diff --git a/Cargo.toml b/Cargo.toml index 8c5b10c581..79f9ee9fa3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.1.15" +version = "0.1.16" edition = "2021" description = "Fast and easy LLM serving." homepage = "https://github.com/EricLBuehler/mistral.rs" diff --git a/README.md b/README.md index 77e8a4adb5..b82b61588d 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis - φ³ 📷 Run the Phi 3 vision model: [documentation and guide here](docs/PHI3V.md) - Mount Everest + Mount Washington *After following installation instructions* @@ -322,7 +322,8 @@ The following tokenizer model types are currently supported. If you would like o please consider using the method demonstrated in examples below, where the tokenizer is sourced from Hugging Face. **Supported GGUF tokenizer types** -- `llama` +- `llama` (sentencepiece) +- `gpt2` (BPE) ## Run with the CLI diff --git a/docs/PHI3V.md b/docs/PHI3V.md index b3a1f66a0b..0d8e5628a1 100644 --- a/docs/PHI3V.md +++ b/docs/PHI3V.md @@ -1,6 +1,6 @@ # Phi 3 Vision Support: `microsoft/Phi-3-vision-128k-instruct` -The Phi 3 Vision Model has support in the Rust, Python, and HTTP APIs. +The Phi 3 Vision Model has support in the Rust, Python, and HTTP APIs. The Phi 3 Vision Model supports ISQ for increased performance. > Note: The Phi 3 Vision model works best with one image although it is supported to send multiple images. @@ -17,7 +17,7 @@ We support an OpenAI compatible HTTP API for vision models. This example demonst --- **Image:** -Mount Everest +Mount Washington **Prompt:** ``` @@ -26,7 +26,7 @@ We support an OpenAI compatible HTTP API for vision models. This example demonst **Output:** ``` -The image shows a large, snow-covered mountain with a clear blue sky. There are no visible clouds or precipitation, and the mountain appears to be quite steep with visible crevices and ridges. The surrounding landscape includes rocky terrain at the base of the mountain. +The image shows a snow-covered mountain with a clear sky above and trees at the base. There appears to be a trail or path leading up the mountain, and some structures can be seen on the peak. ``` --- @@ -53,7 +53,7 @@ completion = openai.chat.completions.create( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg" + "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" }, }, { diff --git a/examples/http.md b/examples/http.md index 0643b7c9e8..0f0ea964ab 100644 --- a/examples/http.md +++ b/examples/http.md @@ -125,158 +125,3 @@ Example with `curl`: ```bash curl http://localhost:/re_isq -H "Content-Type: application/json" -H "Authorization: Bearer EMPTY" -d '{"ggml_type":"Q4K"}' ``` - - -Streaming requests are not supported. - -## Request -### `ChatCompletionRequest` -OpenAI compatible request. -```rust -pub struct ChatCompletionRequest { - pub messages: Either, String>, - pub model: String, - pub logit_bias: Option>, - // Default false - pub logprobs: bool, - pub top_logprobs: Option, - pub max_tokens: Option, - // Default 1 - pub n: usize, - pub presence_penalty: Option, - pub frequency_penalty: Option, - pub stop: Option, - // Default 1 - pub temperature: Option, - // Default 1 - pub top_p: Option, - // Default -1 to consider all - pub top_k: Option, - pub stream: bool, - pub adapters: Option>, -} -``` - -### `CompletionRequest` -```rust -pub struct CompletionRequest { - pub model: String, - pub prompt: String, - pub best_of: usize, - pub echo_prompt: bool, - pub presence_penalty: Option, - pub frequency_penalty: Option, - pub logit_bias: Option>, - // Default false - pub logprobs: Option, - pub max_tokens: Option, - // Default 1 - pub n: usize, - pub stop_seqs: Option, - pub temperature: Option, - pub top_p: Option, - pub suffix: Option, - - // mistral.rs additional - pub top_k: Option, - pub grammar: Option, - pub adapters: Option>, -} -``` - -### `Message` -Message with role of either `user`, `system` or `assistant`. -```rust -pub struct Message { - pub content: String, - pub role: String, - pub name: Option, -} -``` - -### `StopTokens` -Stop tokens. Each item in a `Multi` variant should represent one token. -```rust -pub enum StopTokens { - Multi(Vec), - Single(String), - MultiId(Vec), - SingleId(u32), -} -``` - -## Response - -### `ChatCompletionResponse` -The OpenAI compatible chat completion response. -```rust -pub struct ChatCompletionResponse { - pub id: String, - pub choices: Vec, - pub created: u64, - pub model: &'static str, - pub system_fingerprint: String, - pub object: String, - pub usage: Usage, -} -``` - - -### `Choice` -An individual choice, containing a `ResponseMessage` and maybe `Logprobs`. -```rust -pub struct Choice { - pub finish_reason: String, - pub index: usize, - pub message: ResponseMessage, - pub logprobs: Option, -} -``` - -### `ResponseMessage` -```rust -pub struct ResponseMessage { - pub content: String, - pub role: String, -} -``` - -### `Logprobs` -Logprobs and top logprobs for each token. -```rust -pub struct Logprobs { - pub content: Option>, -} -``` - -### `ResponseLogprob` -Logprobs and top logprobs for each token, with corresponding bytes. Top logprobs are ordered in descending probability. -```rust -pub struct ResponseLogprob { - pub token: String, - pub logprob: f32, - pub bytes: Vec, - pub top_logprobs: Vec, -} -``` - -### `TopLogprob` -```rust -pub struct TopLogprob { - pub token: u32, - pub logprob: f32, - pub bytes: String, -} -``` - -### `Usage` -```rust -pub struct Usage { - pub completion_tokens: usize, - pub prompt_tokens: usize, - pub total_tokens: usize, - pub avg_tok_per_sec: f32, - pub avg_prompt_tok_per_sec: f32, - pub avg_compl_tok_per_sec: f32, -} -``` \ No newline at end of file diff --git a/examples/python/phi3v.py b/examples/python/phi3v.py index 0561d52ad2..de66424003 100644 --- a/examples/python/phi3v.py +++ b/examples/python/phi3v.py @@ -19,7 +19,7 @@ { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg" + "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" }, }, { diff --git a/examples/server/phi3v.py b/examples/server/phi3v.py index 4aca8dba81..ee84a593d2 100644 --- a/examples/server/phi3v.py +++ b/examples/server/phi3v.py @@ -44,7 +44,7 @@ def log_response(response: httpx.Response): { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg" + "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" }, }, { diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml index 0c4ebdb80c..82ae2df12c 100644 --- a/mistralrs-bench/Cargo.toml +++ b/mistralrs-bench/Cargo.toml @@ -17,7 +17,7 @@ candle-core.workspace = true serde.workspace = true serde_json.workspace = true clap.workspace = true -mistralrs-core = { version = "0.1.15", path = "../mistralrs-core" } +mistralrs-core = { version = "0.1.16", path = "../mistralrs-core" } tracing.workspace = true either.workspace = true tokio.workspace = true diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml index 04f224972c..2dac553c62 100644 --- a/mistralrs-core/Cargo.toml +++ b/mistralrs-core/Cargo.toml @@ -17,7 +17,6 @@ candle-core.workspace = true candle-nn.workspace = true serde.workspace = true serde_json.workspace = true -candle-transformers = { git = "https://github.com/EricLBuehler/candle.git", version = "0.5.0" } candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.5.0", optional = true } dirs = "5.0.1" hf-hub = "0.3.2" @@ -60,18 +59,17 @@ akin = "0.4.0" variantly = "0.4.0" buildstructor = "0.5.4" tracing-subscriber.workspace = true -reqwest.workspace = true derive-new = "0.6.0" itertools = "0.13.0" mistralrs-vision = { version = "0.1.13", path = "../mistralrs-vision" } [features] pyo3_macros = ["pyo3"] -cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"] +cuda = ["candle-core/cuda", "candle-nn/cuda"] cudnn = ["candle-core/cudnn"] -metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"] -flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"] -accelerate = ["candle-core/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"] -mkl = ["candle-core/mkl", "candle-nn/mkl", "candle-transformers/mkl"] +metal = ["candle-core/metal", "candle-nn/metal"] +flash-attn = ["cuda", "dep:candle-flash-attn"] +accelerate = ["candle-core/accelerate", "candle-nn/accelerate"] +mkl = ["candle-core/mkl", "candle-nn/mkl"] profile = [] diff --git a/mistralrs-core/README.md b/mistralrs-core/README.md index e69de29bb2..78fedabdb8 100644 --- a/mistralrs-core/README.md +++ b/mistralrs-core/README.md @@ -0,0 +1,5 @@ +# `mistralrs-core` + +Core crate of `mistral.rs` including the models and associated executors. + +Documentation: https://ericlbuehler.github.io/mistral.rs/mistralrs/ diff --git a/mistralrs-core/src/device_map.rs b/mistralrs-core/src/device_map.rs index 23062ca7e1..6c99d89346 100644 --- a/mistralrs-core/src/device_map.rs +++ b/mistralrs-core/src/device_map.rs @@ -6,6 +6,7 @@ use serde::Deserialize; use tracing::info; #[derive(Debug, Default, Deserialize, Clone)] +/// Metadata to initialize the device mapper. pub struct DeviceMapMetadata { device_layers: Option, host_layers: Option, @@ -80,6 +81,7 @@ pub trait DeviceMapper: Debug { } #[derive(Debug)] +/// A device mapper which does device mapping per hidden layer. pub struct LayerDeviceMapper { mappings: Vec, nm_device: Device, diff --git a/mistralrs-core/src/gguf/gguf_tokenizer.rs b/mistralrs-core/src/gguf/gguf_tokenizer.rs index e2ed29b5df..2209626060 100644 --- a/mistralrs-core/src/gguf/gguf_tokenizer.rs +++ b/mistralrs-core/src/gguf/gguf_tokenizer.rs @@ -1,11 +1,21 @@ -use std::sync::atomic::Ordering; +// https://github.com/huggingface/transformers/blob/8685b3c5d2dd2550527773d2a02499495a759e31/src/transformers/convert_slow_tokenizer.py + +use std::{collections::HashMap, sync::atomic::Ordering}; use anyhow::Result; use candle_core::quantized::gguf_file::Content; +use itertools::Itertools; use tokenizers::{ - decoders::{self, byte_fallback::ByteFallback, fuse::Fuse, strip::Strip}, - models::unigram::Unigram, + decoders::{ + self, byte_fallback::ByteFallback, byte_level::ByteLevel, fuse::Fuse, strip::Strip, + }, + models::{bpe::BpeBuilder, unigram::Unigram}, normalizers::{self, Prepend, Replace}, + pre_tokenizers, + processors::{ + self, + template::{self, TemplateProcessing}, + }, AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer, }; use tracing::info; @@ -13,7 +23,7 @@ use tracing::info; use crate::utils::gguf_metadata::ContentMetadata; use crate::DEBUG; -pub struct GgufTokenizerConversion { +pub(crate) struct GgufTokenizerConversion { pub tokenizer: Tokenizer, pub bos: Option, pub eos: Option, @@ -29,6 +39,7 @@ struct PropsGGUF { unk: Option, eos: u32, bos: u32, + add_bos_token: Option, } impl TryFrom> for PropsGGUF { @@ -47,12 +58,19 @@ impl TryFrom> for PropsGGUF { unk: c.get_value("unknown_token_id").ok(), eos: c.get_value("eos_token_id")?, bos: c.get_value("bos_token_id")?, + add_bos_token: c.get_value("add_bos_token").ok(), }; Ok(props) } } +struct AddedTokensCollection { + bos: String, + eos: String, + unk: Option, +} + pub fn convert_gguf_to_hf_tokenizer(content: &Content) -> Result { let metadata = ContentMetadata { path_prefix: "tokenizer.ggml", @@ -62,6 +80,7 @@ pub fn convert_gguf_to_hf_tokenizer(content: &Content) -> Result unigram_tokenizer(&props)?, + "gpt2" => bpe_tokenizer(&props)?, other => { anyhow::bail!("Tokenizer model `{other}` not supported."); } @@ -79,26 +98,55 @@ pub fn convert_gguf_to_hf_tokenizer(content: &Content) -> Result, +) -> AddedTokensCollection { + // Add special tokens (bos, eos, unk): + let mut special_tokens: [Option; 3] = Default::default(); + + // A little bit awkward here since eos/bos are assumed not options so we need to handle an Option + for (i, token_id) in [Some(bos), Some(eos), unk].into_iter().enumerate() { + if let Some(token_id) = token_id { + let token = p.tokens[token_id as usize].as_str(); + special_tokens[i] = Some(token.to_string()); + tokenizer.add_special_tokens(&[AddedToken::from(token.to_string(), true)]); + } + } + + // Destructure array of options: + let [bos_str, eos_str, unk_str] = special_tokens; + // Would need to unwrap bos/eos here, or change the struct types + AddedTokensCollection { + bos: bos_str.unwrap(), + eos: eos_str.unwrap(), + unk: unk_str, + } } -fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, Vec)> { +fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> { let PropsGGUF { unk, eos, bos, .. } = *p; // Unigram (SentencePiece) default UNK is 0 let unk = unk.unwrap_or(0); @@ -140,15 +188,84 @@ fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, Vec::new(); - for token_id in [bos, eos, unk] { - let token = p.tokens[token_id as usize].as_str(); + let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, Some(unk)); + + Ok((tokenizer, TokenizerKind::Unigram, special_tokens)) +} - special_tokens.push(token.to_owned()); - tokenizer.add_special_tokens(&[AddedToken::from(token.to_owned(), true)]); +fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> { + // BPE merges have each string item as a space-delimited pair: + // https://github.com/EricLBuehler/mistral.rs/pull/397#discussion_r1631988370 + let merges = p + .merges + .as_ref() + .ok_or(anyhow::Error::msg("BPE tokenizer must include merges"))? + .iter() + .map(|merge| { + let split: (&str, &str) = merge + .splitn(2, ' ') + .collect_tuple() + .expect("Failed to convert split into 2-tuple"); + (split.0.to_string(), split.1.to_string()) + }) + .collect::>(); + + let mut vocab = HashMap::new(); + for (i, token) in p.tokens.iter().enumerate() { + #[allow(clippy::cast_possible_truncation)] + vocab.insert(token.clone(), i as u32); } - Ok((tokenizer, TokenizerKind::Unigram, special_tokens)) + let PropsGGUF { + eos, + bos, + unk, + add_bos_token, + .. + } = *p; + + let mut bpe = BpeBuilder::new().vocab_and_merges(vocab, merges); + if let Some(unk) = unk { + bpe = bpe.unk_token(p.tokens[unk as usize].to_string()); + }; + + let bpe = bpe.build().map_err(anyhow::Error::msg)?; + + let mut tokenizer = TokenizerX::try_builder() + .with_model(bpe) + .with_decoder(Decoder::ByteLevel(true, true, true)) + .build()?; + tokenizer.with_pre_tokenizer(pre_tokenizers::byte_level::ByteLevel::new( + false, true, true, + )); + if add_bos_token.is_some_and(|x| x) { + let mut special_toks = HashMap::new(); + special_toks.insert( + p.tokens[bos as usize].clone(), + template::SpecialToken::new( + p.tokens[bos as usize].clone(), + vec![bos], + vec![p.tokens[bos as usize].clone()], + ) + .unwrap(), + ); + tokenizer.with_post_processor( + TemplateProcessing::builder() + .try_single(format!("{}:0 $A:0", p.tokens[bos as usize])) + .unwrap() + .try_pair(format!("{}:0 $A:0 $B:1", p.tokens[bos as usize])) + .unwrap() + .special_tokens(special_toks) + .build() + .unwrap(), + ); + } else { + tokenizer.with_post_processor(processors::byte_level::ByteLevel::new(true, false, true)); + } + + let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, unk); + + Ok((tokenizer, TokenizerKind::Bpe, special_tokens)) } // This is a workaround to have a better builder API. @@ -187,6 +304,7 @@ enum Decoder<'a> { Replace(&'a str, &'a str), Strip(char, usize, usize), Sequence(Vec), + ByteLevel(bool, bool, bool), } // Convert into upstream type wrapped enum variants: @@ -209,6 +327,9 @@ impl TryFrom> for DecoderWrapper { decoders::sequence::Sequence::new(seq).into() } + Decoder::ByteLevel(add_prefix_space, trim_offsets, use_regex) => { + ByteLevel::new(add_prefix_space, trim_offsets, use_regex).into() + } }; Ok(value) @@ -285,6 +406,24 @@ mod tests { .map_err(anyhow::Error::msg) .map(|res| res.tokenizer) } + TokenizerType::Gpt2 => { + let api = ApiBuilder::new().with_progress(true).build().unwrap(); + let api = api.repo(Repo::with_revision( + "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF".to_string(), + RepoType::Model, + "main".to_string(), + )); + + let filename = api.get("Meta-Llama-3-8B-Instruct.Q2_K.gguf").unwrap(); + let mut file = std::fs::File::open(&filename)?; + convert_gguf_to_hf_tokenizer( + &Content::read(&mut file) + .map_err(|e| e.with_path(filename)) + .map_err(anyhow::Error::msg)?, + ) + .map_err(anyhow::Error::msg) + .map(|res| res.tokenizer) + } other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"), } } @@ -302,6 +441,17 @@ mod tests { let tokenizer_filename = api.get("tokenizer.json").unwrap(); Ok(Tokenizer::from_file(tokenizer_filename).unwrap()) } + TokenizerType::Gpt2 => { + let api = ApiBuilder::new().with_progress(true).build().unwrap(); + let api = api.repo(Repo::with_revision( + "EricB/mistralrs_tests".to_string(), + RepoType::Model, + "main".to_string(), + )); + + let tokenizer_filename = api.get("tokenizer_gpt2.json").unwrap(); + Ok(Tokenizer::from_file(tokenizer_filename).unwrap()) + } other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"), } } @@ -362,6 +512,31 @@ mod tests { Ok(()) } + #[test] + fn test_encode_gpt2() -> Result<()> { + let passage = get_test_passage(); + let hf_tokenizer = get_hf_tokenizer(TokenizerType::Gpt2)?; + let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Gpt2)?; + + // Without adding special tokens + let hf_decoded = codec_roundtrip(&hf_tokenizer, passage.as_str(), false)?; + let gguf_decoded = codec_roundtrip(&gguf_tokenizer, passage.as_str(), false)?; + assert_eq!(hf_decoded, gguf_decoded); + assert_eq!(passage, gguf_decoded); + + // With special tokens added + // SKIPPED: + // - Bugged the GGUF tokenizer does not prepend ` ` + // - Due to HF tokenizer using BPE (tokenizer.json) while GGUF tokenizer uses Unigram (metadata)? + /* + let hf_decoded = codec_roundtrip(&hf_tokenizer, passage.as_str(), true)?; + let gguf_decoded = codec_roundtrip(&gguf_tokenizer, passage.as_str(), true)?; + assert_eq!(hf_decoded, gguf_decoded); + */ + + Ok(()) + } + #[test] fn test_decode_llama() -> Result<()> { use rand::seq::SliceRandom; @@ -386,4 +561,29 @@ mod tests { Ok(()) } + + #[test] + fn test_decode_gpt2() -> Result<()> { + use rand::seq::SliceRandom; + use rand::thread_rng; + + let hf_tokenizer = get_hf_tokenizer(TokenizerType::Gpt2)?; + let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Gpt2)?; + + #[allow(clippy::cast_possible_truncation)] + let mut tokens = (0..hf_tokenizer.get_vocab_size(false) as u32).collect::>(); + tokens.shuffle(&mut thread_rng()); + + // Without skipping special tokens + let hf_decoded = decode(&hf_tokenizer, &tokens, false)?; + let gguf_decoded = decode(&gguf_tokenizer, &tokens, false)?; + assert_eq!(hf_decoded, gguf_decoded); + + // With skipping special tokens + let hf_decoded = decode(&hf_tokenizer, &tokens, true)?; + let gguf_decoded = decode(&gguf_tokenizer, &tokens, true)?; + assert_eq!(hf_decoded, gguf_decoded); + + Ok(()) + } } diff --git a/mistralrs-core/src/gguf/mod.rs b/mistralrs-core/src/gguf/mod.rs index fedf5fe15a..8d2120c57c 100644 --- a/mistralrs-core/src/gguf/mod.rs +++ b/mistralrs-core/src/gguf/mod.rs @@ -1,3 +1,3 @@ mod gguf_tokenizer; -pub use gguf_tokenizer::{convert_gguf_to_hf_tokenizer, GgufTokenizerConversion}; +pub(crate) use gguf_tokenizer::{convert_gguf_to_hf_tokenizer, GgufTokenizerConversion}; diff --git a/mistralrs-core/src/layers_masker.rs b/mistralrs-core/src/layers_masker.rs index 71f21081d3..586d67b9f8 100644 --- a/mistralrs-core/src/layers_masker.rs +++ b/mistralrs-core/src/layers_masker.rs @@ -1,13 +1,8 @@ #![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)] -use std::{collections::HashMap, ops::Add, sync::Mutex}; +use std::ops::Add; use candle_core::{DType, Device, Result, Tensor, WithDType}; -use once_cell::sync::Lazy; - -// (bs, tgt_len, past_kv_len) -type MaskKey = (usize, usize, usize); -static MASKS: Lazy>> = Lazy::new(|| Mutex::new(HashMap::new())); // https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_attn_mask_utils.py pub struct CausalMasker; @@ -93,14 +88,8 @@ impl CausalMasker { if tgt_len == 1 { return Ok(None); } - let res = MASKS - .lock() - .unwrap() - .get(&(b_sz, tgt_len, past_kv_len)) - .cloned(); - let causal_mask = if let Some(mask) = res { - return Ok(Some(mask)); - } else { + + let causal_mask = { let mask = self.make_mask(tgt_len, past_kv_len, input_ids.device())?; let mask = mask .expand((b_sz, 1, tgt_len, tgt_len + past_kv_len))? @@ -119,10 +108,6 @@ impl CausalMasker { f32::NEG_INFINITY, )?; - MASKS - .lock() - .unwrap() - .insert((b_sz, tgt_len, past_kv_len), mask.clone()); Ok(mask) }); let mask: Option = if let Some(mask) = causal_mask { @@ -150,14 +135,8 @@ impl CausalMasker { if tgt_len == 1 { return Ok(None); } - let res = MASKS - .lock() - .unwrap() - .get(&(b_sz, tgt_len, past_kv_len)) - .cloned(); - let causal_mask = if let Some(mask) = res { - return Ok(Some(mask)); - } else { + + let causal_mask = { let mask = self.make_mask(tgt_len, past_kv_len, input_ids.device())?; let diagonal = past_kv_len as isize - sliding_window as isize - 1; let context_mask = apply_tril(&mask.ones_like()?, diagonal)?; @@ -180,10 +159,6 @@ impl CausalMasker { f32::NEG_INFINITY, )?; - MASKS - .lock() - .unwrap() - .insert((b_sz, tgt_len, past_kv_len), mask.clone()); Ok(mask) }); let mask: Option = if let Some(mask) = causal_mask { @@ -209,25 +184,13 @@ impl CausalMasker { if tgt_len == 1 { return Ok(None); } - let res = MASKS - .lock() - .unwrap() - .get(&(b_sz, tgt_len, past_kv_len)) - .cloned(); - if let Some(mask) = res { - Ok(Some(mask)) - } else { - let mask = self.make_mask(tgt_len, past_kv_len, input_ids.device())?; - let mask = mask - .expand((b_sz, 1, tgt_len, tgt_len + past_kv_len))? - .to_dtype(DType::U8)?; - MASKS - .lock() - .unwrap() - .insert((b_sz, tgt_len, past_kv_len), mask.clone()); - Ok(Some(mask)) - } + let mask = self.make_mask(tgt_len, past_kv_len, input_ids.device())?; + let mask = mask + .expand((b_sz, 1, tgt_len, tgt_len + past_kv_len))? + .to_dtype(DType::U8)?; + + Ok(Some(mask)) } #[deprecated( @@ -251,28 +214,16 @@ impl CausalMasker { if tgt_len == 1 { return Ok(None); } - let res = MASKS - .lock() - .unwrap() - .get(&(b_sz, tgt_len, past_kv_len)) - .cloned(); - if let Some(mask) = res { - Ok(Some(mask)) - } else { - let mask = self.make_mask(tgt_len, past_kv_len, input_ids.device())?; - let diagonal = past_kv_len as isize - sliding_window as isize - 1; - let context_mask = apply_tril(&mask.ones_like()?, diagonal)?; - let mask = masked_fill(&mask.to_dtype(DType::F32)?, &context_mask, f32::MIN)?; - let mask = mask - .expand((b_sz, 1, tgt_len, tgt_len + past_kv_len))? - .to_dtype(DType::U8)?; - MASKS - .lock() - .unwrap() - .insert((b_sz, tgt_len, past_kv_len), mask.clone()); - Ok(Some(mask)) - } + let mask = self.make_mask(tgt_len, past_kv_len, input_ids.device())?; + let diagonal = past_kv_len as isize - sliding_window as isize - 1; + let context_mask = apply_tril(&mask.ones_like()?, diagonal)?; + let mask = masked_fill(&mask.to_dtype(DType::F32)?, &context_mask, f32::MIN)?; + let mask = mask + .expand((b_sz, 1, tgt_len, tgt_len + past_kv_len))? + .to_dtype(DType::U8)?; + + Ok(Some(mask)) } pub fn apply_mask_one_and_zero( diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs index 73c37baa24..59e543c85a 100644 --- a/mistralrs-core/src/lib.rs +++ b/mistralrs-core/src/lib.rs @@ -27,7 +27,7 @@ mod model_selected; pub use model_selected::ModelSelected; mod cublaslt; -pub mod gguf; +mod gguf; pub mod layers; mod layers_masker; mod layers_utils; @@ -45,14 +45,14 @@ mod vision_models; mod xlora_models; pub use device_map::{DeviceMapMetadata, LayerDeviceMapper}; -pub use gguf::{convert_gguf_to_hf_tokenizer, GgufTokenizerConversion}; pub use pipeline::{ chat_template::ChatTemplate, GGMLLoader, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFArchitecture, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig, GemmaLoader, LlamaLoader, Loader, LocalModelPaths, MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader, NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader, - Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, TokenSource, - VisionLoader, VisionLoaderBuilder, VisionLoaderType, VisionModelLoader, VisionSpecificConfig, + Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, + TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType, VisionModelLoader, + VisionSpecificConfig, }; pub use request::{Constraint, MessageContent, NormalRequest, Request, RequestMessage}; pub use response::Response; @@ -61,7 +61,7 @@ pub use sampler::{SamplingParams, StopTokens, TopLogprob}; pub use scheduler::SchedulerMethod; use serde::Serialize; use tokio::runtime::Runtime; -pub use toml_selector::{TomlLoaderArgs, TomlSelector}; +use toml_selector::{TomlLoaderArgs, TomlSelector}; /// `true` if `MISTRALRS_DEBUG=1` pub(crate) static DEBUG: AtomicBool = AtomicBool::new(false); diff --git a/mistralrs-core/src/lora/mod.rs b/mistralrs-core/src/lora/mod.rs index cc8510da4a..ed2c4557c7 100644 --- a/mistralrs-core/src/lora/mod.rs +++ b/mistralrs-core/src/lora/mod.rs @@ -23,6 +23,7 @@ pub struct PreloadAdapter { } #[derive(Clone, Debug, Deserialize)] +/// Adapter model ordering information. pub struct Ordering { #[serde(rename = "order")] pub adapters: Option>, diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs index 9321a1c5e3..852c79d405 100644 --- a/mistralrs-core/src/model_loader.rs +++ b/mistralrs-core/src/model_loader.rs @@ -9,6 +9,7 @@ use crate::{ VisionSpecificConfig, }; +/// A builder for a loader using the selected model. pub struct LoaderBuilder { model: ModelSelected, no_kv_cache: bool, diff --git a/mistralrs-core/src/models/quantized_llama.rs b/mistralrs-core/src/models/quantized_llama.rs index 635fda38bf..496fc35383 100644 --- a/mistralrs-core/src/models/quantized_llama.rs +++ b/mistralrs-core/src/models/quantized_llama.rs @@ -1,7 +1,7 @@ #![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)] -use candle_core::quantized::QMatMul; use candle_core::quantized::{ggml_file, gguf_file}; +use candle_core::quantized::{QMatMul, QTensor}; use candle_core::{DType, Device, Result, Tensor}; use candle_nn::{Embedding, Module, RotaryEmbedding}; @@ -380,18 +380,69 @@ impl ModelConfig::FromGGUF for ModelWeights { let feed_forward_gate_inp = ct.tensor(reader, &format!("{prefix}.ffn_gate_inp.weight"), device)?; let mut experts = Vec::with_capacity(n_expert); - for i in 0..n_expert { - let feed_forward_w1 = - ct.tensor(reader, &format!("{prefix}.ffn_gate.{i}.weight"), device)?; - let feed_forward_w2 = - ct.tensor(reader, &format!("{prefix}.ffn_down.{i}.weight"), device)?; - let feed_forward_w3 = - ct.tensor(reader, &format!("{prefix}.ffn_up.{i}.weight"), device)?; - experts.push(Mlp { - feed_forward_w1: QMatMul::from_qtensor(feed_forward_w1)?, - feed_forward_w2: QMatMul::from_qtensor(feed_forward_w2)?, - feed_forward_w3: QMatMul::from_qtensor(feed_forward_w3)?, - }) + match ct.tensor(reader, &format!("{prefix}.ffn_gate_exps.weight"), device) { + Ok(feed_forward_gate_exps) => { + let feed_forward_down_exps = + ct.tensor(reader, &format!("{prefix}.ffn_down_exps.weight"), device)?; + let feed_forward_up_exps = + ct.tensor(reader, &format!("{prefix}.ffn_up_exps.weight"), device)?; + + let dequant_ffn_gate = feed_forward_gate_exps + .dequantize(device)? + .chunk(n_expert, 0)?; + let dequant_ffn_down = feed_forward_down_exps + .dequantize(device)? + .chunk(n_expert, 0)?; + let dequant_ffn_up = feed_forward_up_exps + .dequantize(device)? + .chunk(n_expert, 0)?; + + assert_eq!(dequant_ffn_up.len(), dequant_ffn_down.len()); + assert_eq!(dequant_ffn_gate.len(), dequant_ffn_down.len()); + assert_eq!(dequant_ffn_gate.len(), n_expert); + + let gate_type = feed_forward_gate_exps.dtype(); + let down_type = feed_forward_down_exps.dtype(); + let up_type = feed_forward_up_exps.dtype(); + + for (ff_w1, (ff_w2, ff_w3)) in dequant_ffn_gate + .into_iter() + .zip(dequant_ffn_down.into_iter().zip(dequant_ffn_up)) + { + experts.push(Mlp { + feed_forward_w1: QMatMul::from_qtensor(QTensor::quantize( + &ff_w1, gate_type, + )?)?, + feed_forward_w2: QMatMul::from_qtensor(QTensor::quantize( + &ff_w2, down_type, + )?)?, + feed_forward_w3: QMatMul::from_qtensor(QTensor::quantize( + &ff_w3, up_type, + )?)?, + }) + } + } + Err(_) => { + for i in 0..n_expert { + let feed_forward_w1 = ct.tensor( + reader, + &format!("{prefix}.ffn_gate.{i}.weight"), + device, + )?; + let feed_forward_w2 = ct.tensor( + reader, + &format!("{prefix}.ffn_down.{i}.weight"), + device, + )?; + let feed_forward_w3 = + ct.tensor(reader, &format!("{prefix}.ffn_up.{i}.weight"), device)?; + experts.push(Mlp { + feed_forward_w1: QMatMul::from_qtensor(feed_forward_w1)?, + feed_forward_w2: QMatMul::from_qtensor(feed_forward_w2)?, + feed_forward_w3: QMatMul::from_qtensor(feed_forward_w3)?, + }) + } + } } MlpOrMoe::MoE { n_expert_used, diff --git a/mistralrs-core/src/models/quantized_phi3.rs b/mistralrs-core/src/models/quantized_phi3.rs index bfefeea71c..02fc53a74b 100644 --- a/mistralrs-core/src/models/quantized_phi3.rs +++ b/mistralrs-core/src/models/quantized_phi3.rs @@ -88,7 +88,7 @@ impl LayerWeights { .reshape((b_sz, seq_len, self.n_head, self.head_dim))? .transpose(1, 2)?; let k = k - .reshape((b_sz, seq_len, self.n_head, self.head_dim))? + .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))? .transpose(1, 2)?; let v = v .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))? @@ -282,8 +282,8 @@ impl ModelConfig::FromGGUF for ModelWeights { n_head: head_count, n_kv_head: head_count_kv, head_dim: embedding_length / head_count, - cos: cos.clone(), - sin: sin.clone(), + cos: cos.to_device(device)?, + sin: sin.to_device(device)?, sliding_window: context_window, }) } diff --git a/mistralrs-core/src/pipeline/chat_template.rs b/mistralrs-core/src/pipeline/chat_template.rs index a4e55ddddb..6f53b2a463 100644 --- a/mistralrs-core/src/pipeline/chat_template.rs +++ b/mistralrs-core/src/pipeline/chat_template.rs @@ -38,6 +38,7 @@ pub struct BeginEndUnkTok( #[allow(dead_code)] #[derive(Debug, Deserialize)] +/// Template for chat models including bos/eos/unk as well as the chat template. pub struct ChatTemplate { add_bos_token: Option, add_eos_token: Option, diff --git a/mistralrs-core/src/pipeline/ggml.rs b/mistralrs-core/src/pipeline/ggml.rs index 4eb9ea22b5..b1cfb3ae0a 100644 --- a/mistralrs-core/src/pipeline/ggml.rs +++ b/mistralrs-core/src/pipeline/ggml.rs @@ -57,6 +57,7 @@ pub struct GGMLPipeline { metadata: GeneralMetadata, } +/// A loader for a GGML model. pub struct GGMLLoader { model_id: String, config: GGMLSpecificConfig, diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs index 6378d7dc77..ccfa9af732 100644 --- a/mistralrs-core/src/pipeline/gguf.rs +++ b/mistralrs-core/src/pipeline/gguf.rs @@ -67,6 +67,7 @@ pub struct GGUFPipeline { metadata: GeneralMetadata, } +/// Loader for a GGUF model. pub struct GGUFLoader { model_id: Option, config: GGUFSpecificConfig, diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs index 4792dc0e85..28efe62917 100644 --- a/mistralrs-core/src/pipeline/macros.rs +++ b/mistralrs-core/src/pipeline/macros.rs @@ -1,3 +1,4 @@ +#[doc(hidden)] #[macro_export] macro_rules! api_dir_list { ($api:expr, $model_id:expr) => { @@ -44,6 +45,7 @@ macro_rules! api_dir_list { }; } +#[doc(hidden)] #[macro_export] macro_rules! api_get_file { ($api:expr, $file:expr, $model_id:expr) => { @@ -73,6 +75,7 @@ macro_rules! api_get_file { }; } +#[doc(hidden)] #[macro_export] macro_rules! get_paths { ($path_name:ident, $token_source:expr, $revision:expr, $this:expr, $quantized_model_id:expr, $quantized_filename:expr, $silent:expr) => {{ @@ -186,6 +189,7 @@ macro_rules! get_paths { }}; } +#[doc(hidden)] #[macro_export] macro_rules! get_paths_gguf { ($path_name:ident, $token_source:expr, $revision:expr, $this:expr, $quantized_model_id:expr, $quantized_filename:expr, $silent:expr) => {{ @@ -309,6 +313,7 @@ macro_rules! get_paths_gguf { }}; } +#[doc(hidden)] #[macro_export] macro_rules! normal_model_loader { ($paths:expr, $dtype:expr, $default_dtype:expr, $device:expr, $config:expr, $loader:expr, $use_flash_attn:expr, $silent:expr, $mapper:expr, $loading_isq:expr, $real_device:expr) => {{ @@ -333,6 +338,7 @@ macro_rules! normal_model_loader { }}; } +#[doc(hidden)] #[macro_export] macro_rules! vision_normal_model_loader { ($paths:expr, $dtype:expr, $default_dtype:expr, $device:expr, $config:expr, $loader:expr, $use_flash_attn:expr, $silent:expr, $mapper:expr, $loading_isq:expr, $real_device:expr) => {{ @@ -348,13 +354,16 @@ macro_rules! vision_normal_model_loader { &$config, $use_flash_attn, vb, - $mapper, - $loading_isq, - $real_device, + $crate::pipeline::NormalLoadingMetadata { + mapper: $mapper, + loading_isq: $loading_isq, + real_device: $real_device, + }, )? }}; } +#[doc(hidden)] #[macro_export] macro_rules! xlora_model_loader { ($paths:expr, $dtype:expr, $default_dtype:expr, $device:expr, $config:expr, $loader:expr, $use_flash_attn:expr, $silent:expr, $mapper:expr, $loading_isq:expr, $real_device:expr) => {{ @@ -399,6 +408,7 @@ macro_rules! xlora_model_loader { }}; } +#[doc(hidden)] #[macro_export] macro_rules! lora_model_loader { ($paths:expr, $dtype:expr, $default_dtype:expr, $device:expr, $config:expr, $loader:expr, $use_flash_attn:expr, $silent:expr, $mapper:expr, $loading_isq:expr, $real_device:expr) => {{ diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index 93a6201c75..18c2299f63 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -40,7 +40,7 @@ use std::{collections::HashMap, path::PathBuf, str::FromStr}; use tokenizers::Tokenizer; use tokio::sync::Mutex; pub use vision::{VisionLoader, VisionLoaderBuilder, VisionSpecificConfig}; -pub use vision_loaders::{VisionLoaderType, VisionModelLoader}; +pub use vision_loaders::{Phi3VLoader, VisionLoaderType, VisionModelLoader}; use anyhow::Result; use candle_core::{DType, Device, Tensor}; @@ -103,6 +103,7 @@ pub trait ModelPaths { } #[derive(Clone)] +/// All local paths and metadata necessary to load a model. pub struct LocalModelPaths

{ tokenizer_filename: P, config_filename: P, diff --git a/mistralrs-core/src/pipeline/normal_loaders.rs b/mistralrs-core/src/pipeline/normal_loaders.rs index ce49be5c21..31eb23a677 100644 --- a/mistralrs-core/src/pipeline/normal_loaders.rs +++ b/mistralrs-core/src/pipeline/normal_loaders.rs @@ -223,6 +223,9 @@ impl GemmaBasicConfig { } } +/// [`NormalLoader`] for a Gemma model. +/// +/// [`NormalLoader`]: https://ericlbuehler.github.io/mistral.rs/mistralrs/struct.NormalLoader.html pub struct GemmaLoader; impl NormalModelLoader for GemmaLoader { @@ -313,6 +316,9 @@ impl LlamaBasicConfig { } } +/// [`NormalLoader`] for a Llama model. +/// +/// [`NormalLoader`]: https://ericlbuehler.github.io/mistral.rs/mistralrs/struct.NormalLoader.html pub struct LlamaLoader; impl NormalModelLoader for LlamaLoader { @@ -495,6 +501,9 @@ impl Phi2BasicConfig { } } +/// [`NormalLoader`] for a Phi 2 model. +/// +/// [`NormalLoader`]: https://ericlbuehler.github.io/mistral.rs/mistralrs/struct.NormalLoader.html pub struct Phi2Loader; impl NormalModelLoader for Phi2Loader { @@ -593,6 +602,9 @@ impl Phi3BasicConfig { } } +/// [`NormalLoader`] for a Phi 3 model. +/// +/// [`NormalLoader`]: https://ericlbuehler.github.io/mistral.rs/mistralrs/struct.NormalLoader.html pub struct Phi3Loader; impl NormalModelLoader for Phi3Loader { @@ -686,6 +698,9 @@ impl Qwen2BasicConfig { } } +/// [`NormalLoader`] for a Qwen 2 model. +/// +/// [`NormalLoader`]: https://ericlbuehler.github.io/mistral.rs/mistralrs/struct.NormalLoader.html pub struct Qwen2Loader; impl NormalModelLoader for Qwen2Loader { diff --git a/mistralrs-core/src/pipeline/sampling_pipeline.rs b/mistralrs-core/src/pipeline/sampling_pipeline.rs index d569defae6..b628a99489 100644 --- a/mistralrs-core/src/pipeline/sampling_pipeline.rs +++ b/mistralrs-core/src/pipeline/sampling_pipeline.rs @@ -1,3 +1,4 @@ +#[doc(hidden)] #[macro_export] macro_rules! finish_and_add_tokens_to_seq { ($this:expr, $prefix_cacher:expr, $seq:expr, $logprobs:expr, $eos_tok:expr, $use_prefix_cacher:expr) => {{ @@ -177,6 +178,7 @@ macro_rules! finish_and_add_tokens_to_seq { } /// Sample and add to the prefix cache. +#[doc(hidden)] #[macro_export] macro_rules! do_sample { ($this:expr, $seqs:expr, $logits:expr, $prefix_cacher:expr, $disable_eos_stop:expr, $rng:expr) => {{ diff --git a/mistralrs-core/src/pipeline/speculative.rs b/mistralrs-core/src/pipeline/speculative.rs index 373bc060f4..18fbaf0a01 100644 --- a/mistralrs-core/src/pipeline/speculative.rs +++ b/mistralrs-core/src/pipeline/speculative.rs @@ -26,6 +26,7 @@ use super::{ IsqPipelineMixin, MetadataMixin, ModelCategory, ModelPaths, PreProcessingMixin, }; +/// A loader for a speculative pipeline using 2 [`Loader`]s. pub struct SpeculativeLoader { pub target: Box, pub draft: Box, @@ -138,6 +139,7 @@ pub struct SpeculativePipeline { } #[derive(Copy, Clone)] +/// Metadata for a speculative pipeline pub struct SpeculativeConfig { /// γ completions to run of the draft model pub gamma: usize, diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs index d009e7a433..d586a377b5 100644 --- a/mistralrs-core/src/pipeline/vision.rs +++ b/mistralrs-core/src/pipeline/vision.rs @@ -1,5 +1,5 @@ use super::cache_manager::DefaultCacheManager; -use super::vision_loaders::{Phi3Loader, VisionLoaderType}; +use super::vision_loaders::{Phi3VLoader, VisionLoaderType}; use super::{ get_model_paths, get_xlora_paths, AdapterActivationMixin, Cache, CacheManager, CacheManagerMixin, GeneralMetadata, IsqPipelineMixin, Loader, MetadataMixin, ModelCategory, @@ -95,7 +95,7 @@ impl VisionLoaderBuilder { setup_logger_and_debug(); let loader: Box = match loader { - VisionLoaderType::Phi3V => Box::new(Phi3Loader), + VisionLoaderType::Phi3V => Box::new(Phi3VLoader), }; Box::new(VisionLoader { inner: loader, diff --git a/mistralrs-core/src/pipeline/vision_loaders.rs b/mistralrs-core/src/pipeline/vision_loaders.rs index 191fcf47d6..de5fb3b438 100644 --- a/mistralrs-core/src/pipeline/vision_loaders.rs +++ b/mistralrs-core/src/pipeline/vision_loaders.rs @@ -2,7 +2,6 @@ use std::sync::Arc; use std::{fmt::Debug, str::FromStr}; use anyhow::Result; -use candle_core::Device; use candle_nn::VarBuilder; #[cfg(feature = "pyo3_macros")] @@ -10,12 +9,11 @@ use pyo3::pyclass; use serde::Deserialize; -use super::{Processor, ProcessorCreator, VisionModel}; +use super::{NormalLoadingMetadata, Processor, ProcessorCreator, VisionModel}; use crate::vision_models::phi3::{Config as Phi3Config, Model as Phi3}; use crate::vision_models::phi3_inputs_processor::Phi3Processor; use crate::vision_models::preprocessor_config::PreProcessorConfig; use crate::vision_models::processor_config::ProcessorConfig; -use crate::DeviceMapMetadata; pub trait VisionModelLoader { fn load( @@ -23,9 +21,7 @@ pub trait VisionModelLoader { config: &str, use_flash_attn: bool, vb: VarBuilder, - mapper: DeviceMapMetadata, - loading_isq: bool, - device: Device, + normal_loading_metadata: NormalLoadingMetadata, ) -> Result>; fn is_gptx(&self) -> bool; fn get_config_repr(&self, config: &str, use_flash_attn: bool) -> Result>; @@ -56,17 +52,18 @@ impl FromStr for VisionLoaderType { // ======================== Phi 3 loader -pub struct Phi3Loader; +/// [`VisionLoader`] for a Phi 3 Vision model. +/// +/// [`VisionLoader`]: https://ericlbuehler.github.io/mistral.rs/mistralrs/struct.VisionLoader.html +pub struct Phi3VLoader; -impl VisionModelLoader for Phi3Loader { +impl VisionModelLoader for Phi3VLoader { fn load( &self, config: &str, use_flash_attn: bool, vb: VarBuilder, - mapper: DeviceMapMetadata, - loading_isq: bool, - device: Device, + normal_loading_metadata: NormalLoadingMetadata, ) -> Result> { let mut config: Phi3Config = serde_json::from_str(config)?; config.use_flash_attn = use_flash_attn; @@ -74,9 +71,7 @@ impl VisionModelLoader for Phi3Loader { &config, vb, self.is_gptx(), - mapper, - loading_isq, - device, + normal_loading_metadata, )?)) } fn is_gptx(&self) -> bool { diff --git a/mistralrs-core/src/request.rs b/mistralrs-core/src/request.rs index 9892fa1808..15cea90571 100644 --- a/mistralrs-core/src/request.rs +++ b/mistralrs-core/src/request.rs @@ -33,6 +33,7 @@ pub enum RequestMessage { } #[derive(Clone)] +/// A normal request request to the `MistralRs` pub struct NormalRequest { pub messages: RequestMessage, pub sampling_params: SamplingParams, diff --git a/mistralrs-core/src/response.rs b/mistralrs-core/src/response.rs index fc4f4c93de..e0cad0a792 100644 --- a/mistralrs-core/src/response.rs +++ b/mistralrs-core/src/response.rs @@ -23,6 +23,7 @@ macro_rules! generate_repr { #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// Chat completion response message. pub struct ResponseMessage { pub content: String, pub role: String, @@ -33,6 +34,7 @@ generate_repr!(ResponseMessage); #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// Delta in content for streaming response. pub struct Delta { pub content: String, pub role: String, @@ -43,6 +45,7 @@ generate_repr!(Delta); #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// A logprob with the top logprobs for this token. pub struct ResponseLogprob { pub token: String, pub logprob: f32, @@ -55,6 +58,7 @@ generate_repr!(ResponseLogprob); #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// Logprobs per token. pub struct Logprobs { pub content: Option>, } @@ -64,6 +68,7 @@ generate_repr!(Logprobs); #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// Chat completion choice. pub struct Choice { pub finish_reason: String, pub index: usize, @@ -76,6 +81,7 @@ generate_repr!(Choice); #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// Completion streaming chunk choice. pub struct ChunkChoice { pub finish_reason: Option, pub index: usize, @@ -122,6 +128,7 @@ generate_repr!(ChatCompletionResponse); #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// Chat completion streaming request chunk. pub struct ChatCompletionChunkResponse { pub id: String, pub choices: Vec, @@ -136,6 +143,7 @@ generate_repr!(ChatCompletionChunkResponse); #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize)] +/// Completion request choice. pub struct CompletionChoice { pub finish_reason: String, pub index: usize, diff --git a/mistralrs-core/src/sampler.rs b/mistralrs-core/src/sampler.rs index a8da56c100..34ad8bae95 100644 --- a/mistralrs-core/src/sampler.rs +++ b/mistralrs-core/src/sampler.rs @@ -70,7 +70,7 @@ pub struct Sampler { #[cfg_attr(feature = "pyo3_macros", pyclass)] #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -// Top-n logprobs element +/// Top-n logprobs element pub struct TopLogprob { pub token: u32, pub logprob: f32, diff --git a/mistralrs-core/src/utils/mod.rs b/mistralrs-core/src/utils/mod.rs index 314f2492ef..314007140a 100644 --- a/mistralrs-core/src/utils/mod.rs +++ b/mistralrs-core/src/utils/mod.rs @@ -6,6 +6,7 @@ pub(crate) mod tokenizer; pub(crate) mod tokens; pub(crate) mod varbuilder_utils; +#[doc(hidden)] #[macro_export] macro_rules! get_mut_arcmutex { ($thing:expr) => { @@ -17,6 +18,7 @@ macro_rules! get_mut_arcmutex { }; } +#[doc(hidden)] #[macro_export] macro_rules! handle_seq_error { ($fallible:expr, $response:expr) => { @@ -34,6 +36,7 @@ macro_rules! handle_seq_error { }; } +#[doc(hidden)] #[macro_export] macro_rules! handle_seq_error_ok { ($fallible:expr, $response:expr) => { @@ -51,6 +54,7 @@ macro_rules! handle_seq_error_ok { }; } +#[doc(hidden)] #[macro_export] macro_rules! handle_seq_error_stateaware_ok { ($fallible:expr, $seq:expr) => { @@ -70,6 +74,7 @@ macro_rules! handle_seq_error_stateaware_ok { }; } +#[doc(hidden)] #[macro_export] macro_rules! handle_pipeline_forward_error { ($stage: tt, $fallible:expr, $seq_slice:expr, $pipeline:expr, $label:tt, $prefix_cacher:expr) => { @@ -177,6 +182,7 @@ macro_rules! handle_pipeline_forward_error { }; } +#[doc(hidden)] #[macro_export] macro_rules! get_mut_group { ($this:expr) => { @@ -188,6 +194,7 @@ macro_rules! get_mut_group { }; } +#[doc(hidden)] #[macro_export] macro_rules! get_bias_if_not_allowed { ($tok_trie:expr, $rx:expr, $next_token_id:expr) => { @@ -201,6 +208,7 @@ macro_rules! get_bias_if_not_allowed { }; } +#[doc(hidden)] #[macro_export] macro_rules! sample_async { ( @@ -235,6 +243,7 @@ macro_rules! sample_async { }; } +#[doc(hidden)] #[macro_export] macro_rules! serde_default_fn { ($t:ty, $name:ident, $v:expr) => { diff --git a/mistralrs-core/src/vision_models/phi3.rs b/mistralrs-core/src/vision_models/phi3.rs index 970804e33b..3ce4127514 100644 --- a/mistralrs-core/src/vision_models/phi3.rs +++ b/mistralrs-core/src/vision_models/phi3.rs @@ -16,10 +16,11 @@ use crate::{ repeat_kv, CausalMasker, FusedBiasLinear, MatMul, Nonzero, PhiRopeConfig, PhiRotaryEmbedding, RmsNorm, ScaledDotProductAttention, }, - pipeline::{extract_logits, Cache, IsqModel, Phi3RopeScaling, VisionModel}, + pipeline::{ + extract_logits, Cache, IsqModel, NormalLoadingMetadata, Phi3RopeScaling, VisionModel, + }, serde_default_fn, vision_models::clip::{Activation, ClipConfig, ClipVisionTransformer}, - DeviceMapMetadata, }; #[derive(Debug, Clone, serde::Deserialize)] @@ -699,9 +700,8 @@ impl ImageEmbedding { // hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = ... let p_0 = positions.i((idx, 0))?.to_scalar::()? as usize; let p_1 = positions.i((idx, 1))?.to_scalar::()? as usize; - // TODO(EricLBuehler): https://github.com/huggingface/candle/pull/2223 will make this nicer hidden_states = hidden_states.slice_assign( - &[p_0..p_0 + 1, p_1..p_1 + cnt, 0..img_set_tensor.dims()[2]], + &[&p_0, &(p_1..p_1 + cnt), &(..img_set_tensor.dims()[2])], &img_set_tensor, )?; idx += cnt; @@ -720,9 +720,8 @@ impl ImageEmbedding { let p_0 = positions.i((idx, 0))?.to_scalar::()? as usize; let p_1 = positions.i((idx, 1))?.to_scalar::()? as usize; // hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = ... - // TODO(EricLBuehler): https://github.com/huggingface/candle/pull/2223 will make this nicer hidden_states = hidden_states.slice_assign( - &[p_0..p_0 + 1, p_1..p_1 + cnt, 0..img_set_tensor.dims()[2]], + &[&p_0, &(p_1..p_1 + cnt), &(..img_set_tensor.dims()[2])], &img_set_tensor, )?; idx += cnt; @@ -757,12 +756,12 @@ impl Model { cfg: &Config, vb: VarBuilder, _is_gptx: bool, - mapper: DeviceMapMetadata, - loading_isq: bool, - real_device: Device, + normal_loading_metadata: NormalLoadingMetadata, ) -> Result { let vb_m = vb.pp("model"); - let mapper = mapper.into_mapper(cfg.num_hidden_layers, &real_device)?; + let mapper = normal_loading_metadata + .mapper + .into_mapper(cfg.num_hidden_layers, &normal_loading_metadata.real_device)?; let embed_tokens = candle_nn::embedding( cfg.vocab_size, cfg.hidden_size, @@ -772,7 +771,7 @@ impl Model { cfg, embed_tokens.clone(), &cfg.embd_layer, - vb_m.pp("vision_embed_tokens"), + mapper.set_nm_device(vb_m.pp("vision_embed_tokens"), false), )?; let mut layers = Vec::with_capacity(cfg.num_hidden_layers); let vb_l = vb_m.pp("layers"); @@ -780,7 +779,9 @@ impl Model { let rotary_emb = Arc::new(PhiRotaryEmbedding::new( vb.dtype(), cfg.clone(), - mapper.device_for(layer_idx, false).unwrap_or(&real_device), + mapper + .device_for(layer_idx, false) + .unwrap_or(&normal_loading_metadata.real_device), )?); let layer = DecoderLayer::new( rotary_emb.clone(), @@ -788,7 +789,7 @@ impl Model { vb_l.pp(layer_idx), &*mapper, layer_idx, - loading_isq, + normal_loading_metadata.loading_isq, )?; layers.push(layer) } @@ -800,14 +801,14 @@ impl Model { let lm_head = linear_no_bias( cfg.hidden_size, cfg.vocab_size, - mapper.set_nm_device(vb.pp("lm_head"), loading_isq), + mapper.set_nm_device(vb.pp("lm_head"), normal_loading_metadata.loading_isq), )?; Ok(Self { vision_embed_tokens, layers, norm, lm_head: QMatMul::Tensor(lm_head.weight().clone()), - device: real_device, + device: normal_loading_metadata.real_device, cache: Cache::new(cfg.num_hidden_layers, false), max_seq_len: cfg.max_position_embeddings, mapper, diff --git a/mistralrs-core/src/xlora_models/quantized_phi3.rs b/mistralrs-core/src/xlora_models/quantized_phi3.rs index 767040d243..b9ddb59a33 100644 --- a/mistralrs-core/src/xlora_models/quantized_phi3.rs +++ b/mistralrs-core/src/xlora_models/quantized_phi3.rs @@ -141,7 +141,7 @@ impl LayerWeights { .reshape((b_sz, seq_len, self.n_head, self.head_dim))? .transpose(1, 2)?; let k = k - .reshape((b_sz, seq_len, self.n_head, self.head_dim))? + .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))? .transpose(1, 2)?; let v = v .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))? @@ -323,8 +323,8 @@ impl ModelConfig::FromAdapterGGUF for ModelWeights { n_head: head_count, n_kv_head: head_count_kv, head_dim: embedding_length / head_count, - cos: cos.clone(), - sin: sin.clone(), + cos: cos.to_device(device)?, + sin: sin.to_device(device)?, sliding_window: context_window, }) } diff --git a/mistralrs-pyo3/Cargo.toml b/mistralrs-pyo3/Cargo.toml index dffc347310..e1e0171386 100644 --- a/mistralrs-pyo3/Cargo.toml +++ b/mistralrs-pyo3/Cargo.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.1.15", path = "../mistralrs-core", features = ["pyo3_macros"] } +mistralrs-core = { version = "0.1.16", path = "../mistralrs-core", features = ["pyo3_macros"] } serde.workspace = true serde_json.workspace = true candle-core.workspace = true diff --git a/mistralrs-pyo3/Cargo_template.toml b/mistralrs-pyo3/Cargo_template.toml index 683239db7d..3d599c0bc7 100644 --- a/mistralrs-pyo3/Cargo_template.toml +++ b/mistralrs-pyo3/Cargo_template.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.1.15", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } +mistralrs-core = { version = "0.1.16", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } serde.workspace = true serde_json.workspace = true candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.5.0", features=["$feature_name"] } diff --git a/mistralrs-pyo3/pyproject.toml b/mistralrs-pyo3/pyproject.toml index 8b8339cfcf..361cb0783e 100644 --- a/mistralrs-pyo3/pyproject.toml +++ b/mistralrs-pyo3/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "mistralrs" -version = "0.1.15" +version = "0.1.16" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-pyo3/pyproject_template.toml b/mistralrs-pyo3/pyproject_template.toml index 2ec480ef4f..229fdaa9b8 100644 --- a/mistralrs-pyo3/pyproject_template.toml +++ b/mistralrs-pyo3/pyproject_template.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "$name" -version = "0.1.15" +version = "0.1.16" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml index 7b8f06f275..60c720edb0 100644 --- a/mistralrs-server/Cargo.toml +++ b/mistralrs-server/Cargo.toml @@ -22,8 +22,7 @@ axum = { version = "0.7.4", features = ["tokio"] } tower-http = { version = "0.5.1", features = ["cors"]} utoipa = { version = "4.2", features = ["axum_extras"] } utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]} -mistralrs-core = { version = "0.1.15", path = "../mistralrs-core" } -dyn-fmt = "0.4.0" +mistralrs-core = { version = "0.1.16", path = "../mistralrs-core" } indexmap.workspace = true accelerate-src = { workspace = true, optional = true } intel-mkl-src = { workspace = true, optional = true } diff --git a/mistralrs-server/README.md b/mistralrs-server/README.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mistralrs-vision/README.md b/mistralrs-vision/README.md new file mode 100644 index 0000000000..ca482bb1a2 --- /dev/null +++ b/mistralrs-vision/README.md @@ -0,0 +1,5 @@ +# `mistralrs-vision` + +This crate provides vision utilities for mistral.rs inspired by torchvision. + +Documentation: https://ericlbuehler.github.io/mistral.rs/mistralrs_vision/index.html \ No newline at end of file diff --git a/mistralrs-vision/src/lib.rs b/mistralrs-vision/src/lib.rs index 76dbf8b690..09073512dd 100644 --- a/mistralrs-vision/src/lib.rs +++ b/mistralrs-vision/src/lib.rs @@ -1,9 +1,32 @@ +//! This crate provides vision utilities for mistral.rs inspired by torchvision. +//! In particular, it represents transformations on some `Self` type which are applied +//! sequentially. +//! +//! ## Example +//! ```rust +//! use candle_core::Device; +//! use image::{ColorType, DynamicImage}; +//! use mistralrs_vision::{ApplyTransforms, Normalize, ToTensor, Transforms}; +//! +//! let image = DynamicImage::new(3, 4, ColorType::Rgb8); +//! let transforms = Transforms { +//! input: &ToTensor, +//! inner_transforms: &[&Normalize { +//! mean: vec![0.5, 0.5, 0.5], +//! std: vec![0.5, 0.5, 0.5], +//! }], +//! }; +//! let transformed = image.apply(transforms, &Device::Cpu).unwrap(); +//! assert_eq!(transformed.dims(), &[3, 4, 3]); +//! ``` + use candle_core::{Device, Result, Tensor}; use image::DynamicImage; mod transforms; pub(crate) mod utils; pub use transforms::{InterpolateResize, Normalize, ToTensor}; +/// A transform over an image. The input may vary but the output is always a Tensor. pub trait ImageTransform { type Input; type Output; @@ -11,11 +34,14 @@ pub trait ImageTransform { fn map(&self, x: &Self::Input, device: &Device) -> Result; } +/// Transforms to apply, starting with the `input` and then with each transform in +/// `inner_transforms` applied sequentially pub struct Transforms<'a> { pub input: &'a dyn ImageTransform, pub inner_transforms: &'a [&'a dyn ImageTransform], } +/// Application of transforms to the Self type. pub trait ApplyTransforms<'a> { fn apply(&self, transforms: Transforms<'a>, device: &Device) -> Result; } diff --git a/mistralrs-vision/src/transforms.rs b/mistralrs-vision/src/transforms.rs index f6c462a5a1..e75838be3a 100644 --- a/mistralrs-vision/src/transforms.rs +++ b/mistralrs-vision/src/transforms.rs @@ -72,8 +72,7 @@ impl ImageTransform for Normalize { } } -/// Do what `ToTensor` does, but also resize the image without preserving -/// aspect ratio. +/// Resize the image via nearest interpolation. pub struct InterpolateResize { pub target_w: usize, pub target_h: usize, diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml index 1e0206fd74..82cd14d68b 100644 --- a/mistralrs/Cargo.toml +++ b/mistralrs/Cargo.toml @@ -12,7 +12,7 @@ license.workspace = true homepage.workspace = true [dependencies] -mistralrs-core = { version = "0.1.15", path = "../mistralrs-core" } +mistralrs-core = { version = "0.1.16", path = "../mistralrs-core" } anyhow.workspace = true tokio.workspace = true candle-core.workspace = true diff --git a/mistralrs/src/lib.rs b/mistralrs/src/lib.rs index 65e6f60055..26e1b708fa 100644 --- a/mistralrs/src/lib.rs +++ b/mistralrs/src/lib.rs @@ -1,2 +1,50 @@ +//! This crate provides an asynchronous, multithreaded API to `mistral.rs`. +//! +//! ## Example +//! ```no_run +//! use std::sync::Arc; +//! use tokio::sync::mpsc::channel; +//! +//! use mistralrs::{ +//! Constraint, DeviceMapMetadata, MistralRs, MistralRsBuilder, +//! NormalLoaderType, NormalRequest, Request, RequestMessage, Response, +//! SamplingParams, SchedulerMethod, TokenSource, +//! }; +//! +//! fn setup() -> anyhow::Result> { +//! // See the examples for how to load your model. +//! todo!() +//! } +//! +//! fn main() -> anyhow::Result<()> { +//! let mistralrs = setup()?; +//! +//! let (tx, mut rx) = channel(10_000); +//! let request = Request::Normal(NormalRequest { +//! messages: RequestMessage::Completion { +//! text: "Hello! My name is ".to_string(), +//! echo_prompt: false, +//! best_of: 1, +//! }, +//! sampling_params: SamplingParams::default(), +//! response: tx, +//! return_logprobs: false, +//! is_streaming: false, +//! id: 0, +//! constraint: Constraint::None, +//! suffix: None, +//! adapters: None, +//! }); +//! mistralrs.get_sender().blocking_send(request)?; +//! +//! let response = rx.blocking_recv().unwrap(); +//! match response { +//! Response::CompletionDone(c) => println!("Text: {}", c.choices[0].text), +//! _ => unreachable!(), +//! } +//! Ok(()) +//! } +//! ``` + pub use candle_core::{quantized::GgmlDType, DType, Device, Result}; pub use mistralrs_core::*;