EricLBuehler · EricLBuehler · Jun 10, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.1.15"
+version = "0.1.16"
 edition = "2021"
 description = "Fast and easy LLM serving."
 homepage = "https://github.com/EricLBuehler/mistral.rs"

diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
 
 - φ³ 📷 Run the Phi 3 vision model: [documentation and guide here](docs/PHI3V.md)
 
-    <img src="https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg" alt="Mount Everest" width = "400" height = "267">
+    <img src="https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" alt="Mount Washington" width = "400" height = "267">
 
     *After following installation instructions*
 
@@ -322,7 +322,8 @@ The following tokenizer model types are currently supported. If you would like o
 please consider using the method demonstrated in examples below, where the tokenizer is sourced from Hugging Face.
 
 **Supported GGUF tokenizer types**
-- `llama`
+- `llama` (sentencepiece)
+- `gpt2` (BPE)
 
 ## Run with the CLI
 

diff --git a/docs/PHI3V.md b/docs/PHI3V.md
@@ -1,6 +1,6 @@
 # Phi 3 Vision Support: `microsoft/Phi-3-vision-128k-instruct`
 
-The Phi 3 Vision Model has support in the Rust, Python, and HTTP APIs.
+The Phi 3 Vision Model has support in the Rust, Python, and HTTP APIs. The Phi 3 Vision Model supports ISQ for increased performance.
 
 > Note: The Phi 3 Vision model works best with one image although it is supported to send multiple images.
 
@@ -17,7 +17,7 @@ We support an OpenAI compatible HTTP API for vision models. This example demonst
 ---
 
 **Image:**
-<img src="https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg" alt="Mount Everest" width = "1000" height = "666">
+<img src="https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" alt="Mount Washington" width = "1000" height = "666">
 
 **Prompt:**
 ```
@@ -26,7 +26,7 @@ We support an OpenAI compatible HTTP API for vision models. This example demonst
 
 **Output:**
 ```
-The image shows a large, snow-covered mountain with a clear blue sky. There are no visible clouds or precipitation, and the mountain appears to be quite steep with visible crevices and ridges. The surrounding landscape includes rocky terrain at the base of the mountain.
+The image shows a snow-covered mountain with a clear sky above and trees at the base. There appears to be a trail or path leading up the mountain, and some structures can be seen on the peak.
 ```
 
 ---
@@ -53,7 +53,7 @@ completion = openai.chat.completions.create(
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": "https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg"
+                        "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
                     },
                 },
                 {

diff --git a/examples/http.md b/examples/http.md
@@ -125,158 +125,3 @@ Example with `curl`:
 ```bash
 curl http://localhost:<port>/re_isq -H "Content-Type: application/json" -H "Authorization: Bearer EMPTY" -d '{"ggml_type":"Q4K"}'
 ```
-
-
-Streaming requests are not supported.
-
-## Request
-### `ChatCompletionRequest`
-OpenAI compatible request.
-```rust
-pub struct ChatCompletionRequest {
-    pub messages: Either<Vec<Message>, String>,
-    pub model: String,
-    pub logit_bias: Option<HashMap<u32, f32>>,
-    // Default false
-    pub logprobs: bool,
-    pub top_logprobs: Option<usize>,
-    pub max_tokens: Option<usize>,
-    // Default 1
-    pub n: usize,
-    pub presence_penalty: Option<f32>,
-    pub frequency_penalty: Option<f32>,
-    pub stop: Option<StopTokens>,
-    // Default 1
-    pub temperature: Option<f64>,
-    // Default 1
-    pub top_p: Option<f64>,
-    // Default -1 to consider all
-    pub top_k: Option<i64>,
-    pub stream: bool,
-    pub adapters: Option<Vec<String>>,
-}
-```
-
-### `CompletionRequest`
-```rust
-pub struct CompletionRequest {
-    pub model: String,
-    pub prompt: String,
-    pub best_of: usize,
-    pub echo_prompt: bool,
-    pub presence_penalty: Option<f32>,
-    pub frequency_penalty: Option<f32>,
-    pub logit_bias: Option<HashMap<u32, f32>>,
-    // Default false
-    pub logprobs: Option<usize>,
-    pub max_tokens: Option<usize>,
-    // Default 1
-    pub n: usize,
-    pub stop_seqs: Option<StopTokens>,
-    pub temperature: Option<f64>,
-    pub top_p: Option<f64>,
-    pub suffix: Option<String>,
-
-    // mistral.rs additional
-    pub top_k: Option<usize>,
-    pub grammar: Option<Grammar>,
-    pub adapters: Option<Vec<String>>,
-}
-```
-
-### `Message`
-Message with role of either `user`, `system` or `assistant`.
-```rust
-pub struct Message {
-    pub content: String,
-    pub role: String,
-    pub name: Option<String>,
-}
-```
-
-### `StopTokens`
-Stop tokens. Each item in a `Multi` variant should represent one token.
-```rust
-pub enum StopTokens {
-    Multi(Vec<String>),
-    Single(String),
-    MultiId(Vec<u32>),
-    SingleId(u32),
-}
-```
-
-## Response
-
-### `ChatCompletionResponse`
-The OpenAI compatible chat completion response.
-```rust
-pub struct ChatCompletionResponse {
-    pub id: String,
-    pub choices: Vec<Choice>,
-    pub created: u64,
-    pub model: &'static str,
-    pub system_fingerprint: String,
-    pub object: String,
-    pub usage: Usage,
-}
-```
-
-
-### `Choice`
-An individual choice, containing a `ResponseMessage` and maybe `Logprobs`.
-```rust
-pub struct Choice {
-    pub finish_reason: String,
-    pub index: usize,
-    pub message: ResponseMessage,
-    pub logprobs: Option<Logprobs>,
-}
-```
-
-### `ResponseMessage`
-```rust
-pub struct ResponseMessage {
-    pub content: String,
-    pub role: String,
-}
-```
-
-### `Logprobs`
-Logprobs and top logprobs for each token.
-```rust
-pub struct Logprobs {
-    pub content: Option<Vec<ResponseLogprob>>,
-}
-```
-
-### `ResponseLogprob`
-Logprobs and top logprobs for each token, with corresponding bytes. Top logprobs are ordered in descending probability.
-```rust
-pub struct ResponseLogprob {
-    pub token: String,
-    pub logprob: f32,
-    pub bytes: Vec<u8>,
-    pub top_logprobs: Vec<TopLogprob>,
-}
-```
-
-### `TopLogprob`
-```rust
-pub struct TopLogprob {
-    pub token: u32,
-    pub logprob: f32,
-    pub bytes: String,
-}
-```
-
-### `Usage`
-```rust
-pub struct Usage {
-    pub completion_tokens: usize,
-    pub prompt_tokens: usize,
-    pub total_tokens: usize,
-    pub avg_tok_per_sec: f32,
-    pub avg_prompt_tok_per_sec: f32,
-    pub avg_compl_tok_per_sec: f32,
-}
-```
diff --git a/examples/python/phi3v.py b/examples/python/phi3v.py
@@ -19,7 +19,7 @@
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": "https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg"
+                            "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
                         },
                     },
                     {

diff --git a/examples/server/phi3v.py b/examples/server/phi3v.py
@@ -44,7 +44,7 @@ def log_response(response: httpx.Response):
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": "https://upload.wikimedia.org/wikipedia/commons/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg"
+                        "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
                     },
                 },
                 {

diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
@@ -17,7 +17,7 @@ candle-core.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 clap.workspace = true
-mistralrs-core = { version = "0.1.15", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.1.16", path = "../mistralrs-core" }
 tracing.workspace = true
 either.workspace = true
 tokio.workspace = true

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -17,7 +17,6 @@ candle-core.workspace = true
 candle-nn.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-candle-transformers = { git = "https://github.com/EricLBuehler/candle.git", version = "0.5.0" }
 candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.5.0", optional = true }
 dirs = "5.0.1"
 hf-hub = "0.3.2"
@@ -60,18 +59,17 @@ akin = "0.4.0"
 variantly = "0.4.0"
 buildstructor = "0.5.4"
 tracing-subscriber.workspace = true
-reqwest.workspace = true
 derive-new = "0.6.0"
 itertools = "0.13.0"
 mistralrs-vision = { version = "0.1.13", path = "../mistralrs-vision" }
 
 [features]
 pyo3_macros = ["pyo3"]
-cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
+cuda = ["candle-core/cuda", "candle-nn/cuda"]
 cudnn = ["candle-core/cudnn"]
-metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
-flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
-accelerate = ["candle-core/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
-mkl = ["candle-core/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
+metal = ["candle-core/metal", "candle-nn/metal"]
+flash-attn = ["cuda", "dep:candle-flash-attn"]
+accelerate = ["candle-core/accelerate", "candle-nn/accelerate"]
+mkl = ["candle-core/mkl", "candle-nn/mkl"]
 profile = []
 
diff --git a/mistralrs-core/README.md b/mistralrs-core/README.md
@@ -0,0 +1,5 @@
+# `mistralrs-core`
+
+Core crate of `mistral.rs` including the models and associated executors.
+
+Documentation: https://ericlbuehler.github.io/mistral.rs/mistralrs/
diff --git a/mistralrs-core/src/device_map.rs b/mistralrs-core/src/device_map.rs
@@ -6,6 +6,7 @@ use serde::Deserialize;
 use tracing::info;
 
 #[derive(Debug, Default, Deserialize, Clone)]
+/// Metadata to initialize the device mapper.
 pub struct DeviceMapMetadata {
     device_layers: Option<usize>,
     host_layers: Option<usize>,
@@ -80,6 +81,7 @@ pub trait DeviceMapper: Debug {
 }
 
 #[derive(Debug)]
+/// A device mapper which does device mapping per hidden layer.
 pub struct LayerDeviceMapper {
     mappings: Vec<Device>,
     nm_device: Device,