feat: Updated Deep Infra models

tryAGI · Feb 3, 2025 · 1ebef8e · 1ebef8e
1 parent ec84762
commit 1ebef8e
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 50 deletions.
diff --git a/src/DeepInfra/src/DeepInfraModelIds.cs b/src/DeepInfra/src/DeepInfraModelIds.cs
@@ -7,15 +7,37 @@ public enum DeepInfraModelIds
 {
 
     /// <summary>
-    /// Name: Llama-3.3-70B-Instruct <br/>
-    /// Organization: meta-llama <br/>
+    /// Name: DeepSeek-R1 <br/>
+    /// Organization: deepseek-ai <br/>
+    /// Context Length: 16000 <br/>
+    /// Prompt Cost: $0.75/MTok <br/>
+    /// Completion Cost: $0.75/MTok <br/>
+    /// Description: We introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.  <br/>
+    /// HuggingFace Url: <a href="https://huggingface.co/deepseek-ai/DeepSeek-R1">https://huggingface.co/deepseek-ai/DeepSeek-R1</a> 
+    /// </summary>
+    DeepseekR1,
+
+    /// <summary>
+    /// Name: DeepSeek-R1-Distill-Llama-70B <br/>
+    /// Organization: deepseek-ai <br/>
     /// Context Length: 131072 <br/>
     /// Prompt Cost: $0.23/MTok <br/>
     /// Completion Cost: $0.23/MTok <br/>
-    /// Description: Llama 3.3-70B is a multilingual LLM trained on a massive dataset of 15 trillion tokens, fine-tuned for instruction-following and conversational dialogue. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation. <br/>
-    /// HuggingFace Url: <a href="https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct">https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct</a> 
+    /// Description: DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks.  <br/>
+    /// HuggingFace Url: <a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B">https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B</a> 
     /// </summary>
-    Llama3370BInstruct,
+    DeepseekR1DistillLlama70B,
+
+    /// <summary>
+    /// Name: DeepSeek-V3 <br/>
+    /// Organization: deepseek-ai <br/>
+    /// Context Length: 16000 <br/>
+    /// Prompt Cost: $0.49/MTok <br/>
+    /// Completion Cost: $0.49/MTok <br/>
+    /// Description:  <br/>
+    /// HuggingFace Url: <a href="https://huggingface.co/deepseek-ai/DeepSeek-V3">https://huggingface.co/deepseek-ai/DeepSeek-V3</a> 
+    /// </summary>
+    DeepseekV3,
 
     /// <summary>
     /// Name: Llama-3.3-70B-Instruct-Turbo <br/>
@@ -29,48 +51,48 @@ public enum DeepInfraModelIds
     Llama3370BInstructTurbo,
 
     /// <summary>
-    /// Name: phi-4 <br/>
-    /// Organization: microsoft <br/>
-    /// Context Length: 16384 <br/>
-    /// Prompt Cost: $0.07/MTok <br/>
-    /// Completion Cost: $0.07/MTok <br/>
-    /// Description: Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&amp;A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning. <br/>
-    /// HuggingFace Url: <a href="https://huggingface.co/microsoft/phi-4">https://huggingface.co/microsoft/phi-4</a> 
+    /// Name: Llama-3.3-70B-Instruct <br/>
+    /// Organization: meta-llama <br/>
+    /// Context Length: 131072 <br/>
+    /// Prompt Cost: $0.23/MTok <br/>
+    /// Completion Cost: $0.23/MTok <br/>
+    /// Description: Llama 3.3-70B is a multilingual LLM trained on a massive dataset of 15 trillion tokens, fine-tuned for instruction-following and conversational dialogue. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation. <br/>
+    /// HuggingFace Url: <a href="https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct">https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct</a> 
     /// </summary>
-    Phi4,
+    Llama3370BInstruct,
 
     /// <summary>
-    /// Name: DeepSeek-V3 <br/>
-    /// Organization: deepseek-ai <br/>
-    /// Context Length: 16000 <br/>
-    /// Prompt Cost: $0.85/MTok <br/>
-    /// Completion Cost: $0.85/MTok <br/>
-    /// Description:  <br/>
-    /// HuggingFace Url: <a href="https://huggingface.co/deepseek-ai/DeepSeek-V3">https://huggingface.co/deepseek-ai/DeepSeek-V3</a> 
+    /// Name: Mistral-Small-24B-Instruct-2501 <br/>
+    /// Organization: mistralai <br/>
+    /// Context Length: 32768 <br/>
+    /// Prompt Cost: $0.07/MTok <br/>
+    /// Completion Cost: $0.07/MTok <br/>
+    /// Description: Mistral Small 3 is a 24B-parameter language model optimized for low-latency performance across common AI tasks. Released under the Apache 2.0 license, it features both pre-trained and instruction-tuned versions designed for efficient local deployment.  The model achieves 81% accuracy on the MMLU benchmark and performs competitively with larger models like Llama 3.3 70B and Qwen 32B, while operating at three times the speed on equivalent hardware. <br/>
+    /// HuggingFace Url: <a href="https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501">https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501</a> 
     /// </summary>
-    DeepseekV3,
+    MistralSmall24BInstruct2501,
 
     /// <summary>
-    /// Name: DeepSeek-R1 <br/>
+    /// Name: DeepSeek-R1-Distill-Qwen-32B <br/>
     /// Organization: deepseek-ai <br/>
-    /// Context Length: 16000 <br/>
-    /// Prompt Cost: $0.85/MTok <br/>
-    /// Completion Cost: $0.85/MTok <br/>
-    /// Description: DeepSeek-R1-Zero is a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrated remarkable performance on reasoning. <br/>
-    /// HuggingFace Url: <a href="https://huggingface.co/deepseek-ai/DeepSeek-R1">https://huggingface.co/deepseek-ai/DeepSeek-R1</a> 
+    /// Context Length: 131072 <br/>
+    /// Prompt Cost: $0.12/MTok <br/>
+    /// Completion Cost: $0.12/MTok <br/>
+    /// Description: DeepSeek R1 Distill Qwen 32B is a distilled large language model based on Qwen 2.5 32B, using outputs from DeepSeek R1. It outperforms OpenAI's o1-mini across various benchmarks, achieving new state-of-the-art results for dense models.  Other benchmark results include:  AIME 2024: 72.6 | MATH-500: 94.3 | CodeForces Rating: 1691. <br/>
+    /// HuggingFace Url: <a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B">https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B</a> 
     /// </summary>
-    DeepseekR1,
+    DeepseekR1DistillQwen32B,
 
     /// <summary>
-    /// Name: DeepSeek-R1-Distill-Llama-70B <br/>
-    /// Organization: deepseek-ai <br/>
-    /// Context Length: 131072 <br/>
-    /// Prompt Cost: $0.23/MTok <br/>
-    /// Completion Cost: $0.23/MTok <br/>
-    /// Description: DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks.  <br/>
-    /// HuggingFace Url: <a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B">https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B</a> 
+    /// Name: phi-4 <br/>
+    /// Organization: microsoft <br/>
+    /// Context Length: 16384 <br/>
+    /// Prompt Cost: $0.07/MTok <br/>
+    /// Completion Cost: $0.07/MTok <br/>
+    /// Description: Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&amp;A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning. <br/>
+    /// HuggingFace Url: <a href="https://huggingface.co/microsoft/phi-4">https://huggingface.co/microsoft/phi-4</a> 
     /// </summary>
-    DeepseekR1DistillLlama70B,
+    Phi4,
 
     /// <summary>
     /// Name: Meta-Llama-3.1-70B-Instruct <br/>

diff --git a/src/DeepInfra/src/DeepInfraModelProvider.cs b/src/DeepInfra/src/DeepInfraModelProvider.cs
@@ -9,12 +9,14 @@ public static class DeepInfraModelProvider
 {
     private static Dictionary<DeepInfraModelIds, ChatModelMetadata> Models { get; set; } = new()
     {
-        { DeepInfraModelIds.Llama3370BInstruct, ToMetadata("meta-llama/Llama-3.3-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)},
+        { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",16000,7.5E-07,2.4E-06)},
+        { DeepInfraModelIds.DeepseekR1DistillLlama70B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Llama-70B",131072,2.3000000000000002E-07,6.9E-07)},
+        { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",16000,4.9E-07,8.900000000000001E-07)},
         { DeepInfraModelIds.Llama3370BInstructTurbo, ToMetadata("meta-llama/Llama-3.3-70B-Instruct-Turbo",131072,1.2E-07,3E-07)},
+        { DeepInfraModelIds.Llama3370BInstruct, ToMetadata("meta-llama/Llama-3.3-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)},
+        { DeepInfraModelIds.MistralSmall24BInstruct2501, ToMetadata("mistralai/Mistral-Small-24B-Instruct-2501",32768,7E-08,1.4E-07)},
+        { DeepInfraModelIds.DeepseekR1DistillQwen32B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",131072,1.2E-07,1.8E-07)},
         { DeepInfraModelIds.Phi4, ToMetadata("microsoft/phi-4",16384,7E-08,1.4E-07)},
-        { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",16000,8.5E-07,9.000000000000001E-07)},
-        { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",16000,8.5E-07,2.5E-06)},
-        { DeepInfraModelIds.DeepseekR1DistillLlama70B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Llama-70B",131072,2.3000000000000002E-07,6.9E-07)},
         { DeepInfraModelIds.MetaLlama3170BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)},
         { DeepInfraModelIds.MetaLlama318BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-8B-Instruct",131072,3E-08,5.0000000000000004E-08)},
         { DeepInfraModelIds.MetaLlama31405BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-405B-Instruct",32768,8.000000000000001E-07,8.000000000000001E-07)},

diff --git a/src/DeepInfra/src/Predefined/AllModels.cs b/src/DeepInfra/src/Predefined/AllModels.cs
@@ -1,33 +1,43 @@
 namespace LangChain.Providers.DeepInfra.Predefined;
 
-/// <inheritdoc cref="DeepInfraModelIds.Llama3370BInstruct"/>
+/// <inheritdoc cref="DeepInfraModelIds.DeepseekR1"/>
 /// <param name="provider">Deep Infra Provider Instance</param>
-public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstruct);
+public class DeepseekR1Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1);
+
+
+/// <inheritdoc cref="DeepInfraModelIds.DeepseekR1DistillLlama70B"/>
+/// <param name="provider">Deep Infra Provider Instance</param>
+public class DeepseekR1DistillLlama70BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillLlama70B);
+
+
+/// <inheritdoc cref="DeepInfraModelIds.DeepseekV3"/>
+/// <param name="provider">Deep Infra Provider Instance</param>
+public class DeepseekV3Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekV3);
 
 
 /// <inheritdoc cref="DeepInfraModelIds.Llama3370BInstructTurbo"/>
 /// <param name="provider">Deep Infra Provider Instance</param>
 public class Llama3370BInstructTurboModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstructTurbo);
 
 
-/// <inheritdoc cref="DeepInfraModelIds.Phi4"/>
+/// <inheritdoc cref="DeepInfraModelIds.Llama3370BInstruct"/>
 /// <param name="provider">Deep Infra Provider Instance</param>
-public class Phi4Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Phi4);
+public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstruct);
 
 
-/// <inheritdoc cref="DeepInfraModelIds.DeepseekV3"/>
+/// <inheritdoc cref="DeepInfraModelIds.MistralSmall24BInstruct2501"/>
 /// <param name="provider">Deep Infra Provider Instance</param>
-public class DeepseekV3Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekV3);
+public class MistralSmall24BInstruct2501Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.MistralSmall24BInstruct2501);
 
 
-/// <inheritdoc cref="DeepInfraModelIds.DeepseekR1"/>
+/// <inheritdoc cref="DeepInfraModelIds.DeepseekR1DistillQwen32B"/>
 /// <param name="provider">Deep Infra Provider Instance</param>
-public class DeepseekR1Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1);
+public class DeepseekR1DistillQwen32BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillQwen32B);
 
 
-/// <inheritdoc cref="DeepInfraModelIds.DeepseekR1DistillLlama70B"/>
+/// <inheritdoc cref="DeepInfraModelIds.Phi4"/>
 /// <param name="provider">Deep Infra Provider Instance</param>
-public class DeepseekR1DistillLlama70BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillLlama70B);
+public class Phi4Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Phi4);
 
 
 /// <inheritdoc cref="DeepInfraModelIds.MetaLlama3170BInstruct"/>