diff --git a/src/DeepInfra/src/DeepInfraModelIds.cs b/src/DeepInfra/src/DeepInfraModelIds.cs
index 7233a2d..66f7ff3 100644
--- a/src/DeepInfra/src/DeepInfraModelIds.cs
+++ b/src/DeepInfra/src/DeepInfraModelIds.cs
@@ -7,15 +7,37 @@ public enum DeepInfraModelIds
{
///
- /// Name: Llama-3.3-70B-Instruct
- /// Organization: meta-llama
+ /// Name: DeepSeek-R1
+ /// Organization: deepseek-ai
+ /// Context Length: 16000
+ /// Prompt Cost: $0.75/MTok
+ /// Completion Cost: $0.75/MTok
+ /// Description: We introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1
+ ///
+ DeepseekR1,
+
+ ///
+ /// Name: DeepSeek-R1-Distill-Llama-70B
+ /// Organization: deepseek-ai
/// Context Length: 131072
/// Prompt Cost: $0.23/MTok
/// Completion Cost: $0.23/MTok
- /// Description: Llama 3.3-70B is a multilingual LLM trained on a massive dataset of 15 trillion tokens, fine-tuned for instruction-following and conversational dialogue. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation.
- /// HuggingFace Url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+ /// Description: DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks.
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B
///
- Llama3370BInstruct,
+ DeepseekR1DistillLlama70B,
+
+ ///
+ /// Name: DeepSeek-V3
+ /// Organization: deepseek-ai
+ /// Context Length: 16000
+ /// Prompt Cost: $0.49/MTok
+ /// Completion Cost: $0.49/MTok
+ /// Description:
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-V3
+ ///
+ DeepseekV3,
///
/// Name: Llama-3.3-70B-Instruct-Turbo
@@ -29,48 +51,48 @@ public enum DeepInfraModelIds
Llama3370BInstructTurbo,
///
- /// Name: phi-4
- /// Organization: microsoft
- /// Context Length: 16384
- /// Prompt Cost: $0.07/MTok
- /// Completion Cost: $0.07/MTok
- /// Description: Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning.
- /// HuggingFace Url: https://huggingface.co/microsoft/phi-4
+ /// Name: Llama-3.3-70B-Instruct
+ /// Organization: meta-llama
+ /// Context Length: 131072
+ /// Prompt Cost: $0.23/MTok
+ /// Completion Cost: $0.23/MTok
+ /// Description: Llama 3.3-70B is a multilingual LLM trained on a massive dataset of 15 trillion tokens, fine-tuned for instruction-following and conversational dialogue. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation.
+ /// HuggingFace Url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
///
- Phi4,
+ Llama3370BInstruct,
///
- /// Name: DeepSeek-V3
- /// Organization: deepseek-ai
- /// Context Length: 16000
- /// Prompt Cost: $0.85/MTok
- /// Completion Cost: $0.85/MTok
- /// Description:
- /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-V3
+ /// Name: Mistral-Small-24B-Instruct-2501
+ /// Organization: mistralai
+ /// Context Length: 32768
+ /// Prompt Cost: $0.07/MTok
+ /// Completion Cost: $0.07/MTok
+ /// Description: Mistral Small 3 is a 24B-parameter language model optimized for low-latency performance across common AI tasks. Released under the Apache 2.0 license, it features both pre-trained and instruction-tuned versions designed for efficient local deployment. The model achieves 81% accuracy on the MMLU benchmark and performs competitively with larger models like Llama 3.3 70B and Qwen 32B, while operating at three times the speed on equivalent hardware.
+ /// HuggingFace Url: https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501
///
- DeepseekV3,
+ MistralSmall24BInstruct2501,
///
- /// Name: DeepSeek-R1
+ /// Name: DeepSeek-R1-Distill-Qwen-32B
/// Organization: deepseek-ai
- /// Context Length: 16000
- /// Prompt Cost: $0.85/MTok
- /// Completion Cost: $0.85/MTok
- /// Description: DeepSeek-R1-Zero is a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrated remarkable performance on reasoning.
- /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1
+ /// Context Length: 131072
+ /// Prompt Cost: $0.12/MTok
+ /// Completion Cost: $0.12/MTok
+ /// Description: DeepSeek R1 Distill Qwen 32B is a distilled large language model based on Qwen 2.5 32B, using outputs from DeepSeek R1. It outperforms OpenAI's o1-mini across various benchmarks, achieving new state-of-the-art results for dense models. Other benchmark results include: AIME 2024: 72.6 | MATH-500: 94.3 | CodeForces Rating: 1691.
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
///
- DeepseekR1,
+ DeepseekR1DistillQwen32B,
///
- /// Name: DeepSeek-R1-Distill-Llama-70B
- /// Organization: deepseek-ai
- /// Context Length: 131072
- /// Prompt Cost: $0.23/MTok
- /// Completion Cost: $0.23/MTok
- /// Description: DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks.
- /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+ /// Name: phi-4
+ /// Organization: microsoft
+ /// Context Length: 16384
+ /// Prompt Cost: $0.07/MTok
+ /// Completion Cost: $0.07/MTok
+ /// Description: Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning.
+ /// HuggingFace Url: https://huggingface.co/microsoft/phi-4
///
- DeepseekR1DistillLlama70B,
+ Phi4,
///
/// Name: Meta-Llama-3.1-70B-Instruct
diff --git a/src/DeepInfra/src/DeepInfraModelProvider.cs b/src/DeepInfra/src/DeepInfraModelProvider.cs
index 9924a47..cc58a2c 100644
--- a/src/DeepInfra/src/DeepInfraModelProvider.cs
+++ b/src/DeepInfra/src/DeepInfraModelProvider.cs
@@ -9,12 +9,14 @@ public static class DeepInfraModelProvider
{
private static Dictionary Models { get; set; } = new()
{
- { DeepInfraModelIds.Llama3370BInstruct, ToMetadata("meta-llama/Llama-3.3-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)},
+ { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",16000,7.5E-07,2.4E-06)},
+ { DeepInfraModelIds.DeepseekR1DistillLlama70B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Llama-70B",131072,2.3000000000000002E-07,6.9E-07)},
+ { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",16000,4.9E-07,8.900000000000001E-07)},
{ DeepInfraModelIds.Llama3370BInstructTurbo, ToMetadata("meta-llama/Llama-3.3-70B-Instruct-Turbo",131072,1.2E-07,3E-07)},
+ { DeepInfraModelIds.Llama3370BInstruct, ToMetadata("meta-llama/Llama-3.3-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)},
+ { DeepInfraModelIds.MistralSmall24BInstruct2501, ToMetadata("mistralai/Mistral-Small-24B-Instruct-2501",32768,7E-08,1.4E-07)},
+ { DeepInfraModelIds.DeepseekR1DistillQwen32B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",131072,1.2E-07,1.8E-07)},
{ DeepInfraModelIds.Phi4, ToMetadata("microsoft/phi-4",16384,7E-08,1.4E-07)},
- { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",16000,8.5E-07,9.000000000000001E-07)},
- { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",16000,8.5E-07,2.5E-06)},
- { DeepInfraModelIds.DeepseekR1DistillLlama70B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Llama-70B",131072,2.3000000000000002E-07,6.9E-07)},
{ DeepInfraModelIds.MetaLlama3170BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)},
{ DeepInfraModelIds.MetaLlama318BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-8B-Instruct",131072,3E-08,5.0000000000000004E-08)},
{ DeepInfraModelIds.MetaLlama31405BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-405B-Instruct",32768,8.000000000000001E-07,8.000000000000001E-07)},
diff --git a/src/DeepInfra/src/Predefined/AllModels.cs b/src/DeepInfra/src/Predefined/AllModels.cs
index 3422055..c905365 100644
--- a/src/DeepInfra/src/Predefined/AllModels.cs
+++ b/src/DeepInfra/src/Predefined/AllModels.cs
@@ -1,8 +1,18 @@
namespace LangChain.Providers.DeepInfra.Predefined;
-///
+///
/// Deep Infra Provider Instance
-public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstruct);
+public class DeepseekR1Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1);
+
+
+///
+/// Deep Infra Provider Instance
+public class DeepseekR1DistillLlama70BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillLlama70B);
+
+
+///
+/// Deep Infra Provider Instance
+public class DeepseekV3Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekV3);
///
@@ -10,24 +20,24 @@ public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraMode
public class Llama3370BInstructTurboModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstructTurbo);
-///
+///
/// Deep Infra Provider Instance
-public class Phi4Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Phi4);
+public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstruct);
-///
+///
/// Deep Infra Provider Instance
-public class DeepseekV3Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekV3);
+public class MistralSmall24BInstruct2501Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.MistralSmall24BInstruct2501);
-///
+///
/// Deep Infra Provider Instance
-public class DeepseekR1Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1);
+public class DeepseekR1DistillQwen32BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillQwen32B);
-///
+///
/// Deep Infra Provider Instance
-public class DeepseekR1DistillLlama70BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillLlama70B);
+public class Phi4Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Phi4);
///