diff --git a/src/DeepInfra/src/DeepInfraModelIds.cs b/src/DeepInfra/src/DeepInfraModelIds.cs index 7233a2d..66f7ff3 100644 --- a/src/DeepInfra/src/DeepInfraModelIds.cs +++ b/src/DeepInfra/src/DeepInfraModelIds.cs @@ -7,15 +7,37 @@ public enum DeepInfraModelIds { /// - /// Name: Llama-3.3-70B-Instruct
- /// Organization: meta-llama
+ /// Name: DeepSeek-R1
+ /// Organization: deepseek-ai
+ /// Context Length: 16000
+ /// Prompt Cost: $0.75/MTok
+ /// Completion Cost: $0.75/MTok
+ /// Description: We introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1 + ///
+ DeepseekR1, + + /// + /// Name: DeepSeek-R1-Distill-Llama-70B
+ /// Organization: deepseek-ai
/// Context Length: 131072
/// Prompt Cost: $0.23/MTok
/// Completion Cost: $0.23/MTok
- /// Description: Llama 3.3-70B is a multilingual LLM trained on a massive dataset of 15 trillion tokens, fine-tuned for instruction-following and conversational dialogue. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation.
- /// HuggingFace Url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct + /// Description: DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks.
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B ///
- Llama3370BInstruct, + DeepseekR1DistillLlama70B, + + /// + /// Name: DeepSeek-V3
+ /// Organization: deepseek-ai
+ /// Context Length: 16000
+ /// Prompt Cost: $0.49/MTok
+ /// Completion Cost: $0.49/MTok
+ /// Description:
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-V3 + ///
+ DeepseekV3, /// /// Name: Llama-3.3-70B-Instruct-Turbo
@@ -29,48 +51,48 @@ public enum DeepInfraModelIds Llama3370BInstructTurbo, /// - /// Name: phi-4
- /// Organization: microsoft
- /// Context Length: 16384
- /// Prompt Cost: $0.07/MTok
- /// Completion Cost: $0.07/MTok
- /// Description: Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning.
- /// HuggingFace Url: https://huggingface.co/microsoft/phi-4 + /// Name: Llama-3.3-70B-Instruct
+ /// Organization: meta-llama
+ /// Context Length: 131072
+ /// Prompt Cost: $0.23/MTok
+ /// Completion Cost: $0.23/MTok
+ /// Description: Llama 3.3-70B is a multilingual LLM trained on a massive dataset of 15 trillion tokens, fine-tuned for instruction-following and conversational dialogue. The model is designed to be helpful, safe, and flexible, with a focus on responsible deployment and mitigating potential risks such as bias, toxicity, and misinformation. It achieves state-of-the-art performance on various benchmarks, including conversational tasks, language translation, and text generation.
+ /// HuggingFace Url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct ///
- Phi4, + Llama3370BInstruct, /// - /// Name: DeepSeek-V3
- /// Organization: deepseek-ai
- /// Context Length: 16000
- /// Prompt Cost: $0.85/MTok
- /// Completion Cost: $0.85/MTok
- /// Description:
- /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-V3 + /// Name: Mistral-Small-24B-Instruct-2501
+ /// Organization: mistralai
+ /// Context Length: 32768
+ /// Prompt Cost: $0.07/MTok
+ /// Completion Cost: $0.07/MTok
+ /// Description: Mistral Small 3 is a 24B-parameter language model optimized for low-latency performance across common AI tasks. Released under the Apache 2.0 license, it features both pre-trained and instruction-tuned versions designed for efficient local deployment. The model achieves 81% accuracy on the MMLU benchmark and performs competitively with larger models like Llama 3.3 70B and Qwen 32B, while operating at three times the speed on equivalent hardware.
+ /// HuggingFace Url: https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501 ///
- DeepseekV3, + MistralSmall24BInstruct2501, /// - /// Name: DeepSeek-R1
+ /// Name: DeepSeek-R1-Distill-Qwen-32B
/// Organization: deepseek-ai
- /// Context Length: 16000
- /// Prompt Cost: $0.85/MTok
- /// Completion Cost: $0.85/MTok
- /// Description: DeepSeek-R1-Zero is a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrated remarkable performance on reasoning.
- /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1 + /// Context Length: 131072
+ /// Prompt Cost: $0.12/MTok
+ /// Completion Cost: $0.12/MTok
+ /// Description: DeepSeek R1 Distill Qwen 32B is a distilled large language model based on Qwen 2.5 32B, using outputs from DeepSeek R1. It outperforms OpenAI's o1-mini across various benchmarks, achieving new state-of-the-art results for dense models. Other benchmark results include: AIME 2024: 72.6 | MATH-500: 94.3 | CodeForces Rating: 1691.
+ /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ///
- DeepseekR1, + DeepseekR1DistillQwen32B, /// - /// Name: DeepSeek-R1-Distill-Llama-70B
- /// Organization: deepseek-ai
- /// Context Length: 131072
- /// Prompt Cost: $0.23/MTok
- /// Completion Cost: $0.23/MTok
- /// Description: DeepSeek-R1-Distill-Llama-70B is a highly efficient language model that leverages knowledge distillation to achieve state-of-the-art performance. This model distills the reasoning patterns of larger models into a smaller, more agile architecture, resulting in exceptional results on benchmarks like AIME 2024, MATH-500, and LiveCodeBench. With 70 billion parameters, DeepSeek-R1-Distill-Llama-70B offers a unique balance of accuracy and efficiency, making it an ideal choice for a wide range of natural language processing tasks.
- /// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B + /// Name: phi-4
+ /// Organization: microsoft
+ /// Context Length: 16384
+ /// Prompt Cost: $0.07/MTok
+ /// Completion Cost: $0.07/MTok
+ /// Description: Phi-4 is a model built upon a blend of synthetic datasets, data from filtered public domain websites, and acquired academic books and Q&A datasets. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning.
+ /// HuggingFace Url: https://huggingface.co/microsoft/phi-4 ///
- DeepseekR1DistillLlama70B, + Phi4, /// /// Name: Meta-Llama-3.1-70B-Instruct
diff --git a/src/DeepInfra/src/DeepInfraModelProvider.cs b/src/DeepInfra/src/DeepInfraModelProvider.cs index 9924a47..cc58a2c 100644 --- a/src/DeepInfra/src/DeepInfraModelProvider.cs +++ b/src/DeepInfra/src/DeepInfraModelProvider.cs @@ -9,12 +9,14 @@ public static class DeepInfraModelProvider { private static Dictionary Models { get; set; } = new() { - { DeepInfraModelIds.Llama3370BInstruct, ToMetadata("meta-llama/Llama-3.3-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)}, + { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",16000,7.5E-07,2.4E-06)}, + { DeepInfraModelIds.DeepseekR1DistillLlama70B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Llama-70B",131072,2.3000000000000002E-07,6.9E-07)}, + { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",16000,4.9E-07,8.900000000000001E-07)}, { DeepInfraModelIds.Llama3370BInstructTurbo, ToMetadata("meta-llama/Llama-3.3-70B-Instruct-Turbo",131072,1.2E-07,3E-07)}, + { DeepInfraModelIds.Llama3370BInstruct, ToMetadata("meta-llama/Llama-3.3-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)}, + { DeepInfraModelIds.MistralSmall24BInstruct2501, ToMetadata("mistralai/Mistral-Small-24B-Instruct-2501",32768,7E-08,1.4E-07)}, + { DeepInfraModelIds.DeepseekR1DistillQwen32B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",131072,1.2E-07,1.8E-07)}, { DeepInfraModelIds.Phi4, ToMetadata("microsoft/phi-4",16384,7E-08,1.4E-07)}, - { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",16000,8.5E-07,9.000000000000001E-07)}, - { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",16000,8.5E-07,2.5E-06)}, - { DeepInfraModelIds.DeepseekR1DistillLlama70B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Llama-70B",131072,2.3000000000000002E-07,6.9E-07)}, { DeepInfraModelIds.MetaLlama3170BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)}, { DeepInfraModelIds.MetaLlama318BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-8B-Instruct",131072,3E-08,5.0000000000000004E-08)}, { DeepInfraModelIds.MetaLlama31405BInstruct, ToMetadata("meta-llama/Meta-Llama-3.1-405B-Instruct",32768,8.000000000000001E-07,8.000000000000001E-07)}, diff --git a/src/DeepInfra/src/Predefined/AllModels.cs b/src/DeepInfra/src/Predefined/AllModels.cs index 3422055..c905365 100644 --- a/src/DeepInfra/src/Predefined/AllModels.cs +++ b/src/DeepInfra/src/Predefined/AllModels.cs @@ -1,8 +1,18 @@ namespace LangChain.Providers.DeepInfra.Predefined; -/// +/// /// Deep Infra Provider Instance -public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstruct); +public class DeepseekR1Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1); + + +/// +/// Deep Infra Provider Instance +public class DeepseekR1DistillLlama70BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillLlama70B); + + +/// +/// Deep Infra Provider Instance +public class DeepseekV3Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekV3); /// @@ -10,24 +20,24 @@ public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraMode public class Llama3370BInstructTurboModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstructTurbo); -/// +/// /// Deep Infra Provider Instance -public class Phi4Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Phi4); +public class Llama3370BInstructModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Llama3370BInstruct); -/// +/// /// Deep Infra Provider Instance -public class DeepseekV3Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekV3); +public class MistralSmall24BInstruct2501Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.MistralSmall24BInstruct2501); -/// +/// /// Deep Infra Provider Instance -public class DeepseekR1Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1); +public class DeepseekR1DistillQwen32BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillQwen32B); -/// +/// /// Deep Infra Provider Instance -public class DeepseekR1DistillLlama70BModel(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.DeepseekR1DistillLlama70B); +public class Phi4Model(DeepInfraProvider provider) : DeepInfraModel(provider, DeepInfraModelIds.Phi4); ///