From fcaa73c09abce6b0ff0ae230d5a970f2e3f57635 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 10 Feb 2025 06:46:14 +0000 Subject: [PATCH] feat: Updated Deep Infra models --- src/DeepInfra/src/DeepInfraModelIds.cs | 8 ++++---- src/DeepInfra/src/DeepInfraModelProvider.cs | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/DeepInfra/src/DeepInfraModelIds.cs b/src/DeepInfra/src/DeepInfraModelIds.cs index 66f7ff3..bbaee46 100644 --- a/src/DeepInfra/src/DeepInfraModelIds.cs +++ b/src/DeepInfra/src/DeepInfraModelIds.cs @@ -9,7 +9,7 @@ public enum DeepInfraModelIds /// /// Name: DeepSeek-R1
/// Organization: deepseek-ai
- /// Context Length: 16000
+ /// Context Length: 32768
/// Prompt Cost: $0.75/MTok
/// Completion Cost: $0.75/MTok
/// Description: We introduce DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.
@@ -31,10 +31,10 @@ public enum DeepInfraModelIds /// /// Name: DeepSeek-V3
/// Organization: deepseek-ai
- /// Context Length: 16000
+ /// Context Length: 32768
/// Prompt Cost: $0.49/MTok
/// Completion Cost: $0.49/MTok
- /// Description:
+ /// Description: DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2.
/// HuggingFace Url: https://huggingface.co/deepseek-ai/DeepSeek-V3 ///
DeepseekV3, @@ -328,7 +328,7 @@ public enum DeepInfraModelIds /// /// Name: QVQ-72B-Preview
/// Organization: Qwen
- /// Context Length: 128000
+ /// Context Length: 32000
/// Prompt Cost: $0.25/MTok
/// Completion Cost: $0.25/MTok
/// Description: QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities. QVQ-72B-Preview has achieved remarkable performance on various benchmarks. It scored a remarkable 70.3% on the Multimodal Massive Multi-task Understanding (MMMU) benchmark
diff --git a/src/DeepInfra/src/DeepInfraModelProvider.cs b/src/DeepInfra/src/DeepInfraModelProvider.cs index cc58a2c..8070d67 100644 --- a/src/DeepInfra/src/DeepInfraModelProvider.cs +++ b/src/DeepInfra/src/DeepInfraModelProvider.cs @@ -9,9 +9,9 @@ public static class DeepInfraModelProvider { private static Dictionary Models { get; set; } = new() { - { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",16000,7.5E-07,2.4E-06)}, + { DeepInfraModelIds.DeepseekR1, ToMetadata("deepseek-ai/DeepSeek-R1",32768,7.5E-07,2.4E-06)}, { DeepInfraModelIds.DeepseekR1DistillLlama70B, ToMetadata("deepseek-ai/DeepSeek-R1-Distill-Llama-70B",131072,2.3000000000000002E-07,6.9E-07)}, - { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",16000,4.9E-07,8.900000000000001E-07)}, + { DeepInfraModelIds.DeepseekV3, ToMetadata("deepseek-ai/DeepSeek-V3",32768,4.9E-07,8.900000000000001E-07)}, { DeepInfraModelIds.Llama3370BInstructTurbo, ToMetadata("meta-llama/Llama-3.3-70B-Instruct-Turbo",131072,1.2E-07,3E-07)}, { DeepInfraModelIds.Llama3370BInstruct, ToMetadata("meta-llama/Llama-3.3-70B-Instruct",131072,2.3000000000000002E-07,4.0000000000000003E-07)}, { DeepInfraModelIds.MistralSmall24BInstruct2501, ToMetadata("mistralai/Mistral-Small-24B-Instruct-2501",32768,7E-08,1.4E-07)}, @@ -38,7 +38,7 @@ public static class DeepInfraModelProvider { DeepInfraModelIds.Hermes3Llama31405B, ToMetadata("NousResearch/Hermes-3-Llama-3.1-405B",131072,8.000000000000001E-07,8.000000000000001E-07)}, { DeepInfraModelIds.SkyT132BPreview, ToMetadata("NovaSky-AI/Sky-T1-32B-Preview",32768,1.2E-07,1.8E-07)}, { DeepInfraModelIds.PhindCodellama34BV2, ToMetadata("Phind/Phind-CodeLlama-34B-v2",4096,6E-07,6E-07)}, - { DeepInfraModelIds.Qvq72BPreview, ToMetadata("Qwen/QVQ-72B-Preview",128000,2.5E-07,5E-07)}, + { DeepInfraModelIds.Qvq72BPreview, ToMetadata("Qwen/QVQ-72B-Preview",32000,2.5E-07,5E-07)}, { DeepInfraModelIds.Qwen272BInstruct, ToMetadata("Qwen/Qwen2-72B-Instruct",32768,3.5E-07,4.0000000000000003E-07)}, { DeepInfraModelIds.Qwen27BInstruct, ToMetadata("Qwen/Qwen2-7B-Instruct",32768,6E-08,6E-08)}, { DeepInfraModelIds.Qwen257BInstruct, ToMetadata("Qwen/Qwen2.5-7B-Instruct",32768,2E-08,5.0000000000000004E-08)},