diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 642ef3c9655b8..85d844f3d3f55 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -216,6 +216,11 @@ See [this page](#generative-models) for more information on how to use generativ - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. - ✅︎ - ✅︎ +* - `InternLM3ForCausalLM` + - InternLM3 + - `internlm/internlm3-8b-instruct`, etc. + - ✅︎ + - ✅︎ * - `JAISLMHeadModel` - Jais - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. diff --git a/tests/models/registry.py b/tests/models/registry.py index d079725b2f78d..b0f0f9767a90f 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -85,6 +85,8 @@ class _HfExamplesInfo: trust_remote_code=True), "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B", trust_remote_code=True), + "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct", + trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"), "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 17b0fbb777e8e..ace3fdc25942f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -97,20 +97,19 @@ def forward(self, x): class LlamaAttention(nn.Module): - def __init__( - self, - config: LlamaConfig, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - quant_config: Optional[QuantizationConfig] = None, - bias: bool = False, - cache_config: Optional[CacheConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, + config: LlamaConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + bias_o_proj: bool = False) -> None: super().__init__() layer_idx = extract_layer_index(prefix) self.hidden_size = hidden_size @@ -150,7 +149,7 @@ def __init__( self.o_proj = RowParallelLinear( input_size=self.total_num_heads * self.head_dim, output_size=hidden_size, - bias=bias, + bias=bias_o_proj, quant_config=quant_config, prefix=f"{prefix}.o_proj", ) @@ -231,6 +230,11 @@ def __init__( # Support internlm/internlm-7b with bias attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False) + bias_o_proj = attention_bias + # support internlm/internlm3-8b with qkv_bias + if hasattr(config, 'qkv_bias'): + attention_bias = config.qkv_bias + self.self_attn = LlamaAttention( config=config, hidden_size=self.hidden_size, @@ -242,6 +246,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + bias_o_proj=bias_o_proj, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a7286a9203f67..a71f7f7029c7d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -60,6 +60,7 @@ "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), + "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),