vllm-project · youkaichao · Jan 26, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 20, 2025
@@ -103,6 +103,116 @@ def test_serving_chat_should_set_correct_max_tokens():
 
     assert mock_engine.generate.call_args.args[1].max_tokens == 10
 
+    # Setting server's max_tokens in the generation_config.json
+    # lower than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 10 # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test Case 1: No max_tokens specified in request
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 15
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
+    # Setting server's max_tokens in the generation_config.json
+    # higher than context_window - prompt_tokens
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "max_tokens": 200 # Setting server-side max_tokens limit
+    }
+
+    # Reinitialize the engine with new settings
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test case 1: No max_tokens specified, defaults to context_window
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 2: Request's max_tokens set higher than server accepts
+    req.max_tokens = 100
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # Test Case 3: Request's max_tokens set lower than server accepts
+    req.max_tokens = 5
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].max_tokens == 5
+
 
 def test_serving_chat_could_load_correct_generation_config():
 

diff --git a/vllm/config.py b/vllm/config.py
@@ -918,12 +918,18 @@ def get_diff_sampling_param(self) -> Dict[str, Any]:
             "top_k",
             "top_p",
             "min_p",
+            "max_new_tokens",
         ]
         if any(p in config for p in available_params):
             diff_sampling_param = {
                 p: config.get(p)
                 for p in available_params if config.get(p) is not None
             }
+            # Huggingface definition of max_new_tokens is equivalent
+            # to vLLM's max_tokens
+            if "max_new_tokens" in diff_sampling_param:
+                diff_sampling_param["max_tokens"] = diff_sampling_param.pop(
+                    "max_new_tokens")
         else:
             diff_sampling_param = {}
         return diff_sampling_param

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -375,13 +375,16 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     def to_beam_search_params(
             self,
-            default_max_tokens: int,
+            server_max_tokens: int,
             default_sampling_params: Optional[dict] = None
     ) -> BeamSearchParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
-            max_tokens = default_max_tokens
+            max_tokens = server_max_tokens
+        # Don't allow user to exceed server limit. Should this notify user?
+        else:
+            max_tokens = min(max_tokens, server_max_tokens)
 
         if default_sampling_params is None:
             default_sampling_params = {}
@@ -401,13 +404,16 @@ def to_beam_search_params(
 
     def to_sampling_params(
             self,
-            default_max_tokens: int,
+            server_max_tokens: int,
             logits_processor_pattern: Optional[str],
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
         if max_tokens is None:
-            max_tokens = default_max_tokens
+            max_tokens = server_max_tokens
+        # Don't allow user to exceed server limit. Should this notify user?
+        else:
+            max_tokens = min(max_tokens, server_max_tokens)
 
         if default_sampling_params is None:
             default_sampling_params = {}
@@ -736,12 +742,15 @@ class CompletionRequest(OpenAIBaseModel):
 
     def to_beam_search_params(
             self,
-            default_max_tokens: int,
+            server_max_tokens: int,
             default_sampling_params: Optional[dict] = None
     ) -> BeamSearchParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
-            max_tokens = default_max_tokens
+            max_tokens = server_max_tokens
+        # Don't allow user to exceed server limit. Should this notify user?
+        else:
+            max_tokens = min(max_tokens, server_max_tokens)
 
         if default_sampling_params is None:
             default_sampling_params = {}
@@ -760,12 +769,15 @@ def to_beam_search_params(
 
     def to_sampling_params(
             self,
-            default_max_tokens: int,
+            server_max_tokens: int,
             logits_processor_pattern: Optional[str],
             default_sampling_params: Optional[dict] = None) -> SamplingParams:
         max_tokens = self.max_tokens
         if max_tokens is None:
-            max_tokens = default_max_tokens
+            max_tokens = server_max_tokens
+        # Don't allow user to exceed server limit. Should this notify user?
+        else:
+            max_tokens = min(max_tokens, server_max_tokens)
 
         if default_sampling_params is None:
             default_sampling_params = {}

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -187,17 +187,24 @@ async def create_chat_completion(
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 sampling_params: Union[SamplingParams, BeamSearchParams]
-                default_max_tokens = self.max_model_len - len(
+                server_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
                 # Build default sampling params
                 default_sampling_params = (
                     self.model_config.get_diff_sampling_param())
+
+                # Limit set by architecture or value in generation_config.json
+                if "max_tokens" in default_sampling_params:
+                    server_max_tokens = min(
+                        server_max_tokens,
+                        default_sampling_params["max_tokens"])
+
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, default_sampling_params)
+                        server_max_tokens, default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens,
+                        server_max_tokens,
                         self.model_config.logits_processor_pattern,
                         default_sampling_params)
 

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -115,17 +115,24 @@ async def create_completion(
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 sampling_params: Union[SamplingParams, BeamSearchParams]
-                default_max_tokens = self.max_model_len - len(
+                server_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
                 # Build default sampling params
                 default_sampling_params = (
                     self.model_config.get_diff_sampling_param())
+
+                # Limit set by architecture or value in generation_config.json
+                if "max_tokens" in default_sampling_params:
+                    server_max_tokens = min(
+                        server_max_tokens,
+                        default_sampling_params["max_tokens"])
+
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, default_sampling_params)
+                        server_max_tokens, default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
-                        default_max_tokens,
+                        server_max_tokens,
                         self.model_config.logits_processor_pattern,
                         default_sampling_params)