vllm-project · zhisbug · Apr 4, 2023 · Apr 6, 2023 · Apr 7, 2023 · Apr 7, 2023
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 *.eggs/
 *.so
 build/
+.idea
diff --git a/cacheflow/master/scheduler.py b/cacheflow/master/scheduler.py
@@ -34,12 +34,14 @@ def __init__(
         num_gpu_blocks: int,
         num_cpu_blocks: int,
         max_num_batched_tokens: int,
+        tokenizer,
     ) -> None:
         self.controllers = controllers
         self.block_size = block_size
         self.num_gpu_blocks = num_gpu_blocks
         self.num_cpu_blocks = num_cpu_blocks
         self.max_num_batched_tokens = max_num_batched_tokens
+        self.tokenizer = tokenizer
 
         # Instantiate the scheduling policy.
         self.policy = PolicyFactory.get_policy(policy_name='fcfs')
@@ -233,6 +235,7 @@ def post_step(
             group_id = seq_group.group_id
             self.num_steps[group_id] += 1
             stop_token_ids = self.sampling_params[group_id].stop_token_ids
+            stop_str = self.sampling_params[group_id].stop_str
 
             # Process beam search results before processing the next tokens.
             for seq in seq_group.seqs:
@@ -263,6 +266,11 @@ def post_step(
                     self._free_seq(seq)
                     continue
 
+                if stop_str is not None:
+                    if self.tokenizer.decode(seq.get_token_ids(), skip_special_tokens=True).endswith(stop_str):
+                        self._free_seq(seq)
+                        continue
+
                 # Check if the sequence has reached the maximum number of steps.
                 max_num_steps = self.sampling_params[group_id].max_num_steps
                 if self.num_steps[group_id] == max_num_steps:

diff --git a/cacheflow/master/server.py b/cacheflow/master/server.py
@@ -3,6 +3,7 @@
 import random
 
 import ray
+from transformers import AutoTokenizer
 
 from cacheflow.master.scheduler import Scheduler
 from cacheflow.models import get_memory_analyzer
@@ -33,9 +34,10 @@ def __init__(
         self.num_nodes = num_nodes
         self.num_devices_per_node = num_devices_per_node
         self.world_size = pipeline_parallel_size * tensor_parallel_size
-
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
         self.memory_analyzer = get_memory_analyzer(
             model_name=model,
+            model_path=model_path,
             block_size=block_size,
             dtype=dtype,
             gpu_memory=gpu_memory,
@@ -77,6 +79,7 @@ def __init__(
             num_gpu_blocks=self.num_gpu_blocks,
             num_cpu_blocks=self.num_cpu_blocks,
             max_num_batched_tokens=max_num_batched_tokens,
+            tokenizer=self.tokenizer
         )
         # Connect the controllers.
         for i in range(len(self.controllers) - 1):

diff --git a/cacheflow/models/memory_analyzer.py b/cacheflow/models/memory_analyzer.py
@@ -145,20 +145,22 @@ class LlamaMemoryAnalyzer(CacheFlowMemoryAnalyzer):
     def __init__(
         self,
         model_name: str,
+        model_path: str,
         block_size: int,
         dtype: torch.dtype,
         gpu_memory: int,
         cpu_memory: int,
         tensor_parallel_size: int,
     ) -> None:
         self.model_name = model_name
+        self.model_path = model_path
         self.block_size = block_size
         self.dtype = dtype
         self.gpu_memory = gpu_memory
         self.cpu_memory = cpu_memory
         self.tensor_parallel_size = tensor_parallel_size
 
-        config = AutoConfig.from_pretrained(model_name)
+        config = AutoConfig.from_pretrained(model_path)
         self.num_layers = config.num_hidden_layers
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads

diff --git a/cacheflow/models/model_utils.py b/cacheflow/models/model_utils.py
@@ -3,7 +3,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import AutoConfig
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from cacheflow.models.memory_analyzer import CacheFlowMemoryAnalyzer
 from cacheflow.models.memory_analyzer import LlamaMemoryAnalyzer
@@ -16,26 +16,36 @@
 _MODELS = {
     'llama': LlamaForCausalLM,
     'opt': OPTForCausalLM,
+    'vicuna': LlamaForCausalLM,
+    'koala': LlamaForCausalLM,
+    'alpaca': LlamaForCausalLM,
 }
 
 _MEMORY_ANALYZERS = {
     'llama': LlamaMemoryAnalyzer,
     'opt': OPTMemoryAnalyzer,
+    'vicuna': LlamaMemoryAnalyzer,
+    'koala': LlamaMemoryAnalyzer,
+    'alpaca': LlamaMemoryAnalyzer,
 }
 
 
 def get_model(
     model_name: str,
+    model_path: str,
     dtype: Union[torch.dtype, str],
     path: str,
 ) -> nn.Module:
     torch_dtype = get_torch_dtype(dtype)
     torch.set_default_dtype(torch_dtype)
-    config = AutoConfig.from_pretrained(model_name)
+
+    # config = AutoConfig.from_pretrained(model_name)
+    config = AutoConfig.from_pretrained(model_path)
     for model_class_name, model_class in _MODELS.items():
         if model_class_name in model_name:
             # Download model weights if it's not cached.
-            weights_dir = model_class.get_weights(model_name, path=path)
+            # weights_dir = model_class.get_weights(model_name, path=path)
+            weights_dir = model_class.get_weights(model_path, path=path)
             # Create a model instance.
             model = model_class(config)
             # Load the weights from the cached or downloaded files.
@@ -46,6 +56,7 @@ def get_model(
 
 def get_memory_analyzer(
     model_name: str,
+    model_path: str,
     block_size: int,
     dtype: Union[torch.dtype, str],
     gpu_memory: int,
@@ -56,6 +67,6 @@ def get_memory_analyzer(
     for model_class, memory_analyzer in _MEMORY_ANALYZERS.items():
         if model_class in model_name:
             return memory_analyzer(
-                model_name, block_size, torch_dtype, gpu_memory, cpu_memory,
+                model_name, model_path, block_size, torch_dtype, gpu_memory, cpu_memory,
                 tensor_parallel_size)
     raise ValueError(f'Unsupported model name: {model_name}')
diff --git a/cacheflow/sampling_params.py b/cacheflow/sampling_params.py
@@ -13,6 +13,7 @@ def __init__(
         max_num_steps: int,
         num_logprobs: int,
         context_window_size: Optional[int],
+        stop_str=None
     ) -> None:
         if n < 1:
             raise ValueError(f'n must be at least 1, got {n}.')
@@ -59,6 +60,7 @@ def __init__(
         self.max_num_steps = max_num_steps
         self.num_logprobs = num_logprobs
         self.context_window_size = context_window_size
+        self.stop_str = stop_str
 
     def __repr__(self) -> str:
         return (f'SamplingParams(n={self.n}, '

diff --git a/cacheflow/worker/worker.py b/cacheflow/worker/worker.py
@@ -43,8 +43,9 @@ def __init__(
         set_random_seed(seed)
 
         # Initialize the model.
-        self.model, self.dtype = get_model(model_name, dtype=dtype, path=model_path)
+        self.model, self.dtype = get_model(model_name, model_path, dtype=dtype, path=model_path)
         self.model = self.model.cuda()
+        print("loading model done...")
         tensor_model_parallel_world_size = (
             get_tensor_model_parallel_world_size())
         initialize_all_reduce_launcher(
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ @@
     *.eggs/
     *.so
     build/
+    .idea