test changes

vwxyzjn · Aug 18, 2024 · cdc74b7 · cdc74b7
1 parent 7341071
commit cdc74b7
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 10 deletions.
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -219,7 +219,7 @@ def prev_rank(self):
     def graph_capture(
             self, graph_capture_context: Optional[GraphCaptureContext] = None):
         if graph_capture_context is None:
-            stream = torch.cuda.Stream()
+            stream = torch.cuda.Stream(self.device)
             graph_capture_context = GraphCaptureContext(stream)
         else:
             stream = graph_capture_context.stream
@@ -905,6 +905,7 @@ def initialize_model_parallel(
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
+    # world_size: int = 1
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -888,7 +888,7 @@ def __init__(
 
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
-        with CudaMemoryProfiler() as m:
+        with CudaMemoryProfiler(self.device) as m:
             self.model = get_model(model_config=self.model_config,
                                    device_config=self.device_config,
                                    load_config=self.load_config,
@@ -1206,12 +1206,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
 
         # Prepare dummy inputs. These will be reused for all batch sizes.
         max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
-        input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-        input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-        slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda()
+        input_tokens = torch.zeros(max_batch_size, dtype=torch.long, device=self.device)
+        input_positions = torch.zeros(max_batch_size, dtype=torch.long, device=self.device)
+        slot_mapping = torch.empty(max_batch_size, dtype=torch.long, device=self.device)
         slot_mapping.fill_(_PAD_SLOT_ID)
-        seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
-        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        seq_lens = torch.ones(max_batch_size, dtype=torch.int32, device=self.device)
+        block_tables = torch.from_numpy(self.graph_block_tables).to(self.device)
         intermediate_inputs = None
         if not get_pp_group().is_first_rank:
             intermediate_inputs = self.model.make_empty_intermediate_tensors(
@@ -1669,6 +1669,7 @@ def capture(
         torch.cuda.synchronize()
 
         # Capture the graph.
+        # breakpoint()
         self._graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
             output_hidden_or_intermediate_states = self.model(

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -132,11 +132,11 @@ def init_device(self) -> None:
             # This env var set by Ray causes exceptions with graph building.
             os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
             self.device = torch.device(f"cuda:{self.local_rank}")
-            torch.cuda.set_device(self.device)
+            # torch.cuda.set_device(self.device)
 
             _check_if_gpu_supports_dtype(self.model_config.dtype)
             torch.cuda.empty_cache()
-            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+            self.init_gpu_memory = torch.cuda.mem_get_info(self.device)[0]
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -193,7 +193,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
         torch.cuda.synchronize()
-        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info(self.device)
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory