yapf formatting in vllm/core

vllm-project · Aug 6, 2024 · c89c730 · c89c730
1 parent 2e78fec
commit c89c730
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 32 deletions.
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
@@ -33,9 +33,10 @@ class RefCounter(RefCounterProtocol):
 
     def __init__(self, all_block_indices: Iterable[BlockId]):
         deduped = set(all_block_indices)
-        self._refcounts: Dict[BlockId,
-                              RefCount] = {index: 0
-                                           for index in deduped}
+        self._refcounts: Dict[BlockId, RefCount] = {
+            index: 0
+            for index in deduped
+        }
 
     def incr(self, block_id: BlockId) -> RefCount:
         assert block_id in self._refcounts

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
@@ -284,8 +284,8 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
 
         # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks <
-                self.watermark_blocks):
+        if (self.num_total_gpu_blocks - num_required_blocks
+                < self.watermark_blocks):
             return AllocStatus.NEVER
         if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
             return AllocStatus.OK
@@ -301,21 +301,22 @@ def _allocate_sequence(self, \
 
         block_table: BlockTable = []
         if seq is not None:
-          for logical_idx in range(num_prompt_blocks):
-              if (self.block_sliding_window is not None
-                      and logical_idx >= self.block_sliding_window):
-                  block = block_table[logical_idx % self.block_sliding_window]
-                  # Set the reference counts of the token blocks.
-                  block.ref_count = ref_count
-              elif not is_encoder_decoder and self.enable_caching:
-                  block = self.gpu_allocator.allocate(
-                      seq.hash_of_block(logical_idx),
-                      seq.num_hashed_tokens_of_block(logical_idx))
-              else:
-                  block = self.gpu_allocator.allocate()
-                  # Set the reference counts of the token blocks.
-                  block.ref_count = ref_count
-              block_table.append(block)
+            for logical_idx in range(num_prompt_blocks):
+                if (self.block_sliding_window is not None
+                        and logical_idx >= self.block_sliding_window):
+                    block = block_table[logical_idx %
+                                        self.block_sliding_window]
+                    # Set the reference counts of the token blocks.
+                    block.ref_count = ref_count
+                elif not is_encoder_decoder and self.enable_caching:
+                    block = self.gpu_allocator.allocate(
+                        seq.hash_of_block(logical_idx),
+                        seq.num_hashed_tokens_of_block(logical_idx))
+                else:
+                    block = self.gpu_allocator.allocate()
+                    # Set the reference counts of the token blocks.
+                    block.ref_count = ref_count
+                block_table.append(block)
 
         return block_table
 

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
@@ -135,8 +135,8 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
             device=Device.GPU)
 
         # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks <
-                self.watermark_blocks):
+        if (self.num_total_gpu_blocks - num_required_blocks
+                < self.watermark_blocks):
             return AllocStatus.NEVER
         if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
             return AllocStatus.OK

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -800,8 +800,8 @@ def _schedule_default(self) -> SchedulerOutputs:
                     running_scheduled.swapped_out) == 0:
                 swapped_in = self._schedule_swapped(budget, curr_loras)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -877,8 +877,8 @@ def _schedule_chunked_prefill(self):
                                            curr_loras,
                                            enable_chunking=True)
 
-        assert (budget.num_batched_tokens <=
-                self.scheduler_config.max_num_batched_tokens)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
         assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
 
         # Update waiting requests.
@@ -983,8 +983,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 # NOTE: We use get_len instead of get_prompt_len because when
                 # a sequence is preempted, prefill includes previous generated
                 # output tokens.
-                if (token_chunk_size + seqs[0].data.get_num_computed_tokens() <
-                        seqs[0].data.get_len()):
+                if (token_chunk_size + seqs[0].data.get_num_computed_tokens()
+                        < seqs[0].data.get_len()):
                     do_sample = False
 
             # It assumes the scheduled_seq_groups is ordered by
@@ -1166,10 +1166,9 @@ def _passed_delay(self, now: float) -> bool:
         if self.scheduler_config.delay_factor > 0 and self.waiting:
             earliest_arrival_time = min(
                 [e.metrics.arrival_time for e in self.waiting])
-            passed_delay = (
-                (now - earliest_arrival_time) >
-                (self.scheduler_config.delay_factor * self.last_prompt_latency)
-                or not self.running)
+            passed_delay = ((now - earliest_arrival_time)
+                            > (self.scheduler_config.delay_factor *
+                               self.last_prompt_latency) or not self.running)
         else:
             passed_delay = True
         return passed_delay