vllm-project
diff --git a/‎tests/v1/e2e/test_correctness_sliding_window.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/v1/e2e/test_correctness_sliding_window.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/v1/core/block_pool.py‎
Lines changed: 53 additions & 42 deletions b/‎vllm/v1/core/block_pool.py‎
Lines changed: 53 additions & 42 deletions
@@ -17,15 +17,15 @@ class TestConfig:
 
 model_config = {
     "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
-    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+    "google/gemma-3-1b-it": TestConfig(4096, (400, 800)),
 }
 
 
 @pytest.mark.parametrize(
     "model",
     [
         "bigcode/starcoder2-3b",  # sliding window only
-        "google/gemma-2-2b-it",  # sliding window + full attention
+        "google/gemma-3-1b-it",  # sliding window + full attention
     ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 
@@ -7,7 +7,7 @@
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         GroupedKVCacheBlock, KVCacheBlock,
+                                         KVCacheBlock, KVCacheBlockBundle,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens)
 from vllm.v1.request import Request
@@ -49,19 +49,20 @@ def __init__(
         # enabled).
         self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)
 
-        # TODO: update comment
-        # {manager_id: {block_hash: {block ID: GroupedKVCacheBlock}}}. A cached
-        # block is a full block with a block hash that can be used for prefix
-        # caching.
+        # {manager_id: {block_hash: {block ID: KVCacheBlockBundle}}}.
+        # A cached block is a full block with a block hash that can be used for
+        # prefix caching.
         # The cached block may be used by running requests or in the
         # free_block_queue that could potentially be evicted.
+        # Use KVCacheBlockBundle to make sure different kv cache groups managed
+        # by the same single_type_manager are cached & evicted together.
         # NOTE: We currently don't de-duplicate the blocks in the cache,
         # meaning that if a block becomes full and is cached, we don't check
         # if there is already an identical block in the cache. This is because
         # we want to make sure the allocated block IDs won't change so that
         # block tables are append-only.
         self.cached_block_hash_to_block: list[dict[BlockHashType, dict[
-            int, GroupedKVCacheBlock]]] = [
+            int, KVCacheBlockBundle]]] = [
                 defaultdict(dict) for _ in range(num_single_type_managers)
             ]
         # To represent a placeholder block with block_id=0.
@@ -74,7 +75,7 @@ def __init__(
         self.kv_event_queue: list[KVCacheEvent] = []
 
     def get_cached_block(self, block_hash: BlockHashType,
-                         manager_id: int) -> Optional[GroupedKVCacheBlock]:
+                         manager_id: int) -> Optional[KVCacheBlockBundle]:
         """Get a cached block by the block hash, or None if cache miss.
         If there are duplicated blocks, we return the first block in the cache.
 
@@ -95,7 +96,7 @@ def get_cached_block(self, block_hash: BlockHashType,
     def cache_full_blocks(
         self,
         request: Request,
-        blocks: list[GroupedKVCacheBlock],
+        blocks: list[KVCacheBlockBundle],
         block_hashes: list[BlockHashType],
         num_cached_blocks: int,
         num_full_blocks: int,
@@ -141,15 +142,14 @@ def cache_full_blocks(
         new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
                                            else None)
         for i, blk in enumerate(new_full_blocks):
-            assert all(b.block_hash is None for b in blk.blocks)
-            assert blk.block_hash is None
+            assert blk.block_hash_is_none()
 
             if i < len(new_block_hashes):
                 # The block hash may already be computed in
                 # "get_computed_blocks" if the tokens are not generated by
                 # this request (either the prompt tokens or the previously
-                # generated tokens with preemption).
-                # TODO: or other groups with the same block_size
+                # generated tokens with preemption), or by other
+                # single_type_managers with the same block_size.
                 # In this case we simply reuse the block hash.
                 block_hash = new_block_hashes[i]
             else:
@@ -177,10 +177,7 @@ def cache_full_blocks(
                 block_hashes.append(block_hash)
 
             # Update and added the full block to the cache.
-            for b in blk.blocks:
-                b.block_hash = block_hash
-                b.manager_id = manager_id
-            blk.block_hash = block_hash
+            blk.init_block_hash(block_hash, manager_id)
             self.cached_block_hash_to_block[manager_id][block_hash][
                 blk.master_block_id] = blk
             if new_hashes is not None:
@@ -200,37 +197,46 @@ def cache_full_blocks(
                     if request.lora_request else None,
                 ))
 
-    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
+    def get_new_blocks(self, num_block_bundle: int,
+                       bundle_size: int) -> list[KVCacheBlockBundle]:
         """Get new blocks from the free block pool.
 
         Note that we do not check block cache in this function.
 
         Args:
-            num_blocks: The number of blocks to allocate.
+            num_block_bundle: The number of KVCacheBlockBundle to allocate.
+            bundle_size: The number of blocks in each KVCacheBlockBundle.
 
         Returns:
             A list of new block.
         """
-        if num_blocks > self.get_num_free_blocks():
+        num_total_blocks = num_block_bundle * bundle_size
+        if num_total_blocks > self.get_num_free_blocks():
             raise ValueError(
-                f"Cannot get {num_blocks} free blocks from the pool")
+                f"Cannot get {num_total_blocks} free blocks from the pool")
 
-        ret: list[KVCacheBlock] = []
+        flat_new_blocks: list[KVCacheBlock] = []
         idx = 0
-        while idx < num_blocks:
+        while idx < num_total_blocks:
             # First allocate blocks.
             curr_block = self.free_block_queue.popleft()
-            assert curr_block.ref_cnt == 0
 
             # If the block is cached, evict it.
             if self.enable_caching:
                 self._maybe_evict_cached_block(curr_block)
 
-            curr_block.incr_ref()
-            ret.append(curr_block)
+            assert curr_block.block_hash is None
+            flat_new_blocks.append(curr_block)
             idx += 1
 
-        return ret
+        new_blocks = []
+        for i in range(num_block_bundle):
+            blocks = flat_new_blocks[i * bundle_size:(i + 1) * bundle_size]
+            block_bundle = KVCacheBlockBundle.from_kv_cache_blocks(
+                tuple(blocks))
+            block_bundle.incr_ref()
+            new_blocks.append(block_bundle)
+        return new_blocks
 
     def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
         """
@@ -249,8 +255,11 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
                 manager_id]:
             cached_blocks = (
                 self.cached_block_hash_to_block[manager_id][block_hash])
-            assert block.block_id in cached_blocks
-            cached_blocks[block.block_id].reset_hash()
+            cached_block = cached_blocks[block.block_id]
+            # TODO: add notes
+            assert cached_block.master_block_id == block.block_id
+            assert cached_block.ref_cnt == 0
+            cached_block.reset_hash()
             del cached_blocks[block.block_id]
             if len(cached_blocks) == 0:
                 del self.cached_block_hash_to_block[manager_id][block_hash]
@@ -260,26 +269,26 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
             return True
         return False
 
-    def touch(self, blocks: list[list[GroupedKVCacheBlock]]) -> None:
+    def touch(self, blocks: list[list[KVCacheBlockBundle]]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
 
         Args:
             blocks: A list of blocks to touch.
         """
-        # TODO: check whether we should manage ref_cnt at grouped_block level
         for blocks_one_manager in blocks:
-            for grouped_block in blocks_one_manager:
-                for block in grouped_block.blocks:
-                    # ref_cnt=0 means this block is in the free list (i.e.
-                    # eviction candidate), so remove it.
-                    if block.ref_cnt == 0 and block != self.null_block:
-                        self.free_block_queue.remove(block)
-                    block.incr_ref()
+            for block_bundle in blocks_one_manager:
+                if block_bundle.ref_cnt == 0:
+                    # ref_cnt=0 means the blocks are in the free list (i.e.
+                    # eviction candidate), so remove them.
+                    for block in block_bundle.blocks:
+                        if block != self.null_block:
+                            self.free_block_queue.remove(block)
+                block_bundle.incr_ref()
 
     def free_blocks(self,
-                    ordered_blocks: Iterable[GroupedKVCacheBlock]) -> None:
+                    ordered_blocks: Iterable[KVCacheBlockBundle]) -> None:
         """Free a list of blocks. The blocks should be ordered by their
         eviction priority, where the first block will be evicted first.
 
@@ -288,11 +297,13 @@ def free_blocks(self,
                 priority.
         """
         # TODO: make sure blocks in the first group are evicted first
-        for blk in ordered_blocks:
-            for block in blk.blocks:
-                block.decr_ref()
+        for block_bundle in ordered_blocks:
+            block_bundle.decr_ref()
+            if block_bundle.ref_cnt > 0:
+                continue
+            for block in block_bundle.blocks:
                 # null_block should not be added to the free list.
-                if block.ref_cnt == 0 and block != self.null_block:
+                if block != self.null_block:
                     self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool: