77                                        BlockStored , KVCacheEvent )
88from  vllm .logger  import  init_logger 
99from  vllm .v1 .core .kv_cache_utils  import  (BlockHashType , FreeKVCacheBlockQueue ,
10-                                          GroupedKVCacheBlock ,  KVCacheBlock ,
10+                                          KVCacheBlock ,  KVCacheBlockBundle ,
1111                                         generate_block_hash_extra_keys ,
1212                                         hash_block_tokens )
1313from  vllm .v1 .request  import  Request 
@@ -49,19 +49,20 @@ def __init__(
4949        # enabled). 
5050        self .free_block_queue  =  FreeKVCacheBlockQueue (self .blocks )
5151
52-         # TODO: update comment 
53-         # {manager_id: {block_hash: {block ID: GroupedKVCacheBlock}}}. A cached 
54-         # block is a full block with a block hash that can be used for prefix 
55-         # caching. 
52+         # {manager_id: {block_hash: {block ID: KVCacheBlockBundle}}}. 
53+         # A cached block is a full block with a block hash that can be used for 
54+         # prefix caching. 
5655        # The cached block may be used by running requests or in the 
5756        # free_block_queue that could potentially be evicted. 
57+         # Use KVCacheBlockBundle to make sure different kv cache groups managed 
58+         # by the same single_type_manager are cached & evicted together. 
5859        # NOTE: We currently don't de-duplicate the blocks in the cache, 
5960        # meaning that if a block becomes full and is cached, we don't check 
6061        # if there is already an identical block in the cache. This is because 
6162        # we want to make sure the allocated block IDs won't change so that 
6263        # block tables are append-only. 
6364        self .cached_block_hash_to_block : list [dict [BlockHashType , dict [
64-             int , GroupedKVCacheBlock ]]] =  [
65+             int , KVCacheBlockBundle ]]] =  [
6566                defaultdict (dict ) for  _  in  range (num_single_type_managers )
6667            ]
6768        # To represent a placeholder block with block_id=0. 
@@ -74,7 +75,7 @@ def __init__(
7475        self .kv_event_queue : list [KVCacheEvent ] =  []
7576
7677    def  get_cached_block (self , block_hash : BlockHashType ,
77-                          manager_id : int ) ->  Optional [GroupedKVCacheBlock ]:
78+                          manager_id : int ) ->  Optional [KVCacheBlockBundle ]:
7879        """Get a cached block by the block hash, or None if cache miss. 
7980        If there are duplicated blocks, we return the first block in the cache. 
8081
@@ -95,7 +96,7 @@ def get_cached_block(self, block_hash: BlockHashType,
9596    def  cache_full_blocks (
9697        self ,
9798        request : Request ,
98-         blocks : list [GroupedKVCacheBlock ],
99+         blocks : list [KVCacheBlockBundle ],
99100        block_hashes : list [BlockHashType ],
100101        num_cached_blocks : int ,
101102        num_full_blocks : int ,
@@ -141,15 +142,14 @@ def cache_full_blocks(
141142        new_hashes : Optional [list [int ]] =  ([] if  self .enable_kv_cache_events 
142143                                           else  None )
143144        for  i , blk  in  enumerate (new_full_blocks ):
144-             assert  all (b .block_hash  is  None  for  b  in  blk .blocks )
145-             assert  blk .block_hash  is  None 
145+             assert  blk .block_hash_is_none ()
146146
147147            if  i  <  len (new_block_hashes ):
148148                # The block hash may already be computed in 
149149                # "get_computed_blocks" if the tokens are not generated by 
150150                # this request (either the prompt tokens or the previously 
151-                 # generated tokens with preemption).  
152-                 # TODO: or other groups  with the same block_size 
151+                 # generated tokens with preemption), or by other  
152+                 # single_type_managers  with the same block_size.  
153153                # In this case we simply reuse the block hash. 
154154                block_hash  =  new_block_hashes [i ]
155155            else :
@@ -177,10 +177,7 @@ def cache_full_blocks(
177177                block_hashes .append (block_hash )
178178
179179            # Update and added the full block to the cache. 
180-             for  b  in  blk .blocks :
181-                 b .block_hash  =  block_hash 
182-                 b .manager_id  =  manager_id 
183-             blk .block_hash  =  block_hash 
180+             blk .init_block_hash (block_hash , manager_id )
184181            self .cached_block_hash_to_block [manager_id ][block_hash ][
185182                blk .master_block_id ] =  blk 
186183            if  new_hashes  is  not None :
@@ -200,37 +197,46 @@ def cache_full_blocks(
200197                    if  request .lora_request  else  None ,
201198                ))
202199
203-     def  get_new_blocks (self , num_blocks : int ) ->  list [KVCacheBlock ]:
200+     def  get_new_blocks (self , num_block_bundle : int ,
201+                        bundle_size : int ) ->  list [KVCacheBlockBundle ]:
204202        """Get new blocks from the free block pool. 
205203
206204        Note that we do not check block cache in this function. 
207205
208206        Args: 
209-             num_blocks: The number of blocks to allocate. 
207+             num_block_bundle: The number of KVCacheBlockBundle to allocate. 
208+             bundle_size: The number of blocks in each KVCacheBlockBundle. 
210209
211210        Returns: 
212211            A list of new block. 
213212        """ 
214-         if  num_blocks  >  self .get_num_free_blocks ():
213+         num_total_blocks  =  num_block_bundle  *  bundle_size 
214+         if  num_total_blocks  >  self .get_num_free_blocks ():
215215            raise  ValueError (
216-                 f"Cannot get { num_blocks }  )
216+                 f"Cannot get { num_total_blocks }  )
217217
218-         ret : list [KVCacheBlock ] =  []
218+         flat_new_blocks : list [KVCacheBlock ] =  []
219219        idx  =  0 
220-         while  idx  <  num_blocks :
220+         while  idx  <  num_total_blocks :
221221            # First allocate blocks. 
222222            curr_block  =  self .free_block_queue .popleft ()
223-             assert  curr_block .ref_cnt  ==  0 
224223
225224            # If the block is cached, evict it. 
226225            if  self .enable_caching :
227226                self ._maybe_evict_cached_block (curr_block )
228227
229-             curr_block .incr_ref () 
230-             ret .append (curr_block )
228+             assert   curr_block .block_hash   is   None 
229+             flat_new_blocks .append (curr_block )
231230            idx  +=  1 
232231
233-         return  ret 
232+         new_blocks  =  []
233+         for  i  in  range (num_block_bundle ):
234+             blocks  =  flat_new_blocks [i  *  bundle_size :(i  +  1 ) *  bundle_size ]
235+             block_bundle  =  KVCacheBlockBundle .from_kv_cache_blocks (
236+                 tuple (blocks ))
237+             block_bundle .incr_ref ()
238+             new_blocks .append (block_bundle )
239+         return  new_blocks 
234240
235241    def  _maybe_evict_cached_block (self , block : KVCacheBlock ) ->  bool :
236242        """ 
@@ -249,8 +255,11 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
249255                manager_id ]:
250256            cached_blocks  =  (
251257                self .cached_block_hash_to_block [manager_id ][block_hash ])
252-             assert  block .block_id  in  cached_blocks 
253-             cached_blocks [block .block_id ].reset_hash ()
258+             cached_block  =  cached_blocks [block .block_id ]
259+             # TODO: add notes 
260+             assert  cached_block .master_block_id  ==  block .block_id 
261+             assert  cached_block .ref_cnt  ==  0 
262+             cached_block .reset_hash ()
254263            del  cached_blocks [block .block_id ]
255264            if  len (cached_blocks ) ==  0 :
256265                del  self .cached_block_hash_to_block [manager_id ][block_hash ]
@@ -260,26 +269,26 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
260269            return  True 
261270        return  False 
262271
263-     def  touch (self , blocks : list [list [GroupedKVCacheBlock ]]) ->  None :
272+     def  touch (self , blocks : list [list [KVCacheBlockBundle ]]) ->  None :
264273        """Touch a block increases its reference count by 1, and may remove 
265274        the block from the free queue. This is used when a block is hit by 
266275        another request with the same prefix. 
267276
268277        Args: 
269278            blocks: A list of blocks to touch. 
270279        """ 
271-         # TODO: check whether we should manage ref_cnt at grouped_block level 
272280        for  blocks_one_manager  in  blocks :
273-             for  grouped_block  in  blocks_one_manager :
274-                 for  block  in  grouped_block .blocks :
275-                     # ref_cnt=0 means this block is in the free list (i.e. 
276-                     # eviction candidate), so remove it. 
277-                     if  block .ref_cnt  ==  0  and  block  !=  self .null_block :
278-                         self .free_block_queue .remove (block )
279-                     block .incr_ref ()
281+             for  block_bundle  in  blocks_one_manager :
282+                 if  block_bundle .ref_cnt  ==  0 :
283+                     # ref_cnt=0 means the blocks are in the free list (i.e. 
284+                     # eviction candidate), so remove them. 
285+                     for  block  in  block_bundle .blocks :
286+                         if  block  !=  self .null_block :
287+                             self .free_block_queue .remove (block )
288+                 block_bundle .incr_ref ()
280289
281290    def  free_blocks (self ,
282-                     ordered_blocks : Iterable [GroupedKVCacheBlock ]) ->  None :
291+                     ordered_blocks : Iterable [KVCacheBlockBundle ]) ->  None :
283292        """Free a list of blocks. The blocks should be ordered by their 
284293        eviction priority, where the first block will be evicted first. 
285294
@@ -288,11 +297,13 @@ def free_blocks(self,
288297                priority. 
289298        """ 
290299        # TODO: make sure blocks in the first group are evicted first 
291-         for  blk  in  ordered_blocks :
292-             for  block  in  blk .blocks :
293-                 block .decr_ref ()
300+         for  block_bundle  in  ordered_blocks :
301+             block_bundle .decr_ref ()
302+             if  block_bundle .ref_cnt  >  0 :
303+                 continue 
304+             for  block  in  block_bundle .blocks :
294305                # null_block should not be added to the free list. 
295-                 if  block . ref_cnt   ==   0   and   block  !=  self .null_block :
306+                 if  block  !=  self .null_block :
296307                    self .free_block_queue .append (block )
297308
298309    def  reset_prefix_cache (self ) ->  bool :
0 commit comments