vllm-project · elaineyz · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for BlockPool lazy null block allocation."""
+
+import pytest
+
+from vllm.v1.core.block_pool import BlockPool
+
+
+class TestBlockPoolLazyNullBlock:
+    """Test lazy null block allocation in BlockPool."""
+
+    def test_null_block_not_allocated_initially(self):
+        """Test that null block is not allocated during BlockPool initialization."""
+        pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+
+        # Initially, null block should not be allocated
+        assert pool._null_block is None
+        assert pool.get_num_free_blocks() == 4
+
+        # Verify null_block is a property, not an instance attribute
+        assert "null_block" not in pool.__dict__
+        assert isinstance(type(pool).null_block, property)
+
+    def test_null_block_lazy_allocation(self):
+        """Test that null block is allocated only when first accessed."""
+        pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+
+        # Before accessing null_block
+        assert pool._null_block is None
+        assert pool.get_num_free_blocks() == 4
+
+        # Access null_block - should trigger lazy allocation
+        null_block = pool.null_block
+
+        # After accessing null_block
+        assert pool._null_block is not None
+        assert pool.get_num_free_blocks() == 3  # One block consumed
+        assert null_block.is_null is True
+        assert null_block.block_id == 0
+
+    def test_null_block_reuse(self):
+        """Test that multiple accesses return the same null block instance."""
+        pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+
+        # First access
+        null_block1 = pool.null_block
+        free_blocks_after_first = pool.get_num_free_blocks()
+
+        # Second access
+        null_block2 = pool.null_block
+        free_blocks_after_second = pool.get_num_free_blocks()
+
+        # Should return same instance without additional allocation
+        assert null_block1 is null_block2
+        assert free_blocks_after_first == free_blocks_after_second == 3
+
+    def test_null_block_allocation_when_no_blocks_available(self):
+        """Test error handling when trying to allocate null block with
+        no free blocks."""
+        pool = BlockPool(num_gpu_blocks=2, enable_caching=True)
+
+        # Consume all available blocks
+        pool.get_new_blocks(2)
+        assert pool.get_num_free_blocks() == 0
+
+        # Trying to access null_block should raise RuntimeError
+        with pytest.raises(RuntimeError, match="Cannot allocate null block"):
+            _ = pool.null_block
+
+    def test_get_usage_with_lazy_null_block(self):
+        """Test that get_usage() correctly accounts for lazy null block allocation."""
+        pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+
+        # Before null block allocation
+        usage_before = pool.get_usage()
+        assert usage_before == 0.0  # No blocks used, no null block overhead
+
+        # After null block allocation (but no actual workload blocks allocated)
+        _ = pool.null_block
+        usage_after = pool.get_usage()
+        # Null block is overhead, not "usage" - so usage should still be 0
+        assert usage_after == 0.0
+
+        # Now allocate actual workload blocks
+        _ = pool.get_new_blocks(2)
+        usage_with_workload = pool.get_usage()
+        # Formula: 1.0 - (1 / 3) = 2/3 (1 free block out of 3 available)
+        assert usage_with_workload == pytest.approx(2.0 / 3.0)
+
+    def test_reset_prefix_cache_with_lazy_null_block(self):
+        """Test that reset_prefix_cache() works correctly with lazy null block."""
+        pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+
+        # Before null block allocation - should succeed
+        assert pool.reset_prefix_cache() is True
+
+        # After null block allocation - should still succeed
+        _ = pool.null_block
+        assert pool.reset_prefix_cache() is True
@@ -14,9 +14,14 @@
 )
 from vllm.v1.core.single_type_kv_cache_manager import (
     ChunkedLocalAttentionManager,
+    FullAttentionManager,
     SlidingWindowManager,
 )
-from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowSpec
+from vllm.v1.kv_cache_interface import (
+    ChunkedLocalAttentionSpec,
+    FullAttentionSpec,
+    SlidingWindowSpec,
+)
 
 pytestmark = pytest.mark.cpu_test
 
@@ -354,3 +359,121 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
     assert (
         manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
     )
+
+
+def test_lazy_null_block_allocation_in_managers():
+    """Test that different managers have different null block allocation patterns."""
+    # Test FullAttentionManager - should not allocate null block
+    block_pool_full = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    full_spec = FullAttentionSpec(
+        block_size=32,
+        num_kv_heads=8,
+        head_size=64,
+        dtype=torch.bfloat16,
+        sliding_window=None,
+    )
+
+    full_manager = FullAttentionManager(full_spec, block_pool_full, kv_cache_group_id=0)
+
+    # FullAttentionManager should not trigger null block allocation
+    assert block_pool_full._null_block is None
+    assert block_pool_full.get_num_free_blocks() == 4
+
+    full_manager.remove_skipped_blocks("test", 100)
+    assert block_pool_full._null_block is None  # Still None
+    assert block_pool_full.get_num_free_blocks() == 4  # All blocks available
+
+    # SlidingWindowManager should allocate null block when needed
+    block_pool_sliding = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    sliding_spec = SlidingWindowSpec(
+        block_size=32,
+        num_kv_heads=8,
+        head_size=64,
+        dtype=torch.bfloat16,
+        sliding_window=64,
+    )
+
+    sliding_manager = SlidingWindowManager(
+        sliding_spec, block_pool_sliding, kv_cache_group_id=0
+    )
+
+    # Initially should not have allocated null block
+    assert block_pool_sliding._null_block is None
+    assert block_pool_sliding.get_num_free_blocks() == 4
+
+    # Set up blocks for sliding window operation
+    blocks = block_pool_sliding.get_new_blocks(2)
+    sliding_manager.req_to_blocks["test"] = blocks
+    assert block_pool_sliding.get_num_free_blocks() == 2
+    assert block_pool_sliding._null_block is None
+
+    # This should trigger null block allocation, but also frees 1 block
+    sliding_manager.remove_skipped_blocks("test", 100)
+
+    # Now null block should be allocated
+    assert block_pool_sliding._null_block is not None
+    assert block_pool_sliding.get_num_free_blocks() == 2
+
+    # ChunkedLocalAttentionManager should allocate null block when needed
+    pool_chunked = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    chunked_spec = ChunkedLocalAttentionSpec(
+        block_size=32,
+        num_kv_heads=8,
+        head_size=64,
+        dtype=torch.bfloat16,
+        attention_chunk_size=16,
+    )
+
+    # Create ChunkedLocalAttentionManager
+    chunked_manager = ChunkedLocalAttentionManager(
+        chunked_spec, pool_chunked, kv_cache_group_id=0
+    )
+
+    # Should not have triggered null block allocation yet
+    assert pool_chunked._null_block is None
+    assert pool_chunked.get_num_free_blocks() == 4
+
+    blocks = pool_chunked.get_new_blocks(2)
+    chunked_manager.req_to_blocks["test_request"] = blocks
+    assert pool_chunked.get_num_free_blocks() == 2
+    assert pool_chunked._null_block is None
+
+    # This should trigger null block allocation, but also frees 1 block
+    chunked_manager.remove_skipped_blocks("test_request", 48)
+
+    # Should have triggered null block allocation
+    assert pool_chunked._null_block is not None
+    assert pool_chunked.get_num_free_blocks() == 2
+
+
+def test_manager_null_block_property_behavior():
+    """Test that manager null_block property provides correct lazy behavior."""
+    block_pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    sliding_spec = SlidingWindowSpec(
+        block_size=32,
+        num_kv_heads=8,
+        head_size=64,
+        dtype=torch.bfloat16,
+        sliding_window=64,
+    )
+
+    manager = SlidingWindowManager(sliding_spec, block_pool, kv_cache_group_id=0)
+
+    # Initially, both manager and pool should have no null block
+    assert manager._null_block is None
+    assert block_pool._null_block is None
+    assert block_pool.get_num_free_blocks() == 4
+
+    # Access manager's null_block property
+    manager_null_block = manager.null_block
+
+    # Should have triggered allocation at both levels
+    assert manager._null_block is not None
+    assert block_pool._null_block is not None
+    assert manager_null_block is block_pool._null_block
+    assert block_pool.get_num_free_blocks() == 3
+
+    # Second access should return same instance
+    manager_null_block2 = manager.null_block
+    assert manager_null_block is manager_null_block2
+    assert block_pool.get_num_free_blocks() == 3  # No additional allocation
@@ -160,12 +160,29 @@ def __init__(
         # To represent a placeholder block with block_id=0.
         # The ref_cnt of null_block is not maintained, needs special care to
         # avoid freeing it.
-        self.null_block = self.free_block_queue.popleft()
-        self.null_block.is_null = True
+        # Use lazy allocation to avoid consuming a block when not needed.
+        self._null_block: KVCacheBlock | None = None
 
         self.enable_kv_cache_events = enable_kv_cache_events
         self.kv_event_queue: list[KVCacheEvent] = []
 
+    @property
+    def null_block(self) -> KVCacheBlock:
+        """Lazy allocation of null block only when first accessed.
+
+        This avoids consuming a block from the pool when null blocks are not
+        needed (e.g., for FullAttentionManager which never uses null blocks).
+        """
+        if self._null_block is None:
+            if self.free_block_queue.num_free_blocks == 0:
+                raise RuntimeError(
+                    "Cannot allocate null block: no free blocks available. "
+                    "Consider increasing the number of GPU blocks."
+                )
+            self._null_block = self.free_block_queue.popleft()
+            self._null_block.is_null = True
+        return self._null_block
+
     def get_cached_block(
         self, block_hash: BlockHash, kv_cache_group_ids: list[int]
     ) -> list[KVCacheBlock] | None:
@@ -370,11 +387,13 @@ def reset_prefix_cache(self) -> bool:
             False otherwise.
         """
         num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
-        if num_used_blocks != 1:  # The null block is always marked as used
+        # Account for null block only if it's allocated
+        expected_used_blocks = 1 if self._null_block is not None else 0
+        if num_used_blocks != expected_used_blocks:
             logger.warning(
                 "Failed to reset prefix cache because some "
                 "blocks (%d) are not freed yet",
-                num_used_blocks - 1,
+                num_used_blocks - expected_used_blocks,
             )
             return False
 
@@ -407,8 +426,9 @@ def get_usage(self) -> float:
             The KV cache usage (between 0.0 and 1.0).
         """
 
-        # Subtract 1 to account for null block.
-        total_gpu_blocks = self.num_gpu_blocks - 1
+        # Subtract 1 to account for null block if it's allocated.
+        null_block_overhead = 1 if self._null_block is not None else 0
+        total_gpu_blocks = self.num_gpu_blocks - null_block_overhead
         if not total_gpu_blocks:
             return 0
         return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks)