Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions tests/v1/core/test_block_pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for BlockPool lazy null block allocation."""

import pytest

from vllm.v1.core.block_pool import BlockPool


class TestBlockPoolLazyNullBlock:
"""Test lazy null block allocation in BlockPool."""

def test_null_block_not_allocated_initially(self):
"""Test that null block is not allocated during BlockPool initialization."""
pool = BlockPool(num_gpu_blocks=4, enable_caching=True)

# Initially, null block should not be allocated
assert pool._null_block is None
assert pool.get_num_free_blocks() == 4

# Verify null_block is a property, not an instance attribute
assert "null_block" not in pool.__dict__
assert isinstance(type(pool).null_block, property)

def test_null_block_lazy_allocation(self):
"""Test that null block is allocated only when first accessed."""
pool = BlockPool(num_gpu_blocks=4, enable_caching=True)

# Before accessing null_block
assert pool._null_block is None
assert pool.get_num_free_blocks() == 4

# Access null_block - should trigger lazy allocation
null_block = pool.null_block

# After accessing null_block
assert pool._null_block is not None
assert pool.get_num_free_blocks() == 3 # One block consumed
assert null_block.is_null is True
assert null_block.block_id == 0

def test_null_block_reuse(self):
"""Test that multiple accesses return the same null block instance."""
pool = BlockPool(num_gpu_blocks=4, enable_caching=True)

# First access
null_block1 = pool.null_block
free_blocks_after_first = pool.get_num_free_blocks()

# Second access
null_block2 = pool.null_block
free_blocks_after_second = pool.get_num_free_blocks()

# Should return same instance without additional allocation
assert null_block1 is null_block2
assert free_blocks_after_first == free_blocks_after_second == 3

def test_null_block_allocation_when_no_blocks_available(self):
"""Test error handling when trying to allocate null block with
no free blocks."""
pool = BlockPool(num_gpu_blocks=2, enable_caching=True)

# Consume all available blocks
pool.get_new_blocks(2)
assert pool.get_num_free_blocks() == 0

# Trying to access null_block should raise RuntimeError
with pytest.raises(RuntimeError, match="Cannot allocate null block"):
_ = pool.null_block

def test_get_usage_with_lazy_null_block(self):
"""Test that get_usage() correctly accounts for lazy null block allocation."""
pool = BlockPool(num_gpu_blocks=4, enable_caching=True)

# Before null block allocation
usage_before = pool.get_usage()
assert usage_before == 0.0 # No blocks used, no null block overhead

# After null block allocation (but no actual workload blocks allocated)
_ = pool.null_block
usage_after = pool.get_usage()
# Null block is overhead, not "usage" - so usage should still be 0
assert usage_after == 0.0

# Now allocate actual workload blocks
_ = pool.get_new_blocks(2)
usage_with_workload = pool.get_usage()
# Formula: 1.0 - (1 / 3) = 2/3 (1 free block out of 3 available)
assert usage_with_workload == pytest.approx(2.0 / 3.0)

def test_reset_prefix_cache_with_lazy_null_block(self):
"""Test that reset_prefix_cache() works correctly with lazy null block."""
pool = BlockPool(num_gpu_blocks=4, enable_caching=True)

# Before null block allocation - should succeed
assert pool.reset_prefix_cache() is True

# After null block allocation - should still succeed
_ = pool.null_block
assert pool.reset_prefix_cache() is True
125 changes: 124 additions & 1 deletion tests/v1/core/test_single_type_kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@
)
from vllm.v1.core.single_type_kv_cache_manager import (
ChunkedLocalAttentionManager,
FullAttentionManager,
SlidingWindowManager,
)
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowSpec
from vllm.v1.kv_cache_interface import (
ChunkedLocalAttentionSpec,
FullAttentionSpec,
SlidingWindowSpec,
)

pytestmark = pytest.mark.cpu_test

Expand Down Expand Up @@ -354,3 +359,121 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
)


def test_lazy_null_block_allocation_in_managers():
"""Test that different managers have different null block allocation patterns."""
# Test FullAttentionManager - should not allocate null block
block_pool_full = BlockPool(num_gpu_blocks=4, enable_caching=True)
full_spec = FullAttentionSpec(
block_size=32,
num_kv_heads=8,
head_size=64,
dtype=torch.bfloat16,
sliding_window=None,
)

full_manager = FullAttentionManager(full_spec, block_pool_full, kv_cache_group_id=0)

# FullAttentionManager should not trigger null block allocation
assert block_pool_full._null_block is None
assert block_pool_full.get_num_free_blocks() == 4

full_manager.remove_skipped_blocks("test", 100)
assert block_pool_full._null_block is None # Still None
assert block_pool_full.get_num_free_blocks() == 4 # All blocks available

# SlidingWindowManager should allocate null block when needed
block_pool_sliding = BlockPool(num_gpu_blocks=4, enable_caching=True)
sliding_spec = SlidingWindowSpec(
block_size=32,
num_kv_heads=8,
head_size=64,
dtype=torch.bfloat16,
sliding_window=64,
)

sliding_manager = SlidingWindowManager(
sliding_spec, block_pool_sliding, kv_cache_group_id=0
)

# Initially should not have allocated null block
assert block_pool_sliding._null_block is None
assert block_pool_sliding.get_num_free_blocks() == 4

# Set up blocks for sliding window operation
blocks = block_pool_sliding.get_new_blocks(2)
sliding_manager.req_to_blocks["test"] = blocks
assert block_pool_sliding.get_num_free_blocks() == 2
assert block_pool_sliding._null_block is None

# This should trigger null block allocation, but also frees 1 block
sliding_manager.remove_skipped_blocks("test", 100)

# Now null block should be allocated
assert block_pool_sliding._null_block is not None
assert block_pool_sliding.get_num_free_blocks() == 2

# ChunkedLocalAttentionManager should allocate null block when needed
pool_chunked = BlockPool(num_gpu_blocks=4, enable_caching=True)
chunked_spec = ChunkedLocalAttentionSpec(
block_size=32,
num_kv_heads=8,
head_size=64,
dtype=torch.bfloat16,
attention_chunk_size=16,
)

# Create ChunkedLocalAttentionManager
chunked_manager = ChunkedLocalAttentionManager(
chunked_spec, pool_chunked, kv_cache_group_id=0
)

# Should not have triggered null block allocation yet
assert pool_chunked._null_block is None
assert pool_chunked.get_num_free_blocks() == 4

blocks = pool_chunked.get_new_blocks(2)
chunked_manager.req_to_blocks["test_request"] = blocks
assert pool_chunked.get_num_free_blocks() == 2
assert pool_chunked._null_block is None

# This should trigger null block allocation, but also frees 1 block
chunked_manager.remove_skipped_blocks("test_request", 48)

# Should have triggered null block allocation
assert pool_chunked._null_block is not None
assert pool_chunked.get_num_free_blocks() == 2


def test_manager_null_block_property_behavior():
"""Test that manager null_block property provides correct lazy behavior."""
block_pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
sliding_spec = SlidingWindowSpec(
block_size=32,
num_kv_heads=8,
head_size=64,
dtype=torch.bfloat16,
sliding_window=64,
)

manager = SlidingWindowManager(sliding_spec, block_pool, kv_cache_group_id=0)

# Initially, both manager and pool should have no null block
assert manager._null_block is None
assert block_pool._null_block is None
assert block_pool.get_num_free_blocks() == 4

# Access manager's null_block property
manager_null_block = manager.null_block

# Should have triggered allocation at both levels
assert manager._null_block is not None
assert block_pool._null_block is not None
assert manager_null_block is block_pool._null_block
assert block_pool.get_num_free_blocks() == 3

# Second access should return same instance
manager_null_block2 = manager.null_block
assert manager_null_block is manager_null_block2
assert block_pool.get_num_free_blocks() == 3 # No additional allocation
32 changes: 26 additions & 6 deletions vllm/v1/core/block_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,29 @@ def __init__(
# To represent a placeholder block with block_id=0.
# The ref_cnt of null_block is not maintained, needs special care to
# avoid freeing it.
self.null_block = self.free_block_queue.popleft()
self.null_block.is_null = True
# Use lazy allocation to avoid consuming a block when not needed.
self._null_block: KVCacheBlock | None = None

self.enable_kv_cache_events = enable_kv_cache_events
self.kv_event_queue: list[KVCacheEvent] = []

@property
def null_block(self) -> KVCacheBlock:
"""Lazy allocation of null block only when first accessed.

This avoids consuming a block from the pool when null blocks are not
needed (e.g., for FullAttentionManager which never uses null blocks).
"""
if self._null_block is None:
if self.free_block_queue.num_free_blocks == 0:
raise RuntimeError(
"Cannot allocate null block: no free blocks available. "
"Consider increasing the number of GPU blocks."
)
self._null_block = self.free_block_queue.popleft()
self._null_block.is_null = True
return self._null_block

def get_cached_block(
self, block_hash: BlockHash, kv_cache_group_ids: list[int]
) -> list[KVCacheBlock] | None:
Expand Down Expand Up @@ -370,11 +387,13 @@ def reset_prefix_cache(self) -> bool:
False otherwise.
"""
num_used_blocks = self.num_gpu_blocks - self.get_num_free_blocks()
if num_used_blocks != 1: # The null block is always marked as used
# Account for null block only if it's allocated
expected_used_blocks = 1 if self._null_block is not None else 0
if num_used_blocks != expected_used_blocks:
logger.warning(
"Failed to reset prefix cache because some "
"blocks (%d) are not freed yet",
num_used_blocks - 1,
num_used_blocks - expected_used_blocks,
)
return False

Expand Down Expand Up @@ -407,8 +426,9 @@ def get_usage(self) -> float:
The KV cache usage (between 0.0 and 1.0).
"""

# Subtract 1 to account for null block.
total_gpu_blocks = self.num_gpu_blocks - 1
# Subtract 1 to account for null block if it's allocated.
null_block_overhead = 1 if self._null_block is not None else 0
total_gpu_blocks = self.num_gpu_blocks - null_block_overhead
if not total_gpu_blocks:
return 0
return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks)
Expand Down
Loading