Skip to content

Commit 22ca459

Browse files
WoosukKwonlengrongfu
authored andcommitted
[V1][Minor] Minor simplification for get_computed_blocks (vllm-project#16139)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent 7e1ce00 commit 22ca459

File tree

1 file changed

+31
-30
lines changed

1 file changed

+31
-30
lines changed

vllm/v1/core/kv_cache_manager.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -126,38 +126,39 @@ def get_computed_blocks(
126126
self.req_to_block_hashes[request.request_id] = block_hashes
127127

128128
self.prefix_cache_stats.requests += 1
129-
if request.sampling_params.prompt_logprobs is None:
130-
if len(block_hashes) * self.block_size == request.num_tokens:
131-
# When prompt length is divisible by the block size and all
132-
# blocks are cached, we need to recompute the last token. This
133-
# have to be achieved by re-computing an entire block because
134-
# allocate_slots() assumes num_computed_tokens is always a
135-
# multiple of the block size. To achieve this, remove the last
136-
# block hash from the block_hashes for find_longest_cache_hit
137-
# This limitation can potentially be removed in the future to
138-
# slightly improve the performance.
139-
last_block_hash = block_hashes.pop()
140-
else:
141-
last_block_hash = None
142-
143-
computed_blocks = (
144-
self.specialized_manager.find_longest_cache_hit(block_hashes))
145-
146-
if last_block_hash is not None:
147-
# Add back the last block hash if it was removed.
148-
block_hashes.append(last_block_hash)
149-
150-
self.prefix_cache_stats.queries += len(block_hashes)
151-
self.prefix_cache_stats.hits += len(computed_blocks)
129+
# When the request requires prompt logprobs, we skip prefix caching.
130+
if request.sampling_params.prompt_logprobs is not None:
131+
return [], 0
152132

153-
# NOTE(woosuk): Since incomplete blocks are not eligible for
154-
# sharing, `num_computed_tokens` is always a multiple of
155-
# `block_size`.
156-
num_computed_tokens = len(computed_blocks) * self.block_size
157-
return computed_blocks, num_computed_tokens
133+
if len(block_hashes) * self.block_size == request.num_tokens:
134+
# When prompt length is divisible by the block size and all
135+
# blocks are cached, we need to recompute the last token. This
136+
# have to be achieved by re-computing an entire block because
137+
# allocate_slots() assumes num_computed_tokens is always a
138+
# multiple of the block size. To achieve this, remove the last
139+
# block hash from the block_hashes for find_longest_cache_hit
140+
# This limitation can potentially be removed in the future to
141+
# slightly improve the performance.
142+
last_block_hash = block_hashes.pop()
158143
else:
159-
# Skip cache hits for prompt logprobs
160-
return [], 0
144+
last_block_hash = None
145+
146+
computed_blocks = (
147+
self.specialized_manager.find_longest_cache_hit(block_hashes))
148+
self.prefix_cache_stats.queries += len(block_hashes)
149+
self.prefix_cache_stats.hits += len(computed_blocks)
150+
151+
if last_block_hash is not None:
152+
# Add back the last block hash if it was removed.
153+
# NOTE: Because block_hashes is cached in req_to_block_hashes,
154+
# we shouldn't modify it directly.
155+
block_hashes.append(last_block_hash)
156+
157+
# NOTE(woosuk): Since incomplete blocks are not eligible for
158+
# sharing, `num_computed_tokens` is always a multiple of
159+
# `block_size`.
160+
num_computed_tokens = len(computed_blocks) * self.block_size
161+
return computed_blocks, num_computed_tokens
161162

162163
def allocate_slots(
163164
self,

0 commit comments

Comments
 (0)