@@ -126,38 +126,39 @@ def get_computed_blocks(
126126 self .req_to_block_hashes [request .request_id ] = block_hashes
127127
128128 self .prefix_cache_stats .requests += 1
129- if request .sampling_params .prompt_logprobs is None :
130- if len (block_hashes ) * self .block_size == request .num_tokens :
131- # When prompt length is divisible by the block size and all
132- # blocks are cached, we need to recompute the last token. This
133- # have to be achieved by re-computing an entire block because
134- # allocate_slots() assumes num_computed_tokens is always a
135- # multiple of the block size. To achieve this, remove the last
136- # block hash from the block_hashes for find_longest_cache_hit
137- # This limitation can potentially be removed in the future to
138- # slightly improve the performance.
139- last_block_hash = block_hashes .pop ()
140- else :
141- last_block_hash = None
142-
143- computed_blocks = (
144- self .specialized_manager .find_longest_cache_hit (block_hashes ))
145-
146- if last_block_hash is not None :
147- # Add back the last block hash if it was removed.
148- block_hashes .append (last_block_hash )
149-
150- self .prefix_cache_stats .queries += len (block_hashes )
151- self .prefix_cache_stats .hits += len (computed_blocks )
129+ # When the request requires prompt logprobs, we skip prefix caching.
130+ if request .sampling_params .prompt_logprobs is not None :
131+ return [], 0
152132
153- # NOTE(woosuk): Since incomplete blocks are not eligible for
154- # sharing, `num_computed_tokens` is always a multiple of
155- # `block_size`.
156- num_computed_tokens = len (computed_blocks ) * self .block_size
157- return computed_blocks , num_computed_tokens
133+ if len (block_hashes ) * self .block_size == request .num_tokens :
134+ # When prompt length is divisible by the block size and all
135+ # blocks are cached, we need to recompute the last token. This
136+ # have to be achieved by re-computing an entire block because
137+ # allocate_slots() assumes num_computed_tokens is always a
138+ # multiple of the block size. To achieve this, remove the last
139+ # block hash from the block_hashes for find_longest_cache_hit
140+ # This limitation can potentially be removed in the future to
141+ # slightly improve the performance.
142+ last_block_hash = block_hashes .pop ()
158143 else :
159- # Skip cache hits for prompt logprobs
160- return [], 0
144+ last_block_hash = None
145+
146+ computed_blocks = (
147+ self .specialized_manager .find_longest_cache_hit (block_hashes ))
148+ self .prefix_cache_stats .queries += len (block_hashes )
149+ self .prefix_cache_stats .hits += len (computed_blocks )
150+
151+ if last_block_hash is not None :
152+ # Add back the last block hash if it was removed.
153+ # NOTE: Because block_hashes is cached in req_to_block_hashes,
154+ # we shouldn't modify it directly.
155+ block_hashes .append (last_block_hash )
156+
157+ # NOTE(woosuk): Since incomplete blocks are not eligible for
158+ # sharing, `num_computed_tokens` is always a multiple of
159+ # `block_size`.
160+ num_computed_tokens = len (computed_blocks ) * self .block_size
161+ return computed_blocks , num_computed_tokens
161162
162163 def allocate_slots (
163164 self ,
0 commit comments