File tree Expand file tree Collapse file tree 1 file changed +6
-2
lines changed
vllm/distributed/kv_transfer/kv_connector Expand file tree Collapse file tree 1 file changed +6
-2
lines changed Original file line number Diff line number Diff line change @@ -118,6 +118,12 @@ def send_kv_caches_and_hidden_states(
118118 start_layer = model_executable .model .start_layer
119119 end_layer = model_executable .model .end_layer
120120
121+ model_config = model_executable .model .config
122+ num_heads = model_config .num_key_value_heads
123+ hidden_size = model_config .hidden_size
124+ num_attention_heads = model_config .num_attention_heads
125+ head_size = int (hidden_size / num_attention_heads )
126+
121127 # query_lens contains new KV caches that are added to vLLM.
122128 # so we will send them to decode instance
123129 # FIXME(Kuntai): This assume that all requests are prefill.
@@ -131,8 +137,6 @@ def send_kv_caches_and_hidden_states(
131137 for layer_id in range (start_layer , end_layer ):
132138 kv_cache = kv_caches [layer_id - start_layer ]
133139
134- _ , _ , num_heads , head_size = kv_cache [0 ].shape
135-
136140 key_cache = kv_cache [0 ].reshape (- 1 , num_heads , head_size )
137141 value_cache = kv_cache [1 ].reshape (- 1 , num_heads , head_size )
138142
You can’t perform that action at this time.
0 commit comments