@@ -14608,8 +14608,6 @@ static int llama_decode_internal(
1460814608
1460914609 const struct llama_hparams & hparams = model.hparams;
1461014610 const int64_t kv_head = kv_self.head;
14611- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
14612- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1461314611
1461414612 for (int i = 0; i < gf->n_nodes; i++) {
1461514613 ggml_tensor * node = gf->nodes[i];
@@ -14619,6 +14617,7 @@ static int llama_decode_internal(
1461914617 const char* k_prefix = "k_cache_view-";
1462014618 if (strncmp(node->src[1]->name, k_prefix, strlen(k_prefix)) == 0) {
1462114619 int il = atoi(node->src[1]->name + strlen(k_prefix)); // Layer index from name
14620+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1462214621 ggml_tensor * tmp_tensor = kv_self.k_l[il];
1462314622 size_t tmp_offset = (ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa))*kv_head;
1462414623 node->src[1]->data = static_cast<char*>(tmp_tensor->data) + tmp_offset;
@@ -14628,6 +14627,7 @@ static int llama_decode_internal(
1462814627 const char* v_prefix = "v_cache_view-";
1462914628 if (strncmp(node->src[1]->name, v_prefix, strlen(v_prefix)) == 0) {
1463014629 int il = atoi(node->src[1]->name + strlen(v_prefix)); // Layer index from name
14630+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1463114631 ggml_tensor * tmp_tensor = kv_self.v_l[il];
1463214632 size_t tmp_offset;
1463314633 if (cparams.flash_attn) {
0 commit comments