@@ -438,12 +438,16 @@ struct llama_server_context
438438 return false ;
439439 }
440440
441- if (params.n_ctx < 2048 ) { // request larger context for the image embedding
441+ if (params.n_ctx != 0 && params. n_ctx < 2048 ) { // request larger context for the image embedding
442442 params.n_ctx = 2048 ;
443443 }
444444 }
445445
446+ // dedicate one sequence to the system prompt
447+ params.n_parallel += 1 ;
448+
446449 std::tie (model, ctx) = llama_init_from_gpt_params (params);
450+ params.n_parallel -= 1 ; // but be sneaky about it
447451 if (model == nullptr )
448452 {
449453 LOG_ERROR (" unable to load model" , {{" model" , params.model }});
@@ -923,9 +927,9 @@ struct llama_server_context
923927 }
924928
925929 // assign the system KV cache to all parallel sequences
926- for (int32_t i = 1 ; i < params.n_parallel ; ++i)
930+ for (int32_t i = 1 ; i <= params.n_parallel ; ++i)
927931 {
928- llama_kv_cache_seq_cp (ctx, 0 , i, 0 , system_tokens. size () );
932+ llama_kv_cache_seq_cp (ctx, 0 , i, - 1 , - 1 );
929933 }
930934 }
931935
@@ -1400,7 +1404,7 @@ struct llama_server_context
14001404 std::vector<llama_token> append_tokens = tokenize (json_prompt, false ); // has next image
14011405 for (int i = 0 ; i < (int ) append_tokens.size (); ++i)
14021406 {
1403- llama_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id }, true );
1407+ llama_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id + 1 }, true );
14041408 slot.n_past += 1 ;
14051409 }
14061410 }
@@ -1636,8 +1640,8 @@ struct llama_server_context
16361640 {" n_system_tokens" , system_tokens.size ()},
16371641 {" n_cache_tokens" , slot.cache_tokens .size ()}
16381642 });
1639- llama_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
1640- llama_kv_cache_seq_add (ctx, slot.id , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
1643+ llama_kv_cache_seq_rm (ctx, slot.id + 1 , n_keep , n_keep + n_discard);
1644+ llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
16411645
16421646 for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size (); i++)
16431647 {
@@ -1689,7 +1693,7 @@ struct llama_server_context
16891693
16901694 // TODO: we always have to take into account the "system_tokens"
16911695 // this is not great and needs to be improved somehow
1692- llama_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id }, true );
1696+ llama_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id + 1 }, true );
16931697 slot.n_past += 1 ;
16941698 }
16951699
@@ -1852,13 +1856,28 @@ struct llama_server_context
18521856 }
18531857 }
18541858
1859+ // keep only the common part
18551860 int p0 = (int ) system_tokens.size () + slot.n_past ;
18561861 LOG_INFO (" kv cache rm [p0, end)" , {
18571862 { " slot_id" , slot.id },
18581863 { " task_id" , slot.task_id },
18591864 { " p0" , p0 }
18601865 });
1861- llama_kv_cache_seq_rm (ctx, slot.id , p0, -1 );
1866+ if (!llama_kv_cache_seq_rm (ctx, slot.id + 1 , p0, -1 )) {
1867+ // could not partially delete (likely using a non-Transformer model)
1868+ // TODO: logging
1869+ llama_kv_cache_seq_rm (ctx, slot.id + 1 , -1 , -1 );
1870+ llama_kv_cache_seq_cp (ctx, 0 , slot.id + 1 , -1 , -1 );
1871+
1872+ // there is no common part left (except for the system prompt)
1873+ // TODO: maybe find a way to refactor this to reuse the !cache_prompt case above
1874+ slot.n_past = 0 ;
1875+ slot.n_past_se = 0 ;
1876+ slot.ga_i = 0 ;
1877+ slot.num_prompt_tokens_processed = slot.num_prompt_tokens ;
1878+ // TODO: is the system prompt ever in the sampling context?
1879+ llama_sampling_reset (slot.ctx_sampling );
1880+ }
18621881
18631882 LOG_VERBOSE (" prompt ingested" , {
18641883 {" n_past" , slot.n_past },
@@ -1887,7 +1906,7 @@ struct llama_server_context
18871906 ga_i += ga_w/ga_n;
18881907 }
18891908 }
1890- llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, {slot.id }, false );
1909+ llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, { slot.id + 1 }, false );
18911910 slot_npast++;
18921911 }
18931912
@@ -1941,9 +1960,9 @@ struct llama_server_context
19411960 LOG_TEE (" div: [%6d, %6d] / %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w , slot.ga_n , (slot.ga_i + ib * bd) / slot.ga_n , (slot.ga_i + ib * bd + slot.ga_w ) / slot.ga_n );
19421961 LOG_TEE (" shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd + slot.ga_w , slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
19431962
1944- llama_kv_cache_seq_add (ctx, slot.id , slot.ga_i , slot.n_past_se , ib * bd);
1945- llama_kv_cache_seq_div (ctx, slot.id , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w ,slot.ga_n );
1946- llama_kv_cache_seq_add (ctx, slot.id , slot.ga_i + ib * bd + slot.ga_w ,slot.n_past_se + ib * bd, dd);
1963+ llama_kv_cache_seq_add (ctx, slot.id + 1 , slot.ga_i , slot.n_past_se , ib * bd);
1964+ llama_kv_cache_seq_div (ctx, slot.id + 1 , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w ,slot.ga_n );
1965+ llama_kv_cache_seq_add (ctx, slot.id + 1 , slot.ga_i + ib * bd + slot.ga_w ,slot.n_past_se + ib * bd, dd);
19471966
19481967 slot.n_past_se -= bd;
19491968
0 commit comments