@@ -2016,11 +2016,6 @@ struct server_context {
20162016 params_base.n_cache_reuse = 0 ;
20172017 SRV_WRN (" %s\n " , " cache_reuse is not supported by this context, it will be disabled" );
20182018 }
2019-
2020- if (!params_base.speculative .model .path .empty ()) {
2021- SRV_ERR (" %s\n " , " err: speculative decode is not supported by this context" );
2022- return false ;
2023- }
20242019 }
20252020
20262021 return true ;
@@ -3215,8 +3210,14 @@ struct server_context {
32153210
32163211 if (slot.n_past > 0 && slot.n_past < (int ) slot.cache_tokens .size ()) {
32173212 const auto pos_min = llama_kv_self_seq_pos_min (ctx, slot.id );
3218- if (pos_min > 0 ) {
3219- SLT_WRN (slot, " n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n " , slot.n_past , (int ) slot.cache_tokens .size (), slot.id , pos_min);
3213+ if (pos_min == -1 ) {
3214+ SLT_ERR (slot, " n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n " , slot.n_past , (int ) slot.cache_tokens .size (), slot.id , pos_min);
3215+ GGML_ABORT (" pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237" );
3216+ }
3217+
3218+ const auto n_swa = llama_model_n_swa (model);
3219+ if (pos_min > slot.n_past - n_swa) {
3220+ SLT_WRN (slot, " n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n " , slot.n_past , (int ) slot.cache_tokens .size (), slot.id , pos_min, n_swa);
32203221 SLT_WRN (slot, " forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n " ,
32213222 " https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055" );
32223223 llama_kv_self_seq_rm (ctx, slot.id , 0 , -1 );
0 commit comments