From 7a0a88d7cb6d6c9806e1e8c3bd1f394dfc7432dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 29 Mar 2025 12:47:35 +0100 Subject: [PATCH] server: apply grammar before other samplers --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 17a292da153c1..554e659478c76 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3249,7 +3249,7 @@ struct server_context { const int tok_idx = slot.i_batch - i; - llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); + llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx, true); slot.i_batch = -1; @@ -3347,7 +3347,7 @@ struct server_context { llama_decode(ctx, slot.batch_spec); // the accepted tokens from the speculation - const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft); + const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft, true); slot.n_past += ids.size(); slot.n_decoded += ids.size();