@@ -468,6 +468,9 @@ struct llama_server_context
468468 bool add_bos_token = true ;
469469 bool has_eos_token = true ;
470470
471+ bool grammar_lazy = false ;
472+ std::vector<common_grammar_trigger> grammar_trigger_words;
473+
471474 int32_t n_ctx; // total context for all clients / slots
472475
473476 // system prompt
@@ -706,6 +709,8 @@ struct llama_server_context
706709 slot->sparams .grammar = json_value (data, " grammar" , default_sparams.grammar );
707710 slot->sparams .n_probs = json_value (data, " n_probs" , default_sparams.n_probs );
708711 slot->sparams .min_keep = json_value (data, " min_keep" , default_sparams.min_keep );
712+ slot->sparams .grammar_trigger_words = grammar_trigger_words;
713+ slot->sparams .grammar_lazy = grammar_lazy;
709714
710715 if (slot->n_predict > 0 && slot->params .n_predict > slot->n_predict ) {
711716 // Might be better to reject the request with a 400 ?
@@ -2374,6 +2379,21 @@ static void params_parse(const backend::ModelOptions* request,
23742379 if ( request->ropefreqscale () != 0 .0f ) {
23752380 params.rope_freq_scale = request->ropefreqscale ();
23762381 }
2382+
2383+ if (request->grammartriggers_size () > 0 ) {
2384+ LOG_INFO (" configuring grammar triggers" , {});
2385+ llama.grammar_lazy = true ;
2386+ for (int i = 0 ; i < request->grammartriggers_size (); i++) {
2387+ common_grammar_trigger trigger;
2388+ trigger.word = request->grammartriggers (i).word ();
2389+ trigger.at_start = request->grammartriggers (i).at_start ();
2390+ llama.grammar_trigger_words .push_back (trigger);
2391+ LOG_INFO (" grammar trigger" , {
2392+ { " word" , trigger.word },
2393+ { " at_start" , trigger.at_start }
2394+ });
2395+ }
2396+ }
23772397}
23782398
23792399
@@ -2522,6 +2542,18 @@ class BackendServiceImpl final : public backend::Backend::Service {
25222542 return grpc::Status::OK;
25232543 }
25242544
2545+ grpc::Status TokenizeString (ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
2546+ json data = parse_options (false , request, llama);
2547+
2548+ std::vector<llama_token> tokens = llama.tokenize (data[" prompt" ],false );
2549+
2550+ for (int i=0 ; i< tokens.size (); i++){
2551+ response->add_tokens (tokens[i]);
2552+ }
2553+
2554+ return grpc::Status::OK;
2555+ }
2556+
25252557 grpc::Status GetMetrics (ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
25262558 llama_client_slot* active_slot = llama.get_active_slot ();
25272559
0 commit comments