Skip to content

Commit 6050d80

Browse files
committed
manual merge
Signed-off-by: Dave Lee <[email protected]>
2 parents 6b8d969 + 28a1310 commit 6050d80

File tree

25 files changed

+429
-32
lines changed

25 files changed

+429
-32
lines changed

.github/workflows/notify-models.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
with:
1919
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
2020
# Check the PR diff using the current branch and the base branch of the PR
21-
- uses: GrantBirki/git-diff-action@v2.7.0
21+
- uses: GrantBirki/git-diff-action@v2.8.0
2222
id: git-diff-action
2323
with:
2424
json_diff_file_output: diff.json
@@ -99,7 +99,7 @@ jobs:
9999
docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
100100
until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done
101101
# Check the PR diff using the current branch and the base branch of the PR
102-
- uses: GrantBirki/git-diff-action@v2.7.0
102+
- uses: GrantBirki/git-diff-action@v2.8.0
103103
id: git-diff-action
104104
with:
105105
json_diff_file_output: diff.json

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
88
# llama.cpp versions
99
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
1010
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
11-
CPPLLAMA_VERSION?=53debe6f3c9cca87e9520a83ee8c14d88977afa4
11+
CPPLLAMA_VERSION?=3ec9fd4b77b6aca03a3c2bf678eae3f9517d6904
1212

1313
# whisper.cpp version
1414
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

backend/backend.proto

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,11 @@ message Reply {
163163
double timing_token_generation = 5;
164164
}
165165

166+
message GrammarTrigger {
167+
string word = 1;
168+
bool at_start = 2;
169+
}
170+
166171
message ModelOptions {
167172
string Model = 1;
168173
int32 ContextSize = 2;
@@ -247,6 +252,8 @@ message ModelOptions {
247252

248253
string CacheTypeKey = 63;
249254
string CacheTypeValue = 64;
255+
256+
repeated GrammarTrigger GrammarTriggers = 65;
250257
}
251258

252259
message Result {

backend/cpp/llama/grpc-server.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,9 @@ struct llama_server_context
468468
bool add_bos_token = true;
469469
bool has_eos_token = true;
470470

471+
bool grammar_lazy = false;
472+
std::vector<common_grammar_trigger> grammar_trigger_words;
473+
471474
int32_t n_ctx; // total context for all clients / slots
472475

473476
// system prompt
@@ -706,6 +709,8 @@ struct llama_server_context
706709
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
707710
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
708711
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
712+
slot->sparams.grammar_trigger_words = grammar_trigger_words;
713+
slot->sparams.grammar_lazy = grammar_lazy;
709714

710715
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
711716
// Might be better to reject the request with a 400 ?
@@ -2374,6 +2379,21 @@ static void params_parse(const backend::ModelOptions* request,
23742379
if ( request->ropefreqscale() != 0.0f ) {
23752380
params.rope_freq_scale = request->ropefreqscale();
23762381
}
2382+
2383+
if (request->grammartriggers_size() > 0) {
2384+
LOG_INFO("configuring grammar triggers", {});
2385+
llama.grammar_lazy = true;
2386+
for (int i = 0; i < request->grammartriggers_size(); i++) {
2387+
common_grammar_trigger trigger;
2388+
trigger.word = request->grammartriggers(i).word();
2389+
trigger.at_start = request->grammartriggers(i).at_start();
2390+
llama.grammar_trigger_words.push_back(trigger);
2391+
LOG_INFO("grammar trigger", {
2392+
{ "word", trigger.word },
2393+
{ "at_start", trigger.at_start }
2394+
});
2395+
}
2396+
}
23772397
}
23782398

23792399

@@ -2522,6 +2542,18 @@ class BackendServiceImpl final : public backend::Backend::Service {
25222542
return grpc::Status::OK;
25232543
}
25242544

2545+
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
2546+
json data = parse_options(false, request, llama);
2547+
2548+
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
2549+
2550+
for (int i=0 ; i< tokens.size(); i++){
2551+
response->add_tokens(tokens[i]);
2552+
}
2553+
2554+
return grpc::Status::OK;
2555+
}
2556+
25252557
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
25262558
llama_client_slot* active_slot = llama.get_active_slot();
25272559

backend/python/transformers/requirements-cpu.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ accelerate
55
transformers
66
bitsandbytes
77
outetts
8-
sentence-transformers==3.4.0
8+
sentence-transformers==3.4.1

backend/python/transformers/requirements-cublas11.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ accelerate
66
transformers
77
bitsandbytes
88
outetts
9-
sentence-transformers==3.4.0
9+
sentence-transformers==3.4.1

backend/python/transformers/requirements-cublas12.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ numba==0.60.0
55
transformers
66
bitsandbytes
77
outetts
8-
sentence-transformers==3.4.0
8+
sentence-transformers==3.4.1

backend/python/transformers/requirements-hipblas.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ numba==0.60.0
77
bitsandbytes
88
outetts
99
bitsandbytes
10-
sentence-transformers==3.4.0
10+
sentence-transformers==3.4.1

backend/python/transformers/requirements-intel.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ numba==0.60.0
88
intel-extension-for-transformers
99
bitsandbytes
1010
outetts
11-
sentence-transformers==3.4.0
11+
sentence-transformers==3.4.1

core/backend/options.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,19 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
118118
nGPULayers = *c.NGPULayers
119119
}
120120

121+
triggers := make([]*pb.GrammarTrigger, 0)
122+
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
123+
triggers = append(triggers, &pb.GrammarTrigger{
124+
Word: t.Word,
125+
AtStart: t.AtStart,
126+
})
127+
128+
}
129+
121130
return &pb.ModelOptions{
122131
CUDA: c.CUDA || c.Diffusers.CUDA,
123132
SchedulerType: c.Diffusers.SchedulerType,
133+
GrammarTriggers: triggers,
124134
PipelineType: c.Diffusers.PipelineType,
125135
CFGScale: c.CFGScale,
126136
LoraAdapter: c.LoraAdapter,

0 commit comments

Comments
 (0)