Skip to content

Commit 259383c

Browse files
authored
chore(deps): bump llama.cpp to '45363632cbd593537d541e81b600242e0b3d47fc' (#6122)
Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 209c069 commit 259383c

File tree

2 files changed

+16
-92
lines changed

2 files changed

+16
-92
lines changed

backend/cpp/llama-cpp/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
LLAMA_VERSION?=cd36b5e5c7fed2a3ac671dd542d579ca40b48b54
2+
LLAMA_VERSION?=45363632cbd593537d541e81b600242e0b3d47fc
33
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
44

55
CMAKE_ARGS?=

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 15 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -437,24 +437,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
437437
}
438438
}
439439

440-
// process files
441-
mtmd::bitmaps bitmaps;
442440
const bool has_mtmd = ctx_server.mctx != nullptr;
443-
{
444-
if (!has_mtmd && !files.empty()) {
445-
throw std::runtime_error("This server does not support multimodal");
446-
}
447-
for (auto & file : files) {
448-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
449-
if (!bmp.ptr) {
450-
throw std::runtime_error("Failed to load image/audio");
451-
}
452-
// calculate bitmap hash (for KV caching)
453-
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
454-
bmp.set_id(hash.c_str());
455-
bitmaps.entries.push_back(std::move(bmp));
456-
}
457-
}
458441

459442
// process prompt
460443
std::vector<server_tokens> inputs;
@@ -464,32 +447,10 @@ class BackendServiceImpl final : public backend::Backend::Service {
464447

465448
if (has_mtmd) {
466449
// multimodal
467-
std::string prompt_str = prompt.get<std::string>();
468-
mtmd_input_text inp_txt = {
469-
prompt_str.c_str(),
470-
/* add_special */ true,
471-
/* parse_special */ true,
472-
};
473-
mtmd::input_chunks chunks(mtmd_input_chunks_init());
474-
auto bitmaps_c_ptr = bitmaps.c_ptr();
475-
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
476-
chunks.ptr.get(),
477-
&inp_txt,
478-
bitmaps_c_ptr.data(),
479-
bitmaps_c_ptr.size());
480-
if (tokenized != 0) {
481-
throw std::runtime_error("Failed to tokenize prompt");
482-
}
483-
484-
server_tokens tmp(chunks, true);
485-
inputs.push_back(std::move(tmp));
450+
inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
486451
} else {
487-
// non-multimodal version
488-
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
489-
for (auto & p : tokenized_prompts) {
490-
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
491-
inputs.push_back(std::move(tmp));
492-
}
452+
// Everything else, including multimodal completions.
453+
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
493454
}
494455

495456
tasks.reserve(inputs.size());
@@ -630,23 +591,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
630591
}
631592

632593
// process files
633-
mtmd::bitmaps bitmaps;
634594
const bool has_mtmd = ctx_server.mctx != nullptr;
635-
{
636-
if (!has_mtmd && !files.empty()) {
637-
throw std::runtime_error("This server does not support multimodal");
638-
}
639-
for (auto & file : files) {
640-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
641-
if (!bmp.ptr) {
642-
throw std::runtime_error("Failed to load image/audio");
643-
}
644-
// calculate bitmap hash (for KV caching)
645-
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
646-
bmp.set_id(hash.c_str());
647-
bitmaps.entries.push_back(std::move(bmp));
648-
}
649-
}
650595

651596
// process prompt
652597
std::vector<server_tokens> inputs;
@@ -657,33 +602,10 @@ class BackendServiceImpl final : public backend::Backend::Service {
657602

658603
if (has_mtmd) {
659604
// multimodal
660-
std::string prompt_str = prompt.get<std::string>();
661-
mtmd_input_text inp_txt = {
662-
prompt_str.c_str(),
663-
/* add_special */ true,
664-
/* parse_special */ true,
665-
};
666-
mtmd::input_chunks chunks(mtmd_input_chunks_init());
667-
auto bitmaps_c_ptr = bitmaps.c_ptr();
668-
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
669-
chunks.ptr.get(),
670-
&inp_txt,
671-
bitmaps_c_ptr.data(),
672-
bitmaps_c_ptr.size());
673-
if (tokenized != 0) {
674-
std::cout << "[PREDICT] Failed to tokenize prompt" << std::endl;
675-
throw std::runtime_error("Failed to tokenize prompt");
676-
}
677-
678-
server_tokens tmp(chunks, true);
679-
inputs.push_back(std::move(tmp));
605+
inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
680606
} else {
681-
// non-multimodal version
682-
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
683-
for (auto & p : tokenized_prompts) {
684-
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
685-
inputs.push_back(std::move(tmp));
686-
}
607+
// Everything else, including multimodal completions.
608+
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
687609
}
688610

689611
tasks.reserve(inputs.size());
@@ -774,7 +696,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
774696
json prompt = body.at("prompt");
775697

776698

777-
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
699+
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
778700
for (const auto & tokens : tokenized_prompts) {
779701
// this check is necessary for models that do not add BOS token to the input
780702
if (tokens.empty()) {
@@ -793,7 +715,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
793715

794716
task.id = ctx_server.queue_tasks.get_new_id();
795717
task.index = i;
796-
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
718+
task.prompt_tokens = std::move(tokenized_prompts[i]);
797719

798720
// OAI-compat
799721
task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;
@@ -849,8 +771,10 @@ class BackendServiceImpl final : public backend::Backend::Service {
849771
}
850772

851773
// Tokenize the query
852-
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, request->query(), /* add_special */ false, true)[0];
853-
774+
auto tokenized_query = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, request->query(), /* add_special */ false, true);
775+
if (tokenized_query.size() != 1) {
776+
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"query\" must contain only a single prompt");
777+
}
854778
// Create and queue the task
855779
json responses = json::array();
856780
bool error = false;
@@ -862,14 +786,14 @@ class BackendServiceImpl final : public backend::Backend::Service {
862786
documents.push_back(request->documents(i));
863787
}
864788

865-
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
789+
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
866790
tasks.reserve(tokenized_docs.size());
867791
for (size_t i = 0; i < tokenized_docs.size(); i++) {
868-
auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
792+
auto tmp = format_rerank(ctx_server.vocab, tokenized_query[0], tokenized_docs[i]);
869793
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
870794
task.id = ctx_server.queue_tasks.get_new_id();
871795
task.index = i;
872-
task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
796+
task.prompt_tokens = std::move(tmp);
873797
tasks.push_back(std::move(task));
874798
}
875799

0 commit comments

Comments
 (0)