@@ -437,24 +437,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
437437 }
438438 }
439439
440- // process files
441- mtmd::bitmaps bitmaps;
442440 const bool has_mtmd = ctx_server.mctx != nullptr ;
443- {
444- if (!has_mtmd && !files.empty ()) {
445- throw std::runtime_error (" This server does not support multimodal" );
446- }
447- for (auto & file : files) {
448- mtmd::bitmap bmp (mtmd_helper_bitmap_init_from_buf (ctx_server.mctx , file.data (), file.size ()));
449- if (!bmp.ptr ) {
450- throw std::runtime_error (" Failed to load image/audio" );
451- }
452- // calculate bitmap hash (for KV caching)
453- std::string hash = fnv_hash (bmp.data (), bmp.n_bytes ());
454- bmp.set_id (hash.c_str ());
455- bitmaps.entries .push_back (std::move (bmp));
456- }
457- }
458441
459442 // process prompt
460443 std::vector<server_tokens> inputs;
@@ -464,32 +447,10 @@ class BackendServiceImpl final : public backend::Backend::Service {
464447
465448 if (has_mtmd) {
466449 // multimodal
467- std::string prompt_str = prompt.get <std::string>();
468- mtmd_input_text inp_txt = {
469- prompt_str.c_str (),
470- /* add_special */ true ,
471- /* parse_special */ true ,
472- };
473- mtmd::input_chunks chunks (mtmd_input_chunks_init ());
474- auto bitmaps_c_ptr = bitmaps.c_ptr ();
475- int32_t tokenized = mtmd_tokenize (ctx_server.mctx ,
476- chunks.ptr .get (),
477- &inp_txt,
478- bitmaps_c_ptr.data (),
479- bitmaps_c_ptr.size ());
480- if (tokenized != 0 ) {
481- throw std::runtime_error (" Failed to tokenize prompt" );
482- }
483-
484- server_tokens tmp (chunks, true );
485- inputs.push_back (std::move (tmp));
450+ inputs.push_back (process_mtmd_prompt (ctx_server.mctx , prompt.get <std::string>(), files));
486451 } else {
487- // non-multimodal version
488- auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , prompt, true , true );
489- for (auto & p : tokenized_prompts) {
490- auto tmp = server_tokens (p, ctx_server.mctx != nullptr );
491- inputs.push_back (std::move (tmp));
492- }
452+ // Everything else, including multimodal completions.
453+ inputs = tokenize_input_prompts (ctx_server.vocab , ctx_server.mctx , prompt, true , true );
493454 }
494455
495456 tasks.reserve (inputs.size ());
@@ -630,23 +591,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
630591 }
631592
632593 // process files
633- mtmd::bitmaps bitmaps;
634594 const bool has_mtmd = ctx_server.mctx != nullptr ;
635- {
636- if (!has_mtmd && !files.empty ()) {
637- throw std::runtime_error (" This server does not support multimodal" );
638- }
639- for (auto & file : files) {
640- mtmd::bitmap bmp (mtmd_helper_bitmap_init_from_buf (ctx_server.mctx , file.data (), file.size ()));
641- if (!bmp.ptr ) {
642- throw std::runtime_error (" Failed to load image/audio" );
643- }
644- // calculate bitmap hash (for KV caching)
645- std::string hash = fnv_hash (bmp.data (), bmp.n_bytes ());
646- bmp.set_id (hash.c_str ());
647- bitmaps.entries .push_back (std::move (bmp));
648- }
649- }
650595
651596 // process prompt
652597 std::vector<server_tokens> inputs;
@@ -657,33 +602,10 @@ class BackendServiceImpl final : public backend::Backend::Service {
657602
658603 if (has_mtmd) {
659604 // multimodal
660- std::string prompt_str = prompt.get <std::string>();
661- mtmd_input_text inp_txt = {
662- prompt_str.c_str (),
663- /* add_special */ true ,
664- /* parse_special */ true ,
665- };
666- mtmd::input_chunks chunks (mtmd_input_chunks_init ());
667- auto bitmaps_c_ptr = bitmaps.c_ptr ();
668- int32_t tokenized = mtmd_tokenize (ctx_server.mctx ,
669- chunks.ptr .get (),
670- &inp_txt,
671- bitmaps_c_ptr.data (),
672- bitmaps_c_ptr.size ());
673- if (tokenized != 0 ) {
674- std::cout << " [PREDICT] Failed to tokenize prompt" << std::endl;
675- throw std::runtime_error (" Failed to tokenize prompt" );
676- }
677-
678- server_tokens tmp (chunks, true );
679- inputs.push_back (std::move (tmp));
605+ inputs.push_back (process_mtmd_prompt (ctx_server.mctx , prompt.get <std::string>(), files));
680606 } else {
681- // non-multimodal version
682- auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , prompt, true , true );
683- for (auto & p : tokenized_prompts) {
684- auto tmp = server_tokens (p, ctx_server.mctx != nullptr );
685- inputs.push_back (std::move (tmp));
686- }
607+ // Everything else, including multimodal completions.
608+ inputs = tokenize_input_prompts (ctx_server.vocab , ctx_server.mctx , prompt, true , true );
687609 }
688610
689611 tasks.reserve (inputs.size ());
@@ -774,7 +696,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
774696 json prompt = body.at (" prompt" );
775697
776698
777- auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , prompt, true , true );
699+ auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , ctx_server. mctx , prompt, true , true );
778700 for (const auto & tokens : tokenized_prompts) {
779701 // this check is necessary for models that do not add BOS token to the input
780702 if (tokens.empty ()) {
@@ -793,7 +715,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
793715
794716 task.id = ctx_server.queue_tasks .get_new_id ();
795717 task.index = i;
796- task.prompt_tokens = server_tokens (tokenized_prompts[i], ctx_server. mctx != nullptr );
718+ task.prompt_tokens = std::move (tokenized_prompts[i]);
797719
798720 // OAI-compat
799721 task.params .oaicompat = OAICOMPAT_TYPE_EMBEDDING;
@@ -849,8 +771,10 @@ class BackendServiceImpl final : public backend::Backend::Service {
849771 }
850772
851773 // Tokenize the query
852- llama_tokens tokenized_query = tokenize_input_prompts (ctx_server.vocab , request->query (), /* add_special */ false , true )[0 ];
853-
774+ auto tokenized_query = tokenize_input_prompts (ctx_server.vocab , ctx_server.mctx , request->query (), /* add_special */ false , true );
775+ if (tokenized_query.size () != 1 ) {
776+ return grpc::Status (grpc::StatusCode::INVALID_ARGUMENT, " \" query\" must contain only a single prompt" );
777+ }
854778 // Create and queue the task
855779 json responses = json::array ();
856780 bool error = false ;
@@ -862,14 +786,14 @@ class BackendServiceImpl final : public backend::Backend::Service {
862786 documents.push_back (request->documents (i));
863787 }
864788
865- auto tokenized_docs = tokenize_input_prompts (ctx_server.vocab , documents, /* add_special */ false , true );
789+ auto tokenized_docs = tokenize_input_prompts (ctx_server.vocab , ctx_server. mctx , documents, /* add_special */ false , true );
866790 tasks.reserve (tokenized_docs.size ());
867791 for (size_t i = 0 ; i < tokenized_docs.size (); i++) {
868- auto tmp = format_rerank (ctx_server.vocab , tokenized_query, tokenized_docs[i]);
792+ auto tmp = format_rerank (ctx_server.vocab , tokenized_query[ 0 ] , tokenized_docs[i]);
869793 server_task task = server_task (SERVER_TASK_TYPE_RERANK);
870794 task.id = ctx_server.queue_tasks .get_new_id ();
871795 task.index = i;
872- task.prompt_tokens = server_tokens (tmp, ctx_server. mctx != nullptr );
796+ task.prompt_tokens = std::move (tmp);
873797 tasks.push_back (std::move (task));
874798 }
875799
0 commit comments