@@ -4228,56 +4228,15 @@ int main(int argc, char ** argv) {
4228
4228
// TODO: this log can become very long, put it behind a flag or think about a more compact format
4229
4229
// SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
4230
4230
4231
- // process files
4232
- mtmd::bitmaps bitmaps;
4233
- const bool has_mtmd = ctx_server.mctx != nullptr ;
4234
- {
4235
- if (!has_mtmd && !files.empty ()) {
4236
- throw std::runtime_error (" This server does not support multimodal" );
4237
- }
4238
- for (auto & file : files) {
4239
- mtmd::bitmap bmp (mtmd_helper_bitmap_init_from_buf (ctx_server.mctx , file.data (), file.size ()));
4240
- if (!bmp.ptr ) {
4241
- throw std::runtime_error (" Failed to load image or audio file" );
4242
- }
4243
- // calculate bitmap hash (for KV caching)
4244
- std::string hash = fnv_hash (bmp.data (), bmp.n_bytes ());
4245
- bmp.set_id (hash.c_str ());
4246
- bitmaps.entries .push_back (std::move (bmp));
4247
- }
4248
- }
4249
-
4250
4231
// process prompt
4251
4232
std::vector<server_tokens> inputs;
4252
4233
4253
- if (oaicompat && has_mtmd) {
4254
- // multimodal
4255
- std::string prompt_str = prompt.get <std::string>();
4256
- mtmd_input_text inp_txt = {
4257
- prompt_str.c_str (),
4258
- /* add_special */ true ,
4259
- /* parse_special */ true ,
4260
- };
4261
- mtmd::input_chunks chunks (mtmd_input_chunks_init ());
4262
- auto bitmaps_c_ptr = bitmaps.c_ptr ();
4263
- int32_t tokenized = mtmd_tokenize (ctx_server.mctx ,
4264
- chunks.ptr .get (),
4265
- &inp_txt,
4266
- bitmaps_c_ptr.data (),
4267
- bitmaps_c_ptr.size ());
4268
- if (tokenized != 0 ) {
4269
- throw std::runtime_error (" Failed to tokenize prompt" );
4270
- }
4271
-
4272
- server_tokens tmp (chunks, true );
4273
- inputs.push_back (std::move (tmp));
4234
+ if (oaicompat && ctx_server.mctx != nullptr ) {
4235
+ // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
4236
+ inputs.push_back (std::move (process_mtmd_prompt (ctx_server.mctx , prompt.get <std::string>(), files)));
4274
4237
} else {
4275
- // non-multimodal version
4276
- auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , prompt, true , true );
4277
- for (auto & p : tokenized_prompts) {
4278
- auto tmp = server_tokens (p, ctx_server.mctx != nullptr );
4279
- inputs.push_back (std::move (tmp));
4280
- }
4238
+ // Everything else, including multimodal completions.
4239
+ inputs = tokenize_input_prompts (ctx_server.vocab , ctx_server.mctx , prompt, true , true );
4281
4240
}
4282
4241
4283
4242
tasks.reserve (inputs.size ());
@@ -4369,7 +4328,12 @@ int main(int argc, char ** argv) {
4369
4328
4370
4329
const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
4371
4330
json data = json::parse (req.body );
4372
- std::vector<raw_buffer> files; // dummy
4331
+ std::vector<raw_buffer> files;
4332
+ if (data.find (" multimodal_data" ) != data.end ()) {
4333
+ for (const auto & entry : data.at (" multimodal_data" )) {
4334
+ files.push_back (base64_decode (entry));
4335
+ }
4336
+ }
4373
4337
handle_completions_impl (
4374
4338
SERVER_TASK_TYPE_COMPLETION,
4375
4339
data,
@@ -4446,7 +4410,7 @@ int main(int argc, char ** argv) {
4446
4410
data[" input_extra" ] = input_extra; // default to empty array if it's not exist
4447
4411
4448
4412
std::string prompt = json_value (data, " prompt" , std::string ());
4449
- std::vector<llama_tokens > tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , prompt, false , true );
4413
+ std::vector<server_tokens > tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , ctx_server. mctx , prompt, false , true );
4450
4414
SRV_DBG (" creating infill tasks, n_prompts = %d\n " , (int ) tokenized_prompts.size ());
4451
4415
data[" prompt" ] = format_infill (
4452
4416
ctx_server.vocab ,
@@ -4457,7 +4421,7 @@ int main(int argc, char ** argv) {
4457
4421
ctx_server.params_base .n_predict ,
4458
4422
ctx_server.slots [0 ].n_ctx , // TODO: there should be a better way
4459
4423
ctx_server.params_base .spm_infill ,
4460
- tokenized_prompts[0 ]
4424
+ tokenized_prompts[0 ]. get_text_tokens () // TODO: this could maybe be multimodal.
4461
4425
);
4462
4426
4463
4427
std::vector<raw_buffer> files; // dummy
@@ -4635,7 +4599,7 @@ int main(int argc, char ** argv) {
4635
4599
}
4636
4600
}
4637
4601
4638
- auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , prompt, true , true );
4602
+ auto tokenized_prompts = tokenize_input_prompts (ctx_server.vocab , ctx_server. mctx , prompt, true , true );
4639
4603
for (const auto & tokens : tokenized_prompts) {
4640
4604
// this check is necessary for models that do not add BOS token to the input
4641
4605
if (tokens.empty ()) {
@@ -4663,7 +4627,7 @@ int main(int argc, char ** argv) {
4663
4627
4664
4628
task.id = ctx_server.queue_tasks .get_new_id ();
4665
4629
task.index = i;
4666
- task.prompt_tokens = server_tokens (tokenized_prompts[i], ctx_server. mctx != nullptr );
4630
+ task.prompt_tokens = std::move (tokenized_prompts[i]);
4667
4631
4668
4632
// OAI-compat
4669
4633
task.params .oaicompat = oaicompat;
@@ -4750,22 +4714,22 @@ int main(int argc, char ** argv) {
4750
4714
return ;
4751
4715
}
4752
4716
4753
- llama_tokens tokenized_query = tokenize_input_prompts (ctx_server.vocab , query, /* add_special */ false , true )[0 ];
4717
+ server_tokens tokenized_query = std::move ( tokenize_input_prompts (ctx_server.vocab , ctx_server. mctx , query, /* add_special */ false , true )[0 ]) ;
4754
4718
4755
4719
// create and queue the task
4756
4720
json responses = json::array ();
4757
4721
bool error = false ;
4758
4722
std::unordered_set<int > task_ids;
4759
4723
{
4760
4724
std::vector<server_task> tasks;
4761
- auto tokenized_docs = tokenize_input_prompts (ctx_server.vocab , documents, /* add_special */ false , true );
4725
+ auto tokenized_docs = tokenize_input_prompts (ctx_server.vocab , ctx_server. mctx , documents, /* add_special */ false , true );
4762
4726
tasks.reserve (tokenized_docs.size ());
4763
4727
for (size_t i = 0 ; i < tokenized_docs.size (); i++) {
4764
4728
auto tmp = format_rerank (ctx_server.vocab , tokenized_query, tokenized_docs[i]);
4765
4729
server_task task = server_task (SERVER_TASK_TYPE_RERANK);
4766
4730
task.id = ctx_server.queue_tasks .get_new_id ();
4767
4731
task.index = i;
4768
- task.prompt_tokens = server_tokens (tmp, ctx_server. mctx != nullptr );
4732
+ task.prompt_tokens = std::move (tmp);
4769
4733
tasks.push_back (std::move (task));
4770
4734
}
4771
4735
0 commit comments