@@ -527,14 +527,6 @@ struct llama_server_context
527
527
slot_params default_params;
528
528
llama_sampling_params default_sparams;
529
529
530
- if (data.count (" __oaicompat" ) != 0 ) {
531
- slot->oaicompat = true ;
532
- slot->oaicompat_model = json_value (data, " model" , std::string (DEFAULT_OAICOMPAT_MODEL));
533
- } else {
534
- slot->oaicompat = false ;
535
- slot->oaicompat_model = " " ;
536
- }
537
-
538
530
slot->params .stream = json_value (data, " stream" , false );
539
531
slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
540
532
slot->params .n_predict = json_value (data, " n_predict" , default_params.n_predict );
@@ -2032,9 +2024,9 @@ static void params_parse(const backend::ModelOptions* request,
2032
2024
std::sregex_token_iterator it{ arg_next.begin (), arg_next.end (), regex, -1 };
2033
2025
std::vector<std::string> split_arg{ it, {} };
2034
2026
2035
- GGML_ASSERT (split_arg.size () <= LLAMA_MAX_DEVICES );
2027
+ GGML_ASSERT (split_arg.size () <= llama_max_devices () );
2036
2028
2037
- for (size_t i_device = 0 ; i_device < LLAMA_MAX_DEVICES ; ++i_device) {
2029
+ for (size_t i_device = 0 ; i_device < llama_max_devices () ; ++i_device) {
2038
2030
if (i_device < split_arg.size ()) {
2039
2031
params.tensor_split [i_device] = std::stof (split_arg[i_device]);
2040
2032
}
@@ -2116,10 +2108,12 @@ class BackendServiceImpl final : public backend::Backend::Service {
2116
2108
}
2117
2109
grpc::Status PredictStream (grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
2118
2110
json data = parse_options (true , request, llama);
2119
- const int task_id = llama.request_completion (data, false , false , -1 );
2111
+ const int task_id = llama.queue_tasks .get_new_id ();
2112
+ llama.queue_results .add_waiting_task_id (task_id);
2113
+ llama.request_completion (task_id, data, false , false , -1 );
2120
2114
while (true )
2121
2115
{
2122
- task_result result = llama.next_result (task_id);
2116
+ task_result result = llama.queue_results . recv (task_id);
2123
2117
if (!result.error ) {
2124
2118
const std::string str =
2125
2119
" data: " +
@@ -2152,9 +2146,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
2152
2146
2153
2147
grpc::Status Predict (ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
2154
2148
json data = parse_options (false , request, llama);
2155
- const int task_id = llama.request_completion (data, false , false , -1 );
2149
+ const int task_id = llama.queue_tasks .get_new_id ();
2150
+ llama.queue_results .add_waiting_task_id (task_id);
2151
+ llama.request_completion (task_id, data, false , false , -1 );
2156
2152
std::string completion_text;
2157
- task_result result = llama.next_result (task_id);
2153
+ task_result result = llama.queue_results . recv (task_id);
2158
2154
if (!result.error && result.stop ) {
2159
2155
completion_text = result.result_json .value (" content" , " " );
2160
2156
reply->set_message (completion_text);
0 commit comments