@@ -78,45 +78,45 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
78
78
api_server_logger .error (err_msg )
79
79
return ErrorResponse (message = err_msg , code = 400 )
80
80
81
- if request .user is not None :
82
- request_id = f"chatcmpl-{ request .user } -{ uuid .uuid4 ()} "
83
- else :
84
- request_id = f"chatcmpl-{ uuid .uuid4 ()} "
85
- api_server_logger .info (f"create chat completion request: { request_id } " )
86
- text_after_process = None
87
- try :
88
- current_req_dict = request .to_dict_for_infer (request_id )
89
- current_req_dict ["arrival_time" ] = time .time ()
90
- prompt_token_ids = self .engine_client .format_and_add_data (current_req_dict )
91
- text_after_process = current_req_dict .get ("text_after_process" )
92
- if isinstance (prompt_token_ids , np .ndarray ):
93
- prompt_token_ids = prompt_token_ids .tolist ()
94
- except Exception as e :
95
- return ErrorResponse (code = 400 , message = str (e ))
96
-
97
- del current_req_dict
98
-
99
81
try :
100
- api_server_logger .debug (f"{ self .engine_client .semaphore .status ()} " )
101
82
if self .max_waiting_time < 0 :
102
83
await self .engine_client .semaphore .acquire ()
103
84
else :
104
85
await asyncio .wait_for (self .engine_client .semaphore .acquire (), timeout = self .max_waiting_time )
105
- except Exception :
106
- return ErrorResponse (code = 408 , message = f"Request queued time exceed { self .max_waiting_time } " )
86
+ api_server_logger .debug (f"current waiting request { self .engine_client .semaphore .status ()} " )
107
87
108
- if request .stream :
109
- return self .chat_completion_stream_generator (
110
- request , request_id , request .model , prompt_token_ids , text_after_process
111
- )
112
- else :
88
+ if request .user is not None :
89
+ request_id = f"chatcmpl-{ request .user } -{ uuid .uuid4 ()} "
90
+ else :
91
+ request_id = f"chatcmpl-{ uuid .uuid4 ()} "
92
+ api_server_logger .info (f"create chat completion request: { request_id } " )
93
+ text_after_process = None
113
94
try :
114
- return await self .chat_completion_full_generator (
115
- request , request_id , request .model , prompt_token_ids , text_after_process
116
- )
95
+ current_req_dict = request .to_dict_for_infer (request_id )
96
+ current_req_dict ["arrival_time" ] = time .time ()
97
+ prompt_token_ids = self .engine_client .format_and_add_data (current_req_dict )
98
+ text_after_process = current_req_dict .get ("text_after_process" )
99
+ if isinstance (prompt_token_ids , np .ndarray ):
100
+ prompt_token_ids = prompt_token_ids .tolist ()
117
101
except Exception as e :
118
102
return ErrorResponse (code = 400 , message = str (e ))
119
103
104
+ del current_req_dict
105
+
106
+ if request .stream :
107
+ return self .chat_completion_stream_generator (
108
+ request , request_id , request .model , prompt_token_ids , text_after_process
109
+ )
110
+ else :
111
+ try :
112
+ return await self .chat_completion_full_generator (
113
+ request , request_id , request .model , prompt_token_ids , text_after_process
114
+ )
115
+ except Exception as e :
116
+ return ErrorResponse (code = 400 , message = str (e ))
117
+ except Exception :
118
+ return ErrorResponse (code = 408 , message = f"Request queued time exceed { self .max_waiting_time } " )
119
+
120
120
def _create_streaming_error_response (self , message : str ) -> str :
121
121
error_response = ErrorResponse (
122
122
code = 400 ,
0 commit comments