Skip to content

Commit 98e03fb

Browse files
[feat] add metrics for yiyan adapter (#3219) (#3614)
* [feat] add metrics for yiyan adapter * [fix] fix metrics num_requests_waiting and num_requests_running * [fix] fix metrics gpu_cache_usage_perc * [refactor] change where requests_number increases * [chore] rename xxx_block_num as xxx_gpu_block_num, and update their values accordingly * [chore] delete useless code
1 parent fe5d09f commit 98e03fb

File tree

7 files changed

+180
-17
lines changed

7 files changed

+180
-17
lines changed

fastdeploy/cache_manager/prefix_cache_manager.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from fastdeploy.cache_manager.cache_data import BlockNode, CacheStatus
3333
from fastdeploy.cache_manager.cache_metrics import CacheMetrics
3434
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
35+
from fastdeploy.metrics.metrics import main_process_metrics
3536
from fastdeploy.utils import get_logger
3637

3738
logger = get_logger("prefix_cache_manager", "prefix_cache_manager.log")
@@ -110,6 +111,10 @@ def __init__(
110111
+ f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}"
111112
)
112113

114+
@property
115+
def available_gpu_resource(self):
116+
return len(self.gpu_free_block_list) / self.num_gpu_blocks if self.num_gpu_blocks > 0 else 0.0
117+
113118
def launch_cache_manager(
114119
self,
115120
cache_config,
@@ -229,6 +234,9 @@ def update_cache_config(self, cache_config):
229234
heapq.heapify(self.gpu_free_block_list)
230235
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
231236

237+
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
238+
main_process_metrics.available_gpu_resource.set(1.0)
239+
232240
def _enable_cpu_cache(self):
233241
"""
234242
_enable_cpu_cache function used to enable cpu cache.
@@ -264,6 +272,8 @@ def allocate_gpu_blocks(self, num_blocks):
264272
logger.info(
265273
f"allocate_gpu_blocks: {allocated_block_ids}, len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}"
266274
)
275+
main_process_metrics.free_gpu_block_num.set(len(self.gpu_free_block_list))
276+
main_process_metrics.available_gpu_resource.set(self.available_gpu_resource)
267277
return allocated_block_ids
268278

269279
def recycle_gpu_blocks(self, gpu_block_ids):
@@ -278,6 +288,8 @@ def recycle_gpu_blocks(self, gpu_block_ids):
278288
heapq.heappush(self.gpu_free_block_list, gpu_block_id)
279289
else:
280290
heapq.heappush(self.gpu_free_block_list, gpu_block_ids)
291+
main_process_metrics.free_gpu_block_num.set(len(self.gpu_free_block_list))
292+
main_process_metrics.available_gpu_resource.set(self.available_gpu_resource)
281293

282294
def allocate_cpu_blocks(self, num_blocks):
283295
"""

fastdeploy/engine/common_engine.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,8 @@ def _fetch_request():
552552
get_request_pool.submit(_fetch_request)
553553
# 2. Schedule requests
554554
tasks = self.resource_manager.schedule()
555+
main_process_metrics.num_requests_waiting.dec(len(tasks))
556+
main_process_metrics.num_requests_running.inc(len(tasks))
555557
# 3. Send to engine
556558
if tasks:
557559
self.resource_manager.get_real_bsz()
@@ -597,6 +599,7 @@ def _insert_zmq_task_to_scheduler(self):
597599
try:
598600
request = Request.from_dict(data)
599601
start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
602+
main_process_metrics.requests_number.inc()
600603
llm_logger.debug(f"Receive request: {request}")
601604
except Exception as e:
602605
llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")

fastdeploy/engine/resource_manager.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,15 @@ def __init__(
5151
"""
5252
self.cfg = config.cache_config
5353
self.max_num_seqs = max_num_seqs
54-
self.stop_flags = [True] * max_num_seqs
54+
self.stop_flags = [True] * max_num_seqs # flag set to true if the slot has not been taken
5555
self.enable_prefix_cache = config.cache_config.enable_prefix_caching
5656
self.cache_manager = PrefixCacheManager(config, tensor_parallel_size, splitwise_role, local_data_parallel_id)
57-
self.tasks_list = [None] * max_num_seqs
57+
self.tasks_list = [None] * max_num_seqs # task slots
5858
self.req_dict = dict()
5959
# current batch status of the engine
6060
self.real_bsz = 0
6161
llm_logger.info(f"{self.info()}")
62+
main_process_metrics.max_batch_size.set(max_num_seqs)
6263

6364
def reset_cache_config(self, cfg):
6465
"""
@@ -222,18 +223,18 @@ def allocate_resources_for_new_tasks(self, tasks):
222223
Returns:
223224
list: processed task list
224225
"""
225-
226-
allocated_position = 0
227-
processing_task_index = 0
226+
llm_logger.debug(f"Allocating resources for a batch of new tasks: {tasks}")
227+
allocated_position = 0 # number of tasks that have been allocated, also the position in request slots
228+
processing_task_index = 0 # current task
228229
processed_tasks = list()
229-
while allocated_position < self.max_num_seqs:
230-
if processing_task_index >= len(tasks):
230+
while allocated_position < self.max_num_seqs: # loop until all tasks are allocated resources for
231+
if processing_task_index >= len(tasks): # if all taskes have been tried, don't give a second chance
231232
break
232233

233234
can_insert = False
234235
while allocated_position < self.max_num_seqs:
235236
if sum(self.stop_flags[allocated_position : allocated_position + 1]) == 1:
236-
can_insert = True
237+
can_insert = True # if there is a empty slot, try to allocate resources for current task
237238
break
238239
allocated_position += 1
239240
if can_insert:
@@ -243,7 +244,8 @@ def allocate_resources_for_new_tasks(self, tasks):
243244
task.set("seed", random.randint(0, 9223372036854775807))
244245
task.idx = allocated_position
245246

246-
if self.enable_prefix_cache:
247+
if self.enable_prefix_cache: # if prefix caching is enabled
248+
# 1. request for enough blocks for current task
247249
cache_prepare_time = time.time()
248250
common_block_ids, unique_block_ids, hit_info = self.cache_manager.request_block_ids(
249251
task,
@@ -253,39 +255,42 @@ def allocate_resources_for_new_tasks(self, tasks):
253255
if unique_block_ids is None:
254256
llm_logger.warning("req_id: {0} not enough blocks available".format(task["req_id"]))
255257
return
256-
258+
# 2. record cache hit information, and return the number of tokens already in cache
257259
cached_len = self._record_request_cache_info(task, common_block_ids, unique_block_ids, hit_info)
258260
task.cache_prepare_time = time.time() - cache_prepare_time
259-
261+
# 3. if prefill/decode disaggregation is enabled
260262
if task.disaggregate_info is not None:
261263
if task.disaggregate_info["role"] == "prefill":
264+
# record the slot position for current task, indexed by request id
262265
self.req_dict[task.request_id] = allocated_position
263266
task.disaggregate_info["block_tables"] = task.block_tables
264267
self._delete_cached_data(task, cached_len)
265268
elif task.disaggregate_info["role"] == "decode":
266269
self.req_dict[task.request_id] = allocated_position
267270
task.disaggregate_info["block_tables"] = task.need_block_tables
268271
else:
272+
# remove cached tokens from prompt token ids to avoid kv recomputation
269273
self._delete_cached_data(task, cached_len)
270274

271-
else:
275+
else: # if prefix caching is disabled
276+
# 1. directly allocate empty block from the cache, if there is any
272277
block_tables = self._get_block_tables(task.prompt_token_ids_len)
273278
if not block_tables:
274279
llm_logger.error(f"req_id: {task.request_id} block_tables is empty")
275-
continue
280+
continue # retry
276281
else:
277282
task.block_tables = block_tables
278283
task.need_block_tables = task.block_tables
279-
284+
# 2. if prefill/decode disaggregation is enabled
280285
if task.disaggregate_info is not None:
281286
task.disaggregate_info["block_tables"] = block_tables
282287
if task.disaggregate_info["role"] == "prefill":
283288
self.req_dict[task.request_id] = allocated_position
284289
elif task.disaggregate_info["role"] == "decode":
285290
self.req_dict[task.request_id] = allocated_position
286291

287-
processed_tasks.append(task)
288-
self.stop_flags[allocated_position] = False
292+
processed_tasks.append(task) # add current task
293+
self.stop_flags[allocated_position] = False # mark the slot as occupied
289294
task.inference_start_time = time.time()
290295
task.inference_time_cost = -1.0
291296
task.tokens_all_num = 0
@@ -299,11 +304,18 @@ def allocate_resources_for_new_tasks(self, tasks):
299304
processing_task_index += 1
300305

301306
# batch size when the statistical engine is inferring
307+
# determine batch size by index of the first slot that is not occupied
302308
for i in range(self.max_num_seqs - 1, -1, -1):
303309
if not self.stop_flags[i]:
304310
self.real_bsz = i + 1
305311
break
306312

313+
# record batch size here
314+
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
315+
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
316+
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
317+
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
318+
307319
llm_logger.info(
308320
f"Number of allocated requests: {len(tasks)}, number of " f"running requests in worker: {self.real_bsz}"
309321
)
@@ -335,6 +347,11 @@ def _record_request_cache_info(self, task, common_block_ids, unique_block_ids, h
335347
task.cpu_cache_token_num = hit_info["cpu_cache_blocks"] * self.cfg.block_size
336348
task.cache_info = (cache_block_num, no_cache_block_num)
337349

350+
# Report the number of cached tokens to Prometheus metrics
351+
main_process_metrics.prefix_cache_token_num.inc(task.num_cached_tokens)
352+
main_process_metrics.prefix_gpu_cache_token_num.inc(task.gpu_cache_token_num)
353+
main_process_metrics.prefix_cpu_cache_token_num.inc(task.cpu_cache_token_num)
354+
338355
cached_len = len(common_block_ids) * self.cfg.block_size
339356
task.block_tables = common_block_ids + unique_block_ids
340357
task.need_block_tables = unique_block_ids

fastdeploy/engine/sched/resource_manager_v1.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
from fastdeploy.engine.request import Request, RequestStatus, RequestType
3030
from fastdeploy.engine.resource_manager import ResourceManager
31+
from fastdeploy.metrics.metrics import main_process_metrics
3132
from fastdeploy.utils import llm_logger
3233

3334

@@ -77,6 +78,7 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l
7778
self.finish_execution_pool = ThreadPoolExecutor(max_workers=1)
7879
self.lock = threading.Lock()
7980
self.to_be_rescheduled_request_id_set = set()
81+
main_process_metrics.max_batch_size.set(max_num_seqs)
8082

8183
def allocated_slots(self, request: Request):
8284
return len(request.block_tables) * self.config.cache_config.block_size
@@ -107,6 +109,9 @@ def reschedule_preempt_task(self, request_id):
107109
self.to_be_rescheduled_request_id_set.remove(request_id)
108110

109111
def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs):
112+
"""
113+
If the request cannot be scheduled, preempt the running request one by one until it can be scheduled. Last in, first out.
114+
"""
110115
can_schedule = True
111116
while True:
112117
if not self.cache_manager.can_allocate_gpu_blocks(num_new_blocks):
@@ -244,6 +249,9 @@ def exist_prefill(self, scheduled_reqs):
244249
return False
245250

246251
def schedule(self):
252+
"""
253+
Try to pull a batch of requests from the waiting queue and schedule them.
254+
"""
247255
with self.lock:
248256
scheduled_reqs: list[Request] = []
249257
preempted_reqs: list[Request] = []
@@ -305,7 +313,7 @@ def schedule(self):
305313
request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
306314
# Prepare prefill task
307315
scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
308-
else:
316+
else: # Not enough blocks to allocate, trigger preemption
309317
can_schedule = self._trigger_preempt(request, num_new_block, preempted_reqs, scheduled_reqs)
310318
if not can_schedule:
311319
break
@@ -371,6 +379,10 @@ def schedule(self):
371379
else:
372380
llm_logger.error("Unknown request status type")
373381
if scheduled_reqs:
382+
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
383+
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
384+
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
385+
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
374386
llm_logger.debug(f"schedued_reqs: {scheduled_reqs}")
375387
return scheduled_reqs
376388

@@ -412,6 +424,11 @@ def get_prefix_cached_blocks(self, request: Request):
412424
request.block_tables = common_block_ids
413425
request.skip_allocate = False
414426

427+
# Report the number of cached tokens to Prometheus metrics
428+
main_process_metrics.prefix_cache_token_num.inc(matched_token_num)
429+
main_process_metrics.prefix_gpu_cache_token_num.inc(request.gpu_cache_token_num)
430+
main_process_metrics.prefix_cpu_cache_token_num.inc(request.cpu_cache_token_num)
431+
415432
if matched_token_num == request.prompt_token_ids_len:
416433
request.num_computed_tokens = matched_token_num - 1
417434
request.skip_allocate = True

fastdeploy/metrics/metrics.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,22 @@ class MetricsManager:
154154
spec_decode_num_emitted_tokens_total: "Counter"
155155
spec_decode_draft_single_head_acceptance_rate: "list[Gauge]"
156156

157+
# for YIYAN Adapter
158+
prefix_cache_token_num: "Gauge"
159+
prefix_gpu_cache_token_num: "Gauge"
160+
prefix_cpu_cache_token_num: "Gauge"
161+
prefix_ssd_cache_token_num: "Gauge"
162+
batch_size: "Gauge"
163+
max_batch_size: "Gauge"
164+
available_gpu_block_num: "Gauge"
165+
free_gpu_block_num: "Gauge"
166+
max_gpu_block_num: "Gauge"
167+
available_gpu_resource: "Gauge"
168+
requests_number: "Counter"
169+
send_cache_failed_num: "Counter"
170+
first_token_latency: "Gauge"
171+
infer_latency: "Gauge"
172+
157173
# 定义所有指标配置
158174
METRICS = {
159175
"num_requests_running": {
@@ -258,6 +274,91 @@ class MetricsManager:
258274
"description": "Total number of successfully processed requests",
259275
"kwargs": {},
260276
},
277+
# for YIYAN Adapter
278+
"prefix_cache_token_num": {
279+
"type": Counter,
280+
"name": "fastdeploy:prefix_cache_token_num",
281+
"description": "Total number of cached tokens",
282+
"kwargs": {},
283+
},
284+
"prefix_gpu_cache_token_num": {
285+
"type": Counter,
286+
"name": "fastdeploy:prefix_gpu_cache_token_num",
287+
"description": "Total number of cached tokens on GPU",
288+
"kwargs": {},
289+
},
290+
"prefix_cpu_cache_token_num": {
291+
"type": Counter,
292+
"name": "fastdeploy:prefix_cpu_cache_token_num",
293+
"description": "Total number of cached tokens on CPU",
294+
"kwargs": {},
295+
},
296+
"prefix_ssd_cache_token_num": {
297+
"type": Counter,
298+
"name": "fastdeploy:prefix_ssd_cache_token_num",
299+
"description": "Total number of cached tokens on SSD",
300+
"kwargs": {},
301+
},
302+
"batch_size": {
303+
"type": Gauge,
304+
"name": "fastdeploy:batch_size",
305+
"description": "Real batch size during inference",
306+
"kwargs": {},
307+
},
308+
"max_batch_size": {
309+
"type": Gauge,
310+
"name": "fastdeploy:max_batch_size",
311+
"description": "Maximum batch size determined when service started",
312+
"kwargs": {},
313+
},
314+
"available_gpu_block_num": {
315+
"type": Gauge,
316+
"name": "fastdeploy:available_gpu_block_num",
317+
"description": "Number of available gpu blocks in cache, including prefix caching blocks that are not officially released",
318+
"kwargs": {},
319+
},
320+
"free_gpu_block_num": {
321+
"type": Gauge,
322+
"name": "fastdeploy:free_gpu_block_num",
323+
"description": "Number of free blocks in cache",
324+
"kwargs": {},
325+
},
326+
"max_gpu_block_num": {
327+
"type": Gauge,
328+
"name": "fastdeploy:max_gpu_block_num",
329+
"description": "Number of total blocks determined when service started",
330+
"kwargs": {},
331+
},
332+
"available_gpu_resource": {
333+
"type": Gauge,
334+
"name": "fastdeploy:available_gpu_resource",
335+
"description": "Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num",
336+
"kwargs": {},
337+
},
338+
"requests_number": {
339+
"type": Counter,
340+
"name": "fastdeploy:requests_number",
341+
"description": "Total number of requests received",
342+
"kwargs": {},
343+
},
344+
"send_cache_failed_num": {
345+
"type": Counter,
346+
"name": "fastdeploy:send_cache_failed_num",
347+
"description": "Total number of failures of sending cache",
348+
"kwargs": {},
349+
},
350+
"first_token_latency": {
351+
"type": Gauge,
352+
"name": "fastdeploy:first_token_latency",
353+
"description": "Latest time to first token in seconds",
354+
"kwargs": {},
355+
},
356+
"infer_latency": {
357+
"type": Gauge,
358+
"name": "fastdeploy:infer_latency",
359+
"description": "Latest time to generate one token in seconds",
360+
"kwargs": {},
361+
},
261362
}
262363
SPECULATIVE_METRICS = {}
263364

0 commit comments

Comments
 (0)