Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 47 additions & 47 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,39 +310,39 @@ async def benchmark(
else:
raise ValueError(f"Unknown backend: {backend}")

print("Starting initial single prompt test run...")
test_prompt, test_prompt_len, test_output_len = input_requests[0]
test_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=api_url,
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {test_output.error}")
else:
print("Initial test run completed. Starting main benchmark run...")

if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
#print("Starting initial single prompt test run...")
#test_prompt, test_prompt_len, test_output_len = input_requests[0]
#test_input = RequestFuncInput(
# model=model_id,
# prompt=test_prompt,
# api_url=api_url,
# prompt_len=test_prompt_len,
# output_len=test_output_len,
# best_of=best_of,
# use_beam_search=use_beam_search,
#)
#test_output = await request_func(request_func_input=test_input)
#if not test_output.success:
# raise ValueError(
# "Initial test run failed - Please make sure benchmark arguments "
# f"are correctly specified. Error: {test_output.error}")
#else:
# print("Initial test run completed. Starting main benchmark run...")

#if profile:
# print("Starting profiler...")
# profile_input = RequestFuncInput(
# model=model_id,
# prompt=test_prompt,
# api_url=base_url + "/start_profile",
# prompt_len=test_prompt_len,
# output_len=test_output_len,
# best_of=best_of,
# use_beam_search=use_beam_search,
# )
# profile_output = await request_func(request_func_input=profile_input)
# if profile_output.success:
# print("Profiler started")

print(f"Traffic request rate: {request_rate}")

Expand All @@ -367,20 +367,20 @@ async def benchmark(
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
#if profile:
# print("Stopping profiler...")
# profile_input = RequestFuncInput(
# model=model_id,
# prompt=test_prompt,
# api_url=base_url + "/stop_profile",
# prompt_len=test_prompt_len,
# output_len=test_output_len,
# best_of=best_of,
# use_beam_search=use_beam_search,
# )
# profile_output = await request_func(request_func_input=profile_input)
# if profile_output.success:
# print("Profiler stopped")

if pbar is not None:
pbar.close()
Expand Down
19 changes: 15 additions & 4 deletions csrc/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);

void gelu_quick(torch::Tensor& out, torch::Tensor& input);

void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
torch::Tensor& input_positions, torch::Tensor& seq_lens,
torch::Tensor& slot_mapping, torch::Tensor& block_tables);
void advance_step(
int64_t num_prefill_tokens, int64_t num_prefills, int64_t num_seqs,
int64_t num_queries, int64_t block_size, int64_t num_prefills_with_sampling,
torch::Tensor& input_tokens,
torch::Tensor& sampled_token_ids, torch::Tensor& input_positions,
torch::Tensor& seq_lens, torch::Tensor& slot_mapping,
torch::Tensor& block_tables, torch::Tensor& seq_start_loc,
c10::optional<torch::Tensor> context_lens,
c10::optional<torch::Tensor> const& prefill_steps_tokens,
c10::optional<torch::Tensor> const& prefill_steps_slot_mapping,
c10::optional<torch::Tensor> const& prefill_input_positions_update,
c10::optional<torch::Tensor> const& prefill_seq_start_loc_update,
c10::optional<torch::Tensor> const& prefill_advance_query,
c10::optional<torch::Tensor> const& prefill_advance_tokens,
c10::optional<torch::Tensor> const& prefill_token_chunk_sizes);

#ifndef USE_ROCM
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
Expand Down
Loading