vllm-project · varun-sundar-rabindranath · Aug 19, 2024 · Aug 22, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -310,39 +310,39 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
-    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len = input_requests[0]
-    test_input = RequestFuncInput(
-        model=model_id,
-        prompt=test_prompt,
-        api_url=api_url,
-        prompt_len=test_prompt_len,
-        output_len=test_output_len,
-        best_of=best_of,
-        use_beam_search=use_beam_search,
-    )
-    test_output = await request_func(request_func_input=test_input)
-    if not test_output.success:
-        raise ValueError(
-            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}")
-    else:
-        print("Initial test run completed. Starting main benchmark run...")
-
-    if profile:
-        print("Starting profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/start_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            best_of=best_of,
-            use_beam_search=use_beam_search,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler started")
+    #print("Starting initial single prompt test run...")
+    #test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    #test_input = RequestFuncInput(
+    #    model=model_id,
+    #    prompt=test_prompt,
+    #    api_url=api_url,
+    #    prompt_len=test_prompt_len,
+    #    output_len=test_output_len,
+    #    best_of=best_of,
+    #    use_beam_search=use_beam_search,
+    #)
+    #test_output = await request_func(request_func_input=test_input)
+    #if not test_output.success:
+    #    raise ValueError(
+    #        "Initial test run failed - Please make sure benchmark arguments "
+    #        f"are correctly specified. Error: {test_output.error}")
+    #else:
+    #    print("Initial test run completed. Starting main benchmark run...")
+
+    #if profile:
+    #    print("Starting profiler...")
+    #    profile_input = RequestFuncInput(
+    #        model=model_id,
+    #        prompt=test_prompt,
+    #        api_url=base_url + "/start_profile",
+    #        prompt_len=test_prompt_len,
+    #        output_len=test_output_len,
+    #        best_of=best_of,
+    #        use_beam_search=use_beam_search,
+    #    )
+    #    profile_output = await request_func(request_func_input=profile_input)
+    #    if profile_output.success:
+    #        print("Profiler started")
 
     print(f"Traffic request rate: {request_rate}")
 
@@ -367,20 +367,20 @@ async def benchmark(
                              pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            best_of=best_of,
-            use_beam_search=use_beam_search,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
+    #if profile:
+    #    print("Stopping profiler...")
+    #    profile_input = RequestFuncInput(
+    #        model=model_id,
+    #        prompt=test_prompt,
+    #        api_url=base_url + "/stop_profile",
+    #        prompt_len=test_prompt_len,
+    #        output_len=test_output_len,
+    #        best_of=best_of,
+    #        use_beam_search=use_beam_search,
+    #    )
+    #    profile_output = await request_func(request_func_input=profile_input)
+    #    if profile_output.success:
+    #        print("Profiler stopped")
 
     if pbar is not None:
         pbar.close()

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_quick(torch::Tensor& out, torch::Tensor& input);
 
-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables);
+void advance_step(
+    int64_t num_prefill_tokens, int64_t num_prefills, int64_t num_seqs,
+    int64_t num_queries, int64_t block_size, int64_t num_prefills_with_sampling,
+    torch::Tensor& input_tokens,
+    torch::Tensor& sampled_token_ids, torch::Tensor& input_positions,
+    torch::Tensor& seq_lens, torch::Tensor& slot_mapping,
+    torch::Tensor& block_tables, torch::Tensor& seq_start_loc,
+    c10::optional<torch::Tensor> context_lens,
+    c10::optional<torch::Tensor> const& prefill_steps_tokens,
+    c10::optional<torch::Tensor> const& prefill_steps_slot_mapping,
+    c10::optional<torch::Tensor> const& prefill_input_positions_update,
+    c10::optional<torch::Tensor> const& prefill_seq_start_loc_update,
+    c10::optional<torch::Tensor> const& prefill_advance_query,
+    c10::optional<torch::Tensor> const& prefill_advance_tokens,
+    c10::optional<torch::Tensor> const& prefill_token_chunk_sizes);
 
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,