From 185ade04a134952790c8e59b4be07274cafa4ce9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 10 Feb 2025 16:54:14 -0800 Subject: [PATCH 1/4] Add BurtGPT to benchmark_serving Signed-off-by: Woosuk Kwon --- benchmarks/benchmark_serving.py | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1044bef59417..3ed641ee20a6 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -38,6 +38,7 @@ from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple import numpy as np +import pandas as pd from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, RequestFuncOutput) from datasets import load_dataset @@ -131,6 +132,30 @@ def sample_sharegpt_requests( return filtered_dataset +def sample_burstgpt_requests( + dataset_path: str, + num_requests: int, + random_seed: int, + tokenizer: PreTrainedTokenizerBase, +) -> List[Tuple[str, int, int, None]]: + df = pd.read_csv(dataset_path) + gpt4_df = df[df["Model"] == "GPT-4"] + # Remove the failed requests (i.e., response length is 0) + gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] + # Randomly sample num_requests from the dataset + gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed) + # Convert the dataframe to a list of tuples + dataset = gpt4_df.values.tolist() + input_requests = [] + for i in range(num_requests): + input_len = int(dataset[i][2]) + output_len = int(dataset[i][3]) + prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size + for j in range(input_len)]) + input_requests.append((prompt, input_len, output_len, None)) + return input_requests + + def sample_sonnet_requests( dataset_path: str, num_requests: int, @@ -830,6 +855,14 @@ def main(args: argparse.Namespace): fixed_output_len=args.sharegpt_output_len, ) + elif args.dataset_name == "burstgpt": + input_requests = sample_burstgpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + random_seed=args.seed, + tokenizer=tokenizer, + ) + elif args.dataset_name == "sonnet": # Do not format the prompt, pass to message directly if args.backend == "openai-chat": @@ -995,7 +1028,7 @@ def main(args: argparse.Namespace): "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "sonnet", "random", "hf"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], help="Name of the dataset to benchmark on.", ) parser.add_argument("--dataset-path", From 9f6aeb023f7dcc9ec1621f83b4427fb135079df8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 10 Feb 2025 16:57:23 -0800 Subject: [PATCH 2/4] Add to readme Signed-off-by: Woosuk Kwon --- benchmarks/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/README.md b/benchmarks/README.md index 890a2525bcfe..367ef93457f9 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -19,3 +19,11 @@ mkdir coco -p wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip unzip coco/train2017.zip -d coco/ ``` + +# Downloading the BurstGPT dataset + +You can download the BurstGPT v1.1 dataset by running: + +```bash +wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv +``` From 79b7ff35e635bf90d958f20b8c04e2ef7c2a745c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 10 Feb 2025 21:09:50 -0800 Subject: [PATCH 3/4] Update benchmarks/benchmark_serving.py Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- benchmarks/benchmark_serving.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3ed641ee20a6..e778539bcb27 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -143,7 +143,10 @@ def sample_burstgpt_requests( # Remove the failed requests (i.e., response length is 0) gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] # Randomly sample num_requests from the dataset - gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed) + if num_requests <= len(gpt4_df): + gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed) + else: + gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed, replace=True) # Convert the dataframe to a list of tuples dataset = gpt4_df.values.tolist() input_requests = [] From e8a94ba77b4a5e38e0f29e9811d0da3c6c10b5b0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 10 Feb 2025 21:16:27 -0800 Subject: [PATCH 4/4] minor Signed-off-by: Woosuk Kwon --- benchmarks/benchmark_serving.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index e778539bcb27..0c892384236b 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -146,7 +146,9 @@ def sample_burstgpt_requests( if num_requests <= len(gpt4_df): gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed) else: - gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed, replace=True) + gpt4_df = gpt4_df.sample(n=num_requests, + random_state=random_seed, + replace=True) # Convert the dataframe to a list of tuples dataset = gpt4_df.values.tolist() input_requests = []