Skip to content

Commit ab1f268

Browse files
huydhnclee2000
andauthored
Create vLLM v1 benchmark dashboard (#6306)
This is the initial version of vLLM v1 benchmark dashboard. The benchmark is run periodically on vLLM main commits. The script running the benchmark is at https://github.com/pytorch/pytorch-integration-testing/tree/master/vllm-benchmarks. Besides all the custom logic for `vllm-project/vllm`, I also add a new `extra` map in the query to store arbitrary information about how the benchmark is setup. Some UX features are left for subsequent PRs: * Provide more information about how the benchmark is setup to be on par with the [v0 dashboard](https://simon-mo-workspace.observablehq.cloud/vllm-dashboard-v0/perf) * Fix the issue where `request_rate` and `tensor_parallel_size` are missing when the former is set to `Inf` leading to an invalid JSON. This fix needs to be done on vLLM side ### Preview https://torchci-git-fork-huydhn-create-vllm-benchma-323632-fbopensource.vercel.app/benchmark/llms?repoName=vllm-project%2Fvllm * [Last 1 day](https://torchci-git-fork-huydhn-create-vllm-benchma-323632-fbopensource.vercel.app/benchmark/llms?startTime=Wed%2C%2019%20Feb%202025%2005%3A14%3A18%20GMT&stopTime=Thu%2C%2020%20Feb%202025%2005%3A14%3A18%20GMT&granularity=hour&lBranch=main&lCommit=fbbe1fbac669a17f81c74c696f11a51167ed6a3c&rBranch=main&rCommit=512368e34a896fdfb64c16402107bcd3603369c7&repoName=vllm-project%2Fvllm&modelName=All%20Models&backendName=All%20Backends&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms) - All benchmarks are now running with the exception of [speculative decoding serving benchmark](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/tests/serving-tests.json#L60), which is not yet supported in v1. * [Last 7 days](https://torchci-git-fork-huydhn-create-vllm-benchma-323632-fbopensource.vercel.app/benchmark/llms?startTime=Thu%2C%2013%20Feb%202025%2005%3A14%3A44%20GMT&stopTime=Thu%2C%2020%20Feb%202025%2005%3A14%3A44%20GMT&granularity=hour&lBranch=main&lCommit=067fa2255b6687ccaa79391dc9d1a08c7632f605&rBranch=main&rCommit=512368e34a896fdfb64c16402107bcd3603369c7&repoName=vllm-project%2Fvllm&modelName=All%20Models&backendName=All%20Backends&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms) - There was llama3-8b model because of this issue vllm-project/vllm#13392, which was fixed last weekend. --------- Co-authored-by: clee2000 <[email protected]>
1 parent 0db4bab commit ab1f268

File tree

9 files changed

+203
-50
lines changed

9 files changed

+203
-50
lines changed

torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,32 @@ WITH benchmarks AS (
2323
tupleElement(o.benchmark, 'extra_info')['arch'],
2424
tupleElement(o.runners[1], 'type')
2525
) AS arch,
26-
IF(
27-
tupleElement(o.benchmark, 'extra_info')['compile'] = '',
28-
'true', -- Default to true
29-
tupleElement(o.benchmark, 'extra_info')['compile']
30-
) AS use_torch_compile,
3126
DATE_TRUNC(
3227
{granularity: String },
3328
fromUnixTimestamp(o.timestamp)
34-
) AS granularity_bucket
29+
) AS granularity_bucket,
30+
-- Repo-specific fields
31+
map(
32+
-- Used by torchao
33+
'use_torch_compile',
34+
IF(
35+
tupleElement(o.benchmark, 'extra_info')['compile'] = '',
36+
'true',
37+
-- Default to true
38+
tupleElement(o.benchmark, 'extra_info')['compile']
39+
),
40+
-- Used by vLLM
41+
'request_rate',
42+
JSONExtractString(
43+
tupleElement(o.benchmark, 'extra_info')['args'],
44+
'request_rate'
45+
),
46+
'tensor_parallel_size',
47+
JSONExtractString(
48+
tupleElement(o.benchmark, 'extra_info')['args'],
49+
'tensor_parallel_size'
50+
)
51+
) AS extra
3552
FROM
3653
benchmark.oss_ci_benchmark_v3 o
3754
WHERE
@@ -77,8 +94,8 @@ SELECT DISTINCT
7794
dtype,
7895
device,
7996
arch,
80-
toBool(use_torch_compile) AS use_torch_compile,
81-
granularity_bucket
97+
granularity_bucket,
98+
extra
8299
FROM
83100
benchmarks
84101
WHERE
@@ -101,4 +118,5 @@ ORDER BY
101118
backend,
102119
model,
103120
dtype,
104-
device
121+
device,
122+
metric

torchci/components/NavBar.tsx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ function NavBar() {
6060
name: "TorchAO LLMs",
6161
href: "/benchmark/llms?repoName=pytorch%2Fao",
6262
},
63+
{
64+
name: "vLLM v1",
65+
href: "/benchmark/llms?repoName=vllm-project%2Fvllm",
66+
},
6367
];
6468

6569
const devInfraDropdown = [

torchci/components/benchmark/CommitPanel.tsx

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,37 @@ export function CommitPanel({
1818
}) {
1919
return (
2020
<Stack direction="row" spacing={2} sx={{ mb: 2 }}>
21-
<Typography fontSize={"1rem"} fontStyle={"italic"}>
22-
*This report was generated by CI running on {repoName}{" "}
23-
{lBranchAndCommit.branch} branch at commit{" "}
24-
<a
25-
href={`/${repoName}/commit/${lBranchAndCommit.commit}#${workflowName}`}
26-
>
27-
{lBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
28-
</a>{" "}
29-
comparing with {rBranchAndCommit.branch} branch at commit{" "}
30-
<a
31-
href={`/${repoName}/commit/${rBranchAndCommit.commit}#${workflowName}`}
32-
>
33-
{rBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
34-
</a>
35-
. {children}
36-
</Typography>
21+
{repoName !== "vllm-project/vllm" && (
22+
<Typography fontSize={"1rem"} fontStyle={"italic"}>
23+
*This report was generated by CI running on {repoName}{" "}
24+
{lBranchAndCommit.branch} branch at commit{" "}
25+
<a
26+
href={`/${repoName}/commit/${lBranchAndCommit.commit}#${workflowName}`}
27+
>
28+
{lBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
29+
</a>{" "}
30+
comparing with {rBranchAndCommit.branch} branch at commit{" "}
31+
<a
32+
href={`/${repoName}/commit/${rBranchAndCommit.commit}#${workflowName}`}
33+
>
34+
{rBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
35+
</a>
36+
. {children}
37+
</Typography>
38+
)}
39+
{repoName === "vllm-project/vllm" && (
40+
<Typography fontSize={"1rem"} fontStyle={"italic"}>
41+
This is vLLM v1 dashboard, please refer to{" "}
42+
<a
43+
href={
44+
"https://simon-mo-workspace.observablehq.cloud/vllm-dashboard-v0/perf"
45+
}
46+
>
47+
v0 dashboard
48+
</a>{" "}
49+
for the information about how the benchmark is setup
50+
</Typography>
51+
)}
3752
</Stack>
3853
);
3954
}

torchci/components/benchmark/llms/ModelGraphPanel.tsx

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,44 @@ export function GraphPanel({
141141
const model = record.model;
142142
const dtype = record.dtype;
143143
const device = record.device;
144+
const metric = record.metric;
144145

145-
record.display = model.includes(dtype)
146-
? model.includes(device)
147-
? model
148-
: `${model} (${device})`
149-
: model.includes(device)
150-
? `${model} (${dtype})`
151-
: `${model} (${dtype} / ${device})`;
146+
if (repoName === "vllm-project/vllm") {
147+
let requestRate = record.extra!["request_rate"];
148+
// TODO (huydhn): Fix the invalid JSON on vLLM side
149+
if (
150+
metric.includes("itl") ||
151+
metric.includes("tpot") ||
152+
metric.includes("ttft")
153+
) {
154+
requestRate = requestRate !== "" ? requestRate : "Inf";
155+
}
156+
157+
let tensorParallel = record.extra!["tensor_parallel_size"];
158+
// TODO (huydhn): Fix the passing of tensor_parallel_size to the benchmark
159+
// script on vLLM side
160+
if (model.includes("8B")) {
161+
tensorParallel = tensorParallel !== "" ? tensorParallel : "1";
162+
} else if (model.includes("70B")) {
163+
tensorParallel = tensorParallel !== "" ? tensorParallel : "4";
164+
} else if (model.includes("8x7B")) {
165+
tensorParallel = tensorParallel !== "" ? tensorParallel : "2";
166+
}
167+
168+
if (requestRate !== "") {
169+
record.display = `${model} / tp${tensorParallel} / qps_${requestRate}`;
170+
} else {
171+
record.display = `${model} / tp${tensorParallel}`;
172+
}
173+
} else {
174+
record.display = model.includes(dtype)
175+
? model.includes(device)
176+
? model
177+
: `${model} (${device})`
178+
: model.includes(device)
179+
? `${model} (${dtype})`
180+
: `${model} (${dtype} / ${device})`;
181+
}
152182

153183
return record;
154184
});
@@ -177,7 +207,7 @@ export function GraphPanel({
177207
.filter((metric) => chartData[metric].length !== 0)
178208
.map((metric: string) => (
179209
<Grid2
180-
size={{ xs: 12, lg: modelName === DEFAULT_MODEL_NAME ? 12 : 4 }}
210+
size={{ xs: 12, lg: modelName === DEFAULT_MODEL_NAME ? 12 : 6 }}
181211
height={GRAPH_ROW_HEIGHT}
182212
key={metric}
183213
>
@@ -203,7 +233,7 @@ export function GraphPanel({
203233
},
204234
},
205235
}}
206-
legendPadding={modelName === DEFAULT_MODEL_NAME ? 320 : 200}
236+
legendPadding={320}
207237
/>
208238
</Grid2>
209239
))}

torchci/components/benchmark/llms/SummaryPanel.tsx

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ export function SummaryPanel({
5656
const rCommit = rPerfData.commit;
5757
const rData = rPerfData.data;
5858

59-
const data = combineLeftAndRight(lPerfData, rPerfData);
59+
const data = combineLeftAndRight(repoName, lPerfData, rPerfData);
6060
const columns: any[] = [
6161
{
6262
field: "metadata",
@@ -138,6 +138,26 @@ export function SummaryPanel({
138138
});
139139
}
140140

141+
if (repoName === "vllm-project/vllm") {
142+
columns.push({
143+
field: "tensor_parallel_size",
144+
headerName: "Tensor parallel",
145+
flex: 1,
146+
renderCell: (params: GridRenderCellParams<any>) => {
147+
return `${params.value}`;
148+
},
149+
});
150+
151+
columns.push({
152+
field: "request_rate",
153+
headerName: "Request rate",
154+
flex: 1,
155+
renderCell: (params: GridRenderCellParams<any>) => {
156+
return `${params.value}`;
157+
},
158+
});
159+
}
160+
141161
columns.push(
142162
...[
143163
{

torchci/components/benchmark/llms/common.tsx

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,17 @@ export const REPO_TO_BENCHMARKS: { [k: string]: string[] } = {
55
"pytorch/pytorch": ["PyTorch gpt-fast benchmark"],
66
"pytorch/executorch": ["ExecuTorch"],
77
"pytorch/ao": ["TorchAO benchmark"],
8+
"vllm-project/vllm": ["vLLM benchmark"],
89
};
9-
export const EXCLUDED_METRICS: string[] = ["load_status"];
10+
export const EXCLUDED_METRICS: string[] = [
11+
"load_status",
12+
"mean_itl_ms",
13+
"mean_tpot_ms",
14+
"mean_ttft_ms",
15+
"std_itl_ms",
16+
"std_tpot_ms",
17+
"std_ttft_ms",
18+
];
1019
export const DEFAULT_MODEL_NAME = "All Models";
1120
export const SCALE = 2;
1221
export const METRIC_DISPLAY_HEADERS: { [k: string]: string } = {
@@ -17,6 +26,15 @@ export const METRIC_DISPLAY_HEADERS: { [k: string]: string } = {
1726
compile_vs_eager_speedup: "Compile vs eager speedup",
1827
autoquant_vs_compile_speedup: "Autoquant vs compile speedup",
1928
eager_speedup: "Eager speedup",
29+
latency: "Latency (s)",
30+
median_itl_ms: "Median ITL (ms)",
31+
median_tpot_ms: "Median TPOT (ms)",
32+
median_ttft_ms: "Median TTFT (ms)",
33+
p99_itl_ms: "p99 ITL (ms)",
34+
p99_tpot_ms: "p99 TPOT (ms)",
35+
p99_ttft_ms: "p99 TTFT (ms)",
36+
requests_per_second: "Requests/s",
37+
tokens_per_second: "Tokens/s",
2038
};
2139
// The variable name is a bit dumb, but it tells if a higher metric value
2240
// is good or bad so that we can highlight it on the dashboard accordingly.
@@ -32,6 +50,15 @@ export const IS_INCREASING_METRIC_VALUE_GOOD: { [k: string]: boolean } = {
3250
"peak_inference_mem_usage(mb)": false,
3351
"peak_load_mem_usuage(mb)": false,
3452
"generate_time(ms)": false,
53+
latency: false,
54+
median_itl_ms: false,
55+
median_tpot_ms: false,
56+
median_ttft_ms: false,
57+
p99_itl_ms: false,
58+
p99_tpot_ms: false,
59+
p99_ttft_ms: false,
60+
requests_per_second: true,
61+
tokens_per_second: true,
3562
};
3663
export const METRIC_DISPLAY_SHORT_HEADERS: { [k: string]: string } = {
3764
"memory_bandwidth(GB/s)": "Bandwidth",
@@ -71,7 +98,7 @@ export interface LLMsBenchmarkData {
7198
device: string;
7299
arch: string;
73100
display?: string;
74-
use_torch_compile?: boolean;
101+
extra?: { [key: string]: string };
75102
}
76103

77104
export interface BranchAndCommitPerfData extends BranchAndCommit {

torchci/lib/benchmark/aoUtils.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,8 @@ export function computeSpeedup(
8484
const currentCommitBaseline: { [key: string]: LLMsBenchmarkData } = {};
8585

8686
data.forEach((r: LLMsBenchmarkData) => {
87-
if (
88-
r.dtype !== TORCHAO_BASELINE ||
89-
r.use_torch_compile !== useTorchCompile
90-
) {
87+
const compile = r.extra?.use_torch_compile === "true";
88+
if (r.dtype !== TORCHAO_BASELINE || compile !== useTorchCompile) {
9189
return;
9290
}
9391

@@ -112,8 +110,9 @@ export function computeSpeedup(
112110
data.forEach((r: LLMsBenchmarkData) => {
113111
withSpeedup.push(r);
114112

113+
const compile = r.extra?.use_torch_compile === "true";
115114
// Compute eager speedup vs the base commit baseline
116-
if (r.dtype === TORCHAO_BASELINE && r.use_torch_compile === false) {
115+
if (r.dtype === TORCHAO_BASELINE && compile === false) {
117116
if (SPEEDUP_METRICS.includes(r.metric)) {
118117
const k = `${r.model} ${r.metric} ${r.device} ${r.arch}`;
119118
if (

0 commit comments

Comments
 (0)