Create vLLM v1 benchmark dashboard (#6306)

huydhn · clee2000 · web-flow · commit ab1f268c9126 · 2025-02-21T10:25:18.000-08:00
This is the initial version of vLLM v1 benchmark dashboard. The benchmark is run periodically on vLLM main commits. The script running the benchmark is at https://github.com/pytorch/pytorch-integration-testing/tree/master/vllm-benchmarks. Besides all the custom logic for `vllm-project/vllm`, I also add a new `extra` map in the query to store arbitrary information about how the benchmark is setup. Some UX features are left for subsequent PRs: * Provide more information about how the benchmark is setup to be on par with the [v0 dashboard](https://simon-mo-workspace.observablehq.cloud/vllm-dashboard-v0/perf) * Fix the issue where `request_rate` and `tensor_parallel_size` are missing when the former is set to `Inf` leading to an invalid JSON. This fix needs to be done on vLLM side ### Preview https://torchci-git-fork-huydhn-create-vllm-benchma-323632-fbopensource.vercel.app/benchmark/llms?repoName=vllm-project%2Fvllm * [Last 1 day](https://torchci-git-fork-huydhn-create-vllm-benchma-323632-fbopensource.vercel.app/benchmark/llms?startTime=Wed%2C%2019%20Feb%202025%2005%3A14%3A18%20GMT&stopTime=Thu%2C%2020%20Feb%202025%2005%3A14%3A18%20GMT&granularity=hour&lBranch=main&lCommit=fbbe1fbac669a17f81c74c696f11a51167ed6a3c&rBranch=main&rCommit=512368e34a896fdfb64c16402107bcd3603369c7&repoName=vllm-project%2Fvllm&modelName=All%20Models&backendName=All%20Backends&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms) - All benchmarks are now running with the exception of [speculative decoding serving benchmark](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/tests/serving-tests.json#L60), which is not yet supported in v1. * [Last 7 days](https://torchci-git-fork-huydhn-create-vllm-benchma-323632-fbopensource.vercel.app/benchmark/llms?startTime=Thu%2C%2013%20Feb%202025%2005%3A14%3A44%20GMT&stopTime=Thu%2C%2020%20Feb%202025%2005%3A14%3A44%20GMT&granularity=hour&lBranch=main&lCommit=067fa2255b6687ccaa79391dc9d1a08c7632f605&rBranch=main&rCommit=512368e34a896fdfb64c16402107bcd3603369c7&repoName=vllm-project%2Fvllm&modelName=All%20Models&backendName=All%20Backends&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms) - There was llama3-8b model because of this issue vllm-project/vllm#13392, which was fixed last weekend. --------- Co-authored-by: clee2000 <44682903+clee2000@users.noreply.github.com>
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql
@@ -23,15 +23,32 @@ WITH benchmarks AS (
             tupleElement(o.benchmark, 'extra_info')['arch'],
             tupleElement(o.runners[1], 'type')
         ) AS arch,
-        IF(
-            tupleElement(o.benchmark, 'extra_info')['compile'] = '',
-            'true',  -- Default to true
-            tupleElement(o.benchmark, 'extra_info')['compile']
-        ) AS use_torch_compile,
         DATE_TRUNC(
             {granularity: String },
             fromUnixTimestamp(o.timestamp)
-        ) AS granularity_bucket
+        ) AS granularity_bucket,
+        -- Repo-specific fields
+        map(
+            -- Used by torchao
+            'use_torch_compile',
+            IF(
+                tupleElement(o.benchmark, 'extra_info')['compile'] = '',
+                'true',
+                -- Default to true
+                tupleElement(o.benchmark, 'extra_info')['compile']
+            ),
+            -- Used by vLLM
+            'request_rate',
+            JSONExtractString(
+                tupleElement(o.benchmark, 'extra_info')['args'],
+                'request_rate'
+            ),
+            'tensor_parallel_size',
+            JSONExtractString(
+                tupleElement(o.benchmark, 'extra_info')['args'],
+                'tensor_parallel_size'
+            )
+        ) AS extra
     FROM
         benchmark.oss_ci_benchmark_v3 o
     WHERE
@@ -77,8 +94,8 @@ SELECT DISTINCT
     dtype,
     device,
     arch,
-    toBool(use_torch_compile) AS use_torch_compile,
-    granularity_bucket
+    granularity_bucket,
+    extra
 FROM
     benchmarks
 WHERE
@@ -101,4 +118,5 @@ ORDER BY
     backend,
     model,
     dtype,
-    device
+    device,
+    metric
diff --git a/torchci/components/NavBar.tsx b/torchci/components/NavBar.tsx
@@ -60,6 +60,10 @@ function NavBar() {
       name: "TorchAO LLMs",
       href: "/benchmark/llms?repoName=pytorch%2Fao",
     },
+    {
+      name: "vLLM v1",
+      href: "/benchmark/llms?repoName=vllm-project%2Fvllm",
+    },
   ];
 
   const devInfraDropdown = [
diff --git a/torchci/components/benchmark/CommitPanel.tsx b/torchci/components/benchmark/CommitPanel.tsx
@@ -18,22 +18,37 @@ export function CommitPanel({
 }) {
   return (
     <Stack direction="row" spacing={2} sx={{ mb: 2 }}>
-      <Typography fontSize={"1rem"} fontStyle={"italic"}>
-        *This report was generated by CI running on {repoName}{" "}
-        {lBranchAndCommit.branch} branch at commit{" "}
-        <a
-          href={`/${repoName}/commit/${lBranchAndCommit.commit}#${workflowName}`}
-        >
-          {lBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
-        </a>{" "}
-        comparing with {rBranchAndCommit.branch} branch at commit{" "}
-        <a
-          href={`/${repoName}/commit/${rBranchAndCommit.commit}#${workflowName}`}
-        >
-          {rBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
-        </a>
-        . {children}
-      </Typography>
+      {repoName !== "vllm-project/vllm" && (
+        <Typography fontSize={"1rem"} fontStyle={"italic"}>
+          *This report was generated by CI running on {repoName}{" "}
+          {lBranchAndCommit.branch} branch at commit{" "}
+          <a
+            href={`/${repoName}/commit/${lBranchAndCommit.commit}#${workflowName}`}
+          >
+            {lBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
+          </a>{" "}
+          comparing with {rBranchAndCommit.branch} branch at commit{" "}
+          <a
+            href={`/${repoName}/commit/${rBranchAndCommit.commit}#${workflowName}`}
+          >
+            {rBranchAndCommit.commit.substring(0, SHA_DISPLAY_LENGTH)}
+          </a>
+          . {children}
+        </Typography>
+      )}
+      {repoName === "vllm-project/vllm" && (
+        <Typography fontSize={"1rem"} fontStyle={"italic"}>
+          This is vLLM v1 dashboard, please refer to{" "}
+          <a
+            href={
+              "https://simon-mo-workspace.observablehq.cloud/vllm-dashboard-v0/perf"
+            }
+          >
+            v0 dashboard
+          </a>{" "}
+          for the information about how the benchmark is setup
+        </Typography>
+      )}
     </Stack>
   );
 }
diff --git a/torchci/components/benchmark/llms/ModelGraphPanel.tsx b/torchci/components/benchmark/llms/ModelGraphPanel.tsx
@@ -141,14 +141,44 @@ export function GraphPanel({
               const model = record.model;
               const dtype = record.dtype;
               const device = record.device;
+              const metric = record.metric;
 
-              record.display = model.includes(dtype)
-                ? model.includes(device)
-                  ? model
-                  : `${model} (${device})`
-                : model.includes(device)
-                ? `${model} (${dtype})`
-                : `${model} (${dtype} / ${device})`;
+              if (repoName === "vllm-project/vllm") {
+                let requestRate = record.extra!["request_rate"];
+                // TODO (huydhn): Fix the invalid JSON on vLLM side
+                if (
+                  metric.includes("itl") ||
+                  metric.includes("tpot") ||
+                  metric.includes("ttft")
+                ) {
+                  requestRate = requestRate !== "" ? requestRate : "Inf";
+                }
+
+                let tensorParallel = record.extra!["tensor_parallel_size"];
+                // TODO (huydhn): Fix the passing of tensor_parallel_size to the benchmark
+                // script on vLLM side
+                if (model.includes("8B")) {
+                  tensorParallel = tensorParallel !== "" ? tensorParallel : "1";
+                } else if (model.includes("70B")) {
+                  tensorParallel = tensorParallel !== "" ? tensorParallel : "4";
+                } else if (model.includes("8x7B")) {
+                  tensorParallel = tensorParallel !== "" ? tensorParallel : "2";
+                }
+
+                if (requestRate !== "") {
+                  record.display = `${model} / tp${tensorParallel} / qps_${requestRate}`;
+                } else {
+                  record.display = `${model} / tp${tensorParallel}`;
+                }
+              } else {
+                record.display = model.includes(dtype)
+                  ? model.includes(device)
+                    ? model
+                    : `${model} (${device})`
+                  : model.includes(device)
+                  ? `${model} (${dtype})`
+                  : `${model} (${dtype} / ${device})`;
+              }
 
               return record;
             });
@@ -177,7 +207,7 @@ export function GraphPanel({
             .filter((metric) => chartData[metric].length !== 0)
             .map((metric: string) => (
               <Grid2
-                size={{ xs: 12, lg: modelName === DEFAULT_MODEL_NAME ? 12 : 4 }}
+                size={{ xs: 12, lg: modelName === DEFAULT_MODEL_NAME ? 12 : 6 }}
                 height={GRAPH_ROW_HEIGHT}
                 key={metric}
               >
@@ -203,7 +233,7 @@ export function GraphPanel({
                       },
                     },
                   }}
-                  legendPadding={modelName === DEFAULT_MODEL_NAME ? 320 : 200}
+                  legendPadding={320}
                 />
               </Grid2>
             ))}
diff --git a/torchci/components/benchmark/llms/SummaryPanel.tsx b/torchci/components/benchmark/llms/SummaryPanel.tsx
@@ -56,7 +56,7 @@ export function SummaryPanel({
   const rCommit = rPerfData.commit;
   const rData = rPerfData.data;
 
-  const data = combineLeftAndRight(lPerfData, rPerfData);
+  const data = combineLeftAndRight(repoName, lPerfData, rPerfData);
   const columns: any[] = [
     {
       field: "metadata",
@@ -138,6 +138,26 @@ export function SummaryPanel({
     });
   }
 
+  if (repoName === "vllm-project/vllm") {
+    columns.push({
+      field: "tensor_parallel_size",
+      headerName: "Tensor parallel",
+      flex: 1,
+      renderCell: (params: GridRenderCellParams<any>) => {
+        return `${params.value}`;
+      },
+    });
+
+    columns.push({
+      field: "request_rate",
+      headerName: "Request rate",
+      flex: 1,
+      renderCell: (params: GridRenderCellParams<any>) => {
+        return `${params.value}`;
+      },
+    });
+  }
+
   columns.push(
     ...[
       {
diff --git a/torchci/components/benchmark/llms/common.tsx b/torchci/components/benchmark/llms/common.tsx
@@ -5,8 +5,17 @@ export const REPO_TO_BENCHMARKS: { [k: string]: string[] } = {
   "pytorch/pytorch": ["PyTorch gpt-fast benchmark"],
   "pytorch/executorch": ["ExecuTorch"],
   "pytorch/ao": ["TorchAO benchmark"],
+  "vllm-project/vllm": ["vLLM benchmark"],
 };
-export const EXCLUDED_METRICS: string[] = ["load_status"];
+export const EXCLUDED_METRICS: string[] = [
+  "load_status",
+  "mean_itl_ms",
+  "mean_tpot_ms",
+  "mean_ttft_ms",
+  "std_itl_ms",
+  "std_tpot_ms",
+  "std_ttft_ms",
+];
 export const DEFAULT_MODEL_NAME = "All Models";
 export const SCALE = 2;
 export const METRIC_DISPLAY_HEADERS: { [k: string]: string } = {
@@ -17,6 +26,15 @@ export const METRIC_DISPLAY_HEADERS: { [k: string]: string } = {
   compile_vs_eager_speedup: "Compile vs eager speedup",
   autoquant_vs_compile_speedup: "Autoquant vs compile speedup",
   eager_speedup: "Eager speedup",
+  latency: "Latency (s)",
+  median_itl_ms: "Median ITL (ms)",
+  median_tpot_ms: "Median TPOT (ms)",
+  median_ttft_ms: "Median TTFT (ms)",
+  p99_itl_ms: "p99 ITL (ms)",
+  p99_tpot_ms: "p99 TPOT (ms)",
+  p99_ttft_ms: "p99 TTFT (ms)",
+  requests_per_second: "Requests/s",
+  tokens_per_second: "Tokens/s",
 };
 // The variable name is a bit dumb, but it tells if a higher metric value
 // is good or bad so that we can highlight it on the dashboard accordingly.
@@ -32,6 +50,15 @@ export const IS_INCREASING_METRIC_VALUE_GOOD: { [k: string]: boolean } = {
   "peak_inference_mem_usage(mb)": false,
   "peak_load_mem_usuage(mb)": false,
   "generate_time(ms)": false,
+  latency: false,
+  median_itl_ms: false,
+  median_tpot_ms: false,
+  median_ttft_ms: false,
+  p99_itl_ms: false,
+  p99_tpot_ms: false,
+  p99_ttft_ms: false,
+  requests_per_second: true,
+  tokens_per_second: true,
 };
 export const METRIC_DISPLAY_SHORT_HEADERS: { [k: string]: string } = {
   "memory_bandwidth(GB/s)": "Bandwidth",
@@ -71,7 +98,7 @@ export interface LLMsBenchmarkData {
   device: string;
   arch: string;
   display?: string;
-  use_torch_compile?: boolean;
+  extra?: { [key: string]: string };
 }
 
 export interface BranchAndCommitPerfData extends BranchAndCommit {
diff --git a/torchci/lib/benchmark/aoUtils.ts b/torchci/lib/benchmark/aoUtils.ts
@@ -84,10 +84,8 @@ export function computeSpeedup(
   const currentCommitBaseline: { [key: string]: LLMsBenchmarkData } = {};
 
   data.forEach((r: LLMsBenchmarkData) => {
-    if (
-      r.dtype !== TORCHAO_BASELINE ||
-      r.use_torch_compile !== useTorchCompile
-    ) {
+    const compile = r.extra?.use_torch_compile === "true";
+    if (r.dtype !== TORCHAO_BASELINE || compile !== useTorchCompile) {
       return;
     }
 
@@ -112,8 +110,9 @@ export function computeSpeedup(
   data.forEach((r: LLMsBenchmarkData) => {
     withSpeedup.push(r);
 
+    const compile = r.extra?.use_torch_compile === "true";
     // Compute eager speedup vs the base commit baseline
-    if (r.dtype === TORCHAO_BASELINE && r.use_torch_compile === false) {
+    if (r.dtype === TORCHAO_BASELINE && compile === false) {
       if (SPEEDUP_METRICS.includes(r.metric)) {
         const k = `${r.model} ${r.metric} ${r.device} ${r.arch}`;
         if (
diff --git a/torchci/lib/benchmark/llmUtils.ts b/torchci/lib/benchmark/llmUtils.ts
diff --git a/torchci/pages/benchmark/llms.tsx b/torchci/pages/benchmark/llms.tsx