Remove Clear Cache Time from do_bench_using_profiling (pytorch#150696)

oniononion36 · pytorchmergebot · commit a05cc9f4941b · 2025-04-17T07:25:41.000Z
Summary: In most instances, this action would take ~33% of the total run time, which means that our benchmark would previously differ from the end results by a lot. Test Plan: We can compare the benchmark results for ``` CUDA_VISIBLE_DEVICES=4,5 buck run mode/opt -c python.package_style=inplace -c fbcode.enable_gpu_sections=true -c fbcode.nvcc_arch=h100a //caffe2/torch/fb/model_transform/experimental/benchmark:mts_gpu_benchmark -- --model-snapshot-id=672308665_0 --lower-backend=AOT_INDUCTOR --node-replacement-dict="{'torch.nn.Linear':{'(autotune)': 'fp8_float_model_dynamic_quantization_rowwise'}}" --trace-aot-inductor-module=True --disable-acc-tracer=False --batch-size=1024 ``` before and after the diff, and notice that on average, the benchmark results decrease by ~0.1ms per iteration, which is more closely aligned with the lowered modules. Differential Revision: D72469845 Pull Request resolved: pytorch#150696 Approved by: https://github.com/frank-wei
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
@@ -16,6 +16,7 @@
 import platform
 import re
 import shutil
+import statistics
 import sys
 import tempfile
 import textwrap
@@ -169,6 +170,77 @@ class GraphPartitionMap:
     constant_names: list[str]
 
 
+def fp8_bench(fn: Callable[[], Any], warmup: int = 25, rep: int = 100) -> float:
+    """
+    Returns benchmark results by examining torch profiler events.
+    This could be more accurate as it doesn't count CPU side overhead.
+    However, this also requires manually excluding irrelevant event, e.g.
+    vectorized_elementwise_kernel which is used to fill L2 cache,
+    various CUDA events, etc, so could also be fragile.
+    """
+
+    fn()
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.float16, device="cuda")
+
+    # Estimate the runtime of the function
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(5):
+        cache.zero_()
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    estimate_ms = start_event.elapsed_time(end_event) / 5
+
+    # compute number of warmup and repeat
+    n_warmup = max(1, int(warmup / estimate_ms))
+    n_repeat = max(1, int(rep / estimate_ms))
+
+    # Warm-up
+    for _ in range(n_warmup):
+        fn()
+
+    start_event = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)]
+    end_event = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)]
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CUDA,
+        ]
+    ) as p:
+        torch.cuda.synchronize()
+        for i in range(n_repeat):
+            cache.zero_()
+            start_event[i].record()
+            with torch.cuda.nvtx.range("RunCudaModule"):
+                fn()
+            end_event[i].record()
+        torch.cuda.synchronize()
+        times = torch.tensor(
+            [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
+        )
+
+    res = torch.mean(times).item()
+    log.debug("raw events")
+    log.debug(p.key_averages().table(sort_by="self_device_time_total", row_limit=-1))
+    filtered_events = EventList(
+        [
+            event
+            for event in p.events()
+            if event.device_type == DeviceType.CUDA and "fused_abs_max_0" in event.name
+        ]
+    )
+    if filtered_events:
+        res -= (
+            statistics.mean(event.device_time_total for event in filtered_events)
+            / 1000.0
+        )
+
+    log.debug("profiling results: %s ms", res)
+    return res
+
+
 def do_bench_using_profiling(
     fn: Callable[[], Any], warmup: int = 25, rep: int = 100
 ) -> float: