From 5af6f15f0dd5a71a618614636aa317030d4048c9 Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Mon, 28 Jul 2025 01:07:30 -0700 Subject: [PATCH] chore: Improve the AutoTuner log information. * Change fallback alert from DEBUG level to WARNING level and only do once. * Add debug information for profiling cache right after warmup phase. * Change the level of exception message during tactic profiling from ERROR to WARNING. And all exception details are pushed to DEBUG. * Other trivial refinements. Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- tensorrt_llm/_torch/autotuner.py | 59 +++++++++++++------ .../_torch/pyexecutor/model_engine.py | 9 +-- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index 090bcd42b2f..89d866ee9a1 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -197,13 +197,13 @@ def autotune(tune_mode: bool = True): AutoTuner.get().is_tuning_mode = tune_mode autotune_enabled = tune_mode and not old_mode if autotune_enabled: - logger.info("[Autotuner]: Autotuning process starts ...") + logger.info("[Autotuner] Autotuning process starts ...") try: yield finally: AutoTuner.get().is_tuning_mode = old_mode if autotune_enabled: - logger.info("[Autotuner]: Autotuning process ends") + logger.info("[Autotuner] Autotuning process ends") @dataclass @@ -350,16 +350,11 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner], runner = runners[runner_id] # TODO: check the stored runner and tactic can implement this shape here # Should not directly try (runner, tactic) here, or it will hurt a lot of inference perf. - - # Record the cache miss config. - # Expect no cache miss in inference. Thus, any cache miss should be recorded. - if not is_cache_hit: - logger.debug( - f"[AutoTunner]: Using fallback tactic for {custom_op} with input shapes {input_shapes}" - ) - logger.debug( - f"[AutoTunner]: Generated key{AutoTuner._get_cache_key(custom_op, runners[0], input_shapes, tuning_config)}" - ) + if not is_cache_hit and len(self.profiling_cache) > 0: + # Only log once for each custom op and only when cache is not empty + logger.warning_once( + f"[AutoTunner] Using the fallback tactic, due to cache miss on input shapes={input_shapes}", + key=(custom_op)) return runner, tactic assert len(runners) > 0, "At least one runner is required" @@ -370,6 +365,8 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner], # Record the total configs to try self.stats.tuned_op_total_configs[custom_op] = len(profiles) + new_tuning_failure_occured = False + for p in profiles: tensors = self._prepare_input_tensors(p, inputs) is_cache_hit, runner_id, tactic, _ = self.search_cache( @@ -396,11 +393,13 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner], except Exception as e: shapes = self._get_input_sizes(tensors) - logger.error( - f"[Autotuner]: Failed when profiling {r} {tac}, shapes={shapes}. Error occurred: {e}" + logger.warning( + f"[Autotuner] Failed when profiling runner={r}, tactic={tac}, shapes={shapes}. Set TLLM_LOG_LEVEL=DEBUG for more details." ) + logger.debug(f"[Autotuner] Exception captured: {e}") # Record the failed profiling combinations + new_tuning_failure_occured = True if custom_op not in self.stats.failed_profiling_count: self.stats.failed_profiling_count[ custom_op] = set() @@ -426,9 +425,26 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner], custom_op] = self.stats.tuned_op_successful_configs.get( custom_op, 0) + 1 logger.debug( - f"[Autotuner]: profiling chosen runner: {runners[runner_id]} {tactic} for {cache_key}" + f"[Autotuner] Profiling runner={runners[runner_id]}, tactic={tactic} for cache_key={cache_key}." + ) + else: + logger.warning( + f"[Autotuner] No valid runner/tactic was found for custom_op={custom_op}, input_shapes={input_shapes}. " + f"At least one valid (runner, tactic) pair is required. " + f"If get_valid_tactics is intended to return empty list, please ensure that this profile is not valid for the custom_op " + f"and should not occurs during the inference stage, or fallback tactic is implemented. Otherwise, the the tuning process will crash." ) + # If failed profiling tactics occurs, log the error. + if new_tuning_failure_occured: + logger.warning( + f"[Autotuner] New tuning error occurs:" + f"Total failed profiling tactics occurs: {len(self.stats.failed_profiling_count[custom_op])} for custom_op={custom_op}. " + f"This will not block the tuning process. " + f"Please set TLLM_LOG_LEVEL=WARNING to find out when the tactic profiling fails. " + f"Set TLLM_LOG_LEVEL=DEBUG to get more details of the failures." + ) + # Get the best runner and tactic from cache # If no valid tactic is found, the fallback runner and tactic will be used _, runner_id, tactic, _ = self.search_cache(custom_op, runners, @@ -487,7 +503,7 @@ def _profile_single_kernel(self, runner: TunableRunner, shapes = self._get_input_sizes(inputs) logger.debug( - f"[Autotuner]: profiling {runner} {tactic}, shapes={shapes}, avg_time {avg_time}" + f"[Autotuner] Profiled runner={runner}, tactic={tactic}, shapes={shapes}: {avg_time}ms." ) return avg_time @@ -557,7 +573,7 @@ def _optimization_profiles( p.shapes[spec.input_idx][spec.dim_idx] = DynamicDim( min_value, opt_value, max_value) generated_profiles.append(p) - logger.debug(f"[Autotuner]: generated profile: {p}") + logger.debug(f"[Autotuner] Generated profile: {p}") return generated_profiles @classmethod @@ -649,3 +665,12 @@ def clear_cache(self) -> None: def reset_statistics(self) -> None: """Reset all statistics counters.""" self.stats = AutoTunerStatistics() + + def print_profiling_cache(self): + logger.debug(f"[Autotuner] The profiling_cache entries:") + logger.debug( + f"[Autotuner] Cache contents: (custom_op, runner, hash(attributes), shape_profiles) -> (runner_id, tactic, shape_profile(ignored))" + ) + for key, value in self.profiling_cache.items(): + runner_id, tactic, _ = value + logger.debug(f"[Autotuner] {key}: ({runner_id}, {tactic})") diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 3ac132767fd..527b32259da 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -693,15 +693,16 @@ def disable_optimization(backend: Backend): # No KV cache space! pass else: - logger.info( - f"Run autotuning warmup for batch size={1}") self.forward(batch, new_tensors_device=None, resource_manager=resource_manager) torch.cuda.synchronize() - logger.info(f"Autotuner Cache size after warmup " + - str(len(AutoTuner.get().profiling_cache))) + logger.info( + f"[Autotuner] Cache size after warmup is {len(AutoTuner.get().profiling_cache)}" + ) + + AutoTuner.get().print_profiling_cache() if not (self._run_cuda_graphs or self._torch_compile_piecewise_cuda_graph):