From 72726b78e028a14bd9a7d9bf2cbd2b4b55ec7373 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 17 Jul 2025 17:25:06 +0100
Subject: [PATCH 1/5] Remove unnecessary file from root dir

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .yapfignore    | 1 -
 pyproject.toml | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 .yapfignore

diff --git a/.yapfignore b/.yapfignore
deleted file mode 100644
index 2d6dcf8380ca..000000000000
--- a/.yapfignore
+++ /dev/null
@@ -1 +0,0 @@
-collect_env.py
diff --git a/pyproject.toml b/pyproject.toml
index 85a112ff51cf..7bd1c59a0931 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ ignore_patterns = [
     "benchmarks/**",
     "build/**",
     "examples/**",
+    "vllm/collect_env.py",
 ]
 
 [tool.ruff]

From ceb7cc265fbe5a9a0147467a85f59c852d5e0eb3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 17 Jul 2025 17:25:06 +0100
Subject: [PATCH 2/5] Migrate `tests/` from `yapf` to `ruff-format`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml | 17 +++++++++++--
 pyproject.toml          |  2 ++
 tests/pyproject.toml    | 54 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 tests/pyproject.toml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5197820fb402..43418dd8fd79 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,14 +12,27 @@ repos:
   - id: yapf
     args: [--in-place, --verbose]
     # Keep the same list from yapfignore here to avoid yapf failing without any inputs
-    exclude: '(.buildkite|benchmarks|build|examples)/.*'
+    exclude: |
+      (?x)^(
+        .buildkite|
+        benchmarks|
+        build|
+        examples|
+        tests
+      )/.*$
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.11.7
   hooks:
   - id: ruff
     args: [--output-format, github, --fix]
   - id: ruff-format
-    files: ^(.buildkite|benchmarks|examples)/.*
+    files: |
+      (?x)^(
+        .buildkite|
+        benchmarks|
+        examples|
+        tests
+      )/.*$
 - repo: https://github.com/crate-ci/typos
   rev: v1.34.0
   hooks:
diff --git a/pyproject.toml b/pyproject.toml
index 7bd1c59a0931..3ce2bb1c09a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ ignore_patterns = [
     "benchmarks/**",
     "build/**",
     "examples/**",
+    "tests/**",
     "vllm/collect_env.py",
 ]
 
@@ -143,6 +144,7 @@ skip_glob = [
     ".buildkite/*",
     "benchmarks/*",
     "examples/*",
+    "tests/*",
 ]
 use_parentheses = true
 skip_gitignore = true
diff --git a/tests/pyproject.toml b/tests/pyproject.toml
new file mode 100644
index 000000000000..f825cb203269
--- /dev/null
+++ b/tests/pyproject.toml
@@ -0,0 +1,54 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+exclude = [
+    # External file, leaving license intact
+    "examples/other/fp8/quantizer/quantize.py",
+    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
+]
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["vllm"]
+
+[tool.ruff.format]
+docstring-code-format = true
\ No newline at end of file

From e4c86294e6962379ca3f9fe671d9dedbe5076689 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:31:47 +0100
Subject: [PATCH 3/5] `pre-commit run -a`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/async_engine/api_server_async_engine.py |   14 +-
 tests/async_engine/conftest.py                |    2 +-
 tests/async_engine/test_api_server.py         |   31 +-
 tests/async_engine/test_async_llm_engine.py   |   60 +-
 tests/async_engine/test_request_tracker.py    |    3 +-
 .../test_basic_correctness.py                 |  133 +-
 .../basic_correctness/test_chunked_prefill.py |  136 +-
 tests/basic_correctness/test_cpu_offload.py   |    5 +-
 tests/basic_correctness/test_cumem.py         |   23 +-
 tests/basic_correctness/test_preemption.py    |   87 +-
 tests/benchmarks/test_latency_cli.py          |   14 +-
 tests/benchmarks/test_serve_cli.py            |    4 +-
 tests/benchmarks/test_throughput_cli.py       |   14 +-
 tests/build_cython.py                         |   15 +-
 tests/compile/backend.py                      |   17 +-
 .../compile/piecewise/test_full_cudagraph.py  |  108 +-
 tests/compile/piecewise/test_simple.py        |   59 +-
 tests/compile/piecewise/test_toy_llama.py     |  290 ++--
 tests/compile/test_async_tp.py                |  139 +-
 tests/compile/test_basic_correctness.py       |   42 +-
 tests/compile/test_config.py                  |   55 +-
 tests/compile/test_full_graph.py              |  111 +-
 tests/compile/test_functionalization.py       |   62 +-
 tests/compile/test_fusion.py                  |   80 +-
 tests/compile/test_fusion_all_reduce.py       |  118 +-
 tests/compile/test_fusion_attn.py             |   57 +-
 tests/compile/test_pass_manager.py            |    9 +-
 tests/compile/test_sequence_parallelism.py    |  213 +--
 tests/compile/test_silu_mul_quant_fusion.py   |   50 +-
 tests/compile/test_wrapper.py                 |   13 +-
 tests/config/test_config_generation.py        |   10 +-
 tests/config/test_mp_reducer.py               |   14 +-
 tests/conftest.py                             |  414 +++---
 tests/core/block/e2e/conftest.py              |   36 +-
 tests/core/block/e2e/test_correctness.py      |  385 +++---
 .../e2e/test_correctness_sliding_window.py    |   86 +-
 tests/core/block/test_block_manager.py        |  182 +--
 tests/core/block/test_block_table.py          |  201 +--
 .../block/test_cpu_gpu_block_allocator.py     |   31 +-
 tests/core/block/test_naive_block.py          |   96 +-
 tests/core/block/test_prefix_caching_block.py |  382 +++---
 tests/core/conftest.py                        |    2 +-
 tests/core/test_chunked_prefill_scheduler.py  |  173 ++-
 tests/core/test_num_computed_tokens_update.py |   49 +-
 tests/core/test_scheduler.py                  |  518 ++++----
 tests/core/test_scheduler_encoder_decoder.py  |   51 +-
 tests/core/test_serialization.py              |   14 +-
 tests/core/utils.py                           |  109 +-
 tests/cuda/test_cuda_context.py               |   45 +-
 .../test_disable_detokenization.py            |   20 +-
 tests/detokenizer/test_stop_checker.py        |   32 +-
 tests/detokenizer/test_stop_reason.py         |   37 +-
 tests/detokenizer/test_stop_strings.py        |  100 +-
 tests/distributed/conftest.py                 |    6 +-
 tests/distributed/test_ca_buffer_sharing.py   |    6 +-
 tests/distributed/test_comm_ops.py            |  106 +-
 tests/distributed/test_custom_all_reduce.py   |   45 +-
 tests/distributed/test_distributed_oot.py     |    3 +-
 tests/distributed/test_eplb_algo.py           |  178 +--
 tests/distributed/test_eplb_execute.py        |  181 +--
 tests/distributed/test_events.py              |   53 +-
 tests/distributed/test_expert_parallel.py     |   51 +-
 .../distributed/test_multi_node_assignment.py |   10 +-
 tests/distributed/test_node_count.py          |   15 +-
 tests/distributed/test_pipeline_parallel.py   |  215 +--
 tests/distributed/test_pipeline_partition.py  |    4 +-
 tests/distributed/test_pp_cudagraph.py        |   20 +-
 tests/distributed/test_pynccl.py              |  234 ++--
 tests/distributed/test_quick_all_reduce.py    |   74 +-
 tests/distributed/test_same_node.py           |    6 +-
 tests/distributed/test_sequence_parallel.py   |  139 +-
 tests/distributed/test_shm_broadcast.py       |   24 +-
 tests/distributed/test_torchrun_example.py    |   30 +-
 tests/distributed/test_utils.py               |   67 +-
 tests/encoder_decoder/test_e2e_correctness.py |   52 +-
 tests/engine/conftest.py                      |    2 +-
 tests/engine/test_arg_utils.py                |  175 +--
 tests/engine/test_computed_prefix_blocks.py   |   12 +-
 tests/engine/test_executor.py                 |   22 +-
 .../test_multi_step_output_processor.py       |   79 +-
 tests/engine/test_multiproc_workers.py        |   20 +-
 tests/engine/test_options.py                  |   15 +-
 tests/engine/test_short_mm_context.py         |   21 +-
 tests/entrypoints/conftest.py                 |  139 +-
 tests/entrypoints/llm/test_accuracy.py        |   19 +-
 tests/entrypoints/llm/test_chat.py            |   85 +-
 tests/entrypoints/llm/test_collective_rpc.py  |   12 +-
 tests/entrypoints/llm/test_encode.py          |   43 +-
 tests/entrypoints/llm/test_generate.py        |   39 +-
 .../llm/test_generate_multiple_loras.py       |   23 +-
 tests/entrypoints/llm/test_gpu_utilization.py |    5 +-
 tests/entrypoints/llm/test_guided_generate.py |  377 +++---
 tests/entrypoints/llm/test_lazy_outlines.py   |   40 +-
 .../entrypoints/llm/test_prompt_validation.py |    4 +-
 .../offline_mode/test_offline_mode.py         |   10 +-
 .../openai/correctness/test_lmeval.py         |   26 +-
 .../openai/correctness/test_mteb_embed.py     |   14 +-
 .../openai/correctness/test_mteb_score.py     |   22 +-
 .../test_transcription_api_correctness.py     |   58 +-
 .../openai/test_async_tokenization.py         |   50 +-
 tests/entrypoints/openai/test_audio.py        |  274 ++--
 tests/entrypoints/openai/test_basic.py        |   58 +-
 tests/entrypoints/openai/test_chat.py         |  802 ++++++------
 tests/entrypoints/openai/test_chat_echo.py    |   21 +-
 .../openai/test_chat_logit_bias_validation.py |   10 +-
 .../entrypoints/openai/test_chat_template.py  |   74 +-
 .../openai/test_chat_with_tool_reasoning.py   |  101 +-
 .../entrypoints/openai/test_chunked_prompt.py |   27 +-
 .../entrypoints/openai/test_classification.py |   76 +-
 tests/entrypoints/openai/test_cli_args.py     |  119 +-
 tests/entrypoints/openai/test_completion.py   |  399 +++---
 .../test_completion_with_function_calling.py  |   80 +-
 .../test_completion_with_prompt_embeds.py     |   61 +-
 .../openai/test_default_mm_loras.py           |   43 +-
 tests/entrypoints/openai/test_embedding.py    |  230 ++--
 .../openai/test_embedding_dimensions.py       |   36 +-
 .../openai/test_encoder_decoder.py            |   10 +-
 .../entrypoints/openai/test_lora_adapters.py  |  175 ++-
 .../entrypoints/openai/test_lora_resolvers.py |   58 +-
 tests/entrypoints/openai/test_metrics.py      |  137 +-
 tests/entrypoints/openai/test_models.py       |    4 +-
 .../openai/test_oot_registration.py           |   11 +-
 .../entrypoints/openai/test_openai_schema.py  |   19 +-
 .../openai/test_optional_middleware.py        |   12 +-
 tests/entrypoints/openai/test_pooling.py      |  175 +--
 .../openai/test_prompt_validation.py          |   35 +-
 tests/entrypoints/openai/test_rerank.py       |   67 +-
 .../openai/test_return_tokens_as_ids.py       |   42 +-
 tests/entrypoints/openai/test_root_path.py    |   37 +-
 tests/entrypoints/openai/test_run_batch.py    |  124 +-
 tests/entrypoints/openai/test_score.py        |  146 ++-
 tests/entrypoints/openai/test_serving_chat.py |  168 +--
 .../entrypoints/openai/test_serving_models.py |   61 +-
 tests/entrypoints/openai/test_shutdown.py     |    9 +-
 tests/entrypoints/openai/test_sleep.py        |   27 +-
 .../openai/test_tensorizer_entrypoint.py      |   29 +-
 tests/entrypoints/openai/test_tokenization.py |  182 ++-
 .../openai/test_transcription_validation.py   |  107 +-
 .../openai/test_translation_validation.py     |   60 +-
 tests/entrypoints/openai/test_truncation.py   |   23 +-
 tests/entrypoints/openai/test_video.py        |  276 ++--
 tests/entrypoints/openai/test_vision.py       |  294 ++---
 .../openai/test_vision_embedding.py           |   45 +-
 .../test_hunyuan_a13b_tool_parser.py          |  190 +--
 .../test_llama4_pythonic_tool_parser.py       |  249 ++--
 .../tool_parsers/test_pythonic_tool_parser.py |  189 ++-
 .../entrypoints/openai/tool_parsers/utils.py  |   92 +-
 .../test_api_server_process_manager.py        |   68 +-
 tests/entrypoints/test_chat_utils.py          | 1162 +++++++----------
 tests/entrypoints/test_ssl_cert_refresher.py  |    3 +-
 .../test_fastsafetensors_loader.py            |    3 +-
 .../test_weight_utils.py                      |   26 +-
 tests/kernels/allclose_default.py             |    6 +-
 tests/kernels/attention/conftest.py           |    3 +-
 tests/kernels/attention/test_attention.py     |  199 ++-
 .../attention/test_attention_selector.py      |  205 +--
 .../attention/test_blocksparse_attention.py   |   83 +-
 tests/kernels/attention/test_cache.py         |  410 +++---
 .../attention/test_cascade_flash_attn.py      |   62 +-
 .../attention/test_encoder_decoder_attn.py    |  427 +++---
 tests/kernels/attention/test_flash_attn.py    |  132 +-
 tests/kernels/attention/test_flashinfer.py    |  285 ++--
 ...test_flashinfer_trtllm_decode_attention.py |   56 +-
 tests/kernels/attention/test_flashmla.py      |   60 +-
 .../kernels/attention/test_lightning_attn.py  |  100 +-
 .../attention/test_merge_attn_states.py       |  218 ++--
 tests/kernels/attention/test_mha_attn.py      |   19 +-
 .../kernels/attention/test_mla_decode_cpu.py  |   34 +-
 .../kernels/attention/test_prefix_prefill.py  |  394 +++---
 .../attention/test_rocm_attention_selector.py |   48 +-
 .../attention/test_triton_decode_attention.py |   12 +-
 .../test_triton_unified_attention.py          |   62 +-
 tests/kernels/core/test_activation.py         |   43 +-
 .../core/test_fused_quant_layernorm.py        |  118 +-
 tests/kernels/core/test_layernorm.py          |   72 +-
 tests/kernels/core/test_opcheck.py            |    1 -
 tests/kernels/core/test_permute_cols.py       |    8 +-
 tests/kernels/core/test_pos_encoding.py       |  227 ++--
 tests/kernels/core/test_rotary_embedding.py   |   80 +-
 tests/kernels/core/test_uva.py                |   18 +-
 tests/kernels/mamba/test_causal_conv1d.py     |  260 ++--
 tests/kernels/mamba/test_mamba_mixer2.py      |   85 +-
 tests/kernels/mamba/test_mamba_ssm.py         |  595 +++++----
 tests/kernels/mamba/test_mamba_ssm_ssd.py     |  215 ++-
 .../moe/modular_kernel_tools/cli_args.py      |   95 +-
 .../moe/modular_kernel_tools/common.py        |  258 ++--
 .../make_feature_matrix.py                    |  124 +-
 .../moe/modular_kernel_tools/mk_objects.py    |   83 +-
 .../modular_kernel_tools/parallel_utils.py    |   28 +-
 .../profile_modular_kernel.py                 |   53 +-
 .../kernels/moe/modular_kernel_tools/utils.py |   56 +-
 tests/kernels/moe/parallel_utils.py           |  105 +-
 tests/kernels/moe/test_batched_moe.py         |  106 +-
 tests/kernels/moe/test_block_fp8.py           |  126 +-
 tests/kernels/moe/test_block_int8.py          |   50 +-
 .../moe/test_count_expert_num_tokens.py       |  114 +-
 .../kernels/moe/test_cutlass_grouped_gemm.py  |   71 +-
 tests/kernels/moe/test_cutlass_moe.py         |  352 ++---
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  306 +++--
 tests/kernels/moe/test_deepep_moe.py          |  284 ++--
 tests/kernels/moe/test_deepgemm.py            |   48 +-
 .../moe/test_modular_kernel_combinations.py   |  119 +-
 tests/kernels/moe/test_moe.py                 |  440 ++++---
 .../kernels/moe/test_moe_align_block_size.py  |   54 +-
 .../kernels/moe/test_moe_permute_unpermute.py |  243 ++--
 tests/kernels/moe/test_mxfp4_moe.py           |   42 +-
 tests/kernels/moe/test_nvfp4_moe.py           |  110 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    |  204 +--
 tests/kernels/moe/test_pplx_moe.py            |  320 +++--
 tests/kernels/moe/test_rocm_aiter_topk.py     |  199 +--
 .../moe/test_silu_mul_fp8_quant_deep_gemm.py  |   16 +-
 tests/kernels/moe/test_triton_moe_ptpc_fp8.py |   42 +-
 tests/kernels/moe/utils.py                    |  141 +-
 tests/kernels/quant_utils.py                  |  162 +--
 tests/kernels/quantization/nvfp4_utils.py     |   14 +-
 .../quantization/test_allspark_gemm.py        |   80 +-
 tests/kernels/quantization/test_aqlm.py       |   43 +-
 tests/kernels/quantization/test_awq.py        |   52 +-
 tests/kernels/quantization/test_awq_triton.py |  105 +-
 tests/kernels/quantization/test_block_fp8.py  |   56 +-
 tests/kernels/quantization/test_block_int8.py |   23 +-
 .../quantization/test_cutlass_2of4_sparse.py  |  179 ++-
 .../quantization/test_cutlass_scaled_mm.py    |  589 +++++----
 tests/kernels/quantization/test_fp8_quant.py  |   97 +-
 tests/kernels/quantization/test_ggml.py       |   53 +-
 tests/kernels/quantization/test_gguf.py       |  108 +-
 tests/kernels/quantization/test_gptq.py       |   29 +-
 .../kernels/quantization/test_int8_kernel.py  |   44 +-
 tests/kernels/quantization/test_int8_quant.py |   98 +-
 tests/kernels/quantization/test_machete_mm.py |  300 +++--
 .../kernels/quantization/test_marlin_gemm.py  |  322 +++--
 .../kernels/quantization/test_nvfp4_quant.py  |   44 +-
 .../quantization/test_nvfp4_scaled_mm.py      |   85 +-
 .../quantization/test_rocm_skinny_gemms.py    |   22 +-
 .../quantization/test_triton_scaled_mm.py     |   55 +-
 .../test_apply_repetition_penalties.py        |   64 +-
 tests/kernels/test_cutlass_mla_decode.py      |   47 +-
 tests/kernels/test_flex_attention.py          |   13 +-
 tests/kernels/test_fused_quant_activation.py  |   20 +-
 tests/kernels/test_triton_flash_attention.py  |  530 ++++----
 tests/kernels/utils.py                        |  649 ++++-----
 tests/kv_transfer/test_disagg.py              |   47 +-
 tests/kv_transfer/test_lookup_buffer.py       |   23 +-
 tests/kv_transfer/test_module.py              |   25 +-
 tests/kv_transfer/test_send_recv.py           |   30 +-
 tests/lora/conftest.py                        |  131 +-
 tests/lora/test_add_lora.py                   |   47 +-
 tests/lora/test_baichuan.py                   |   81 +-
 tests/lora/test_chatglm3_tp.py                |   66 +-
 tests/lora/test_default_mm_loras.py           |   13 +-
 tests/lora/test_layers.py                     |  837 ++++++------
 tests/lora/test_llama_tp.py                   |  179 ++-
 tests/lora/test_lora_allowed_token_ids.py     |   32 +-
 tests/lora/test_lora_checkpoints.py           |   42 +-
 tests/lora/test_lora_functions.py             |   44 +-
 tests/lora/test_lora_huggingface.py           |   11 +-
 tests/lora/test_lora_manager.py               |  462 ++++---
 tests/lora/test_minicpmv_tp.py                |   53 +-
 tests/lora/test_mixtral.py                    |   29 +-
 tests/lora/test_peft_helper.py                |   31 +-
 tests/lora/test_phi.py                        |   39 +-
 tests/lora/test_punica_ops.py                 |  236 ++--
 tests/lora/test_quant_model.py                |   80 +-
 tests/lora/test_qwen2vl.py                    |  127 +-
 tests/lora/test_resolver.py                   |   11 +-
 tests/lora/test_tokenizer_group.py            |   30 +-
 tests/lora/test_transformers_model.py         |   76 +-
 tests/lora/test_utils.py                      |  115 +-
 tests/lora/test_worker.py                     |   45 +-
 tests/lora/utils.py                           |   92 +-
 tests/metrics/test_metrics.py                 |  166 +--
 tests/mistral_tool_use/conftest.py            |   10 +-
 .../test_mistral_tool_calls.py                |    6 +-
 tests/mistral_tool_use/utils.py               |   13 +-
 tests/model_executor/conftest.py              |   41 +-
 .../model_executor/test_enabled_custom_ops.py |  122 +-
 .../model_executor/test_guided_processors.py  |  154 +--
 tests/model_executor/test_logits_processor.py |   49 +-
 .../test_model_load_with_params.py            |   59 +-
 tests/model_executor/test_weight_utils.py     |   39 +-
 tests/models/language/generation/test_bart.py |   97 +-
 .../models/language/generation/test_common.py |   60 +-
 .../models/language/generation/test_gemma.py  |   12 +-
 .../language/generation/test_granite.py       |    6 +-
 .../models/language/generation/test_hybrid.py |  125 +-
 .../language/generation/test_mistral.py       |  333 +++--
 .../models/language/generation/test_phimoe.py |   78 +-
 tests/models/language/pooling/embed_utils.py  |   31 +-
 tests/models/language/pooling/mteb_utils.py   |  154 +--
 tests/models/language/pooling/test_baai.py    |  118 +-
 .../pooling/test_bge_reranker_v2_gemma.py     |   72 +-
 .../language/pooling/test_classification.py   |   20 +-
 .../language/pooling/test_cross_encoder.py    |   17 +-
 .../models/language/pooling/test_embedding.py |   51 +-
 tests/models/language/pooling/test_gritlm.py  |   25 +-
 tests/models/language/pooling/test_gte.py     |  106 +-
 .../models/language/pooling/test_intfloat.py  |   54 +-
 tests/models/language/pooling/test_jina.py    |   78 +-
 .../language/pooling/test_mxbai_rerank.py     |   56 +-
 tests/models/language/pooling/test_nomic.py   |   40 +-
 .../pooling/test_nomic_max_model_len.py       |   69 +-
 .../language/pooling/test_qwen3_reranker.py   |   69 +-
 tests/models/language/pooling/test_reward.py  |   22 +-
 tests/models/language/pooling/test_scoring.py |   63 +-
 .../pooling/test_snowflake_arctic_embed.py    |   92 +-
 .../pooling/test_truncation_control.py        |   52 +-
 .../multimodal/generation/test_common.py      |  197 ++-
 .../multimodal/generation/test_florence2.py   |   83 +-
 .../generation/test_granite_speech.py         |   73 +-
 .../multimodal/generation/test_interleaved.py |   38 +-
 .../multimodal/generation/test_mllama.py      |  482 ++++---
 .../multimodal/generation/test_phi4mm.py      |  177 +--
 .../multimodal/generation/test_pixtral.py     |  171 +--
 .../multimodal/generation/test_qwen2_vl.py    |  279 ++--
 .../multimodal/generation/test_ultravox.py    |  146 ++-
 .../multimodal/generation/test_voxtral.py     |   70 +-
 .../multimodal/generation/test_whisper.py     |   33 +-
 .../generation/vlm_utils/builders.py          |  168 +--
 .../generation/vlm_utils/case_filtering.py    |   94 +-
 .../multimodal/generation/vlm_utils/core.py   |   63 +-
 .../generation/vlm_utils/custom_inputs.py     |   35 +-
 .../generation/vlm_utils/model_utils.py       |  257 ++--
 .../generation/vlm_utils/runners.py           |   99 +-
 .../multimodal/generation/vlm_utils/types.py  |   25 +-
 .../multimodal/pooling/test_dse_qwen2_vl.py   |  117 +-
 .../multimodal/pooling/test_intern_vit.py     |   26 +-
 .../pooling/test_jinavl_reranker.py           |   78 +-
 .../multimodal/pooling/test_llava_next.py     |   53 +-
 tests/models/multimodal/pooling/test_phi3v.py |   47 +-
 .../multimodal/processing/test_common.py      |   74 +-
 .../multimodal/processing/test_h2ovl.py       |   26 +-
 .../multimodal/processing/test_idefics3.py    |   14 +-
 .../multimodal/processing/test_internvl.py    |   13 +-
 .../multimodal/processing/test_llama4.py      |   37 +-
 .../multimodal/processing/test_llava_next.py  |   47 +-
 .../processing/test_llava_onevision.py        |   56 +-
 .../processing/test_minimax_vl_01.py          |   26 +-
 .../multimodal/processing/test_mllama.py      |   10 +-
 .../multimodal/processing/test_nemotron_vl.py |   16 +-
 .../multimodal/processing/test_phi3v.py       |    4 +-
 .../multimodal/processing/test_phi4mm.py      |   10 +-
 .../multimodal/processing/test_qwen2_vl.py    |    6 +-
 .../multimodal/processing/test_smolvlm.py     |   14 +-
 tests/models/multimodal/test_mapping.py       |    7 +-
 tests/models/quantization/test_aqlm.py        |   37 +-
 tests/models/quantization/test_awq.py         |   87 +-
 tests/models/quantization/test_bitblas.py     |   26 +-
 .../models/quantization/test_bitsandbytes.py  |  236 ++--
 tests/models/quantization/test_fp8.py         |  107 +-
 tests/models/quantization/test_gguf.py        |   72 +-
 .../models/quantization/test_gptq_bitblas.py  |   20 +-
 tests/models/quantization/test_gptq_marlin.py |   50 +-
 .../quantization/test_gptq_marlin_24.py       |   41 +-
 tests/models/quantization/test_modelopt.py    |   37 +-
 tests/models/quantization/test_mxfp4.py       |   21 +-
 tests/models/quantization/test_nvfp4.py       |   43 +-
 tests/models/registry.py                      |    6 +-
 tests/models/test_initialization.py           |   85 +-
 tests/models/test_oot_registration.py         |   41 +-
 tests/models/test_registry.py                 |   75 +-
 tests/models/test_transformers.py             |   79 +-
 tests/models/test_utils.py                    |   38 +-
 tests/models/test_vision.py                   |   24 +-
 tests/models/utils.py                         |  127 +-
 tests/mq_llm_engine/conftest.py               |    2 +-
 tests/mq_llm_engine/test_abort.py             |   18 +-
 tests/mq_llm_engine/test_error_handling.py    |  158 +--
 tests/mq_llm_engine/test_load.py              |   18 +-
 tests/mq_llm_engine/utils.py                  |   36 +-
 .../multi_step/test_correctness_async_llm.py  |   75 +-
 tests/multi_step/test_correctness_llm.py      |  152 ++-
 tests/multimodal/test_inputs.py               |   34 +-
 tests/multimodal/test_processing.py           |   34 +-
 tests/multimodal/test_utils.py                |  164 ++-
 tests/multimodal/test_video.py                |   21 +-
 tests/multimodal/utils.py                     |    4 +-
 tests/neuron/1_core/test_activation.py        |   16 +-
 tests/neuron/1_core/test_block_table.py       |   35 +-
 tests/neuron/1_core/test_cache.py             |   34 +-
 tests/neuron/1_core/test_layernorm.py         |   27 +-
 tests/neuron/1_core/test_logits_processor.py  |   45 +-
 .../neuron/1_core/test_neuron_model_runner.py |   35 +-
 tests/neuron/1_core/test_neuron_quant.py      |    3 +-
 tests/neuron/1_core/test_prefix_prefill.py    |  177 ++-
 tests/neuron/1_core/test_rotary_embedding.py  |   56 +-
 tests/neuron/2_core/test_comm_ops.py          |   50 +-
 tests/neuron/2_core/test_eagle.py             |   29 +-
 tests/neuron/2_core/test_mistral.py           |   29 +-
 tests/neuron/2_core/test_multi_lora.py        |  100 +-
 .../test_filesystem_resolver.py               |    5 +-
 tests/plugins/vllm_add_dummy_model/setup.py   |   15 +-
 .../vllm_add_dummy_model/__init__.py          |    3 +-
 .../my_gemma_embedding.py                     |   15 +-
 .../vllm_add_dummy_model/my_llava.py          |   23 +-
 .../vllm_add_dummy_model/my_opt.py            |    5 +-
 .../plugins/vllm_add_dummy_platform/setup.py  |   16 +-
 .../dummy_attention_backend.py                |    4 +-
 .../dummy_custom_ops.py                       |    3 +-
 .../vllm_add_dummy_platform/dummy_platform.py |   14 +-
 tests/plugins_tests/conftest.py               |    2 +-
 tests/plugins_tests/test_platform_plugins.py  |   17 +-
 tests/plugins_tests/test_scheduler_plugins.py |   10 +-
 .../test_disable_sliding_window.py            |   28 +-
 tests/prefix_caching/test_prefix_caching.py   |   75 +-
 tests/prompt_adapter/test_bloom.py            |   38 +-
 .../test_multi_adapter_inference.py           |   60 +-
 tests/prompt_adapter/test_pa_lora.py          |   46 +-
 tests/quantization/reference_mxfp4.py         |  125 +-
 tests/quantization/test_auto_round.py         |   21 +-
 tests/quantization/test_compressed_tensors.py |  215 +--
 tests/quantization/test_configs.py            |   24 +-
 tests/quantization/test_cpu_offload.py        |  183 +--
 tests/quantization/test_experts_int8.py       |   13 +-
 tests/quantization/test_fp8.py                |   95 +-
 tests/quantization/test_gptq_dynamic.py       |   51 +-
 tests/quantization/test_ipex_quant.py         |   18 +-
 tests/quantization/test_lm_head.py            |   29 +-
 tests/quantization/test_ptpc_fp8.py           |   32 +-
 tests/quantization/test_quark.py              |  105 +-
 .../test_register_quantization_config.py      |   54 +-
 tests/quantization/test_rtn.py                |   10 +-
 tests/quantization/test_torchao.py            |   58 +-
 .../test_deepseekr1_reasoning_parser.py       |   17 +-
 .../test_granite_reasoning_parser.py          |   48 +-
 .../test_hunyuan_reasoning_parser.py          |   35 +-
 .../reasoning/test_qwen3_reasoning_parser.py  |   14 +-
 tests/reasoning/utils.py                      |   14 +-
 .../test_weight_utils.py                      |   15 +-
 tests/samplers/test_beam_search.py            |   41 +-
 tests/samplers/test_ignore_eos.py             |    6 +-
 tests/samplers/test_logits_processor.py       |    5 +-
 tests/samplers/test_logprobs.py               |  105 +-
 tests/samplers/test_no_bad_words.py           |  106 +-
 tests/samplers/test_ranks.py                  |   24 +-
 tests/samplers/test_rejection_sampler.py      |  442 ++++---
 tests/samplers/test_sampler.py                |  301 +++--
 tests/samplers/test_seeded_generate.py        |   20 +-
 .../test_typical_acceptance_sampler.py        |  248 ++--
 tests/spec_decode/conftest.py                 |    2 +-
 tests/spec_decode/e2e/conftest.py             |  208 +--
 tests/spec_decode/e2e/test_compatibility.py   |   21 +-
 .../spec_decode/e2e/test_eagle_correctness.py |  709 +++++-----
 tests/spec_decode/e2e/test_integration.py     |  192 +--
 .../e2e/test_integration_dist_tp2.py          |  459 ++++---
 .../e2e/test_integration_dist_tp4.py          |  156 ++-
 tests/spec_decode/e2e/test_logprobs.py        |  345 +++--
 .../e2e/test_medusa_correctness.py            |  567 ++++----
 tests/spec_decode/e2e/test_mlp_correctness.py |  720 +++++-----
 tests/spec_decode/e2e/test_mtp_correctness.py |  467 ++++---
 .../e2e/test_multistep_correctness.py         | 1050 ++++++++-------
 .../spec_decode/e2e/test_ngram_correctness.py |  567 ++++----
 tests/spec_decode/e2e/test_seed.py            |   45 +-
 tests/spec_decode/test_batch_expansion.py     |   68 +-
 tests/spec_decode/test_dynamic_spec_decode.py |   64 +-
 tests/spec_decode/test_memory_usage.py        |   33 +-
 tests/spec_decode/test_metrics.py             |  143 +-
 tests/spec_decode/test_multi_step_worker.py   |  468 ++++---
 tests/spec_decode/test_ngram_worker.py        |   76 +-
 tests/spec_decode/test_scorer.py              |  119 +-
 tests/spec_decode/test_spec_decode_worker.py  |  840 ++++++------
 tests/spec_decode/test_utils.py               |   70 +-
 tests/spec_decode/utils.py                    |  181 +--
 tests/standalone_tests/lazy_imports.py        |    3 +-
 tests/tensorizer_loader/conftest.py           |   21 +-
 tests/tensorizer_loader/test_tensorizer.py    |  248 ++--
 tests/test_cache_block_hashing.py             |   43 +-
 tests/test_config.py                          |  189 +--
 tests/test_embedded_commit.py                 |   16 +-
 tests/test_inputs.py                          |   29 +-
 tests/test_logger.py                          |   68 +-
 tests/test_outputs.py                         |   16 +-
 tests/test_regression.py                      |   25 +-
 tests/test_sampling_params.py                 |   69 +-
 tests/test_scalartype.py                      |   33 +-
 tests/test_seed_behavior.py                   |   48 +-
 tests/test_sequence.py                        |   20 +-
 tests/test_sharded_state_loader.py            |   88 +-
 tests/test_triton_utils.py                    |    7 +-
 tests/test_utils.py                           |  314 +++--
 tests/test_version.py                         |    3 +-
 tests/test_vllm_port.py                       |   13 +-
 tests/tokenization/test_cached_tokenizer.py   |   19 +-
 tests/tokenization/test_detokenize.py         |  281 ++--
 tests/tokenization/test_get_eos.py            |    7 +-
 tests/tokenization/test_mistral_tokenizer.py  |  335 ++---
 tests/tokenization/test_tokenizer.py          |    2 +-
 tests/tokenization/test_tokenizer_group.py    |   15 +-
 tests/tokenization/test_tokenizer_registry.py |   30 +-
 tests/tool_use/conftest.py                    |   30 +-
 ...est_chat_completion_request_validations.py |  104 +-
 tests/tool_use/test_chat_completions.py       |   38 +-
 tests/tool_use/test_jamba_tool_parser.py      |  248 ++--
 tests/tool_use/test_kimi_k2_tool_parser.py    |  110 +-
 tests/tool_use/test_minimax_tool_parser.py    |  173 +--
 tests/tool_use/test_parallel_tool_calls.py    |   73 +-
 tests/tool_use/test_tool_calls.py             |   37 +-
 tests/tool_use/test_tool_choice_required.py   |  347 +++--
 tests/tool_use/test_xlam_tool_parser.py       |  178 ++-
 tests/tool_use/utils.py                       |  410 +++---
 tests/tools/test_config_validator.py          |   23 +-
 tests/tpu/lora/test_lora.py                   |   73 +-
 tests/tpu/test_compilation.py                 |   29 +-
 tests/tpu/test_custom_dispatcher.py           |   31 +-
 tests/tpu/test_moe_pallas.py                  |    9 +-
 tests/tpu/test_quantization_accuracy.py       |   14 +-
 tests/tracing/test_tracing.py                 |  162 ++-
 tests/utils.py                                |  564 ++++----
 tests/v1/attention/test_attention_backends.py |  269 ++--
 tests/v1/attention/utils.py                   |  133 +-
 tests/v1/core/test_async_scheduler.py         |   55 +-
 tests/v1/core/test_kv_cache_utils.py          |  385 +++---
 tests/v1/core/test_prefix_caching.py          |  632 +++++----
 tests/v1/core/test_scheduler.py               |  521 ++++----
 tests/v1/core/test_scheduler_e2e.py           |   16 +-
 tests/v1/core/test_specialized_manager.py     |   95 +-
 tests/v1/core/utils.py                        |   65 +-
 tests/v1/e2e/test_cascade_attention.py        |    3 +-
 .../v1/e2e/test_correctness_sliding_window.py |   48 +-
 tests/v1/e2e/test_spec_decode.py              |   52 +-
 tests/v1/engine/conftest.py                   |   44 +-
 tests/v1/engine/test_async_llm.py             |  132 +-
 tests/v1/engine/test_engine_args.py           |   12 +-
 tests/v1/engine/test_engine_core.py           |   55 +-
 tests/v1/engine/test_engine_core_client.py    |  173 ++-
 .../v1/engine/test_fast_incdec_prefix_err.py  |  145 +-
 tests/v1/engine/test_llm_engine.py            |   52 +-
 tests/v1/engine/test_output_processor.py      |  504 +++----
 tests/v1/engine/utils.py                      |   73 +-
 tests/v1/entrypoints/conftest.py              |  114 +-
 .../llm/test_struct_output_generate.py        |  345 ++---
 .../entrypoints/openai/responses/conftest.py  |    1 -
 .../openai/responses/test_basic.py            |   43 +-
 .../openai/responses/test_image.py            |  117 +-
 .../openai/responses/test_stateful.py         |   18 +-
 .../responses/test_structured_output.py       |   24 +-
 .../openai/test_chat_completion.py            |   73 +-
 .../v1/entrypoints/openai/test_completion.py  |  377 +++---
 .../openai/test_multi_api_servers.py          |   89 +-
 tests/v1/executor/test_multiproc_executor.py  |   77 +-
 .../nixl_integration/test_accuracy.py         |   33 +-
 .../nixl_integration/test_edge_cases.py       |   39 +-
 .../nixl_integration/toy_proxy_server.py      |  166 ++-
 .../kv_connector/unit/test_multi_connector.py |  114 +-
 .../kv_connector/unit/test_nixl_connector.py  |  230 ++--
 .../unit/test_remote_decode_lifecycle.py      |   45 +-
 .../unit/test_remote_prefill_lifecycle.py     |  136 +-
 tests/v1/kv_connector/unit/utils.py           |  116 +-
 tests/v1/metrics/test_ray_metrics.py          |    8 +-
 tests/v1/sample/test_logits_processors.py     |  392 +++---
 tests/v1/sample/test_logprobs.py              |  184 +--
 tests/v1/sample/test_logprobs_e2e.py          |   30 +-
 tests/v1/sample/test_rejection_sampler.py     |  250 ++--
 tests/v1/sample/test_sampler.py               |  178 +--
 tests/v1/sample/test_sampling_params_e2e.py   |   22 +-
 tests/v1/sample/test_topk_topp_sampler.py     |   43 +-
 tests/v1/sample/utils.py                      |   42 +-
 tests/v1/shutdown/test_delete.py              |   50 +-
 tests/v1/shutdown/test_forward_error.py       |   54 +-
 tests/v1/shutdown/test_processor_error.py     |   19 +-
 tests/v1/shutdown/test_startup_error.py       |   47 +-
 tests/v1/spec_decode/test_eagle.py            |  160 +--
 tests/v1/spec_decode/test_ngram.py            |  104 +-
 tests/v1/structured_output/test_utils.py      |  153 +--
 tests/v1/test_async_llm_dp.py                 |   81 +-
 tests/v1/test_external_lb_dp.py               |  149 ++-
 tests/v1/test_metrics_reader.py               |   44 +-
 tests/v1/test_oracle.py                       |   12 +-
 tests/v1/test_serial_utils.py                 |   80 +-
 tests/v1/test_utils.py                        |   54 +-
 tests/v1/tpu/test_basic.py                    |   90 +-
 tests/v1/tpu/test_kv_cache_update_kernel.py   |   85 +-
 tests/v1/tpu/test_mha_attn.py                 |   23 +-
 tests/v1/tpu/test_multimodal.py               |   44 +-
 tests/v1/tpu/test_pallas.py                   |   20 +-
 tests/v1/tpu/test_perf.py                     |   68 +-
 tests/v1/tpu/test_sampler.py                  |   59 +-
 .../v1/tpu/test_spmd_model_weight_loading.py  |   23 +-
 tests/v1/tpu/test_topk_topp_sampler.py        |   85 +-
 tests/v1/tpu/test_tpu_qkv_linear.py           |   13 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py  |  127 +-
 tests/v1/worker/test_gpu_input_batch.py       |  147 ++-
 tests/v1/worker/test_gpu_model_runner.py      |  207 +--
 tests/vllm_test_utils/setup.py                |    6 +-
 .../vllm_test_utils/vllm_test_utils/blame.py  |    4 +-
 .../vllm_test_utils/monitor.py                |   27 +-
 tests/weight_loading/test_weight_loading.py   |   32 +-
 tests/worker/conftest.py                      |    2 +-
 .../test_encoder_decoder_model_runner.py      |  163 ++-
 tests/worker/test_model_input.py              |  105 +-
 tests/worker/test_model_runner.py             |  121 +-
 tests/worker/test_profile.py                  |   19 +-
 tests/worker/test_swap.py                     |   15 +-
 vllm/benchmarks/datasets.py                   |   34 +-
 vllm/benchmarks/serve.py                      |   30 +-
 594 files changed, 33881 insertions(+), 28948 deletions(-)

diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index ec6b20f5e04b..57d1fe4256cb 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vllm.entrypoints.api_server with some extra logging for testing."""
+
 from collections.abc import Iterable
 from typing import Any
 
@@ -17,7 +18,6 @@
 
 
 class AsyncLLMEngineWithStats(AsyncLLMEngine):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._num_aborts = 0
@@ -47,8 +47,10 @@ def stats() -> Response:
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
     vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level="debug",
-                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
+    )
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
index 375b248ebeda..a6a8b33e19d3 100644
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
@@ -9,4 +9,4 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 76c94bdf80ca..f6c35a118e66 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -13,13 +13,15 @@
 
 
 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
-    response = requests.post("http://localhost:8000/generate",
-                             json={
-                                 "prompt": prompt,
-                                 "max_tokens": max_tokens,
-                                 "temperature": 0,
-                                 "ignore_eos": True
-                             })
+    response = requests.post(
+        "http://localhost:8000/generate",
+        json={
+            "prompt": prompt,
+            "max_tokens": max_tokens,
+            "temperature": 0,
+            "ignore_eos": True,
+        },
+    )
     response.raise_for_status()
     return response.json()
 
@@ -30,8 +32,9 @@ def _query_server_long(prompt: str) -> dict:
 
 @pytest.fixture
 def api_server(distributed_executor_backend: str):
-    script_path = Path(__file__).parent.joinpath(
-        "api_server_async_engine.py").absolute()
+    script_path = (
+        Path(__file__).parent.joinpath("api_server_async_engine.py").absolute()
+    )
     commands = [
         sys.executable,
         "-u",
@@ -80,8 +83,9 @@ def test_api_server(api_server, distributed_executor_backend: str):
         for result in pool.map(_query_server, prompts):
             assert result
 
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        num_aborted_requests = requests.get("http://localhost:8000/stats").json()[
+            "num_aborted_requests"
+        ]
         assert num_aborted_requests == 0
 
         # Try with 100 prompts
@@ -101,8 +105,9 @@ def test_api_server(api_server, distributed_executor_backend: str):
         # give it some times to update the stats
         time.sleep(1)
 
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        num_aborted_requests = requests.get("http://localhost:8000/stats").json()[
+            "num_aborted_requests"
+        ]
         assert num_aborted_requests > 0
 
     # check that server still runs after cancellations
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 0eb7a6eb52aa..c1ed64abd6e7 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -36,7 +36,6 @@ class MockModelConfig:
 
 
 class MockEngine:
-
     def __init__(self):
         self.step_calls = 0
         self.add_request_calls = 0
@@ -49,8 +48,7 @@ def __init__(self):
     async def step_async(self, virtual_engine):
         # PP size is 1, ignore virtual engine
         self.step_calls += 1
-        return [RequestOutput(
-            request_id=self.request_id)] if self.request_id else []
+        return [RequestOutput(request_id=self.request_id)] if self.request_id else []
 
     async def process_model_inputs_async(self, *args, **kwargs):
         pass
@@ -67,7 +65,7 @@ def stop_generating(self):
     def add_request(self, **kwargs):
         del kwargs  # Unused
         self.add_request_calls += 1
-        print(f'Request calls: {self.add_request_calls}')
+        print(f"Request calls: {self.add_request_calls}")
 
     async def add_request_async(self, **kwargs):
         self.add_request_calls += 1
@@ -142,9 +140,12 @@ def start_engine():
     print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
 
     return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m",
-                        enforce_eager=True,
-                        num_scheduler_steps=num_scheduler_steps))
+        AsyncEngineArgs(
+            model="facebook/opt-125m",
+            enforce_eager=True,
+            num_scheduler_steps=num_scheduler_steps,
+        )
+    )
 
 
 def uid() -> str:
@@ -157,8 +158,9 @@ async def async_engine():
     # scoped fixture and monkeypatch is function scoped.
     previous_value = os.getenv("VLLM_USE_V1", None)
     os.environ["VLLM_USE_V1"] = "0"
-    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
-                                                            func=start_engine)
+    engine = await asyncio.get_event_loop().run_in_executor(
+        executor=None, func=start_engine
+    )
     try:
         yield engine
     finally:
@@ -182,7 +184,6 @@ def should_do_global_cleanup_after_test(request) -> bool:
 @pytest.mark.asyncio(scope="module")
 @pytest.mark.parametrize("stop", [None, ["a stop string"]])
 async def test_asyncio_run(async_engine, stop):
-
     scheduler_config = await async_engine.get_scheduler_config()
     num_scheduler_steps = scheduler_config.num_scheduler_steps
 
@@ -196,9 +197,9 @@ async def run(prompt: str):
 
         output_count = 0
         final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  sampling_params,
-                                                  request_id=uid()):
+        async for output in async_engine.generate(
+            prompt, sampling_params, request_id=uid()
+        ):
             output_count += 1
             final_output = output
         return final_output, output_count
@@ -247,18 +248,19 @@ async def run(prompt: str, kind: RequestOutputKind):
 
         output_count = 0
         final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
+        async for output in async_engine.generate(prompt, params, request_id=uid()):
             output_count += 1
             final_output = output
 
         assert final_output is not None
         assert final_output.finished
 
-        return (final_output.prompt_token_ids,
-                final_output.outputs[0].token_ids,
-                final_output.outputs[0].text, output_count)
+        return (
+            final_output.prompt_token_ids,
+            final_output.outputs[0].token_ids,
+            final_output.outputs[0].text,
+            output_count,
+        )
 
     async def run_deltas(prompt: str):
         params = copy(sampling_params)
@@ -269,9 +271,7 @@ async def run_deltas(prompt: str):
         output_text = ""
         output_count = 0
         final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
+        async for output in async_engine.generate(prompt, params, request_id=uid()):
             token_ids = output.outputs[0].token_ids
             text = output.outputs[0].text
             final_output = output
@@ -298,7 +298,8 @@ async def run_deltas(prompt: str):
     results = await asyncio.gather(
         run("common input prompt", RequestOutputKind.CUMULATIVE),
         run("common input prompt", RequestOutputKind.FINAL_ONLY),
-        run_deltas("common input prompt"))
+        run_deltas("common input prompt"),
+    )
 
     # Make sure outputs are the same
     prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
@@ -342,9 +343,9 @@ async def test_cancellation(async_engine, stop):
 
     i = 0
     with pytest.raises(CancelledError):
-        async for output in async_engine.generate("test2",
-                                                  sampling_params,
-                                                  request_id=request_id):
+        async for output in async_engine.generate(
+            "test2", sampling_params, request_id=request_id
+        ):
             assert not output.finished
             i += 1
             if i == stop_at:
@@ -402,8 +403,7 @@ async def test_invalid_argument(async_engine):
 
     # Targeting specific DP rank only supported in v1 multi-instance DP
     with pytest.raises(ValueError):
-        async for _ in async_engine.generate("test",
-                                             sampling_params,
-                                             request_id=uid(),
-                                             data_parallel_rank=0):
+        async for _ in async_engine.generate(
+            "test", sampling_params, request_id=uid(), data_parallel_rank=0
+        ):
             pass
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 1851eeeda790..784d6dbb796d 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -60,7 +60,8 @@ async def test_request_tracker():
     stream_5 = tracker.add_request("5")
     assert tracker.new_requests_event.is_set()
     tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], finished=True)
+    )
     await tracker.wait_for_new_requests()
     new, aborted = tracker.get_new_and_aborted_requests()
     assert not tracker.new_requests_event.is_set()
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 2e103019f7af..c75defeda4da 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
+
 import os
 import weakref
 from unittest.mock import Mock
@@ -46,16 +47,21 @@ def test_vllm_gc_ed():
 
 
 def _fix_prompt_embed_outputs(
-        vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
-        example_prompts: list[str]) -> list[tuple[list[int], str]]:
+    vllm_outputs: list[tuple[list[int], str]],
+    hf_model: HfRunner,
+    example_prompts: list[str],
+) -> list[tuple[list[int], str]]:
     fixed_vllm_outputs = []
     for vllm_output, hf_input, prompt in zip(
-            vllm_outputs, hf_model.get_inputs(example_prompts),
-            example_prompts):
+        vllm_outputs, hf_model.get_inputs(example_prompts), example_prompts
+    ):
         hf_input_ids = hf_input["input_ids"].tolist()[0]
         fixed_vllm_outputs.append(
-            (hf_input_ids + vllm_output[0][len(hf_input_ids):],
-             prompt + vllm_output[1]))
+            (
+                hf_input_ids + vllm_output[0][len(hf_input_ids) :],
+                prompt + vllm_output[1],
+            )
+        )
     return fixed_vllm_outputs
 
 
@@ -73,18 +79,14 @@ def test_models(
     enforce_eager: bool,
     enable_prompt_embeds: bool,
 ) -> None:
-
-    if enable_prompt_embeds and envs.is_set(
-            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+    if enable_prompt_embeds and envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
         pytest.skip("enable_prompt_embeds is not supported in v1.")
 
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
-    if backend in ("XFORMERS",
-                   "FLASHINFER") and model == "google/gemma-2-2b-it":
-        pytest.skip(
-            f"{backend} does not support gemma2 with full context length.")
+    if backend in ("XFORMERS", "FLASHINFER") and model == "google/gemma-2-2b-it":
+        pytest.skip(f"{backend} does not support gemma2 with full context length.")
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", backend)
@@ -92,30 +94,33 @@ def test_models(
         # 5042 tokens for gemma2
         # gemma2 has alternating sliding window size of 4096
         # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = "The following numbers of the sequence " + ", ".join(
-            str(i) for i in range(1024)) + " are:"
+        prompt = (
+            "The following numbers of the sequence "
+            + ", ".join(str(i) for i in range(1024))
+            + " are:"
+        )
         example_prompts = [prompt]
 
         with hf_runner(model) as hf_model:
             hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
             if enable_prompt_embeds:
                 with torch.no_grad():
-                    prompt_embeds = hf_model.get_prompt_embeddings(
-                        example_prompts)
-
-        with VllmRunner(model,
-                        max_model_len=8192,
-                        enforce_eager=enforce_eager,
-                        enable_prompt_embeds=enable_prompt_embeds,
-                        gpu_memory_utilization=0.7) as vllm_model:
+                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+
+        with VllmRunner(
+            model,
+            max_model_len=8192,
+            enforce_eager=enforce_eager,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
+        ) as vllm_model:
             if enable_prompt_embeds:
-                vllm_outputs = vllm_model.generate_greedy(
-                    prompt_embeds, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
                 vllm_outputs = _fix_prompt_embed_outputs(
-                    vllm_outputs, hf_model, example_prompts)
+                    vllm_outputs, hf_model, example_prompts
+                )
             else:
-                vllm_outputs = vllm_model.generate_greedy(
-                    example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
         check_outputs_equal(
             outputs_0_lst=hf_outputs,
@@ -127,23 +132,20 @@ def test_models(
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, "
-    "test_suite, extra_env", [
+    "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
+    [
         ("distilbert/distilgpt2", "ray", "", "L4", {}),
         ("distilbert/distilgpt2", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "L4", {
-            "VLLM_SLEEP_WHEN_IDLE": "1"
-        }),
-        ("distilbert/distilgpt2", "mp", "", "L4", {
-            "VLLM_SLEEP_WHEN_IDLE": "1"
-        }),
+        ("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
         ("distilbert/distilgpt2", "ray", "", "A100", {}),
         ("distilbert/distilgpt2", "mp", "", "A100", {}),
         ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
         ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
-    ])
+    ],
+)
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
     monkeypatch: pytest.MonkeyPatch,
@@ -157,20 +159,21 @@ def test_models_distributed(
     extra_env: dict[str, str],
     enable_prompt_embeds: bool,
 ) -> None:
-
-    if enable_prompt_embeds and envs.is_set(
-            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+    if enable_prompt_embeds and envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
         pytest.skip("enable_prompt_embeds is not supported in v1.")
 
     if test_suite != TARGET_TEST_SUITE:
         pytest.skip(f"Skip test for {test_suite}")
 
     with monkeypatch.context() as monkeypatch_context:
-        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        if (
+            model == "meta-llama/Llama-3.2-1B-Instruct"
+            and distributed_executor_backend == "ray"
+            and attention_backend == ""
+            and test_suite == "L4"
+        ):  # noqa
             if enable_prompt_embeds:
-                pytest.skip(
-                    "enable_prompt_embeds does not work with ray compiled dag."
-                )
+                pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
             monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
             monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
 
@@ -192,30 +195,26 @@ def test_models_distributed(
         # will hurt multiprocessing backend with fork method
         # (the default method).
         with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                distributed_executor_backend=distributed_executor_backend,
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
         ) as vllm_model:
             if enable_prompt_embeds:
                 with hf_runner(model, dtype=dtype) as hf_model:
                     with torch.no_grad():
-                        prompt_embeds = hf_model.get_prompt_embeddings(
-                            example_prompts)
-                    vllm_outputs = vllm_model.generate_greedy(
-                        prompt_embeds, max_tokens)
+                        prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+                    vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
                     vllm_outputs = _fix_prompt_embed_outputs(
-                        vllm_outputs, hf_model, example_prompts)
-                    hf_outputs = hf_model.generate_greedy(
-                        example_prompts, max_tokens)
+                        vllm_outputs, hf_model, example_prompts
+                    )
+                    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
             else:
-                vllm_outputs = vllm_model.generate_greedy(
-                    example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
                 with hf_runner(model, dtype=dtype) as hf_model:
-                    hf_outputs = hf_model.generate_greedy(
-                        example_prompts, max_tokens)
+                    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -226,27 +225,23 @@ def test_models_distributed(
 
 
 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
-
     from vllm.envs import VLLM_USE_V1
 
     if not VLLM_USE_V1:
         pytest.skip("Skipping V0 test, dump input not supported")
 
     # Needed to mock an error in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
-    with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
+    with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
         if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
             v1_test_failed_model_execution(vllm_model)
 
 
 def v1_test_failed_model_execution(vllm_model):
-
     engine = vllm_model.model.llm_engine
-    mocked_execute_model = Mock(
-        side_effect=RuntimeError("Mocked Critical Error"))
-    engine.engine_core.engine_core.model_executor.execute_model =\
-                mocked_execute_model
+    mocked_execute_model = Mock(side_effect=RuntimeError("Mocked Critical Error"))
+    engine.engine_core.engine_core.model_executor.execute_model = mocked_execute_model
 
     with pytest.raises(RuntimeError) as exc_info:
         prompts = [
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 4816b76996fc..597155497c6e 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -37,7 +37,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     all tests in the file.
     """
     with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
+        m.setenv("VLLM_USE_V1", "0")
         yield
 
 
@@ -49,13 +49,18 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
+@pytest.mark.parametrize(
+    "attention_backend",
+    [
+        pytest.param(
+            "FLASHINFER",
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(), reason="FLASHINFER isn't supported on ROCm"
+            ),
+        ),
+        "FLASH_ATTN",
+    ],
+)
 def test_models(
     hf_runner: HfRunner,
     vllm_runner: VllmRunner,
@@ -83,16 +88,15 @@ def test_models(
             hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
         with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
         ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
         check_outputs_equal(
             outputs_0_lst=hf_outputs,
@@ -105,13 +109,18 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
+@pytest.mark.parametrize(
+    "attention_backend",
+    [
+        pytest.param(
+            "FLASHINFER",
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(), reason="FLASHINFER isn't supported on ROCm"
+            ),
+        ),
+        "FLASH_ATTN",
+    ],
+)
 def test_models_distributed(
     hf_runner: HfRunner,
     vllm_runner: VllmRunner,
@@ -123,8 +132,10 @@ def test_models_distributed(
 ) -> None:
     with monkeypatch.context() as m:
         m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-        if (model == "meta-llama/Llama-3.2-1B-Instruct"
-                and distributed_executor_backend == "ray"):
+        if (
+            model == "meta-llama/Llama-3.2-1B-Instruct"
+            and distributed_executor_backend == "ray"
+        ):
             # test Ray Compiled Graph
             m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
             m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
@@ -146,13 +157,13 @@ def test_models_distributed(
         # fork method (the default method).
 
         with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                max_num_seqs=max_num_seqs,
-                enable_chunked_prefill=enable_chunked_prefill,
-                max_num_batched_tokens=max_num_batched_tokens,
-                distributed_executor_backend=distributed_executor_backend,
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
         ) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(
                 example_prompts,
@@ -172,8 +183,8 @@ def test_models_distributed(
 
 @pytest.mark.parametrize(
     "kv_cache_dtype,model",
-    [("fp8_e4m3",
-      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
+    [("fp8_e4m3", "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
@@ -184,8 +195,9 @@ def test_models_distributed(
 # Due to low-precision numerical divergence, this test is too sensitive to
 # the async postprocessor
 @pytest.mark.parametrize("disable_async_output_proc", [True])
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="machete_prepack_B isn't supported on ROCm")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="machete_prepack_B isn't supported on ROCm"
+)
 def test_models_with_fp8_kv_cache(
     vllm_runner: VllmRunner,
     example_prompts,
@@ -208,28 +220,30 @@ def test_models_with_fp8_kv_cache(
     max_num_batched_tokens = chunked_prefill_token_size
 
     with vllm_runner(
-            model,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
+        model,
+        tensor_parallel_size=tensor_parallel_size,
+        enforce_eager=enforce_eager,
+        max_num_seqs=max_num_seqs,
+        kv_cache_dtype=kv_cache_dtype,
+        disable_async_output_proc=disable_async_output_proc,
     ) as vllm_model:
         no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+            example_prompts, max_tokens, NUM_LOG_PROBS
+        )
 
     with vllm_runner(
-            model,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
+        model,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=True,
+        tensor_parallel_size=tensor_parallel_size,
+        enforce_eager=enforce_eager,
+        max_num_seqs=max_num_seqs,
+        kv_cache_dtype=kv_cache_dtype,
+        disable_async_output_proc=disable_async_output_proc,
     ) as vllm_model:
         chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+            example_prompts, max_tokens, NUM_LOG_PROBS
+        )
 
     check_logprobs_close(
         outputs_0_lst=no_chunked_prefill_outputs,
@@ -272,14 +286,14 @@ def test_with_prefix_caching(
     outputs = {}  # type: ignore
     for enable in (True, False):
         with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=enable,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            enable_prefix_caching=enable,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
         ) as vllm_model:
             outputs[enable] = []
             for prompt in full_prompts:
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 28bfe9e7c802..3c1e01d072b9 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -5,5 +5,6 @@
 
 
 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
-                         ["--cpu-offload-gb", "1"])
+    compare_two_settings(
+        "meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
+    )
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 34f9389c82a9..f05dd9244b31 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -23,13 +23,13 @@ def test_python_error():
     tensors = []
     with allocator.use_memory_pool():
         # allocate 70% of the total memory
-        x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+        x = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
         tensors.append(x)
     # release the memory
     allocator.sleep()
 
     # allocate more memory than the total memory
-    y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+    y = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
     tensors.append(y)
     with pytest.raises(RuntimeError):
         # when the allocator is woken up, it should raise an error
@@ -41,17 +41,17 @@ def test_python_error():
 def test_basic_cumem():
     # some tensors from default memory pool
     shape = (1024, 1024)
-    x = torch.empty(shape, device='cuda')
+    x = torch.empty(shape, device="cuda")
     x.zero_()
 
     # some tensors from custom memory pool
     allocator = CuMemAllocator.get_instance()
     with allocator.use_memory_pool():
         # custom memory pool
-        y = torch.empty(shape, device='cuda')
+        y = torch.empty(shape, device="cuda")
         y.zero_()
         y += 1
-        z = torch.empty(shape, device='cuda')
+        z = torch.empty(shape, device="cuda")
         z.zero_()
         z += 2
 
@@ -74,16 +74,16 @@ def test_basic_cumem():
 def test_cumem_with_cudagraph():
     allocator = CuMemAllocator.get_instance()
     with allocator.use_memory_pool():
-        weight = torch.eye(1024, device='cuda')
+        weight = torch.eye(1024, device="cuda")
     with allocator.use_memory_pool(tag="discard"):
-        cache = torch.empty(1024, 1024, device='cuda')
+        cache = torch.empty(1024, 1024, device="cuda")
 
     def model(x):
         out = x @ weight
-        cache[:out.size(0)].copy_(out)
+        cache[: out.size(0)].copy_(out)
         return out + 1
 
-    x = torch.empty(128, 1024, device='cuda')
+    x = torch.empty(128, 1024, device="cuda")
 
     # warmup
     model(x)
@@ -109,7 +109,7 @@ def model(x):
     model_graph.replay()
 
     # cache content is as expected
-    assert torch.allclose(x, cache[:x.size(0)])
+    assert torch.allclose(x, cache[: x.size(0)])
 
     # output content is as expected
     assert torch.allclose(y, x + 1)
@@ -123,7 +123,8 @@ def model(x):
         ("meta-llama/Llama-3.2-1B", True),
         # sleep mode with pytorch checkpoint
         ("facebook/opt-125m", False),
-    ])
+    ],
+)
 def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 341a39a42b85..cc93e6f12e12 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -7,13 +7,13 @@
 Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
 pytest tests/basic_correctness/test_preemption.py`.
 """
+
 import pytest
 from prometheus_client import REGISTRY
 
 import vllm.envs as envs
 from vllm import SamplingParams
-from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
-                                 ENABLE_ARTIFICIAL_PREEMPT)
+from vllm.core.scheduler import ARTIFICIAL_PREEMPTION_MAX_CNT, ENABLE_ARTIFICIAL_PREEMPT
 
 from ..models.utils import check_outputs_equal
 
@@ -28,7 +28,7 @@ def use_v0_only(monkeypatch):
     We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
     so use VLLM_USE_V1=0 for all tests in the file.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -36,7 +36,8 @@ def check_settings():
     assert ENABLE_ARTIFICIAL_PREEMPT is True, (
         "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
         "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
-        "pytest tests/basic_correctness/test_preemption.py`")
+        "pytest tests/basic_correctness/test_preemption.py`"
+    )
 
 
 @pytest.fixture
@@ -72,25 +73,29 @@ def test_chunked_prefill_recompute(
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_seqs=max_num_seqs,
-            distributed_executor_backend=distributed_executor_backend,
-            disable_log_stats=False,
+        model,
+        dtype=dtype,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_seqs=max_num_seqs,
+        distributed_executor_backend=distributed_executor_backend,
+        disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (
+            vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+            < ARTIFICIAL_PREEMPTION_MAX_CNT
+        )
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         vllm_output_ids, vllm_output_str = vllm_outputs[i]
         assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
+        )
         assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+        )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -112,16 +117,19 @@ def test_preemption(
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            distributed_executor_backend=distributed_executor_backend,
+        model,
+        dtype=dtype,
+        disable_log_stats=False,
+        distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+        assert (
+            vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+            < ARTIFICIAL_PREEMPTION_MAX_CNT
+        )
+        total_preemption = vllm_model.model.llm_engine.scheduler[
+            0
+        ].num_cumulative_preemption
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -130,8 +138,10 @@ def test_preemption(
         name_1="vllm",
     )
 
-    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
+    assert (
+        "is preempted by PreemptionMode.RECOMPUTE mode because there "
+        "is not enough KV cache space." in caplog_vllm.text
+    )
     # Ensure the count bucket of request-level histogram metrics matches
     # the number of requests as a simple sanity check to ensure metrics are
     # generated
@@ -162,25 +172,26 @@ def test_preemption_infeasible(
     prefill_blocks = 2
     decode_blocks = max_tokens // BLOCK_SIZE
     with vllm_runner(
-            model,
-            dtype=dtype,
-            block_size=BLOCK_SIZE,
-            # Not enough gpu blocks to complete a single sequence.
-            # preemption should happen, and the sequence should be
-            # ignored instead of hanging forever.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            distributed_executor_backend=distributed_executor_backend,
+        model,
+        dtype=dtype,
+        block_size=BLOCK_SIZE,
+        # Not enough gpu blocks to complete a single sequence.
+        # preemption should happen, and the sequence should be
+        # ignored instead of hanging forever.
+        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+        distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
+        sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
         req_outputs = vllm_model.model.generate(
             example_prompts,
             sampling_params=sampling_params,
         )
 
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (
+            vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+            < ARTIFICIAL_PREEMPTION_MAX_CNT
+        )
 
     # Verify the request is ignored and not hang.
     for req_output in req_outputs:
diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py
index 2279c846e01c..54075a3a15e6 100644
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
@@ -10,8 +10,18 @@
 @pytest.mark.benchmark
 def test_bench_latency():
     command = [
-        "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
-        "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+        "vllm",
+        "bench",
+        "latency",
+        "--model",
+        MODEL_NAME,
+        "--input-len",
+        "32",
+        "--output-len",
+        "1",
+        "--enforce-eager",
+        "--load-format",
+        "dummy",
     ]
     result = subprocess.run(command, capture_output=True, text=True)
     print(result.stdout)
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index bfcf274727e2..48d524ceebd0 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -11,9 +11,7 @@
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
-    ]
+    args = ["--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py
index b61e51db4fbe..a579b59e8af4 100644
--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -10,8 +10,18 @@
 @pytest.mark.benchmark
 def test_bench_throughput():
     command = [
-        "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
-        "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+        "vllm",
+        "bench",
+        "throughput",
+        "--model",
+        MODEL_NAME,
+        "--input-len",
+        "32",
+        "--output-len",
+        "1",
+        "--enforce-eager",
+        "--load-format",
+        "dummy",
     ]
     result = subprocess.run(command, capture_output=True, text=True)
     print(result.stdout)
diff --git a/tests/build_cython.py b/tests/build_cython.py
index 444434e8f0a7..5a968651d8c2 100644
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@@ -28,12 +28,13 @@
     "vllm/utils/__init__.py",
 ]
 
-setup(ext_modules=cythonize(infiles,
-                            annotate=False,
-                            force=True,
-                            compiler_directives={
-                                'language_level': "3",
-                                'infer_types': True
-                            }))
+setup(
+    ext_modules=cythonize(
+        infiles,
+        annotate=False,
+        force=True,
+        compiler_directives={"language_level": "3", "infer_types": True},
+    )
+)
 
 # example usage: python3 build_cython.py build_ext --inplace
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index ace4d25534cd..0a362de08df3 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -25,20 +25,18 @@ class TestBackend:
     Inductor config is default-initialized from VllmConfig.CompilationConfig.
     """
 
-    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
-                                                             None]]):
+    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
         self.custom_passes = list(passes)
         compile_config = get_current_vllm_config().compilation_config
         self.inductor_config = compile_config.inductor_compile_config
-        self.inductor_config['force_disable_caches'] = True
-        self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
+        self.inductor_config["force_disable_caches"] = True
+        self.inductor_config["post_grad_custom_post_pass"] = self.post_pass
 
     def __call__(self, graph: fx.GraphModule, example_inputs):
         self.graph_pre_compile = deepcopy(graph)
         from torch._inductor.compile_fx import compile_fx
-        return compile_fx(graph,
-                          example_inputs,
-                          config_patches=self.inductor_config)
+
+        return compile_fx(graph, example_inputs, config_patches=self.inductor_config)
 
     def post_pass(self, graph: fx.Graph):
         self.graph_pre_pass = deepcopy(graph)
@@ -56,12 +54,11 @@ def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced=True):
             assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph"
             assert num_pre > num_post, f"All nodes remain for op {op.name()}"
             if fully_replaced:
-                assert num_post == 0, \
-                    f"Unexpected op {op.name()} in post-pass graph"
+                assert num_post == 0, f"Unexpected op {op.name()} in post-pass graph"
 
     def check_after_ops(self, ops: Sequence[OpOverload]):
         for op in ops:
             num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
             num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
             assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph"
-            assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
\ No newline at end of file
+            assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index efe9c843f144..99dd61254a75 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -36,10 +36,7 @@ def temporary_environ(env_vars):
 def llm_pair(request):
     model = request.param
 
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }):
+    with temporary_environ({"VLLM_USE_V1": "1", "VLLM_FLASH_ATTN_VERSION": "3"}):
         full = LLM(
             model=model,
             gpu_memory_utilization=0.45,
@@ -71,11 +68,14 @@ def llm_pair(request):
     [
         # Model names for the llm_pair fixture
         "deepseek-ai/DeepSeek-V2-Lite",
-        "Qwen/Qwen2-1.5B-Instruct"
+        "Qwen/Qwen2-1.5B-Instruct",
     ],
-    indirect=True)
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
+    indirect=True,
+)
+@pytest.mark.skipif(
+    current_platform.get_device_capability() != (9, 0),
+    reason="Only Hopper GPUs support FA3 and FlashMLA",
+)
 class TestFullCUDAGraph:
     """
     Use a class such that an llm pair is constructed once for all
@@ -85,20 +85,22 @@ class TestFullCUDAGraph:
     meaning there would be multiple LLM instances hogging memory simultaneously.
     """
 
-    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
-        (1, 10),
-        (7, 10),
-        (16, 10),
-        (25, 10),
-        (32, 10),
-        (45, 10),
-        (64, 10),
-        (123, 10),
-        (8, 5),
-        (8, 30),
-    ])
-    def test_full_cudagraph(self, batch_size, max_tokens,
-                            llm_pair: tuple[LLM, LLM]):
+    @pytest.mark.parametrize(
+        ("batch_size", "max_tokens"),
+        [
+            (1, 10),
+            (7, 10),
+            (16, 10),
+            (25, 10),
+            (32, 10),
+            (45, 10),
+            (64, 10),
+            (123, 10),
+            (8, 5),
+            (8, 30),
+        ],
+    )
+    def test_full_cudagraph(self, batch_size, max_tokens, llm_pair: tuple[LLM, LLM]):
         """
         Test various batch sizes and max_tokens to ensure that the
         full cudagraph compilation works for padded cases too.
@@ -107,16 +109,15 @@ def test_full_cudagraph(self, batch_size, max_tokens,
         piecewise_llm, full_cudagraph_llm = llm_pair
 
         prompts = ["Hello, my name is"] * batch_size
-        sampling_params = SamplingParams(temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         top_p=0.95)
+        sampling_params = SamplingParams(
+            temperature=0.0, max_tokens=max_tokens, top_p=0.95
+        )
 
         piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
         full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
 
         # Check that all responses are the same
-        for piecewise_res, full_res in zip(piecewise_responses,
-                                           full_responses):
+        for piecewise_res, full_res in zip(piecewise_responses, full_responses):
             assert piecewise_res.outputs[0].text == full_res.outputs[0].text
 
 
@@ -126,33 +127,44 @@ def test_full_cudagraph(self, batch_size, max_tokens,
         ("Qwen/Qwen2-1.5B-Instruct", True),
         # MLA does not support capturing CUDA Graphs with size > max_num_seqs
         ("deepseek-ai/DeepSeek-V2-Lite", False),
-    ])
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
+    ],
+)
+@pytest.mark.skipif(
+    current_platform.get_device_capability() != (9, 0),
+    reason="Only Hopper GPUs support FA3 and FlashMLA",
+)
 def test_lower_max_num_seqs(model, supported):
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }), ExitStack() as stack:
+    with (
+        temporary_environ({"VLLM_USE_V1": "1", "VLLM_FLASH_ATTN_VERSION": "3"}),
+        ExitStack() as stack,
+    ):
         if not supported:
             stack.enter_context(pytest.raises(RuntimeError))
 
-        llm = LLM(model=model,
-                  max_num_seqs=256,
-                  trust_remote_code=True,
-                  max_model_len=1024,
-                  compilation_config=CompilationConfig(
-                      full_cuda_graph=True,
-                      cudagraph_capture_sizes=[64, 256, 512]))
+        llm = LLM(
+            model=model,
+            max_num_seqs=256,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(
+                full_cuda_graph=True, cudagraph_capture_sizes=[64, 256, 512]
+            ),
+        )
         llm.generate(["Hello, my name is"] * 10)
 
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION":
-            "2"  #FA2 not supported with full_cuda_graph
-    }), pytest.raises(RuntimeError):
-        LLM(model="Qwen/Qwen2-1.5B-Instruct",
-            compilation_config=CompilationConfig(full_cuda_graph=True))
+    with (
+        temporary_environ(
+            {
+                "VLLM_USE_V1": "1",
+                "VLLM_FLASH_ATTN_VERSION": "2",  # FA2 not supported with full_cuda_graph
+            }
+        ),
+        pytest.raises(RuntimeError),
+    ):
+        LLM(
+            model="Qwen/Qwen2-1.5B-Instruct",
+            compilation_config=CompilationConfig(full_cuda_graph=True),
+        )
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 06ac3527e1fb..ee67d6696f70 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -4,6 +4,7 @@
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
 """
+
 import pytest
 import torch
 from torch import nn
@@ -11,8 +12,12 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (
+    CompilationConfig,
+    CompilationLevel,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import set_forward_context
 from vllm.utils import direct_register_custom_op
@@ -23,8 +28,9 @@
 silly_lib = Library("silly", "FRAGMENT")  # noqa
 
 
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
+def silly_attention(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+) -> None:
     global global_counter
     global_counter += 1
     print(f"{global_counter=}")
@@ -32,8 +38,9 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     out[0] += 1
 
 
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
+def silly_attention_fake(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+) -> None:
     return
 
 
@@ -48,12 +55,7 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
 
 @support_torch_compile
 class SillyModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
         super().__init__()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -81,28 +83,31 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 def test_simple_piecewise_compile(use_inductor):
     assert VLLM_USE_V1
 
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=True,
-        use_inductor=use_inductor,
-        splitting_ops=["silly.attention"],
-        cudagraph_copy_inputs=True,
-        cudagraph_capture_sizes=[1, 2],
-    ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            use_cudagraph=True,
+            use_inductor=use_inductor,
+            splitting_ops=["silly.attention"],
+            cudagraph_copy_inputs=True,
+            cudagraph_capture_sizes=[1, 2],
+        )
+    )
     with set_current_vllm_config(vllm_config):
-        model = SillyModel(vllm_config=vllm_config, prefix='')
+        model = SillyModel(vllm_config=vllm_config, prefix="")
 
     inputs = torch.randn(100).cuda()
 
-    with compilation_counter.expect(
+    with (
+        compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
             num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
             num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
             num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=
-            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-
+            num_cudagraph_captured=6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        ),
+        set_forward_context({}, vllm_config=vllm_config),
+    ):
         model(inputs)
 
         model(torch.randn(2).cuda())
@@ -113,4 +118,4 @@ def test_simple_piecewise_compile(use_inductor):
         global_counter = 0
         output = model(input)
         assert global_counter == 2
-        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+        assert torch.allclose(output.cpu(), torch.tensor([3.0, 1.0]))
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index b7ed8353b3ce..738cc45da6e0 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,6 +8,7 @@
 if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
+
 from dataclasses import dataclass
 from typing import Any, Optional
 
@@ -18,8 +19,12 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (
+    CompilationConfig,
+    CompilationLevel,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.forward_context import set_forward_context
 from vllm.utils import direct_register_custom_op
 
@@ -27,15 +32,17 @@
 silly_lib = Library("silly", "FRAGMENT")  # noqa
 
 
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
+def silly_attention(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+) -> None:
     out.copy_(q)
     out += k
     out += v
 
 
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
+def silly_attention_fake(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+) -> None:
     return
 
 
@@ -66,15 +73,14 @@ def compute_hash(self) -> str:
             factors.append((k, v))
         factors.sort()
         import hashlib
-        return hashlib.md5(str(factors).encode(),
-                           usedforsecurity=False).hexdigest()
+
+        return hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
 
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
 
 
 class LlamaMLP(nn.Module):
-
     def __init__(self, config: LlamaConfig) -> None:
         super().__init__()
         self.gate_up_projection = nn.Linear(
@@ -89,31 +95,31 @@ def __init__(self, config: LlamaConfig) -> None:
         )
 
         if config.tractable_init:
-            nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
-            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
+            nn.init.eye_(self.gate_up_projection.weight.data[: config.mlp_size])
+            nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size :])
             nn.init.eye_(self.down_projection.weight.data)
         else:
-            nn.init.xavier_normal_(self.gate_up_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
-            nn.init.xavier_normal_(self.down_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
+            nn.init.xavier_normal_(
+                self.gate_up_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
+            nn.init.xavier_normal_(
+                self.down_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
 
     def forward(self, x):
         # for tractable_init and positive input, this is
         # essentially an elementwise-square
         x = self.gate_up_projection(x)
-        x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
-            x[:, x.size(1) // 2:])
+        x = x[:, : x.size(1) // 2] * torch.nn.functional.relu(x[:, x.size(1) // 2 :])
         x = self.down_projection(x)
         return x
 
 
 class LlamaAttention(nn.Module):
-
     def __init__(self, config: LlamaConfig) -> None:
         super().__init__()
         self.qkv_projection = nn.Linear(
@@ -129,21 +135,25 @@ def __init__(self, config: LlamaConfig) -> None:
         )
 
         if config.tractable_init:
-            nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
-            nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
-                                                         config.hidden_size])
-            nn.init.eye_(self.qkv_projection.weight.data[2 *
-                                                         config.hidden_size:])
+            nn.init.eye_(self.qkv_projection.weight.data[: config.hidden_size])
+            nn.init.eye_(
+                self.qkv_projection.weight.data[
+                    config.hidden_size : 2 * config.hidden_size
+                ]
+            )
+            nn.init.eye_(self.qkv_projection.weight.data[2 * config.hidden_size :])
             nn.init.eye_(self.output_projection.weight.data)
         else:
-            nn.init.xavier_normal_(self.qkv_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
-            nn.init.xavier_normal_(self.output_projection.weight.data,
-                                   generator=torch.Generator().manual_seed(
-                                       config.random_seed),
-                                   gain=0.001)
+            nn.init.xavier_normal_(
+                self.qkv_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
+            nn.init.xavier_normal_(
+                self.output_projection.weight.data,
+                generator=torch.Generator().manual_seed(config.random_seed),
+                gain=0.001,
+            )
 
     def forward(
         self,
@@ -167,7 +177,6 @@ def forward(
 
 
 class LlamaDecoderLayer(nn.Module):
-
     def __init__(self, config: LlamaConfig) -> None:
         super().__init__()
         self.self_attention = LlamaAttention(config)
@@ -187,7 +196,7 @@ def forward(
         - if residual is not None, the outputs are:
             - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
             - hidden_states = (residual + 1) ** 2
-        """ # noqa
+        """  # noqa
         if residual is None:
             residual = hidden_states
             hidden_states = hidden_states + 1
@@ -196,8 +205,9 @@ def forward(
             residual = hidden_states
             hidden_states = hidden_states + 1
 
-        hidden_states = self.self_attention(positions=positions,
-                                            hidden_states=hidden_states)
+        hidden_states = self.self_attention(
+            positions=positions, hidden_states=hidden_states
+        )
 
         hidden_states = hidden_states + residual
         residual = hidden_states
@@ -209,20 +219,22 @@ def forward(
 
 @support_torch_compile
 class LlamaModel(nn.Module):
-
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 config: LlamaConfig,
-                 prefix: str = '',
-                 **kwargs) -> None:
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        config: LlamaConfig,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
         super().__init__()
         self.embedding_tokens = nn.Embedding(
             num_embeddings=config.vocab_size,
             embedding_dim=config.hidden_size,
         )
         self.layers = nn.ModuleList(
-            [LlamaDecoderLayer(config) for _ in range(config.num_layers)])
+            [LlamaDecoderLayer(config) for _ in range(config.num_layers)]
+        )
 
         # this is the initial value of the hidden states
         self.embedding_tokens.weight.data.fill_(config.init_value)
@@ -239,34 +251,39 @@ def forward(
         return hidden_states
 
 
-def tractable_computation(input_ids: torch.Tensor,
-                          positions: torch.Tensor,
-                          config: LlamaConfig,
-                          init_value: float = 1.0) -> torch.Tensor:
-    hidden_states = torch.ones(input_ids.size(0),
-                               config.hidden_size,
-                               device=input_ids.device,
-                               dtype=input_ids.dtype) * init_value
+def tractable_computation(
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    config: LlamaConfig,
+    init_value: float = 1.0,
+) -> torch.Tensor:
+    hidden_states = (
+        torch.ones(
+            input_ids.size(0),
+            config.hidden_size,
+            device=input_ids.device,
+            dtype=input_ids.dtype,
+        )
+        * init_value
+    )
 
     # first layer
     residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
-    hidden_states = (residual + 1)**2
+    hidden_states = (residual + 1) ** 2
 
     # following layers
     for _ in range(config.num_layers - 1):
         hidden_states = hidden_states + residual
         residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
-        hidden_states = (residual + 1)**2
+        hidden_states = (residual + 1) ** 2
 
     return hidden_states
 
 
 @torch.inference_mode
-def run_model(llama_config,
-              use_compile: bool,
-              use_inductor: bool,
-              split_attn: bool = False) -> torch.Tensor:
-
+def run_model(
+    llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False
+) -> torch.Tensor:
     if use_compile:
         compilation_config = CompilationConfig(
             level=CompilationLevel.PIECEWISE,
@@ -278,18 +295,22 @@ def run_model(llama_config,
             compilation_config.splitting_ops = ["silly.attention"]
     else:
         compilation_config = CompilationConfig(
-            level=CompilationLevel.NO_COMPILATION, )
+            level=CompilationLevel.NO_COMPILATION,
+        )
 
-    vllm_config = VllmConfig(compilation_config=compilation_config,
-                             additional_config=llama_config)
+    vllm_config = VllmConfig(
+        compilation_config=compilation_config, additional_config=llama_config
+    )
     with set_current_vllm_config(vllm_config):
-        model = LlamaModel(config=llama_config,
-                           vllm_config=vllm_config,
-                           prefix="").eval().cuda()
+        model = (
+            LlamaModel(config=llama_config, vllm_config=vllm_config, prefix="")
+            .eval()
+            .cuda()
+        )
 
     with set_forward_context({}, vllm_config=vllm_config):
         B = 16  # max batch size
-        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        input_ids = torch.randint(0, llama_config.vocab_size, (B,)).cuda()
         positions = torch.arange(B).cuda()
 
         model(input_ids, positions)
@@ -302,9 +323,9 @@ def run_model(llama_config,
         output = output.cpu()
 
         if llama_config.tractable_init:
-            expected_output = tractable_computation(input_ids[:2],
-                                                    positions[:2],
-                                                    llama_config).cpu()
+            expected_output = tractable_computation(
+                input_ids[:2], positions[:2], llama_config
+            ).cpu()
 
             assert torch.allclose(output, expected_output)
         else:
@@ -315,27 +336,23 @@ def run_model(llama_config,
 def test_toy_llama(use_inductor: bool):
     # compare output with and without piecewise compilation
 
-    llama_config = LlamaConfig(hidden_size=128,
-                               mlp_size=256,
-                               vocab_size=128,
-                               num_layers=12)
+    llama_config = LlamaConfig(
+        hidden_size=128, mlp_size=256, vocab_size=128, num_layers=12
+    )
 
-    tractable_config = LlamaConfig(hidden_size=128,
-                                   mlp_size=256,
-                                   vocab_size=128,
-                                   num_layers=2,
-                                   tractable_init=True)
+    tractable_config = LlamaConfig(
+        hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
+    )
 
     outputs = []
     with compilation_counter.expect(
-            num_graphs_seen=0,
-            num_piecewise_graphs_seen=0,
-            num_piecewise_capturable_graphs_seen=0,
-            num_backend_compilations=0,
-            num_cudagraph_captured=0,
+        num_graphs_seen=0,
+        num_piecewise_graphs_seen=0,
+        num_piecewise_capturable_graphs_seen=0,
+        num_backend_compilations=0,
+        num_cudagraph_captured=0,
     ):
-        outputs.append(
-            run_model(llama_config, use_inductor=False, use_compile=False))
+        outputs.append(run_model(llama_config, use_inductor=False, use_compile=False))
     run_model(tractable_config, use_inductor=False, use_compile=False)
 
     if use_inductor:
@@ -344,41 +361,41 @@ def test_toy_llama(use_inductor: bool):
         kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
     with compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=1,
-            num_piecewise_capturable_graphs_seen=1,
-            num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=
-            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-            **kwargs,
+        num_graphs_seen=1,  # one graph for the model
+        num_piecewise_graphs_seen=1,
+        num_piecewise_capturable_graphs_seen=1,
+        num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
+        num_cudagraph_captured=2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        **kwargs,
     ):
         outputs.append(
-            run_model(llama_config,
-                      use_inductor=use_inductor,
-                      use_compile=True))
+            run_model(llama_config, use_inductor=use_inductor, use_compile=True)
+        )
     run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
 
     with compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=2 * llama_config.num_layers +
-            1,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=1 +
-            llama_config.num_layers,  # 1 + num_layers
-            num_backend_compilations=1 +
-            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=2 *
-        (1 + llama_config.num_layers
-         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=1,  # one graph for the model
+        num_piecewise_graphs_seen=2 * llama_config.num_layers + 1,  # 2 * num_layers + 1
+        num_piecewise_capturable_graphs_seen=1
+        + llama_config.num_layers,  # 1 + num_layers
+        num_backend_compilations=1
+        + llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
+        num_cudagraph_captured=2
+        * (
+            1 + llama_config.num_layers
+        ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
         outputs.append(
-            run_model(llama_config,
-                      use_inductor=use_inductor,
-                      use_compile=True,
-                      split_attn=True))
-    run_model(tractable_config,
-              use_inductor=use_inductor,
-              use_compile=True,
-              split_attn=True)
+            run_model(
+                llama_config,
+                use_inductor=use_inductor,
+                use_compile=True,
+                split_attn=True,
+            )
+        )
+    run_model(
+        tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True
+    )
 
     for i in range(1, len(outputs)):
         assert torch.allclose(outputs[0], outputs[i])
@@ -389,17 +406,15 @@ def benchmark():
     from triton.testing import do_bench
 
     # similar to llama 3.1-8B
-    llama_config = LlamaConfig(hidden_size=4096,
-                               mlp_size=14336,
-                               vocab_size=128 * 1024,
-                               num_layers=32)
+    llama_config = LlamaConfig(
+        hidden_size=4096, mlp_size=14336, vocab_size=128 * 1024, num_layers=32
+    )
 
     # a tiny model to measure the overhead
     # of piecewise cudagraph
-    llama_config = LlamaConfig(hidden_size=40,
-                               mlp_size=80,
-                               vocab_size=128,
-                               num_layers=2)
+    llama_config = LlamaConfig(
+        hidden_size=40, mlp_size=80, vocab_size=128, num_layers=2
+    )
 
     cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
 
@@ -425,12 +440,15 @@ def benchmark():
 
         vllm_config = VllmConfig(compilation_config=compilation_config)
         with set_current_vllm_config(vllm_config):
-            model = LlamaModel(config=llama_config,
-                               vllm_config=vllm_config,
-                               prefix="").eval().cuda().to(torch.bfloat16)
+            model = (
+                LlamaModel(config=llama_config, vllm_config=vllm_config, prefix="")
+                .eval()
+                .cuda()
+                .to(torch.bfloat16)
+            )
 
         B = 256  # max batch size
-        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
+        input_ids = torch.randint(0, llama_config.vocab_size, (B,)).cuda()
         positions = torch.arange(B).cuda().to(torch.bfloat16)
 
         graphs = {}
@@ -452,21 +470,25 @@ def benchmark():
                 # and use it later, because it will look up the name `b` in the
                 # enclosing scope, and the value of `b` will always be 256.
                 # it is fine here, because we only use the lambda function once.
-                runtime = do_bench(lambda: graphs[b][0]  # noqa
-                                   (input_ids[:b], positions[:b]))  # noqa
+                runtime = do_bench(
+                    lambda: graphs[b][0](  # noqa
+                        input_ids[:b], positions[:b]
+                    )
+                )  # noqa
                 piecewise_cudagraph_time[b] = runtime
             else:
                 runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
-                eager_runtime = do_bench(
-                    lambda: model(input_ids[:b], positions[:b]))  # noqa
+                eager_runtime = do_bench(lambda: model(input_ids[:b], positions[:b]))  # noqa
                 full_cudagraph_time[b] = runtime
                 eager_time[b] = eager_runtime
 
     # print in tabular format
     print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
     for b in cudagraph_sizes:
-        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
-              f"\t{piecewise_cudagraph_time[b]:.3f}")
+        print(
+            f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+            f"\t{piecewise_cudagraph_time[b]:.3f}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 62804e721e3d..78ac35a28ba6 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -8,18 +8,30 @@
 
 import vllm.envs as envs
 from vllm.compilation.collective_fusion import AsyncTPPass
-from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
-                         PassConfig, VllmConfig)
-from vllm.distributed import (tensor_model_parallel_all_gather,
-                              tensor_model_parallel_reduce_scatter)
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.config import (
+    CompilationConfig,
+    DeviceConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
+from vllm.distributed import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_reduce_scatter,
+)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables
 
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import (compare_two_settings, create_new_process_for_each_test,
-                     multi_gpu_test)
+from ..utils import (
+    compare_two_settings,
+    create_new_process_for_each_test,
+    multi_gpu_test,
+)
 from .backend import TestBackend
 
 prompts = [
@@ -31,20 +43,19 @@
 
 
 class TestMMRSModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16):
         super().__init__()
         self.hidden_size = hidden_size
-        self.gate_proj = torch.nn.Parameter(torch.empty(
-            (self.hidden_size * 2, hidden_size)),
-                                            requires_grad=False)
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((self.hidden_size * 2, hidden_size)), requires_grad=False
+        )
         # Initialize weights
         torch.nn.init.normal_(self.gate_proj, std=0.02)
 
     def forward(self, hidden_states):
         """
         Forward pass implementing the mm + reduce scatter in the FX graph
-    
+
         """
         # Reshape input
         view = hidden_states.reshape(-1, self.hidden_size)
@@ -63,13 +74,12 @@ def ops_in_model_after(self):
 
 
 class TestAGMMModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16):
         super().__init__()
         self.hidden_size = hidden_size
-        self.weight = torch.nn.Parameter(torch.empty(
-            (hidden_size, hidden_size)),
-                                         requires_grad=False)
+        self.weight = torch.nn.Parameter(
+            torch.empty((hidden_size, hidden_size)), requires_grad=False
+        )
         # Initialize weights
         torch.nn.init.normal_(self.weight, std=0.02)
 
@@ -97,28 +107,33 @@ def ops_in_model_after(self):
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
-                               hidden_size: int, dtype: torch.dtype):
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+def test_async_tp_pass_replace(
+    test_model: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype
+):
     num_processes = 2
 
     def run_torch_spawn(fn, nprocs):
         # need to use torch.mp.spawn otherwise will have problems with
         # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(num_processes, test_model,
-                                          batch_size, seq_len, hidden_size,
-                                          dtype),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(async_tp_pass_on_test_model, num_processes)
 
 
-def async_tp_pass_on_test_model(local_rank: int, world_size: int,
-                                test_model_cls: torch.nn.Module,
-                                batch_size: int, seq_len: int,
-                                hidden_size: int, dtype: torch.dtype):
+def async_tp_pass_on_test_model(
+    local_rank: int,
+    world_size: int,
+    test_model_cls: torch.nn.Module,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+):
     current_platform.seed_everything(0)
 
     device = torch.device(f"cuda:{local_rank}")
@@ -126,13 +141,15 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
@@ -140,29 +157,34 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
 
     # configure vllm config for SequenceParallelismPass
     vllm_config = VllmConfig()
-    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
-        enable_async_tp=True, ), )
+    vllm_config.compilation_config = CompilationConfig(
+        pass_config=PassConfig(
+            enable_async_tp=True,
+        ),
+    )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
 
     # this is a fake model name to construct the model config
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
-    vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
-                                           trust_remote_code=True,
-                                           dtype=dtype,
-                                           seed=42)
+    vllm_config.model_config = ModelConfig(
+        model=model_name,
+        task="auto",
+        tokenizer=model_name,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype=dtype,
+        seed=42,
+    )
 
     async_tp_pass = AsyncTPPass(vllm_config)
     backend = TestBackend(async_tp_pass)
 
     model = test_model_cls(hidden_size)
 
-    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
-                                dtype=dtype,
-                                requires_grad=False)
+    hidden_states = torch.randn(
+        (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
+    )
 
     compiled_model = torch.compile(model, backend=backend)
     compiled_model(hidden_states)
@@ -210,12 +232,10 @@ def test_async_tp_pass_correctness(
         common_args.append("--enforce-eager")
 
     compilation_config = {
-        'level': 3,
-        'compile_sizes': [2, 4, 8],
-        'splitting_ops': [],
-        'pass_config': {
-            'enable_async_tp': async_tp_enabled
-        },
+        "level": 3,
+        "compile_sizes": [2, 4, 8],
+        "splitting_ops": [],
+        "pass_config": {"enable_async_tp": async_tp_enabled},
     }
 
     async_tp_env = tp_env = {
@@ -240,9 +260,6 @@ def test_async_tp_pass_correctness(
         "mp",
     ]
 
-    compare_two_settings(model_id,
-                         async_tp_args,
-                         tp_args,
-                         async_tp_env,
-                         tp_env,
-                         method="generate")
+    compare_two_settings(
+        model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
+    )
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 1ee9b234d9f4..35d7bccb8b63 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -62,8 +62,12 @@ class TestSetting:
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
             model_args=[
-                "--task", "embed", "--dtype", "bfloat16", "--max-model-len",
-                "2048"
+                "--task",
+                "embed",
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
             ],
             pp_size=1,
             tp_size=1,
@@ -92,7 +96,8 @@ class TestSetting:
             method="generate_with_image",
             fullgraph=False,
         ),
-    ])
+    ],
+)
 def test_compile_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_setting: TestSetting,
@@ -108,23 +113,28 @@ def test_compile_correctness(
     method = test_setting.method
     fullgraph = test_setting.fullgraph
     if cuda_device_count_stateless() != pp_size * tp_size:
-        pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
-                    f"{cuda_device_count_stateless()}")
+        pytest.skip(
+            f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
+            f"{cuda_device_count_stateless()}"
+        )
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
         final_args = [
-            "--enforce-eager", *model_args, "-pp",
-            str(pp_size), "-tp",
-            str(tp_size)
+            "--enforce-eager",
+            *model_args,
+            "-pp",
+            str(pp_size),
+            "-tp",
+            str(tp_size),
         ]
 
         all_args: list[list[str]] = []
         all_envs: list[dict[str, str] | None] = []
 
         for level in [
-                CompilationLevel.NO_COMPILATION,
-                CompilationLevel.PIECEWISE,
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.PIECEWISE,
         ]:
             all_args.append(final_args + [f"-O{level}"])
             all_envs.append({})
@@ -135,20 +145,20 @@ def test_compile_correctness(
             model,
             all_args,
             all_envs,
-            method=method if method != "generate" else "generate_close")
+            method=method if method != "generate" else "generate_close",
+        )
         all_envs.clear()
         all_args.clear()
 
         for level in [
-                CompilationLevel.NO_COMPILATION,
-                CompilationLevel.DYNAMO_AS_IS,
-                CompilationLevel.DYNAMO_ONCE,
+            CompilationLevel.NO_COMPILATION,
+            CompilationLevel.DYNAMO_AS_IS,
+            CompilationLevel.DYNAMO_ONCE,
         ]:
             all_args.append(final_args + [f"-O{level}"])
             all_envs.append({})
             if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
                 # "DYNAMO_ONCE" will always use fullgraph
-                all_envs[-1][
-                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+                all_envs[-1]["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
 
         compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 0ba59f4b5a05..4266cff7a85f 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -9,11 +9,11 @@
 
 
 def test_version():
-    assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
-    assert _is_torch_equal_or_newer('2.8.0a0+gitc82a174', '2.8.0.dev')
-    assert _is_torch_equal_or_newer('2.8.0', '2.8.0.dev')
-    assert _is_torch_equal_or_newer('2.8.1', '2.8.0.dev')
-    assert not _is_torch_equal_or_newer('2.7.1', '2.8.0.dev')
+    assert _is_torch_equal_or_newer("2.8.0.dev20250624+cu128", "2.8.0.dev")
+    assert _is_torch_equal_or_newer("2.8.0a0+gitc82a174", "2.8.0.dev")
+    assert _is_torch_equal_or_newer("2.8.0", "2.8.0.dev")
+    assert _is_torch_equal_or_newer("2.8.1", "2.8.0.dev")
+    assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
 
 
 def test_use_cudagraphs_dynamic(monkeypatch):
@@ -21,7 +21,7 @@ def test_use_cudagraphs_dynamic(monkeypatch):
     vllm_config = VllmConfig()
     assert vllm_config.compilation_config.use_cudagraph
 
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     vllm_config = VllmConfig()
     assert not vllm_config.compilation_config.use_cudagraph
 
@@ -34,19 +34,23 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
     assert vllm.envs.VLLM_USE_V1
 
     # spawn means that the counters are in the same process.
-    monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
-    monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
 
     compilation_config = {
         "use_cudagraph": False,  # speed things up a bit
     }
     with (
-            compilation_counter.expect(num_cache_entries_updated=0,
-                                       num_compiled_artifacts_saved=0),
-            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
-                        compilation_config=compilation_config,
-                        gpu_memory_utilization=0.4) as _):
+        compilation_counter.expect(
+            num_cache_entries_updated=0, num_compiled_artifacts_saved=0
+        ),
+        # loading the model causes compilation (if enabled) to happen
+        vllm_runner(
+            "facebook/opt-125m",
+            compilation_config=compilation_config,
+            gpu_memory_utilization=0.4,
+        ) as _,
+    ):
         pass
 
 
@@ -55,20 +59,23 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
     assert vllm.envs.VLLM_USE_V1
 
     # Disable multiprocessing so that the counter is in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     compilation_config = {
         "cudagraph_capture_sizes": [100],
         "use_cudagraph": enabled,
     }
     with (
-            compilation_counter.expect(
-                num_graphs_seen=1,
-                num_gpu_runner_capture_triggers=1 if enabled else 0,
-                num_cudagraph_captured=13 if enabled else 0,
-            ),
-            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
-                        compilation_config=compilation_config,
-                        gpu_memory_utilization=0.4) as _):
+        compilation_counter.expect(
+            num_graphs_seen=1,
+            num_gpu_runner_capture_triggers=1 if enabled else 0,
+            num_cudagraph_captured=13 if enabled else 0,
+        ),
+        # loading the model causes compilation (if enabled) to happen
+        vllm_runner(
+            "facebook/opt-125m",
+            compilation_config=compilation_config,
+            gpu_memory_utilization=0.4,
+        ) as _,
+    ):
         pass
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 72f962ed7484..3707cb196eeb 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -20,53 +20,67 @@
 def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
-        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-            "dtype": torch.float16,
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
-            "dtype": torch.float16,
-        }),
+        (
+            "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+            {
+                "dtype": torch.float16,
+            },
+        ),
+        (
+            "neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",
+            {
+                "dtype": torch.float16,
+            },
+        ),
         ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
     if all:
         if is_quant_method_supported("aqlm"):
-            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-                "quantization": "aqlm"
-            }))
+            TEST_MODELS.append(
+                ("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {"quantization": "aqlm"})
+            )
 
         # TODO: figure out why this fails.
         if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-                "quantization": "gguf"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {"quantization": "gguf"})
+            )
 
         if is_quant_method_supported("gptq"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-                "quantization": "gptq"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {"quantization": "gptq"})
+            )
 
         if is_quant_method_supported("gptq_marlin"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-                "quantization": "gptq_marlin"
-            }))
+            TEST_MODELS.append(
+                (
+                    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+                    {"quantization": "gptq_marlin"},
+                )
+            )
 
         if is_quant_method_supported("gptq_marlin_24"):
-            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-                "quantization": "gptq_marlin_24"
-            }))
+            TEST_MODELS.append(
+                (
+                    "alexm-nm/tinyllama-24-marlin24-4bit-g128",
+                    {"quantization": "gptq_marlin_24"},
+                )
+            )
 
         if is_quant_method_supported("marlin"):
             TEST_MODELS.append(
-                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-                    "quantization": "marlin"
-                }))
+                (
+                    "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
+                    {"quantization": "marlin"},
+                )
+            )
 
         if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-                "quantization": "AWQ"
-            }))
+            TEST_MODELS.append(
+                ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {"quantization": "AWQ"})
+            )
 
     if keywords is None:
         return TEST_MODELS
@@ -102,22 +116,34 @@ def test_full_graph(
     "compilation_config, model_info",
     [
         # additional compile sizes, only some of the models
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           compile_sizes=[1, 2]), model)
+        (
+            CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]),
+            model,
+        )
         for model in models_list(all=False)
-    ] + [
+    ]
+    + [
         # RMSNorm + quant fusion, only 8-bit quant models
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           custom_ops=["+rms_norm"],
-                           pass_config=PassConfig(enable_fusion=True,
-                                                  enable_noop=True)), model)
+        (
+            CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                custom_ops=["+rms_norm"],
+                pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+            ),
+            model,
+        )
         for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
-    ] + [
+    ]
+    + [
         # Test depyf integration works
-        (CompilationConfig(level=CompilationLevel.PIECEWISE,
-                           debug_dump_path=tempfile.gettempdir()),
-         ("facebook/opt-125m", {})),
-    ])
+        (
+            CompilationConfig(
+                level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()
+            ),
+            ("facebook/opt-125m", {}),
+        ),
+    ],
+)
 # only test some of the models
 @create_new_process_for_each_test()
 def test_custom_compile_config(
@@ -129,8 +155,11 @@ def test_custom_compile_config(
     run_model(compilation_config, model, model_kwargs)
 
 
-def run_model(compile_config: Union[int, CompilationConfig], model: str,
-              model_kwargs: dict[str, Any]):
+def run_model(
+    compile_config: Union[int, CompilationConfig],
+    model: str,
+    model_kwargs: dict[str, Any],
+):
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index aade29b99de7..1096d5744dbc 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -8,8 +8,13 @@
 from vllm import LLM, SamplingParams
 from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
-                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fusion import (
+    FUSED_OPS,
+    FusionPass,
+    QuantKey,
+    kFp8DynamicTokenSym,
+    kFp8StaticTensorSym,
+)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
@@ -26,7 +31,7 @@
 RMS_QUANT_OPS = {
     "static_fp8": [
         torch.ops._C.rms_norm_static_fp8_quant.default,
-        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
     ],
 }
 
@@ -43,25 +48,27 @@
 
 @pytest.mark.parametrize(
     "model, quant_key",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e",
-      kFp8DynamicTokenSym)])
+    [
+        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym),
+        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e", kFp8DynamicTokenSym),
+    ],
+)
 @pytest.mark.parametrize("do_fusion", [True, False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
-                    reason="Only test on CUDA")
-def test_fix_functionalization(model: str, quant_key: QuantKey,
-                               do_fusion: bool):
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", reason="Only test on CUDA")
+def test_fix_functionalization(model: str, quant_key: QuantKey, do_fusion: bool):
     torch.set_default_device("cuda")
 
     vllm_config = VllmConfig()
     vllm_config.compilation_config = CompilationConfig(
-        pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True))
+        pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True)
+    )
     noop_pass = NoOpEliminationPass(vllm_config)
     fusion_pass = FusionPass.instance(vllm_config)
     act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
 
-    passes = [noop_pass, fusion_pass, act_quant_fusion_pass
-              ] if do_fusion else [noop_pass]
+    passes = (
+        [noop_pass, fusion_pass, act_quant_fusion_pass] if do_fusion else [noop_pass]
+    )
     func_pass = FixFunctionalizationPass(vllm_config)
     backend_func = TestBackend(*passes, func_pass)
     backend_no_func = TestBackend(*passes)
@@ -76,14 +83,12 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
     # 2 LLM instances.
 
     sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-    model_runner.model = torch.compile(orig_model,
-                                       fullgraph=True,
-                                       backend=backend_func)
+    model_runner.model = torch.compile(orig_model, fullgraph=True, backend=backend_func)
     gen_func = llm.generate(prompts, sampling_params)
 
-    model_runner.model = torch.compile(orig_model,
-                                       fullgraph=True,
-                                       backend=backend_no_func)
+    model_runner.model = torch.compile(
+        orig_model, fullgraph=True, backend=backend_no_func
+    )
 
     gen_no_func = llm.generate(prompts, sampling_params)
 
@@ -92,19 +97,22 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
 
     # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
     # and replaced by fused quantized ops in RMS_QUANT_OPS.
-    rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
-               ] if do_fusion else [RMS_OP]
-    silu_mul_ops = [SILU_MUL_QUANT_OP] if do_fusion and \
-        quant_key == kFp8StaticTensorSym else [
-        SILU_MUL_OP
-    ]
+    rms_ops = (
+        [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]]
+        if do_fusion
+        else [RMS_OP]
+    )
+    silu_mul_ops = (
+        [SILU_MUL_QUANT_OP]
+        if do_fusion and quant_key == kFp8StaticTensorSym
+        else [SILU_MUL_OP]
+    )
 
     ops = OPS_IN_MODEL + rms_ops + silu_mul_ops
 
     for op in ops:
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
-                                  op) is None  # noqa: E501
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None  # noqa: E501
 
     # make sure the ops were all de-functionalized
     found = dict()
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 4a3820e20fd8..399d3045cd87 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -6,14 +6,22 @@
 
 import vllm.envs as envs
 import vllm.plugins
-from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
-                                     FusionPass, GroupShape, QuantKey)
+from vllm.compilation.fusion import (
+    FUSED_OPS,
+    QUANT_OPS,
+    FusedRMSQuantKey,
+    FusionPass,
+    GroupShape,
+    QuantKey,
+)
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
-                         VllmConfig)
+from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+    CUTLASS_FP8_SUPPORTED,
+    Fp8LinearOp,
+    maybe_create_device_identity,
+)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
@@ -22,18 +30,23 @@
 
 
 class TestModel(torch.nn.Module):
-
-    def __init__(self, hidden_size: int, eps: float, static: bool,
-                 cutlass_fp8_enabled: bool, *args, **kwargs):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float,
+        static: bool,
+        cutlass_fp8_enabled: bool,
+        *args,
+        **kwargs,
+    ):
         super().__init__(*args, **kwargs)
         self.cutlass_fp8_enabled = cutlass_fp8_enabled
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
-        self.key = QuantKey(dtype=FP8_DTYPE,
-                            static=static,
-                            group_shape=group_shape,
-                            symmetric=True)
+        self.key = QuantKey(
+            dtype=FP8_DTYPE, static=static, group_shape=group_shape, symmetric=True
+        )
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         else:
@@ -52,17 +65,15 @@ def forward(self, x):
         resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = self.fp8_linear.apply(y,
-                                   self.w[0],
-                                   self.wscale[0],
-                                   input_scale=self.scale[0])
+        x2 = self.fp8_linear.apply(
+            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
+        )
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = self.fp8_linear.apply(y2,
-                                   self.w[1],
-                                   self.wscale[1],
-                                   input_scale=self.scale[1])
+        x3 = self.fp8_linear.apply(
+            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
+        )
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
@@ -72,7 +83,7 @@ def ops_in_model_before(self):
     def ops_in_model_after(self):
         return [
             FUSED_OPS[FusedRMSQuantKey(self.key, False)],
-            FUSED_OPS[FusedRMSQuantKey(self.key, True)]
+            FUSED_OPS[FusedRMSQuantKey(self.key, True)],
         ]
 
 
@@ -81,22 +92,27 @@ def ops_in_model_after(self):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
-                    reason="Only test on CUDA and ROCm")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              cutlass_fp8_enabled):
+@pytest.mark.parametrize(
+    "cutlass_fp8_enabled", [True, False] if CUTLASS_FP8_SUPPORTED else [False]
+)
+@pytest.mark.skipif(
+    envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
+)
+def test_fusion_rmsnorm_quant(
+    dtype, hidden_size, num_tokens, eps, static, cutlass_fp8_enabled
+):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
     maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
 
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        custom_ops=["+rms_norm", "+quant_fp8"],
-        pass_config=PassConfig(enable_fusion=True, enable_noop=True),
-    ))
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            custom_ops=["+rms_norm", "+quant_fp8"],
+            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+        )
+    )
     with vllm.config.set_current_vllm_config(vllm_config):
         # Reshape pass is needed for the fusion pass to work
         noop_pass = NoOpEliminationPass(vllm_config)
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index 492e90f2a75f..26c8f4c70177 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -7,11 +7,19 @@
 
 import vllm.envs as envs
 from vllm.compilation.collective_fusion import AllReduceFusionPass
-from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig,
-                         ModelConfig, PassConfig, VllmConfig)
+from vllm.config import (
+    CompilationConfig,
+    CompilationLevel,
+    DeviceConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
 from vllm.distributed import tensor_model_parallel_all_reduce
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables
@@ -21,7 +29,6 @@
 
 
 class TestAllReduceRMSNormModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
@@ -42,7 +49,6 @@ def ops_in_model_after(self):
 
 
 class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
-
     def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
@@ -64,37 +70,45 @@ def ops_in_model_after(self):
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "test_model",
-    [TestAllReduceRMSNormModel, TestAllReduceFusedAddRMSNormModel])
+    "test_model", [TestAllReduceRMSNormModel, TestAllReduceFusedAddRMSNormModel]
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [8])
 @pytest.mark.parametrize("hidden_size", [4096])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-@pytest.mark.skipif(not find_spec("flashinfer"),
-                    reason="flashinfer is not installed")
-@pytest.mark.skipif(not current_platform.is_device_capability(100),
-                    reason="Only test on SM100")
-def test_all_reduce_fusion_pass_replace(test_model: torch.nn.Module,
-                                        batch_size: int, seq_len: int,
-                                        hidden_size: int, dtype: torch.dtype):
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+@pytest.mark.skipif(not find_spec("flashinfer"), reason="flashinfer is not installed")
+@pytest.mark.skipif(
+    not current_platform.is_device_capability(100), reason="Only test on SM100"
+)
+def test_all_reduce_fusion_pass_replace(
+    test_model: torch.nn.Module,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+):
     num_processes = 2
 
     def run_torch_spawn(fn, nprocs):
-        torch.multiprocessing.spawn(fn,
-                                    args=(num_processes, test_model,
-                                          batch_size, seq_len, hidden_size,
-                                          dtype),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(all_reduce_fusion_pass_on_test_model, num_processes)
 
 
-def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
-                                         test_model_cls: torch.nn.Module,
-                                         batch_size: int, seq_len: int,
-                                         hidden_size: int, dtype: torch.dtype):
+def all_reduce_fusion_pass_on_test_model(
+    local_rank: int,
+    world_size: int,
+    test_model_cls: torch.nn.Module,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+):
     current_platform.seed_everything(0)
 
     device = torch.device(f"cuda:{local_rank}")
@@ -102,45 +116,53 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     vllm_config = VllmConfig(
-        compilation_config=CompilationConfig(level=CompilationLevel.PIECEWISE,
-                                             custom_ops=["+rms_norm"],
-                                             compile_sizes=[2, 4, 8]))
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            custom_ops=["+rms_norm"],
+            compile_sizes=[2, 4, 8],
+        )
+    )
     vllm_config.compilation_config.pass_config = PassConfig(
-        enable_fi_allreduce_fusion=True)
+        enable_fi_allreduce_fusion=True
+    )
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
 
     # this is a fake model name to construct the model config
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
-    vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
-                                           trust_remote_code=True,
-                                           dtype=dtype,
-                                           seed=42)
+    vllm_config.model_config = ModelConfig(
+        model=model_name,
+        task="auto",
+        tokenizer=model_name,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype=dtype,
+        seed=42,
+    )
 
     all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
     backend = TestBackend(all_reduce_fusion_pass)
 
     model = test_model_cls(hidden_size)
 
-    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
-                                requires_grad=False)
-    residual = torch.randn((batch_size * seq_len, hidden_size),
-                           requires_grad=False)
+    hidden_states = torch.randn(
+        (batch_size * seq_len, hidden_size), requires_grad=False
+    )
+    residual = torch.randn((batch_size * seq_len, hidden_size), requires_grad=False)
 
     compiled_model = torch.compile(model, backend=backend)
     compiled_model(hidden_states, residual)
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 70750eb9ac4e..a5a0edd85624 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -21,15 +21,18 @@
 
 
 @pytest.mark.parametrize(
-    "model, quant_key",
-    [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)])
+    "model, quant_key", [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)]
+)
 @pytest.mark.parametrize(
-    "use_triton_fa", [True, False] if current_platform.is_rocm() else [False])
+    "use_triton_fa", [True, False] if current_platform.is_rocm() else [False]
+)
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only test CUDA and ROCm")
-def test_attention_fusion(example_prompts, monkeypatch, model: str,
-                          quant_key: QuantKey, use_triton_fa: bool):
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Only test CUDA and ROCm"
+)
+def test_attention_fusion(
+    example_prompts, monkeypatch, model: str, quant_key: QuantKey, use_triton_fa: bool
+):
     # Clean Dynamo cache to avoid reusing other test cases
     # (for some reason the reset at the end is not enough)
     torch._dynamo.reset()
@@ -55,15 +58,15 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
     vllm_config = VllmConfig(compilation_config=compile_config)
     backend_unfused = TestBackend(NoOpEliminationPass(vllm_config))
 
-    llm = LLM(model,
-              enforce_eager=True,
-              compilation_config=compile_config,
-              gpu_memory_utilization=0.9,
-              max_model_len=2048)
+    llm = LLM(
+        model,
+        enforce_eager=True,
+        compilation_config=compile_config,
+        gpu_memory_utilization=0.9,
+        max_model_len=2048,
+    )
 
-    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=10,
-                                     top_p=0.95)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_p=0.95)
 
     unfused_output = llm.generate(prompts, sampling_params)
     backend_unfused = None  # Reset backend to make sure llm gets released
@@ -82,17 +85,19 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
     # so we initialize it during compilation.
     attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw)
     backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass)
-    llm2 = LLM(model,
-               enforce_eager=True,
-               compilation_config=compile_config,
-               gpu_memory_utilization=0.9,
-               max_model_len=2048)
+    llm2 = LLM(
+        model,
+        enforce_eager=True,
+        compilation_config=compile_config,
+        gpu_memory_utilization=0.9,
+        max_model_len=2048,
+    )
 
     # check support
     attn_fusion_supported = [
-        layer.impl.fused_output_quant_supported(quant_key.dtype,
-                                                quant_key.static,
-                                                quant_key.group_shape)
+        layer.impl.fused_output_quant_supported(
+            quant_key.dtype, quant_key.static, quant_key.group_shape
+        )
         for key, layer in compile_config.static_forward_context.items()
     ]
 
@@ -109,9 +114,9 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
     for i in range(len(attn_nodes_pre)):
         assert attn_nodes_pre[i].kwargs["output_scale"] is None
         fused = attn_nodes_post[i].kwargs["output_scale"] is not None
-        assert fused == attn_fusion_supported[i], \
-            f"Node {i} {'' if fused else 'not '} expected " \
-            f"to have fused output quant"
+        assert fused == attn_fusion_supported[i], (
+            f"Node {i} {'' if fused else 'not '} expected to have fused output quant"
+        )
 
     # check outputs
     fused_output = llm2.generate(prompts, sampling_params)
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 251cc46e9e98..ac561d2e8f84 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -28,7 +28,6 @@ def test_bad_callable():
 
 # Pass that inherits from InductorPass
 class ProperPass(InductorPass):
-
     def __call__(self, graph: torch.fx.graph.Graph) -> None:
         pass
 
@@ -39,8 +38,7 @@ def __call__(self, graph: torch.fx.graph.Graph) -> None:
         ProperPass(),
         # Can also wrap callables in CallableInductorPass for compliance
         CallableInductorPass(simple_callable),
-        CallableInductorPass(simple_callable,
-                             InductorPass.hash_source(__file__))
+        CallableInductorPass(simple_callable, InductorPass.hash_source(__file__)),
     ],
 )
 def test_pass_manager_uuid(callable):
@@ -65,8 +63,9 @@ def test_pass_manager_uuid(callable):
 
     # UUID should be different due to config change
     config2 = copy.deepcopy(config)
-    config2.compilation_config.pass_config.enable_fusion = not \
-        config2.compilation_config.pass_config.enable_fusion
+    config2.compilation_config.pass_config.enable_fusion = (
+        not config2.compilation_config.pass_config.enable_fusion
+    )
     pass_manager3 = PostGradPassManager()
     pass_manager3.configure(config2)
     pass_manager3.add(callable)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index b56edfc90612..4251ae7a9a37 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -10,14 +10,20 @@
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
-from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
-                         PassConfig, VllmConfig)
+from vllm.config import (
+    CompilationConfig,
+    DeviceConfig,
+    ModelConfig,
+    PassConfig,
+    VllmConfig,
+)
 from vllm.distributed import tensor_model_parallel_all_reduce
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables
 
@@ -34,16 +40,15 @@
 
 
 class TestModel(torch.nn.Module):
-
-    def __init__(self,
-                 hidden_size=16,
-                 intermediate_size=32,
-                 vllm_config: VllmConfig = None):
+    def __init__(
+        self, hidden_size=16, intermediate_size=32, vllm_config: VllmConfig = None
+    ):
         super().__init__()
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size)))
+            torch.empty((intermediate_size, hidden_size))
+        )
         self.norm = RMSNorm(intermediate_size, 1e-05)
         # Initialize weights
         torch.nn.init.normal_(self.gate_proj, std=0.02)
@@ -51,18 +56,18 @@ def __init__(self,
     def forward(self, hidden_states, residual):
         """
         Forward pass implementing the operations in the FX graph
-        
+
         Args:
             hidden_states: Input tensor
             residual: Residual tensor from previous layer
-            
+
         Returns:
             Tuple containing the output tensor
         """
         # Reshape input
         view = hidden_states.reshape(-1, self.hidden_size)
 
-        #matrix multiplication
+        # matrix multiplication
         permute = self.gate_proj.permute(1, 0)
         mm = torch.mm(view, permute)
 
@@ -80,7 +85,7 @@ def ops_in_model_before(self):
     def ops_in_model_after(self):
         return [
             torch.ops.vllm.reduce_scatter.default,
-            torch.ops.vllm.all_gather.default
+            torch.ops.vllm.all_gather.default,
         ]
 
     def ops_in_model(self):
@@ -88,47 +93,45 @@ def ops_in_model(self):
 
 
 class TestQuantModel(torch.nn.Module):
-
-    def __init__(self,
-                 hidden_size=16,
-                 intermediate_size=32,
-                 vllm_config: VllmConfig = None):
+    def __init__(
+        self, hidden_size=16, intermediate_size=32, vllm_config: VllmConfig = None
+    ):
         super().__init__()
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.vllm_config = vllm_config
-        self.gate_proj = torch.nn.Parameter(torch.empty(
-            (intermediate_size, hidden_size)),
-                                            requires_grad=False)
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((intermediate_size, hidden_size)), requires_grad=False
+        )
         self.norm = RMSNorm(intermediate_size, 1e-05)
         # Initialize weights
         torch.nn.init.normal_(self.gate_proj, std=0.02)
 
-        self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True,
-                                      use_per_token_if_dynamic=False)
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=True, use_per_token_if_dynamic=False
+        )
 
         self.scale = torch.rand(1, dtype=torch.float32)
         # Create a weight that is compatible with torch._scaled_mm,
         # which expects a column-major layout.
-        self.w = torch.rand(hidden_size,
-                            intermediate_size).to(dtype=FP8_DTYPE).t()
+        self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
         self.wscale = torch.rand(1, dtype=torch.float32)
 
     def forward(self, hidden_states, residual):
         """
         Forward pass implementing the operations in the FX graph
-        
+
         Args:
             hidden_states: Input tensor
             residual: Residual tensor from previous layer
-            
+
         Returns:
             Tuple containing the output tensor
         """
         # Reshape input
         view = hidden_states.reshape(-1, self.hidden_size)
 
-        #matrix multiplication
+        # matrix multiplication
         permute = self.gate_proj.permute(1, 0)
         mm = torch.mm(view, permute)
 
@@ -140,45 +143,51 @@ def forward(self, hidden_states, residual):
 
         # for static input quantization
         # self.fp8_linear is initialized with use_per_token_if_dynamic=False
-        fp8_linear_result = self.fp8_linear.apply(norm_output,
-                                                  self.w,
-                                                  self.wscale,
-                                                  input_scale=self.scale.to(
-                                                      norm_output.device))
+        fp8_linear_result = self.fp8_linear.apply(
+            norm_output,
+            self.w,
+            self.wscale,
+            input_scale=self.scale.to(norm_output.device),
+        )
 
         return fp8_linear_result, residual_output
 
     def ops_in_model_before(self):
-        ops_to_remove = [torch.ops.vllm.all_reduce.default
-                         ]  # Always removed by SP
+        ops_to_remove = [torch.ops.vllm.all_reduce.default]  # Always removed by SP
         # The following are only removed if fusion happens
-        if self.vllm_config and self.vllm_config.compilation_config \
-            .pass_config.enable_fusion:
-            ops_to_remove.extend([
-                torch.ops._C.fused_add_rms_norm.default,
-                torch.ops._C.static_scaled_fp8_quant.default,
-            ])
+        if (
+            self.vllm_config
+            and self.vllm_config.compilation_config.pass_config.enable_fusion
+        ):
+            ops_to_remove.extend(
+                [
+                    torch.ops._C.fused_add_rms_norm.default,
+                    torch.ops._C.static_scaled_fp8_quant.default,
+                ]
+            )
         return ops_to_remove
 
     def ops_in_model_after(self):
         ops_to_add = [
             torch.ops.vllm.reduce_scatter.default,
-            torch.ops.vllm.all_gather.default
+            torch.ops.vllm.all_gather.default,
         ]
         # The following is only added if fusion happens
-        if self.vllm_config and self.vllm_config.compilation_config \
-            .pass_config.enable_fusion:
-            ops_to_add.append(
-                torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
+        if (
+            self.vllm_config
+            and self.vllm_config.compilation_config.pass_config.enable_fusion
+        ):
+            ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
         return ops_to_add
 
     def ops_in_model(self):
-        if self.vllm_config and self.vllm_config.compilation_config \
-            .pass_config.enable_fusion:
+        if (
+            self.vllm_config
+            and self.vllm_config.compilation_config.pass_config.enable_fusion
+        ):
             # If fusion happens, the fused op is the one
             # we check for (de)functionalization
-            return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
-                    ]  # noqa: E501
+            return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]  # noqa: E501
         else:
             # If no fusion, the original ops are checked
             return [
@@ -195,30 +204,47 @@ def ops_in_model(self):
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("enable_fusion", [True, False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-def test_sequence_parallelism_pass(test_model_cls: type[torch.nn.Module],
-                                   batch_size: int, seq_len: int,
-                                   hidden_size: int, dtype: torch.dtype,
-                                   enable_fusion: bool):
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+def test_sequence_parallelism_pass(
+    test_model_cls: type[torch.nn.Module],
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    enable_fusion: bool,
+):
     num_processes = 2
 
     def run_torch_spawn(fn, nprocs):
         # need to use torch.mp.spawn otherwise will have problems with
         # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(num_processes, test_model_cls,
-                                          batch_size, seq_len, hidden_size,
-                                          dtype, enable_fusion),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                test_model_cls,
+                batch_size,
+                seq_len,
+                hidden_size,
+                dtype,
+                enable_fusion,
+            ),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(sequence_parallelism_pass_on_test_model, num_processes)
 
 
 def sequence_parallelism_pass_on_test_model(
-        local_rank: int, world_size: int,
-        test_model_cls: type[torch.nn.Module], batch_size: int, seq_len: int,
-        hidden_size: int, dtype: torch.dtype, enable_fusion: bool):
+    local_rank: int,
+    world_size: int,
+    test_model_cls: type[torch.nn.Module],
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    enable_fusion: bool,
+):
     current_platform.seed_everything(0)
 
     device = torch.device(f"cuda:{local_rank}")
@@ -226,13 +252,15 @@ def sequence_parallelism_pass_on_test_model(
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
@@ -240,22 +268,27 @@ def sequence_parallelism_pass_on_test_model(
 
     # configure vllm config for SequenceParallelismPass
     vllm_config = VllmConfig()
-    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
-        enable_sequence_parallelism=True,
-        enable_fusion=enable_fusion,
-        enable_noop=True))  # NoOp needed for fusion
+    vllm_config.compilation_config = CompilationConfig(
+        pass_config=PassConfig(
+            enable_sequence_parallelism=True,
+            enable_fusion=enable_fusion,
+            enable_noop=True,
+        )
+    )  # NoOp needed for fusion
     vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
 
     # this is a fake model name to construct the model config
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
-    vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
-                                           trust_remote_code=True,
-                                           dtype=dtype,
-                                           seed=42)
+    vllm_config.model_config = ModelConfig(
+        model=model_name,
+        task="auto",
+        tokenizer=model_name,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype=dtype,
+        seed=42,
+    )
 
     sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
     noop_pass = NoOpEliminationPass(vllm_config)
@@ -270,12 +303,9 @@ def sequence_parallelism_pass_on_test_model(
     backend_no_func = TestBackend(*passes_for_backend)
     backend_func = TestBackend(*passes_for_backend, func_pass)
 
-    model = test_model_cls(hidden_size,
-                           hidden_size * 2,
-                           vllm_config=vllm_config)
+    model = test_model_cls(hidden_size, hidden_size * 2, vllm_config=vllm_config)
 
-    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
-                                dtype=dtype)
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
     residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
 
     compiled_model_no_func = torch.compile(model, backend=backend_no_func)
@@ -294,8 +324,7 @@ def sequence_parallelism_pass_on_test_model(
     # check if the functionalization pass is applied
     for op in model.ops_in_model():
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
-                                  op) is None  # noqa: E501
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None  # noqa: E501
 
     # make sure the ops were all de-functionalized
     found = dict()
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 5351a3cf35ba..fa2446beb327 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -9,27 +9,28 @@
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp)
+    CUTLASS_FP8_SUPPORTED,
+    Fp8LinearOp,
+)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
 
 
 class TestModel(torch.nn.Module):
-
-    def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args,
-                 **kwargs):
+    def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.silu_and_mul = SiluAndMul()
         self.wscale = torch.rand(1, dtype=torch.float32)
         self.scale = torch.rand(1, dtype=torch.float32)
 
-        self.w = (torch.rand(
-            hidden_size,
-            hidden_size).to(dtype=current_platform.fp8_dtype()).t())
+        self.w = (
+            torch.rand(hidden_size, hidden_size)
+            .to(dtype=current_platform.fp8_dtype())
+            .t()
+        )
 
         self.fp8_linear = Fp8LinearOp(
             cutlass_fp8_supported=cutlass_fp8_enabled,
@@ -39,28 +40,27 @@ def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args,
 
     def forward(self, x):
         y = self.silu_and_mul(x)
-        x2 = self.fp8_linear.apply(y,
-                                   self.w,
-                                   self.wscale,
-                                   input_scale=self.wscale)
+        x2 = self.fp8_linear.apply(y, self.w, self.wscale, input_scale=self.wscale)
         return x2
 
 
 @pytest.mark.parametrize("num_tokens", [256])
 @pytest.mark.parametrize("hidden_size", [64])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
-                    reason="Only test on CUDA and ROCm")
-def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
-                                   cutlass_fp8_enabled):
+@pytest.mark.parametrize(
+    "cutlass_fp8_enabled", [True, False] if CUTLASS_FP8_SUPPORTED else [False]
+)
+@pytest.mark.skipif(
+    envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
+)
+def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, cutlass_fp8_enabled):
     torch.set_default_device("cuda")
     torch.set_default_dtype(torch.float16)
 
     # Reshape pass is needed for the fusion pass to work
     config = VllmConfig()
     config.compilation_config = CompilationConfig(
-        pass_config=PassConfig(enable_fusion=True, enable_noop=True))
+        pass_config=PassConfig(enable_fusion=True, enable_noop=True)
+    )
     fusion_pass = ActivationQuantFusionPass(config)
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
@@ -76,10 +76,12 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
     result2 = model2(x)
 
     # Check that it gives the same answer
-    torch.testing.assert_close(result[0].to(dtype=torch.float16),
-                               result2[0].to(dtype=torch.float16),
-                               atol=1e-3,
-                               rtol=1e-3)
+    torch.testing.assert_close(
+        result[0].to(dtype=torch.float16),
+        result2[0].to(dtype=torch.float16),
+        atol=1e-3,
+        rtol=1e-3,
+    )
 
     # Check substitution worked
     pre_nodes = backend.graph_pre_pass.nodes
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 5e39f6821d16..34db5a999cbd 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -10,7 +10,6 @@
 
 
 class MyMod(torch.nn.Module):
-
     def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
         if cache is not None:
             return x + cache
@@ -18,12 +17,12 @@ def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
 
 
 class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
-
     def __init__(self, model):
         self.model = model
         compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(compiled_callable,
-                         compilation_level=CompilationLevel.DYNAMO_ONCE)
+        super().__init__(
+            compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
+        )
 
     def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
         # this is the function to be compiled
@@ -54,10 +53,8 @@ def test_torch_compile_wrapper():
 
         # for new input, dispatch to the compiled code directly
         new_x = torch.tensor([3])
-        assert wrapper(new_x,
-                       None).item() == 6  # dispatch to the first compiled code
-        assert wrapper(
-            new_x, cache).item() == 5  # dispatch to the second compiled code
+        assert wrapper(new_x, None).item() == 6  # dispatch to the first compiled code
+        assert wrapper(new_x, cache).item() == 5  # dispatch to the second compiled code
 
     for wrapper in wrappers:
         # make sure they have independent compiled codes
diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py
index 024e81fccc5f..cff998d14817 100644
--- a/tests/config/test_config_generation.py
+++ b/tests/config/test_config_generation.py
@@ -14,8 +14,9 @@ def test_cuda_empty_vs_unset_configs(monkeypatch: pytest.MonkeyPatch):
     """
 
     def create_config():
-        engine_args = EngineArgs(model="deepseek-ai/DeepSeek-V2-Lite",
-                                 trust_remote_code=True)
+        engine_args = EngineArgs(
+            model="deepseek-ai/DeepSeek-V2-Lite", trust_remote_code=True
+        )
         return engine_args.create_engine_config()
 
     # Create config with CUDA_VISIBLE_DEVICES set normally
@@ -34,5 +35,6 @@ def create_config():
     empty_config_dict.pop("instance_id", None)
 
     assert deep_compare(normal_config_dict, empty_config_dict), (
-        "Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=\"\""
-        " should be equivalent")
+        'Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=""'
+        " should be equivalent"
+    )
diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py
index ee351cbfa7c1..5a2e784ac7f4 100644
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@@ -16,13 +16,13 @@ def test_mp_reducer(monkeypatch):
     """
 
     # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
-    monkeypatch.setenv('VLLM_USE_V1', '1')
+    monkeypatch.setenv("VLLM_USE_V1", "1")
 
     # Ensure transformers_modules is not in sys.modules
-    if 'transformers_modules' in sys.modules:
-        del sys.modules['transformers_modules']
+    if "transformers_modules" in sys.modules:
+        del sys.modules["transformers_modules"]
 
-    with patch('multiprocessing.reducer.register') as mock_register:
+    with patch("multiprocessing.reducer.register") as mock_register:
         engine_args = AsyncEngineArgs(
             model="facebook/opt-125m",
             max_model_len=32,
@@ -37,7 +37,8 @@ def test_mp_reducer(monkeypatch):
         )
 
         assert mock_register.called, (
-            "multiprocessing.reducer.register should have been called")
+            "multiprocessing.reducer.register should have been called"
+        )
 
         vllm_config_registered = False
         for call_args in mock_register.call_args_list:
@@ -46,8 +47,7 @@ def test_mp_reducer(monkeypatch):
                 vllm_config_registered = True
 
                 reducer_func = call_args[0][1]
-                assert callable(
-                    reducer_func), "Reducer function should be callable"
+                assert callable(reducer_func), "Reducer function should be callable"
                 break
 
         assert vllm_config_registered, (
diff --git a/tests/conftest.py b/tests/conftest.py
index f3524d1fe2a6..2ebcae46f656 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,23 +13,33 @@
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
-                          BatchEncoding, BatchFeature)
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BatchEncoding,
+    BatchFeature,
+)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from tests.models.utils import (TokensTextLogprobs,
-                                TokensTextLogprobsPromptLogprobs)
+from tests.models.utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import TaskOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.inputs import (
+    ExplicitEncoderDecoderPrompt,
+    TextPrompt,
+    to_enc_dec_tuple_list,
+    zip_enc_dec_prompts,
+)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
@@ -63,12 +73,13 @@ class ImageAssetPrompts(TypedDict):
 
 
 class ImageTestAssets(list[ImageAsset]):
-
     def __init__(self) -> None:
-        super().__init__([
-            ImageAsset("stop_sign"),
-            ImageAsset("cherry_blossom"),
-        ])
+        super().__init__(
+            [
+                ImageAsset("stop_sign"),
+                ImageAsset("cherry_blossom"),
+            ]
+        )
 
     def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
         """
@@ -85,11 +96,12 @@ class VideoAssetPrompts(TypedDict):
 
 
 class VideoTestAssets(list[VideoAsset]):
-
     def __init__(self) -> None:
-        super().__init__([
-            VideoAsset("baby_reading"),
-        ])
+        super().__init__(
+            [
+                VideoAsset("baby_reading"),
+            ]
+        )
 
     def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
         return [prompts["baby_reading"]]
@@ -101,12 +113,13 @@ class AudioAssetPrompts(TypedDict):
 
 
 class AudioTestAssets(list[AudioAsset]):
-
     def __init__(self) -> None:
-        super().__init__([
-            AudioAsset("mary_had_lamb"),
-            AudioAsset("winning_call"),
-        ])
+        super().__init__(
+            [
+                AudioAsset("mary_had_lamb"),
+                AudioAsset("winning_call"),
+            ]
+        )
 
     def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
         return [prompts["mary_had_lamb"], prompts["winning_call"]]
@@ -151,11 +164,11 @@ def run_with_both_engines(request, monkeypatch):
     if use_v1:
         if skip_v1:
             pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
+        monkeypatch.setenv("VLLM_USE_V1", "1")
     else:
         if skip_v0:
             pytest.skip("Skipping test on vllm V0")
-        monkeypatch.setenv('VLLM_USE_V1', '0')
+        monkeypatch.setenv("VLLM_USE_V1", "0")
 
     yield
 
@@ -221,15 +234,17 @@ def example_system_message() -> str:
 
 class DecoderPromptType(Enum):
     """For encoder/decoder models only."""
+
     CUSTOM = 1
     NONE = 2
     EMPTY_STR = 3
 
 
 @pytest.fixture
-def example_encoder_decoder_prompts(
-) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
-    '''
+def example_encoder_decoder_prompts() -> dict[
+    DecoderPromptType, list[ExplicitEncoderDecoderPrompt]
+]:
+    """
     Returns an encoder prompt list and a decoder prompt list, wherein each pair
     of same-index entries in both lists corresponds to an (encoder prompt,
     decoder prompt) tuple.
@@ -238,7 +253,7 @@ def example_encoder_decoder_prompts(
 
     * Encoder prompt list
     * Decoder prompt list (reverse of encoder prompt list)
-    '''
+    """
 
     encoder_prompts = []
     for filename in _TEST_PROMPTS:
@@ -250,12 +265,15 @@ def example_encoder_decoder_prompts(
 
     # NONE decoder prompt type
     return {
-        DecoderPromptType.NONE:
-        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
-        DecoderPromptType.EMPTY_STR:
-        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
-        DecoderPromptType.CUSTOM:
-        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
+        DecoderPromptType.NONE: zip_enc_dec_prompts(
+            encoder_prompts, none_decoder_prompts
+        ),
+        DecoderPromptType.EMPTY_STR: zip_enc_dec_prompts(
+            encoder_prompts, empty_str_decoder_prompts
+        ),
+        DecoderPromptType.CUSTOM: zip_enc_dec_prompts(
+            encoder_prompts, custom_decoder_prompts
+        ),
     }
 
 
@@ -287,15 +305,13 @@ def audio_assets() -> AudioTestAssets:
 
 
 class HfRunner:
-
     def get_default_device(self):
         from vllm.platforms import current_platform
 
-        return ("cpu"
-                if current_platform.is_cpu() else current_platform.device_type)
+        return "cpu" if current_platform.is_cpu() else current_platform.device_type
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
-        if x is None or isinstance(x, (bool, )):
+        if x is None or isinstance(x, (bool,)):
             return x
 
         if device is None:
@@ -367,14 +383,15 @@ def __init__(
             )
 
             # in case some unquantized custom models are not in same dtype
-            if (getattr(model, "quantization_method", None) is None
-                    and any(p.dtype != self.dtype
-                            for p in model.parameters())):
+            if getattr(model, "quantization_method", None) is None and any(
+                p.dtype != self.dtype for p in model.parameters()
+            ):
                 model = model.to(dtype=self.dtype)
 
-            if (getattr(model, "quantization_method", None) != "bitsandbytes"
-                    and len({p.device
-                             for p in model.parameters()}) < 2):
+            if (
+                getattr(model, "quantization_method", None) != "bitsandbytes"
+                and len({p.device for p in model.parameters()}) < 2
+            ):
                 model = model.to(device=self.device)
 
             self.model = model
@@ -389,6 +406,7 @@ def __init__(
         # don't put this import at the top level
         # it will call torch.cuda.device_count()
         from transformers import AutoProcessor  # noqa: F401
+
         self.processor = AutoProcessor.from_pretrained(
             model_name,
             torch_dtype=torch_dtype,
@@ -469,10 +487,9 @@ def generate(
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        all_inputs = self.get_inputs(prompts,
-                                     images=images,
-                                     videos=videos,
-                                     audios=audios)
+        all_inputs = self.get_inputs(
+            prompts, images=images, videos=videos, audios=audios
+        )
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for inputs in all_inputs:
@@ -499,16 +516,17 @@ def generate_greedy(
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> list[tuple[list[int], str]]:
-        outputs = self.generate(prompts,
-                                do_sample=False,
-                                max_new_tokens=max_tokens,
-                                images=images,
-                                videos=videos,
-                                audios=audios,
-                                **kwargs)
+        outputs = self.generate(
+            prompts,
+            do_sample=False,
+            max_new_tokens=max_tokens,
+            images=images,
+            videos=videos,
+            audios=audios,
+            **kwargs,
+        )
 
-        return [(output_ids[0], output_str[0])
-                for output_ids, output_str in outputs]
+        return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]
 
     def generate_beam_search(
         self,
@@ -519,21 +537,22 @@ def generate_beam_search(
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        outputs = self.generate(prompts,
-                                do_sample=False,
-                                max_new_tokens=max_tokens,
-                                num_beams=beam_width,
-                                num_return_sequences=beam_width,
-                                images=images,
-                                videos=videos,
-                                audios=audios)
+        outputs = self.generate(
+            prompts,
+            do_sample=False,
+            max_new_tokens=max_tokens,
+            num_beams=beam_width,
+            num_return_sequences=beam_width,
+            images=images,
+            videos=videos,
+            audios=audios,
+        )
 
         for i in range(len(outputs)):
             output_ids, output_str = outputs[i]
             for j in range(len(output_ids)):
                 output_ids[j] = [
-                    x for x in output_ids[j]
-                    if x != self.tokenizer.pad_token_id
+                    x for x in output_ids[j] if x != self.tokenizer.pad_token_id
                 ]
             outputs[i] = (output_ids, output_str)
         return outputs
@@ -547,10 +566,9 @@ def generate_greedy_logprobs(
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> list[list[torch.Tensor]]:
-        all_inputs = self.get_inputs(prompts,
-                                     images=images,
-                                     videos=videos,
-                                     audios=audios)
+        all_inputs = self.get_inputs(
+            prompts, images=images, videos=videos, audios=audios
+        )
 
         all_logprobs: list[list[torch.Tensor]] = []
         for inputs in all_inputs:
@@ -563,8 +581,7 @@ def generate_greedy_logprobs(
                 return_dict_in_generate=True,
                 **kwargs,
             )
-            seq_logprobs = self._hidden_states_to_seq_logprobs(
-                output.hidden_states)
+            seq_logprobs = self._hidden_states_to_seq_logprobs(output.hidden_states)
             all_logprobs.append(seq_logprobs)
         return all_logprobs
 
@@ -628,10 +645,9 @@ def generate_greedy_logprobs_limit(
         videos: Optional[PromptVideoInput] = None,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs]:
-        all_inputs = self.get_inputs(prompts,
-                                     images=images,
-                                     videos=videos,
-                                     audios=audios)
+        all_inputs = self.get_inputs(
+            prompts, images=images, videos=videos, audios=audios
+        )
 
         all_logprobs: list[list[dict[int, float]]] = []
         all_output_ids: list[list[int]] = []
@@ -651,8 +667,7 @@ def generate_greedy_logprobs_limit(
             (
                 seq_logprobs_lst,
                 output_len,
-            ) = self._hidden_states_to_logprobs(output.hidden_states,
-                                                num_logprobs)
+            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
 
             all_logprobs.append(seq_logprobs_lst)
             seq_ids = output.sequences[0]
@@ -662,8 +677,10 @@ def generate_greedy_logprobs_limit(
             all_output_strs.append(self.tokenizer.decode(output_ids))
 
         outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [
+            (output_ids, output_str, output_logprobs)
+            for output_ids, output_str, output_logprobs in outputs
+        ]
 
     def generate_encoder_decoder_greedy_logprobs_limit(
         self,
@@ -673,16 +690,17 @@ def generate_encoder_decoder_greedy_logprobs_limit(
         images: Optional[PromptImageInput] = None,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs]:
-        '''
+        """
         Greedy logprobs generation for vLLM encoder/decoder models
-        '''
+        """
 
         all_logprobs: list[list[dict[int, float]]] = []
         all_output_ids: list[list[int]] = []
         all_output_strs: list[str] = []
 
         for i, (encoder_prompt, decoder_prompt) in enumerate(
-                to_enc_dec_tuple_list(encoder_decoder_prompts)):
+            to_enc_dec_tuple_list(encoder_decoder_prompts)
+        ):
             processor_kwargs: dict[str, Any] = {
                 "text": encoder_prompt,
                 "return_tensors": "pt",
@@ -696,8 +714,7 @@ def generate_encoder_decoder_greedy_logprobs_limit(
             if decoder_prompt is None:
                 decoder_input_ids = None
             else:
-                decoder_inputs = self.tokenizer(decoder_prompt,
-                                                return_tensors="pt")
+                decoder_inputs = self.tokenizer(decoder_prompt, return_tensors="pt")
                 decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
 
             output = self.model.generate(
@@ -714,8 +731,9 @@ def generate_encoder_decoder_greedy_logprobs_limit(
             (
                 seq_logprobs_lst,
                 output_len,
-            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
-                                                num_logprobs)
+            ) = self._hidden_states_to_logprobs(
+                output.decoder_hidden_states, num_logprobs
+            )
 
             all_logprobs.append(seq_logprobs_lst)
             seq_ids = output.sequences[0]
@@ -724,19 +742,16 @@ def generate_encoder_decoder_greedy_logprobs_limit(
             all_output_strs.append(self.tokenizer.decode(output_ids))
 
         outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [
+            (output_ids, output_str, output_logprobs)
+            for output_ids, output_str, output_logprobs in outputs
+        ]
 
-    def encode(self, prompts: list[str], *args,
-               **kwargs) -> list[list[torch.Tensor]]:
+    def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]:
         return self.model.encode(prompts, *args, **kwargs)
 
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-        return self.model.predict(prompts,
-                                  *args,
-                                  convert_to_tensor=True,
-                                  **kwargs)
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
+        return self.model.predict(prompts, *args, convert_to_tensor=True, **kwargs)
 
     def __enter__(self):
         return self
@@ -809,12 +824,12 @@ def get_inputs(
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
     ) -> list[TextPrompt]:
-
-        if any(x is not None and len(x) != len(prompts)
-               for x in [images, videos, audios]):
+        if any(
+            x is not None and len(x) != len(prompts) for x in [images, videos, audios]
+        ):
             raise ValueError(
-                "All non-None multimodal inputs must have the same length as "
-                "prompts")
+                "All non-None multimodal inputs must have the same length as prompts"
+            )
 
         inputs = []
         for i, prompt in enumerate(prompts):
@@ -849,14 +864,11 @@ def generate(
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
 
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.model.generate(
+            inputs, sampling_params=sampling_params, **kwargs
+        )
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for req_output in req_outputs:
@@ -883,8 +895,9 @@ def _final_steps_generate_w_logprobs(
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
                 output_logprobs = sample.logprobs
-            outputs.append((output_ids, output_str, output_logprobs,
-                            req_output.prompt_logprobs))
+            outputs.append(
+                (output_ids, output_str, output_logprobs, req_output.prompt_logprobs)
+            )
         return outputs
 
     def generate_w_logprobs(
@@ -895,43 +908,45 @@ def generate_w_logprobs(
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
         **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
-
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
-
-        toks_str_logsprobs_prompt_logprobs = (
-            self._final_steps_generate_w_logprobs(req_outputs))
+    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
+
+        req_outputs = self.model.generate(
+            inputs, sampling_params=sampling_params, **kwargs
+        )
+
+        toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
+            req_outputs
+        )
         # Omit prompt logprobs if not required by sampling params
-        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.prompt_logprobs is None else
-                toks_str_logsprobs_prompt_logprobs)
+        return (
+            [x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+            if sampling_params.prompt_logprobs is None
+            else toks_str_logsprobs_prompt_logprobs
+        )
 
     def generate_encoder_decoder_w_logprobs(
         self,
         encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         sampling_params: SamplingParams,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
-        '''
+    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+        """
         Logprobs generation for vLLM encoder/decoder models
-        '''
+        """
 
         assert sampling_params.logprobs is not None
-        req_outputs = self.model.generate(encoder_decoder_prompts,
-                                          sampling_params=sampling_params)
-        toks_str_logsprobs_prompt_logprobs = (
-            self._final_steps_generate_w_logprobs(req_outputs))
+        req_outputs = self.model.generate(
+            encoder_decoder_prompts, sampling_params=sampling_params
+        )
+        toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
+            req_outputs
+        )
         # Omit prompt logprobs if not required by sampling params
-        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.prompt_logprobs is None else
-                toks_str_logsprobs_prompt_logprobs)
+        return (
+            [x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+            if sampling_params.prompt_logprobs is None
+            else toks_str_logsprobs_prompt_logprobs
+        )
 
     def generate_greedy(
         self,
@@ -943,14 +958,15 @@ def generate_greedy(
         **kwargs: Any,
     ) -> list[tuple[list[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts,
-                                greedy_params,
-                                images=images,
-                                videos=videos,
-                                audios=audios,
-                                **kwargs)
-        return [(output_ids[0], output_str[0])
-                for output_ids, output_str in outputs]
+        outputs = self.generate(
+            prompts,
+            greedy_params,
+            images=images,
+            videos=videos,
+            audios=audios,
+            **kwargs,
+        )
+        return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]
 
     def generate_greedy_logprobs(
         self,
@@ -964,22 +980,24 @@ def generate_greedy_logprobs(
         stop_token_ids: Optional[list[int]] = None,
         stop: Optional[list[str]] = None,
         **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=num_prompt_logprobs,
             stop_token_ids=stop_token_ids,
-            stop=stop)
+            stop=stop,
+        )
 
-        return self.generate_w_logprobs(prompts,
-                                        greedy_logprobs_params,
-                                        images=images,
-                                        audios=audios,
-                                        videos=videos,
-                                        **kwargs)
+        return self.generate_w_logprobs(
+            prompts,
+            greedy_logprobs_params,
+            images=images,
+            audios=audios,
+            videos=videos,
+            **kwargs,
+        )
 
     def generate_encoder_decoder_greedy_logprobs(
         self,
@@ -988,8 +1006,7 @@ def generate_encoder_decoder_greedy_logprobs(
         num_logprobs: int,
         num_prompt_logprobs: Optional[int] = None,
         skip_special_tokens: bool = True,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
@@ -997,12 +1014,13 @@ def generate_encoder_decoder_greedy_logprobs(
             prompt_logprobs=(num_prompt_logprobs),
             skip_special_tokens=skip_special_tokens,
         )
-        '''
+        """
         Greedy logprobs generation for vLLM encoder/decoder models
-        '''
+        """
 
         return self.generate_encoder_decoder_w_logprobs(
-            encoder_decoder_prompts, greedy_logprobs_params)
+            encoder_decoder_prompts, greedy_logprobs_params
+        )
 
     def generate_beam_search(
         self,
@@ -1013,14 +1031,11 @@ def generate_beam_search(
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
 
         outputs = self.model.beam_search(
-            inputs,
-            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+            inputs, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)
+        )
         returned_outputs = []
         for output in outputs:
             token_ids = [x.tokens for x in output.sequences]
@@ -1032,17 +1047,16 @@ def classify(self, prompts: list[str]) -> list[list[float]]:
         req_outputs = self.model.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
-    def embed(self,
-              prompts: list[str],
-              images: Optional[PromptImageInput] = None,
-              videos: Optional[PromptVideoInput] = None,
-              audios: Optional[PromptAudioInput] = None,
-              *args,
-              **kwargs) -> list[list[float]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+    def embed(
+        self,
+        prompts: list[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        *args,
+        **kwargs,
+    ) -> list[list[float]]:
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
 
         req_outputs = self.model.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
@@ -1081,6 +1095,7 @@ def vllm_runner():
 @pytest.fixture()
 def temporary_enable_log_propagate():
     import logging
+
     logger = logging.getLogger("vllm")
     logger.propagate = True
     yield
@@ -1100,6 +1115,7 @@ def num_gpus_available():
     in current process."""
 
     from vllm.platforms import current_platform
+
     return current_platform.device_count()
 
 
@@ -1113,12 +1129,11 @@ def num_gpus_available():
 def dummy_opt_path():
     json_path = os.path.join(_dummy_opt_path, "config.json")
     if not os.path.exists(_dummy_opt_path):
-        snapshot_download(repo_id="facebook/opt-125m",
-                          local_dir=_dummy_opt_path,
-                          ignore_patterns=[
-                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack"
-                          ])
+        snapshot_download(
+            repo_id="facebook/opt-125m",
+            local_dir=_dummy_opt_path,
+            ignore_patterns=["*.bin", "*.bin.index.json", "*.pt", "*.h5", "*.msgpack"],
+        )
         assert os.path.exists(json_path)
         with open(json_path) as f:
             config = json.load(f)
@@ -1132,12 +1147,11 @@ def dummy_opt_path():
 def dummy_llava_path():
     json_path = os.path.join(_dummy_llava_path, "config.json")
     if not os.path.exists(_dummy_llava_path):
-        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
-                          local_dir=_dummy_llava_path,
-                          ignore_patterns=[
-                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack"
-                          ])
+        snapshot_download(
+            repo_id="llava-hf/llava-1.5-7b-hf",
+            local_dir=_dummy_llava_path,
+            ignore_patterns=["*.bin", "*.bin.index.json", "*.pt", "*.h5", "*.msgpack"],
+        )
         assert os.path.exists(json_path)
         with open(json_path) as f:
             config = json.load(f)
@@ -1151,12 +1165,11 @@ def dummy_llava_path():
 def dummy_gemma2_embedding_path():
     json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
     if not os.path.exists(_dummy_gemma2_embedding_path):
-        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
-                          local_dir=_dummy_gemma2_embedding_path,
-                          ignore_patterns=[
-                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
-                              "*.msgpack"
-                          ])
+        snapshot_download(
+            repo_id="BAAI/bge-multilingual-gemma2",
+            local_dir=_dummy_gemma2_embedding_path,
+            ignore_patterns=["*.bin", "*.bin.index.json", "*.pt", "*.h5", "*.msgpack"],
+        )
         assert os.path.exists(json_path)
         with open(json_path) as f:
             config = json.load(f)
@@ -1169,10 +1182,9 @@ def dummy_gemma2_embedding_path():
 # Add the flag `--optional` to allow run tests
 # that are marked with @pytest.mark.optional
 def pytest_addoption(parser):
-    parser.addoption("--optional",
-                     action="store_true",
-                     default=False,
-                     help="run optional test")
+    parser.addoption(
+        "--optional", action="store_true", default=False, help="run optional test"
+    )
 
 
 def pytest_collection_modifyitems(config, items):
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
index e2c6c66b259c..c6e9bf88e71e 100644
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -12,21 +12,26 @@
 
 
 @pytest.fixture
-def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                           baseline_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                                baseline_llm_kwargs, seed)
+def baseline_llm_generator(
+    common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, seed
+):
+    return create_llm_generator(
+        common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, seed
+    )
 
 
 @pytest.fixture
-def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                       test_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                                test_llm_kwargs, seed)
+def test_llm_generator(
+    common_llm_kwargs, per_test_common_llm_kwargs, test_llm_kwargs, seed
+):
+    return create_llm_generator(
+        common_llm_kwargs, per_test_common_llm_kwargs, test_llm_kwargs, seed
+    )
 
 
-def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                         distinct_llm_kwargs, seed):
+def create_llm_generator(
+    common_llm_kwargs, per_test_common_llm_kwargs, distinct_llm_kwargs, seed
+):
     kwargs = {
         **common_llm_kwargs,
         **per_test_common_llm_kwargs,
@@ -47,11 +52,12 @@ def generator_inner():
         del llm
 
 
-def get_text_from_llm_generator(llm_generator: Iterable[LLM],
-                                prompts,
-                                sampling_params,
-                                llm_cb: Optional[Callable[[LLM],
-                                                          None]] = None):
+def get_text_from_llm_generator(
+    llm_generator: Iterable[LLM],
+    prompts,
+    sampling_params,
+    llm_cb: Optional[Callable[[LLM], None]] = None,
+):
     for llm in llm_generator:
         if llm_cb:
             llm_cb(llm)
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 93222b564ebe..8c25c06e78da 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -12,28 +12,28 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "preemption_mode": "swap"
-}, {
-    "preemption_mode": "recompute"
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs", [{"preemption_mode": "swap"}, {"preemption_mode": "recompute"}]
+)
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_block_manager_with_preemption(baseline_llm_generator,
-                                       test_llm_generator, batch_size):
+def test_block_manager_with_preemption(
+    baseline_llm_generator, test_llm_generator, batch_size
+):
     """Verify block manager produces same outputs even when there is preemption.
 
     This constructs two LLM, each with limited number of GPU blocks. The limit
@@ -47,8 +47,8 @@ def test_block_manager_with_preemption(baseline_llm_generator,
     KV mapping has time to build up error.
 
     NOTE(Kuntai): Though we have removed block manager v1, this test is still
-    useful as it asserts the behavior of block manager v2 (now it is called 
-    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    useful as it asserts the behavior of block manager v2 (now it is called
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
     keep this test.
     """
     output_len = 1024
@@ -74,13 +74,14 @@ def test_block_manager_with_preemption(baseline_llm_generator,
     )
 
     baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
+        baseline_llm_generator, prompts, sampling_params
+    )
 
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
+    test_token_ids = get_token_ids_from_llm_generator(
+        test_llm_generator, prompts, sampling_params
+    )
 
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids, test_token_ids):
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
@@ -88,38 +89,43 @@ def test_block_manager_with_preemption(baseline_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # Our prompts will generate 128 tokens; since the prompts themselves are
-        # small, we don't need much KV space beyond 128.
-        "max_model_len": 160,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-    }])
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            # Our prompts will generate 128 tokens; since the prompts themselves are
+            # small, we don't need much KV space beyond 128.
+            "max_model_len": 160,
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
         {
             "block_size": 16,
-
             # Allow only 2 sequences of ~128 tokens in worst case.
             # Note 8 = 128/block_size
             "num_gpu_blocks_override": 2 * (8 + 1),
         },
         {
             "block_size": 8,
-
             # Allow only 2 sequences of ~128 tokens in worst case.
             # Note 16 = 128/block_size
             "num_gpu_blocks_override": 2 * (16 + 2),
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    "baseline_llm_kwargs",
+    [
+        {
+            "num_lookahead_slots": 0,
         }
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "num_lookahead_slots": 0,
-}])
+    ],
+)
 @pytest.mark.parametrize(
     "test_llm_kwargs",
     [
@@ -132,13 +138,14 @@ def test_block_manager_with_preemption(baseline_llm_generator,
         {
             "num_lookahead_slots": 10,
             "preemption_mode": "recompute",
-        }
-    ])
+        },
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
-                                                   test_llm_generator,
-                                                   batch_size):
+def test_lookahead_greedy_equality_with_preemption(
+    baseline_llm_generator, test_llm_generator, batch_size
+):
     """Verify vLLM produces the same output with greedy sampling, when lookahead
     scheduling is used vs. not.
 
@@ -167,16 +174,17 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
         temperature=temperature,
     )
 
-    print('Getting token ids without lookahead scheduling')
+    print("Getting token ids without lookahead scheduling")
     baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
+        baseline_llm_generator, prompts, sampling_params
+    )
 
-    print('Getting token ids with lookahead scheduling')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
+    print("Getting token ids with lookahead scheduling")
+    test_token_ids = get_token_ids_from_llm_generator(
+        test_llm_generator, prompts, sampling_params
+    )
 
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids, test_token_ids):
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
@@ -188,42 +196,55 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
         {
             # Use a small model for a fast test.
             "model": "facebook/opt-125m",
-
             # skip cuda graph creation for fast test.
             "enforce_eager": True,
             "enable_chunked_prefill": True,
         },
-    ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs",
-                         [{
-                             "block_size": 16,
-                             "max_num_batched_tokens": 2,
-                             "max_num_seqs": 2,
-                         }, {
-                             "block_size": 16,
-                             "max_num_batched_tokens": 3,
-                             "max_num_seqs": 2,
-                         }, {
-                             "block_size": 16,
-                             "max_num_batched_tokens": 256,
-                             "max_num_seqs": 10,
-                         }])
-@pytest.mark.parametrize("baseline_llm_kwargs", [
-    {},
-])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "num_lookahead_slots": 0,
-    },
-    {
-        "num_lookahead_slots": 5,
-    },
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "block_size": 16,
+            "max_num_batched_tokens": 2,
+            "max_num_seqs": 2,
+        },
+        {
+            "block_size": 16,
+            "max_num_batched_tokens": 3,
+            "max_num_seqs": 2,
+        },
+        {
+            "block_size": 16,
+            "max_num_batched_tokens": 256,
+            "max_num_seqs": 10,
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    "baseline_llm_kwargs",
+    [
+        {},
+    ],
+)
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "num_lookahead_slots": 0,
+        },
+        {
+            "num_lookahead_slots": 5,
+        },
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_chunked_prefill_block_manager(baseline_llm_generator,
-                                       test_llm_generator, batch_size):
-    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
+def test_chunked_prefill_block_manager(
+    baseline_llm_generator, test_llm_generator, batch_size
+):
+    """Verify that chunked prefill works with SelfAttnBlockSpaceManager,
     with and without lookahead scheduling.
     """
     output_len = 32
@@ -245,16 +266,17 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
         temperature=temperature,
     )
 
-    print('Getting token ids with BlockManager')
+    print("Getting token ids with BlockManager")
     baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
+        baseline_llm_generator, prompts, sampling_params
+    )
 
-    print('Getting token ids with BlockManager, with lookahead slots.')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
+    print("Getting token ids with BlockManager, with lookahead slots.")
+    test_token_ids = get_token_ids_from_llm_generator(
+        test_llm_generator, prompts, sampling_params
+    )
 
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids, test_token_ids):
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
@@ -262,31 +284,30 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-
-        # Enable prefill cache
-        "enable_prefix_caching": True,
-    }])
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
+            # Enable prefill cache
+            "enable_prefix_caching": True,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "preemption_mode": "swap"
-}, {
-    "preemption_mode": "recompute"
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs", [{"preemption_mode": "swap"}, {"preemption_mode": "recompute"}]
+)
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_block_manager_prefix_caching_enabled_with_preemption(
-        baseline_llm_generator, test_llm_generator, batch_size):
+    baseline_llm_generator, test_llm_generator, batch_size
+):
     """Verify block manager produces same outputs even when there is preemption.
 
     This constructs two LLM, each with limited number of GPU blocks. The limit
@@ -300,8 +321,8 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
     KV mapping has time to build up error.
 
     NOTE(Kuntai): Though we have removed block manager v1, this test is still
-    useful as it asserts the behavior of block manager v2 (now it is called 
-    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
+    useful as it asserts the behavior of block manager v2 (now it is called
+    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
     keep this test.
     """
     output_len = 1024
@@ -326,16 +347,17 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
         temperature=temperature,
     )
 
-    print('Getting token ids from block manager')
+    print("Getting token ids from block manager")
     baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
+        baseline_llm_generator, prompts, sampling_params
+    )
 
-    print('Getting token ids from block manager, with preemption')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
+    print("Getting token ids from block manager, with preemption")
+    test_token_ids = get_token_ids_from_llm_generator(
+        test_llm_generator, prompts, sampling_params
+    )
 
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids, test_token_ids):
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
@@ -343,32 +365,32 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            # Allow only 5 sequences of ~1024 tokens in worst case.
+            "block_size": 16,
+            "num_gpu_blocks_override": 5 * (64 + 1),
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "enable_prefix_caching": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "enable_prefix_caching": True,
-    "preemption_mode": "swap"
-}, {
-    "enable_prefix_caching": True,
-    "preemption_mode": "recompute"
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{"enable_prefix_caching": False}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {"enable_prefix_caching": True, "preemption_mode": "swap"},
+        {"enable_prefix_caching": True, "preemption_mode": "recompute"},
+    ],
+)
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
-                                             test_llm_generator, batch_size):
+def test_auto_prefix_caching_with_preemption(
+    baseline_llm_generator, test_llm_generator, batch_size
+):
     """Verify block manager v2 with auto prefix caching enabled produces same
     outputs as auto prefix caching disabled, even when there is preemption.
 
@@ -400,16 +422,17 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
         temperature=temperature,
     )
 
-    print('Getting token ids with APC disabled')
+    print("Getting token ids with APC disabled")
     baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
+        baseline_llm_generator, prompts, sampling_params
+    )
 
-    print('Getting token ids with APC enabled')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
+    print("Getting token ids with APC enabled")
+    test_token_ids = get_token_ids_from_llm_generator(
+        test_llm_generator, prompts, sampling_params
+    )
 
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids, test_token_ids):
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
@@ -417,28 +440,33 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # we keep the blocks small, so that hit eviction quickly
-        "max_model_len": 48,
-        "block_size": 16,
-        "num_gpu_blocks_override": 3,
-    }])
+    [
+        {
+            # Use a small model for a fast test.
+            "model": "facebook/opt-125m",
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            # we keep the blocks small, so that hit eviction quickly
+            "max_model_len": 48,
+            "block_size": 16,
+            "num_gpu_blocks_override": 3,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "enable_prefix_caching": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "enable_prefix_caching": True,
-}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{"enable_prefix_caching": False}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "enable_prefix_caching": True,
+        }
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
-                                                  test_llm_generator):
+def test_auto_prefix_caching_after_eviction_start(
+    baseline_llm_generator, test_llm_generator
+):
     """Verify block manager v2 with auto prefix caching could works normal
     even when eviction started.
     With APC enabled, all blocks are held by native block at the beginning.
@@ -455,7 +483,7 @@ def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
         "You are a helpful assistant. Please answer truthfully and write out "
         "your thinking step by step to be sure you get the right answer. You "
         "are helpful and harmless and you follow ethical guidelines. "
-        "who are you?"
+        "who are you?",
     ]
 
     sampling_params = SamplingParams(
@@ -464,16 +492,17 @@ def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
         temperature=temperature,
     )
 
-    print('Getting token ids with APC disabled')
+    print("Getting token ids with APC disabled")
     baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
+        baseline_llm_generator, prompts, sampling_params
+    )
 
-    print('Getting token ids with APC enabled')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
+    print("Getting token ids with APC enabled")
+    test_token_ids = get_token_ids_from_llm_generator(
+        test_llm_generator, prompts, sampling_params
+    )
 
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids, test_token_ids):
         assert expected_token_ids == actual_token_ids
 
     assert baseline_token_ids == test_token_ids
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 4d67eea2264b..eed7c3387e1f 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -18,23 +18,26 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model": MODEL,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
+    [
+        {
+            "model": MODEL,
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "block_size": BLOCK_SIZE,
+            # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
+            "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
-                                  batch_size, seed, backend, monkeypatch):
+def test_sliding_window_retrieval(
+    baseline_llm_generator, test_llm_generator, batch_size, seed, backend, monkeypatch
+):
     """
     The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
     asks for value of one of them (which is outside the sliding window).
@@ -58,16 +61,16 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
 
     prompts, answer, indices = prep_prompts(batch_size)
 
-    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
-                                                 prompts,
-                                                 sampling_params,
-                                                 llm_cb=check_window(prompts))
+    baseline_texts = get_text_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params, llm_cb=check_window(prompts)
+    )
 
     check_answers(indices, answer, baseline_texts)
 
-    print('Getting token ids from block manager v2')
-    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
-                                             sampling_params)
+    print("Getting token ids from block manager v2")
+    test_texts = get_text_from_llm_generator(
+        test_llm_generator, prompts, sampling_params
+    )
     check_answers(indices, answer, test_texts)
 
     cmp = [
@@ -84,21 +87,24 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model": MODEL,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
+    [
+        {
+            "model": MODEL,
+            # skip cuda graph creation for fast test.
+            "enforce_eager": True,
+            "block_size": BLOCK_SIZE,
+            "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
-                                        backend, monkeypatch):
+def test_sliding_window_chunked_prefill(
+    test_llm_generator, batch_size, seed, backend, monkeypatch
+):
     """
     This is similar to test_sliding_window_retrieval, however, it doesn't
     compare against the v1 block manager since v1 doesn't support
@@ -123,10 +129,9 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
 
     # We don't compare with the baseline model here, since the results
     # slightly different due to different tailing in attention.
-    test_texts = get_text_from_llm_generator(test_llm_generator,
-                                             prompts,
-                                             sampling_params,
-                                             llm_cb=check_window(prompts))
+    test_texts = get_text_from_llm_generator(
+        test_llm_generator, prompts, sampling_params, llm_cb=check_window(prompts)
+    )
     check_answers(indices, answer, test_texts)
 
 
@@ -148,8 +153,10 @@ def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
     for _ in range(batch_size):
         idx = random.randint(30, 90)
         indices.append(idx)
-        prompt = "```python\n# We set a number of variables, " + \
-                 f"x{idx} will be important later\n"
+        prompt = (
+            "```python\n# We set a number of variables, "
+            + f"x{idx} will be important later\n"
+        )
         ln = random.randint(*ln_range)
         for k in range(30, ln):
             v = random.randint(10, 99)
@@ -162,10 +169,9 @@ def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
     return prompts, answer, indices
 
 
-def check_answers(indices: list[int],
-                  answer: list[int],
-                  outputs: list[str],
-                  accept_rate: float = 0.7):
+def check_answers(
+    indices: list[int], answer: list[int], outputs: list[str], accept_rate: float = 0.7
+):
     answer2 = [int(text[0:2].strip()) for text in outputs]
     print(list(zip(indices, zip(answer, answer2))))
     numok = 0
@@ -178,12 +184,12 @@ def check_answers(indices: list[int],
 
 
 def check_window(prompts: list[str]):
-
     def inner(llm: LLM):
         sliding_window = llm.llm_engine.model_config.get_sliding_window()
         assert sliding_window and sliding_window > 0
         assert any(
             len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
-            for prompt in prompts)
+            for prompt in prompts
+        )
 
     return inner
diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
index 9eed264fd7d4..b3344bdd65c8 100644
--- a/tests/core/block/test_block_manager.py
+++ b/tests/core/block/test_block_manager.py
@@ -3,23 +3,29 @@
 
 import pytest
 
-from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                                   STR_NOT_IMPL_ENC_DEC_SWA)
+from vllm.core.block.utils import (
+    STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+    STR_NOT_IMPL_ENC_DEC_SWA,
+)
 from vllm.core.block_manager import SelfAttnBlockSpaceManager
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
 
-from ..utils import (create_dummy_prompt, create_seq_group,
-                     create_seq_group_encoder_decoder)
+from ..utils import (
+    create_dummy_prompt,
+    create_seq_group,
+    create_seq_group_encoder_decoder,
+)
 
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
 @pytest.mark.parametrize("num_seqs_per_group", [1, 4])
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
-                                num_gpu_blocks: int, watermark: float):
+def test_can_allocate_seq_group(
+    block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float
+):
     block_manager = SelfAttnBlockSpaceManager(
         block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
@@ -62,10 +68,9 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
 @pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
 @pytest.mark.parametrize("num_seqs_per_group", [1, 4])
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_seq_group_encoder_decoder(block_size: int,
-                                                num_seqs_per_group: int,
-                                                num_gpu_blocks: int,
-                                                watermark: float):
+def test_can_allocate_seq_group_encoder_decoder(
+    block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float
+):
     block_manager = SelfAttnBlockSpaceManager(
         block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
@@ -82,7 +87,8 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
     num_output_blocks = num_output_blocks_per_seq
 
     for bdx, num_prompt_blocks in enumerate(
-            range(1, num_gpu_blocks - num_output_blocks)):
+        range(1, num_gpu_blocks - num_output_blocks)
+    ):
         num_cross_blocks_per_seq = num_prompt_blocks
 
         seq_group = create_seq_group_encoder_decoder(
@@ -91,15 +97,16 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
                 block_size * num_output_blocks_per_seq
                 for _ in range(num_seqs_per_group)
             ],
-            request_id=str(bdx))
+            request_id=str(bdx),
+        )
 
         assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
 
         can_allocate_result = block_manager.can_allocate(seq_group)
 
-        num_required_blocks = num_prompt_blocks + \
-                              num_output_blocks + \
-                              num_cross_blocks_per_seq
+        num_required_blocks = (
+            num_prompt_blocks + num_output_blocks + num_cross_blocks_per_seq
+        )
 
         if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
             assert can_allocate_result == AllocStatus.NEVER
@@ -113,11 +120,10 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
 @pytest.mark.parametrize("num_gpu_blocks", [16])
 @pytest.mark.parametrize("num_seqs_per_group", [1])
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
-                                                     num_seqs_per_group: int,
-                                                     num_gpu_blocks: int,
-                                                     watermark: float):
-    '''
+def test_can_allocate_encoder_decoder_fails_with_swa(
+    block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float
+):
+    """
     SWA short for Sliding Window Attention.
 
     At time of writing block manager does not support SWA.
@@ -135,7 +141,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
 
     The setup for this test is stripped down version of
     test_can_allocate_seq_group_encoder_decoder()
-    '''
+    """
 
     with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
         block_manager = SelfAttnBlockSpaceManager(
@@ -143,7 +149,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
             num_gpu_blocks=num_gpu_blocks,
             num_cpu_blocks=1024,
             watermark=watermark,
-            sliding_window=5  # SWA
+            sliding_window=5,  # SWA
         )
 
         num_output_blocks_per_seq = 1
@@ -155,7 +161,8 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
                 block_size * num_output_blocks_per_seq
                 for _ in range(num_seqs_per_group)
             ],
-            request_id="0")
+            request_id="0",
+        )
 
         assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
         block_manager.can_allocate(seq_group)
@@ -177,15 +184,14 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
 @pytest.mark.parametrize("num_seqs_per_group", [1])
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
 def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
-        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
-        watermark: float):
-
+    block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float
+):
     block_manager = SelfAttnBlockSpaceManager(
         block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
         num_cpu_blocks=1024,
         watermark=watermark,
-        enable_caching=True  # Prefix cache
+        enable_caching=True,  # Prefix cache
     )
 
     num_output_blocks_per_seq = 1
@@ -194,10 +200,10 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
     seq_group = create_seq_group_encoder_decoder(
         seq_prompt_len=block_size * num_prompt_blocks,
         seq_output_lens=[
-            block_size * num_output_blocks_per_seq
-            for _ in range(num_seqs_per_group)
+            block_size * num_output_blocks_per_seq for _ in range(num_seqs_per_group)
         ],
-        request_id="0")
+        request_id="0",
+    )
 
     assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
 
@@ -212,8 +218,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
 @pytest.mark.parametrize("prompt_len", [1, 7, 8])
 @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
 @pytest.mark.parametrize("num_lookahead_slots", [0, 10])
-def test_append_slots(block_size, prompt_len, num_slots_to_append,
-                      num_lookahead_slots):
+def test_append_slots(block_size, prompt_len, num_slots_to_append, num_lookahead_slots):
     """Verify append_slots consumes the correct number of blocks from the block
     table.
     """
@@ -247,18 +252,19 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
     # Append slots for new tokens and lookahead slots.
     free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
     block_manager.append_slots(seq, num_lookahead_slots)
-    num_consumed_blocks = (free_blocks_before_append -
-                           block_manager.get_num_free_gpu_blocks())
+    num_consumed_blocks = (
+        free_blocks_before_append - block_manager.get_num_free_gpu_blocks()
+    )
 
     # Expect consumed blocks to be new blocks required to support the new slots.
     expected_consumed_blocks = len(
         list(
             chunk_list(
-                list(
-                    range(prompt_len + num_slots_to_append +
-                          num_lookahead_slots)),
-                block_size))) - len(
-                    list(chunk_list(list(range(prompt_len)), block_size)))
+                list(range(prompt_len + num_slots_to_append + num_lookahead_slots)),
+                block_size,
+            )
+        )
+    ) - len(list(chunk_list(list(range(prompt_len)), block_size)))
     assert num_consumed_blocks == expected_consumed_blocks
 
 
@@ -267,16 +273,19 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
 @pytest.mark.parametrize("num_gpu_blocks", [4])
 @pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
 @pytest.mark.parametrize("enable_caching", [False, True])
-def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
-              enable_caching):
+def test_swap(
+    block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, enable_caching
+):
     """Verify blocks number on src/desc device is correct after swapping in/out
-        sequence group (not missing or extra blocks).
+    sequence group (not missing or extra blocks).
     """
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size,
+        num_cpu_blocks,
+        num_gpu_blocks,
+        watermark=0,
+        enable_caching=enable_caching,
+    )
     prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
     prompt.status = SequenceStatus.WAITING
     block_manager.allocate(seq_group)
@@ -319,19 +328,21 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
 @pytest.mark.parametrize("num_gpu_blocks", [4])
 @pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
 @pytest.mark.parametrize("enable_caching", [True, False])
-def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
-                  enable_caching):
-    """ Verify the block manager can correctly determine if a sequence group
-        can be swapped in/out.
+def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots, enable_caching):
+    """Verify the block manager can correctly determine if a sequence group
+    can be swapped in/out.
     """
     num_cpu_blocks = num_gpu_blocks
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size,
+        num_cpu_blocks,
+        num_gpu_blocks,
+        watermark=0,
+        enable_caching=enable_caching,
+    )
     prompt, seq_group = create_dummy_prompt(
-        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
+        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1
+    )
     prompt.status = SequenceStatus.WAITING
     block_manager.allocate(seq_group)
     prompt.status = SequenceStatus.RUNNING
@@ -352,11 +363,14 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
 
     # At this moment, we still have enough free blocks to swap in the seq group.
     if num_lookahead_slots <= block_size:
-        assert block_manager.can_swap_in(seq_group,
-                                         num_lookahead_slots) == AllocStatus.OK
+        assert (
+            block_manager.can_swap_in(seq_group, num_lookahead_slots) == AllocStatus.OK
+        )
     else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+        assert (
+            block_manager.can_swap_in(seq_group, num_lookahead_slots)
+            == AllocStatus.NEVER
+        )
 
     # During Swapped out, 2 cached blocks were evicted from the GPU,
     # so the prompt1 can't be swapped in
@@ -364,17 +378,22 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
     prompt2, seq_group2 = create_dummy_prompt(
         "2",
         prompt_length=prompt2_len,
-        prompt_tokens=[10000 + i for i in range(prompt2_len)])
+        prompt_tokens=[10000 + i for i in range(prompt2_len)],
+    )
     prompt2.status = SequenceStatus.WAITING
     block_manager.allocate(seq_group2)
 
     # Swap seq group from CPU -> GPU.
     if num_lookahead_slots <= block_size:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.LATER
+        assert (
+            block_manager.can_swap_in(seq_group, num_lookahead_slots)
+            == AllocStatus.LATER
+        )
     else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+        assert (
+            block_manager.can_swap_in(seq_group, num_lookahead_slots)
+            == AllocStatus.NEVER
+        )
 
 
 @pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
@@ -386,11 +405,13 @@ def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
     block_size = 8
     num_cpu_blocks = 1
     num_gpu_blocks = 1
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
+    block_manager = SelfAttnBlockSpaceManager(
+        block_size,
+        num_cpu_blocks,
+        num_gpu_blocks,
+        watermark=0,
+        enable_caching=enable_caching,
+    )
     prompt_length = block_size - 3
     assert prompt_length > 0
     prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
@@ -414,13 +435,17 @@ def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
     # the total number of available GPU blocks then the swap
     # should fail.
     num_unseen_tokens = 1
-    if (num_lookahead_slots + num_unseen_tokens +
-            prompt_length) <= (block_size * num_gpu_blocks):
-        assert block_manager.can_swap_in(seq_group,
-                                         num_lookahead_slots) == AllocStatus.OK
+    if (num_lookahead_slots + num_unseen_tokens + prompt_length) <= (
+        block_size * num_gpu_blocks
+    ):
+        assert (
+            block_manager.can_swap_in(seq_group, num_lookahead_slots) == AllocStatus.OK
+        )
     else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
+        assert (
+            block_manager.can_swap_in(seq_group, num_lookahead_slots)
+            == AllocStatus.NEVER
+        )
 
 
 # TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@@ -430,8 +455,7 @@ def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
 @pytest.mark.parametrize("prompt_len", [10, 300, 1000])
 @pytest.mark.parametrize("num_slots_to_append", [50])
 @pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
-def test_sliding_window(block_size, prompt_len, num_slots_to_append,
-                        sliding_window):
+def test_sliding_window(block_size, prompt_len, num_slots_to_append, sliding_window):
     """Verify append_slots consumes the correct number of blocks from the block
     table.
     """
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index ba085001136b..157bb58f5e0f 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -33,14 +33,17 @@ def test_allocate_naive(block_size: int, sequence_len: int):
 
     block_tables: list[BlockTable] = []
     for i in range(5):
-        assert allocator.get_num_free_blocks(
-            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
+        assert (
+            allocator.get_num_free_blocks(device=Device.GPU)
+            == num_gpu_blocks - i * num_blocks_per_alloc
+        )
 
         block_tables.append(
             BlockTable(
                 block_size=block_size,
                 block_allocator=allocator,
-            ))
+            )
+        )
         block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
 
 
@@ -71,35 +74,33 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
 
     token_ids = list(range(sequence_len))
     chunked_tokens = list(chunk_list(token_ids, block_size))
-    num_mutable_blocks_per_alloc = 0 if len(
-        chunked_tokens[-1]) == block_size else 1
-    num_immutable_blocks_per_alloc = len(
-        chunked_tokens) - num_mutable_blocks_per_alloc
+    num_mutable_blocks_per_alloc = 0 if len(chunked_tokens[-1]) == block_size else 1
+    num_immutable_blocks_per_alloc = len(chunked_tokens) - num_mutable_blocks_per_alloc
 
     block_tables: list[BlockTable] = []
     for alloc_i in range(1, 6):
-
         block_tables.append(
             BlockTable(
                 block_size=block_size,
                 block_allocator=allocator,
-            ))
+            )
+        )
         block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
 
         # Expect all sequences to share allocations, except for their last block
         # (which may be mutable).
-        assert allocator.get_num_free_blocks(
-            device=Device.GPU) == num_gpu_blocks - (
-                num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
-                (alloc_i))
+        assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks - (
+            num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc * (alloc_i)
+        )
 
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("sequence_len", [1, 16, 129])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
 @pytest.mark.parametrize("device", ["cpu", "gpu"])
-def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
-                       device: str):
+def test_allocate_free(
+    block_size: int, sequence_len: int, allocator_type: str, device: str
+):
     """Test the allocation and freeing of blocks using different allocators and
     devices.
 
@@ -128,10 +129,11 @@ def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
 
     for i in range(5):
         block_table.allocate(token_ids=token_ids, device=device)
-        assert allocator.get_num_free_blocks(
-            device) == num_device_blocks - num_blocks_per_alloc
-        assert all(block_id is not None
-                   for block_id in block_table.physical_block_ids)
+        assert (
+            allocator.get_num_free_blocks(device)
+            == num_device_blocks - num_blocks_per_alloc
+        )
+        assert all(block_id is not None for block_id in block_table.physical_block_ids)
 
         block_table.free()
         assert allocator.get_num_free_blocks(device) == num_device_blocks
@@ -141,8 +143,9 @@ def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
 @pytest.mark.parametrize("sequence_len", [1, 16, 129])
 @pytest.mark.parametrize("append_len", [1, 16, 129])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_append_token_ids_allocation(block_size: int, sequence_len: int,
-                                     append_len: int, allocator_type: str):
+def test_append_token_ids_allocation(
+    block_size: int, sequence_len: int, append_len: int, allocator_type: str
+):
     """Test the allocation behavior when appending token IDs to a BlockTable.
 
     This test creates a CpuGpuBlockAllocator with the specified block size,
@@ -169,29 +172,29 @@ def test_append_token_ids_allocation(block_size: int, sequence_len: int,
         block_allocator=allocator,
     )
 
-    num_expected_blocks_before_append = len(
-        list(chunk_list(token_ids, block_size)))
-    num_expected_appended_blocks = len(
-        list(chunk_list(token_ids + token_ids_to_append,
-                        block_size))) - num_expected_blocks_before_append
+    num_expected_blocks_before_append = len(list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = (
+        len(list(chunk_list(token_ids + token_ids_to_append, block_size)))
+        - num_expected_blocks_before_append
+    )
 
     block_table.allocate(token_ids=token_ids, device=Device.GPU)
 
-    assert len(
-        block_table.physical_block_ids) == num_expected_blocks_before_append
+    assert len(block_table.physical_block_ids) == num_expected_blocks_before_append
     block_table.append_token_ids(token_ids_to_append)
-    assert len(
-        block_table.physical_block_ids
-    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+    assert (
+        len(block_table.physical_block_ids)
+        == num_expected_blocks_before_append + num_expected_appended_blocks
+    )
 
 
 @pytest.mark.parametrize("block_size", [1, 8])
 @pytest.mark.parametrize("sequence_len", [1, 16, 129])
 @pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
-                                           num_empty_slots: int,
-                                           allocator_type: str):
+def test_ensure_num_empty_slots_allocation(
+    block_size: int, sequence_len: int, num_empty_slots: int, allocator_type: str
+):
     """Test the allocation behavior when ensuring a certain number of empty
     slots in a BlockTable.
 
@@ -218,22 +221,22 @@ def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
         block_allocator=allocator,
     )
 
-    num_expected_blocks_before_append = len(
-        list(chunk_list(token_ids, block_size)))
-    num_expected_appended_blocks = len(
-        list(chunk_list(token_ids + [-1] * num_empty_slots,
-                        block_size))) - num_expected_blocks_before_append
+    num_expected_blocks_before_append = len(list(chunk_list(token_ids, block_size)))
+    num_expected_appended_blocks = (
+        len(list(chunk_list(token_ids + [-1] * num_empty_slots, block_size)))
+        - num_expected_blocks_before_append
+    )
 
     block_table.allocate(token_ids=token_ids, device=Device.GPU)
 
     # Assert that the empty slots consume the expected number of additional
     # blocks.
-    assert len(
-        block_table.physical_block_ids) == num_expected_blocks_before_append
+    assert len(block_table.physical_block_ids) == num_expected_blocks_before_append
     block_table.ensure_num_empty_slots(num_empty_slots)
-    assert len(
-        block_table.physical_block_ids
-    ) == num_expected_blocks_before_append + num_expected_appended_blocks
+    assert (
+        len(block_table.physical_block_ids)
+        == num_expected_blocks_before_append + num_expected_appended_blocks
+    )
 
     # Now, ensure no additional blocks consumed as we fill up the empty slots.
     num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
@@ -246,9 +249,13 @@ def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
 @pytest.mark.parametrize("append_len", [1, 16, 129])
 @pytest.mark.parametrize("append_size", [1, 4, 129])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
-                                          append_len: int, allocator_type: str,
-                                          append_size: int):
+def test_append_token_ids_correct_content(
+    block_size: int,
+    sequence_len: int,
+    append_len: int,
+    allocator_type: str,
+    append_size: int,
+):
     """Verify token ids are correctly appended. Appends various amounts of
     token ids in various append sizes, and verifies the final sequence is
     correct.
@@ -286,13 +293,13 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
 def test_fork(seq_len: int, block_size: int, allocator_type: str):
     """Create a sequence using the specified allocator.
-        1. Assert that after forking the sequence, the free block count is the
-            same.
-        2. Assert that the forked sequence has the same physical mappings.
-        3. Then free the original sequence; verify that the free block count is
-            the same.
-        4. Finally, free the forked sequence and verify that the free block
-            count drops to zero.
+    1. Assert that after forking the sequence, the free block count is the
+        same.
+    2. Assert that the forked sequence has the same physical mappings.
+    3. Then free the original sequence; verify that the free block count is
+        the same.
+    4. Finally, free the forked sequence and verify that the free block
+        count drops to zero.
     """
     num_gpu_blocks = 1024
 
@@ -312,30 +319,30 @@ def test_fork(seq_len: int, block_size: int, allocator_type: str):
 
     block_table.allocate(token_ids)
 
-    num_free_blocks_before_fork = allocator.get_num_free_blocks(
-        device=Device.GPU)
+    num_free_blocks_before_fork = allocator.get_num_free_blocks(device=Device.GPU)
 
     forked_block_table = block_table.fork()
 
     # Expect physical_block_ids and token_ids to match.
-    assert (block_table.physical_block_ids ==
-            forked_block_table.physical_block_ids)
-    assert block_table._get_all_token_ids(
-    ) == forked_block_table._get_all_token_ids()
+    assert block_table.physical_block_ids == forked_block_table.physical_block_ids
+    assert block_table._get_all_token_ids() == forked_block_table._get_all_token_ids()
 
     # Do not expect any additional allocations.
-    assert allocator.get_num_free_blocks(
-        device=Device.GPU) == num_free_blocks_before_fork
+    assert (
+        allocator.get_num_free_blocks(device=Device.GPU) == num_free_blocks_before_fork
+    )
 
     # Free the original blocks. Assert num free blocks does not change, since
     # refcount is nonzero.
     block_table.free()
-    assert allocator.get_num_free_blocks(
-        device=Device.GPU) == num_free_blocks_before_fork
+    assert (
+        allocator.get_num_free_blocks(device=Device.GPU) == num_free_blocks_before_fork
+    )
 
     # Expect the forked block table to be unaffected by the free.
-    assert all(block_id is not None
-               for block_id in forked_block_table.physical_block_ids)
+    assert all(
+        block_id is not None for block_id in forked_block_table.physical_block_ids
+    )
 
     # Free the forked blocks. Assert num free blocks does change, since
     # refcount is now zero.
@@ -348,10 +355,14 @@ def test_fork(seq_len: int, block_size: int, allocator_type: str):
 @pytest.mark.parametrize("append_len", [1, 16, 129])
 @pytest.mark.parametrize("appender", ["forked", "original"])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_cow(block_size: int, sequence_len: int, append_len: int,
-             allocator_type: str, appender: str):
-    """Fork a sequence; append to the forked sequence; verify there's a CoW.
-    """
+def test_cow(
+    block_size: int,
+    sequence_len: int,
+    append_len: int,
+    allocator_type: str,
+    appender: str,
+):
+    """Fork a sequence; append to the forked sequence; verify there's a CoW."""
     num_gpu_blocks = 1024
 
     allocator = CpuGpuBlockAllocator.create(
@@ -370,8 +381,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
     )
 
     num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
-    num_expected_cow_blocks = cdiv(sequence_len + append_len,
-                                   block_size) - (sequence_len // block_size)
+    num_expected_cow_blocks = cdiv(sequence_len + append_len, block_size) - (
+        sequence_len // block_size
+    )
 
     original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
     original_block_ids = original_block_table.physical_block_ids[:]
@@ -380,8 +392,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
     forked_block_table = original_block_table.fork()
 
     # Expect no additional allocation (copy on _write_).
-    assert allocator.get_num_free_blocks(
-        Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
+    assert allocator.get_num_free_blocks(Device.GPU) == (
+        num_gpu_blocks - num_expected_non_cow_blocks
+    )
 
     if appender == "forked":
         appender_block_table = forked_block_table
@@ -400,9 +413,9 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
     assert appender_block_table.physical_block_ids != original_block_ids
 
     # Expect the blocks changed during append to have a CoW.
-    assert allocator.get_num_free_blocks(
-        Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
-                                         num_expected_cow_blocks)
+    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks - (
+        num_expected_non_cow_blocks + num_expected_cow_blocks
+    )
 
     cows = allocator.clear_copy_on_writes()
     if sequence_len % block_size > 0:
@@ -432,9 +445,14 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
 @pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
 @pytest.mark.parametrize("appender", ["forked", "original"])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_cow_lookahead_simple(block_size: int, sequence_len: int,
-                              append_len: int, lookahead_slots: int,
-                              allocator_type: str, appender: str):
+def test_cow_lookahead_simple(
+    block_size: int,
+    sequence_len: int,
+    append_len: int,
+    lookahead_slots: int,
+    allocator_type: str,
+    appender: str,
+):
     """Similar to test_cow, except with lookahead allocation. The assertions are
     less rigorous due to the complexity of the property under test.
     """
@@ -507,10 +525,13 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
 @pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
 @pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
-                                            num_new_tokens: int,
-                                            num_lookahead_slots: int,
-                                            allocator_type: str):
+def test_num_blocks_touched_by_append_slots(
+    block_size: int,
+    sequence_len: int,
+    num_new_tokens: int,
+    num_lookahead_slots: int,
+    allocator_type: str,
+):
     """Verify correct calculation of get_num_blocks_touched_by_append_slots.
 
     This is done by using copy-on-write, which requires any modified block to
@@ -547,10 +568,9 @@ def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
     _ = block_table.fork()
 
     # Determine how many blocks should be touched.
-    expected_num_touched_blocks = (
-        block_table.get_num_blocks_touched_by_append_slots(
-            token_ids=token_ids_to_append,
-            num_lookahead_slots=num_lookahead_slots))
+    expected_num_touched_blocks = block_table.get_num_blocks_touched_by_append_slots(
+        token_ids=token_ids_to_append, num_lookahead_slots=num_lookahead_slots
+    )
 
     # Measure how many blocks are touched by measuring num_free_blocks before
     # and after the append.
@@ -558,8 +578,9 @@ def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
     # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
     num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
     block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
-    num_consumed_blocks = (num_free_blocks_before_append -
-                           allocator.get_num_free_blocks(Device.GPU))
+    num_consumed_blocks = num_free_blocks_before_append - allocator.get_num_free_blocks(
+        Device.GPU
+    )
 
     # TODO(cade) ensure equality when num_lookahead_slots > 0.
     # The reason we have < is because lookahead blocks are not copied eagerly;
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
index 795eef6743fd..1b2151fcf2d2 100644
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -11,8 +11,9 @@
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                                block_size: int, allocator_type: str):
+def test_allocate_mutable_block(
+    num_cpu_blocks: int, num_gpu_blocks: int, block_size: int, allocator_type: str
+):
     allocator = CpuGpuBlockAllocator.create(
         allocator_type=allocator_type,
         num_gpu_blocks=num_gpu_blocks,
@@ -50,8 +51,9 @@ def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
 @pytest.mark.parametrize("num_gpu_blocks", [1024])
 @pytest.mark.parametrize("block_size", [2])
 @pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                                  block_size: int, allocator_type: str):
+def test_allocate_immutable_block(
+    num_cpu_blocks: int, num_gpu_blocks: int, block_size: int, allocator_type: str
+):
     allocator = CpuGpuBlockAllocator.create(
         allocator_type=allocator_type,
         num_gpu_blocks=num_gpu_blocks,
@@ -59,29 +61,30 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
         block_size=block_size,
     )
 
-    unique_token_ids = list(
-        range((num_cpu_blocks + num_gpu_blocks) * block_size))
+    unique_token_ids = list(range((num_cpu_blocks + num_gpu_blocks) * block_size))
     gpu_token_ids = list(
-        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
+        chunk_list(unique_token_ids[: num_gpu_blocks * block_size], block_size)
+    )
     cpu_token_ids = list(
-        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
+        chunk_list(unique_token_ids[num_gpu_blocks * block_size :], block_size)
+    )
 
     assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     cpu_blocks = [
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids,
-                                           device=Device.CPU)
+        allocator.allocate_immutable_block(
+            prev_block=None, token_ids=token_ids, device=Device.CPU
+        )
         for token_ids in cpu_token_ids
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
 
     gpu_blocks = [
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids,
-                                           device=Device.GPU)
+        allocator.allocate_immutable_block(
+            prev_block=None, token_ids=token_ids, device=Device.GPU
+        )
         for token_ids in gpu_token_ids
     ]
     assert allocator.get_num_free_blocks(Device.CPU) == 0
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index a31d1c46b37f..1e2e104c6113 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -10,18 +10,21 @@
 
 
 class TestNaiveBlockAllocator:
-
     @staticmethod
-    def create_allocate_lambda(allocate_type: str,
-                               allocator: NaiveBlockAllocator,
-                               prev_block: Optional[Block],
-                               token_ids: list[int]):
+    def create_allocate_lambda(
+        allocate_type: str,
+        allocator: NaiveBlockAllocator,
+        prev_block: Optional[Block],
+        token_ids: list[int],
+    ):
         if allocate_type == "immutable":
             allocate_block = lambda: allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=token_ids)
+                prev_block=prev_block, token_ids=token_ids
+            )
         elif allocate_type == "mutable":
             allocate_block = lambda: allocator.allocate_mutable_block(
-                prev_block=prev_block)
+                prev_block=prev_block
+            )
         else:
             raise ValueError()
 
@@ -31,16 +34,13 @@ def create_allocate_lambda(allocate_type: str,
     @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
     @pytest.mark.parametrize("num_blocks", [1, 1024])
     @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_ooms(allocate_type: str, num_blocks: int,
-                           block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
+    def test_allocate_ooms(allocate_type: str, num_blocks: int, block_size: int):
+        allocator = NaiveBlockAllocator(
+            create_block=NaiveBlock, num_blocks=num_blocks, block_size=block_size
+        )
         allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
+            allocate_type, allocator, prev_block=None, token_ids=list(range(block_size))
+        )
 
         [allocate_block() for _ in range(num_blocks)]
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
@@ -50,16 +50,13 @@ def test_allocate_ooms(allocate_type: str, num_blocks: int,
     @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
     @pytest.mark.parametrize("num_blocks", [1, 1024])
     @pytest.mark.parametrize("block_size", [1, 16])
-    def test_free_prevents_oom(allocate_type: str, num_blocks: int,
-                               block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
+    def test_free_prevents_oom(allocate_type: str, num_blocks: int, block_size: int):
+        allocator = NaiveBlockAllocator(
+            create_block=NaiveBlock, num_blocks=num_blocks, block_size=block_size
+        )
         allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
+            allocate_type, allocator, prev_block=None, token_ids=list(range(block_size))
+        )
 
         blocks = [allocate_block() for _ in range(num_blocks)]
 
@@ -85,16 +82,13 @@ def test_free_prevents_oom(allocate_type: str, num_blocks: int,
     @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
     @pytest.mark.parametrize("num_blocks", [1024])
     @pytest.mark.parametrize("block_size", [16])
-    def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
-                                 block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
+    def test_get_num_free_blocks(allocate_type: str, num_blocks: int, block_size: int):
+        allocator = NaiveBlockAllocator(
+            create_block=NaiveBlock, num_blocks=num_blocks, block_size=block_size
+        )
         allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
+            allocate_type, allocator, prev_block=None, token_ids=list(range(block_size))
+        )
 
         assert allocator.get_num_free_blocks() == num_blocks
 
@@ -108,41 +102,37 @@ def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
     @pytest.mark.parametrize("num_blocks", [4])
     @pytest.mark.parametrize("block_size", [8])
     def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
-        """ Verify the allocator can correctly return the number of
+        """Verify the allocator can correctly return the number of
         full blocks touched.
         """
-        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
-                                            num_blocks=num_blocks,
-                                            block_size=block_size)
-        allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
-                                            num_blocks=num_blocks,
-                                            block_size=block_size)
+        allocator_src = NaiveBlockAllocator(
+            create_block=NaiveBlock, num_blocks=num_blocks, block_size=block_size
+        )
+        allocator_dst = NaiveBlockAllocator(
+            create_block=NaiveBlock, num_blocks=num_blocks, block_size=block_size
+        )
 
         # Create a chain of cacheable blocks in the dst
         allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
             "immutable",
             allocator_src,
             prev_block=None,
-            token_ids=list(range(block_size)))
+            token_ids=list(range(block_size)),
+        )
         src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
 
         # All blocks are cached
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks - 1
+        assert allocator_dst.get_num_full_blocks_touched(src_blocks) == num_blocks - 1
 
         # Insert one non-full block in the src
-        allocate_non_full_block = \
-            TestNaiveBlockAllocator.create_allocate_lambda(
-                "mutable", allocator_src,
-                prev_block=src_blocks[-1],token_ids=[]
-            )
+        allocate_non_full_block = TestNaiveBlockAllocator.create_allocate_lambda(
+            "mutable", allocator_src, prev_block=src_blocks[-1], token_ids=[]
+        )
         src_blocks.append(allocate_non_full_block())
         src_blocks[-1].append_token_ids([0])
 
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks - 1
+        assert allocator_dst.get_num_full_blocks_touched(src_blocks) == num_blocks - 1
         # Fill up the last source block and then invoke
         # get_num_blocks_touched
         src_blocks[-1].append_token_ids([0] * (block_size - 1))
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks
+        assert allocator_dst.get_num_full_blocks_touched(src_blocks) == num_blocks
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index 46e224c6f53b..6236eddf33b4 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -11,33 +11,37 @@
 from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block, BlockAllocator
-from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
-                                                  PrefixCachingBlock,
-                                                  PrefixCachingBlockAllocator)
+from vllm.core.block.prefix_caching_block import (
+    ComputedBlocksTracker,
+    PrefixCachingBlock,
+    PrefixCachingBlockAllocator,
+)
 from vllm.sequence import Logprob
 from vllm.utils import Device
 
 
 class TestPrefixCachingBlock:
-
     @staticmethod
     @pytest.mark.parametrize("seed", list(range(10)))
     @pytest.mark.parametrize("block_size", [1, 16])
     @pytest.mark.parametrize("is_curr_block_full", [True, False])
-    def test_first_block_has_correct_content_hash(seed: int, block_size: int,
-                                                  is_curr_block_full: bool):
-        """Verify a block which is first in the sequence has the correct hash.
-        """
+    def test_first_block_has_correct_content_hash(
+        seed: int, block_size: int, is_curr_block_full: bool
+    ):
+        """Verify a block which is first in the sequence has the correct hash."""
         random.seed(seed)
-        num_to_fill = block_size if is_curr_block_full else random.randint(
-            0, block_size - 1)
+        num_to_fill = (
+            block_size if is_curr_block_full else random.randint(0, block_size - 1)
+        )
         token_ids = list(range(num_to_fill))
         mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
 
-        block_with_prev = PrefixCachingBlock(prev_block=None,
-                                             token_ids=token_ids,
-                                             block_size=block_size,
-                                             allocator=mock_allocator)
+        block_with_prev = PrefixCachingBlock(
+            prev_block=None,
+            token_ids=token_ids,
+            block_size=block_size,
+            allocator=mock_allocator,
+        )
 
         if is_curr_block_full:
             # Expect hash since block is full.
@@ -45,7 +49,9 @@ def test_first_block_has_correct_content_hash(seed: int, block_size: int,
                 PrefixCachingBlock.hash_block_tokens(
                     is_first_block=True,
                     prev_block_hash=None,
-                    cur_block_token_ids=token_ids))
+                    cur_block_token_ids=token_ids,
+                )
+            )
         else:
             # Do not expect hash since block is not full.
             assert block_with_prev.content_hash is None
@@ -55,9 +61,9 @@ def test_first_block_has_correct_content_hash(seed: int, block_size: int,
     @pytest.mark.parametrize("block_size", [1, 16])
     @pytest.mark.parametrize("is_curr_block_full", [True, False])
     @pytest.mark.parametrize("prev_block_has_hash", [True, False])
-    def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
-                                                is_curr_block_full: bool,
-                                                prev_block_has_hash: bool):
+    def test_nth_block_has_correct_content_hash(
+        seed: int, block_size: int, is_curr_block_full: bool, prev_block_has_hash: bool
+    ):
         """Verify a block which is not first in the sequence has the correct
         hash.
         """
@@ -66,11 +72,13 @@ def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
 
         previous_block = MagicMock(spec=PrefixCachingBlock)
         prev_block_hash = random.randint(0, 1000)
-        previous_block.content_hash = (prev_block_hash if prev_block_has_hash
-                                       else hash('None'))
+        previous_block.content_hash = (
+            prev_block_hash if prev_block_has_hash else hash("None")
+        )
 
-        num_to_fill = block_size if is_curr_block_full else random.randint(
-            0, block_size - 1)
+        num_to_fill = (
+            block_size if is_curr_block_full else random.randint(0, block_size - 1)
+        )
         token_ids = list(range(num_to_fill))
         mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
 
@@ -83,11 +91,11 @@ def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
 
         if is_curr_block_full and prev_block_has_hash:
             # Expect hash since block is full and previous block has hash.
-            assert (block_with_prev.content_hash ==
-                    PrefixCachingBlock.hash_block_tokens(
-                        is_first_block=False,
-                        prev_block_hash=prev_block_hash,
-                        cur_block_token_ids=token_ids))
+            assert block_with_prev.content_hash == PrefixCachingBlock.hash_block_tokens(
+                is_first_block=False,
+                prev_block_hash=prev_block_hash,
+                cur_block_token_ids=token_ids,
+            )
         else:
             # Do not expect hash since block is not full or the previous block
             # does not have a hash.
@@ -97,9 +105,9 @@ def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
     @pytest.mark.parametrize("block_size", [1, 2, 16])
     @pytest.mark.parametrize("num_tokens", list(range(3)))
     @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
-    def test_blocks_have_correct_hash_in_chain(block_size: int,
-                                               num_tokens: int,
-                                               num_empty_trailing_blocks: int):
+    def test_blocks_have_correct_hash_in_chain(
+        block_size: int, num_tokens: int, num_empty_trailing_blocks: int
+    ):
         """Create two chains of logical blocks with the same contents.
         Assert the hashes are equal.
         """
@@ -107,30 +115,29 @@ def test_blocks_have_correct_hash_in_chain(block_size: int,
 
         token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
 
-        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            num_empty_trailing_blocks=num_empty_trailing_blocks)
-                                     for _ in range(2))
+        first_chain, second_chain = (
+            TestPrefixCachingBlock.create_chain(
+                block_size=block_size,
+                token_ids=token_ids,
+                num_empty_trailing_blocks=num_empty_trailing_blocks,
+            )
+            for _ in range(2)
+        )
 
-        for first_chain_block, second_chain_block in zip(
-                first_chain, second_chain):
-            assert (first_chain_block.content_hash ==
-                    second_chain_block.content_hash)
+        for first_chain_block, second_chain_block in zip(first_chain, second_chain):
+            assert first_chain_block.content_hash == second_chain_block.content_hash
 
         if not first_chain or not second_chain:
             assert first_chain == second_chain
             assert num_tokens == 0
 
     @staticmethod
-    def create_chain(block_size: int,
-                     token_ids: list[int],
-                     num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]:
-        """Helper method which creates a chain of blocks.
-        """
+    def create_chain(
+        block_size: int, token_ids: list[int], num_empty_trailing_blocks=0
+    ) -> list[PrefixCachingBlock]:
+        """Helper method which creates a chain of blocks."""
         blocks: list[PrefixCachingBlock] = []
-        num_blocks = math.ceil(
-            len(token_ids) / block_size) + num_empty_trailing_blocks
+        num_blocks = math.ceil(len(token_ids) / block_size) + num_empty_trailing_blocks
 
         if num_blocks == 0:
             return []
@@ -146,9 +153,9 @@ def create_chain(block_size: int,
                 allocator=allocator,
             )
 
-            tokens_to_append = token_ids[block_number *
-                                         block_size:(block_number + 1) *
-                                         block_size]
+            tokens_to_append = token_ids[
+                block_number * block_size : (block_number + 1) * block_size
+            ]
             if tokens_to_append:
                 prev_block.append_token_ids(tokens_to_append)
 
@@ -158,17 +165,21 @@ def create_chain(block_size: int,
 
 
 class TestPrefixCachingBlockAllocator:
-
     @staticmethod
-    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
-                               prev_block: Optional[Block],
-                               token_ids: list[int]):
+    def create_allocate_lambda(
+        allocate_type: str,
+        allocator: BlockAllocator,
+        prev_block: Optional[Block],
+        token_ids: list[int],
+    ):
         if allocate_type == "immutable":
             allocate_block = lambda: allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=token_ids)
+                prev_block=prev_block, token_ids=token_ids
+            )
         elif allocate_type == "mutable":
             allocate_block = lambda: allocator.allocate_mutable_block(
-                prev_block=prev_block)
+                prev_block=prev_block
+            )
         else:
             raise ValueError()
 
@@ -178,8 +189,9 @@ def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
     @pytest.mark.parametrize("num_blocks", [1, 1024])
     @pytest.mark.parametrize("block_size", [1, 16])
     def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
             allocate_type="mutable",
             allocator=allocator,
@@ -195,9 +207,11 @@ def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
     @pytest.mark.parametrize("num_blocks", [1, 1024])
     @pytest.mark.parametrize("block_size", [1, 16])
     def test_allocate_immutable_does_not_oom_single_hash(
-            num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        num_blocks: int, block_size: int
+    ):
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
             allocate_type="immutable",
             allocator=allocator,
@@ -212,20 +226,20 @@ def test_allocate_immutable_does_not_oom_single_hash(
 
         # Expect all blocks to have same physical block index.
         for block in blocks:
-            assert (block.block_id == non_oom_block.block_id)
+            assert block.block_id == non_oom_block.block_id
 
     @staticmethod
     @pytest.mark.parametrize("num_blocks", [1, 1024])
     @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_immutable_ooms_many_hash(num_blocks: int,
-                                               block_size: int):
+    def test_allocate_immutable_ooms_many_hash(num_blocks: int, block_size: int):
         """Consume all blocks using many different hashes/block content.
 
         Do this by creating a sequence that is very long.
         Expect next block to OOM.
         """
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
 
         # Create token ids that will exhaust all blocks.
         token_ids = list(range(num_blocks * block_size))
@@ -238,9 +252,9 @@ def test_allocate_immutable_ooms_many_hash(num_blocks: int,
 
         # Expect allocation with unseen hash to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_immutable_block(prev_block=chain[-1],
-                                               token_ids=list(
-                                                   range(block_size)))
+            allocator.allocate_immutable_block(
+                prev_block=chain[-1], token_ids=list(range(block_size))
+            )
 
         # Expect mutable allocation to fail.
         with pytest.raises(BlockAllocator.NoFreeBlocksError):
@@ -256,14 +270,15 @@ def test_allocate_immutable_ooms_many_hash(num_blocks: int,
         # Expect physical block indices to be the same in both chains.
         assert chain and second_chain
         for first_chain_block, second_chain_block in zip(chain, second_chain):
-            assert (first_chain_block.block_id == second_chain_block.block_id)
+            assert first_chain_block.block_id == second_chain_block.block_id
 
     @staticmethod
     @pytest.mark.parametrize("num_blocks", [1, 1024])
     @pytest.mark.parametrize("block_size", [1, 16])
     def test_free_prevents_oom(num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
 
         # Create token ids that will exhaust all blocks.
         token_ids = list(range(num_blocks * block_size))
@@ -300,8 +315,9 @@ def test_free_prevents_oom(num_blocks: int, block_size: int):
     @pytest.mark.parametrize("seed", list(range(20)))
     def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
         random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         num_blocks_to_consume = random.randint(1, num_blocks - 1)
 
         # Create token ids that will exhaust all blocks.
@@ -316,23 +332,24 @@ def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
         # Free each block in chain, assert num free blocks includes new free
         # block.
         for i, block in enumerate(chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume +
-                                                       i)
+            assert allocator.get_num_free_blocks() == (
+                num_blocks - num_blocks_to_consume + i
+            )
             allocator.free(block)
 
     @staticmethod
     @pytest.mark.parametrize("num_blocks", [4])
     @pytest.mark.parametrize("block_size", [8])
-    def test_prefix_caching_block_get_num_full_blocks_touched(
-            num_blocks, block_size):
-        """ Verify the allocator can correctly return the number of
+    def test_prefix_caching_block_get_num_full_blocks_touched(num_blocks, block_size):
+        """Verify the allocator can correctly return the number of
         blocks touched, when there are cached prefixes.
         """
-        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                    block_size=block_size)
-        allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                    block_size=block_size)
+        allocator_src = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
+        allocator_dst = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
 
         # Create token ids that will exhaust all blocks except the last
         token_ids = list(range((num_blocks - 1) * block_size))
@@ -345,49 +362,43 @@ def test_prefix_caching_block_get_num_full_blocks_touched(
         )
 
         # Create a chain of the same blocks in the src
-        blocks_to_swap_in = \
-            TestPrefixCachingBlockAllocator.create_immutable_chain(
-                block_size=block_size,
-                token_ids=token_ids,
-                allocator=allocator_src,
-            )
+        blocks_to_swap_in = TestPrefixCachingBlockAllocator.create_immutable_chain(
+            block_size=block_size,
+            token_ids=token_ids,
+            allocator=allocator_src,
+        )
         # All blocks are cached
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 0
+        assert allocator_dst.get_num_full_blocks_touched(blocks_to_swap_in) == 0
 
         # Free the first block in the dst
         allocator_dst.free(cached_blocks[0])
 
         # Now the first block becomes dangling, the swapped blocks need
         # to reclaim the first block in the dst
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 1
+        assert allocator_dst.get_num_full_blocks_touched(blocks_to_swap_in) == 1
 
         # Insert one non-full block in the src
-        non_full_block = allocator_src.allocate_mutable_block(
-            blocks_to_swap_in[-1])
+        non_full_block = allocator_src.allocate_mutable_block(blocks_to_swap_in[-1])
         non_full_block.append_token_ids([0])
         blocks_to_swap_in.append(non_full_block)
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 1
+        assert allocator_dst.get_num_full_blocks_touched(blocks_to_swap_in) == 1
         # Fill up the last mutable block and invoke get_num_blocks_touched.
         # Note: The last block is not cached so it will be touched.
         non_full_block.append_token_ids([0] * (block_size - 1))
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 2
+        assert allocator_dst.get_num_full_blocks_touched(blocks_to_swap_in) == 2
 
     @staticmethod
     @pytest.mark.parametrize("num_blocks", [1024])
     @pytest.mark.parametrize("block_size", [16])
     @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
-                                        seed: int):
+    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int, seed: int):
         """Verify sharing occurs by allocating two sequences that share prefixes
         and incrementally freeing blocks.
         """
         random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         num_blocks_to_consume = random.randint(1, num_blocks - 1)
 
         # Create token ids that will exhaust all blocks.
@@ -407,32 +418,33 @@ def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
         # Free each block in the first chain. Since all blocks are shared, the
         # free count should stay constant.
         for i, block in enumerate(first_chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume)
+            assert allocator.get_num_free_blocks() == (
+                num_blocks - num_blocks_to_consume
+            )
             allocator.free(block)
 
         # Free each block in the second chain. Since the refcount is now zero,
         # the free count should increment with each free.
         for i, block in enumerate(second_chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume +
-                                                       i)
+            assert allocator.get_num_free_blocks() == (
+                num_blocks - num_blocks_to_consume + i
+            )
             allocator.free(block)
 
     @staticmethod
     @pytest.mark.parametrize("num_blocks", [1024])
     @pytest.mark.parametrize("block_size", [16])
     @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
-                                           seed: int):
+    def test_get_common_computed_block_ids(num_blocks: int, block_size: int, seed: int):
         """Verify get_common_computed_block_ids could get correct result
         by create two immutable chain sharing prefix at specified pos,
         and compare whether we also could get right result
         from get_common_computed_block_ids.
         """
         random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks * 2, block_size=block_size
+        )
         num_blocks_to_consume = random.randint(1, num_blocks - 1)
 
         # Create token ids that will exhaust all blocks.
@@ -463,9 +475,10 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
             second_chain[i].block_id for i in range(num_blocks_to_consume)
         ]
         res = allocator.get_common_computed_block_ids(
-            [first_computed_ids, second_computed_ids])
+            [first_computed_ids, second_computed_ids]
+        )
 
-        assert (len(res) == zero_point_blocks)
+        assert len(res) == zero_point_blocks
 
     # Test case that assume those prompted block after first immutable would
     # be freed into hashless allocator, while first immutable block get ref
@@ -477,12 +490,12 @@ def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
     def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
         random.seed(seed)
 
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         token_ids = list(range(block_size))
 
-        block = allocator.allocate_immutable_block(prev_block=None,
-                                                   token_ids=token_ids)
+        block = allocator.allocate_immutable_block(prev_block=None, token_ids=token_ids)
 
         assert allocator._refcounter.get(block.block_id) == 1
         m = allocator.allocate_mutable_block(prev_block=None)
@@ -511,15 +524,17 @@ def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
         all_blocks_list = [i for i in range(num_blocks)]
         zero_ref = {i: 0 for i in range(num_blocks)}
         one_ref = {i: 1 for i in range(num_blocks)}
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         token_ids = list(range(num_blocks * block_size))
 
         # Verify initial/pre-alloc state
 
         # Ensure all blocks are free inside hashless allocator
-        assert list(allocator._hashless_allocator._free_block_indices
-                    ) == all_blocks_list
+        assert (
+            list(allocator._hashless_allocator._free_block_indices) == all_blocks_list
+        )
         # Ensure no tracked blocks
         assert len(allocator._block_tracker.keys()) == num_blocks
         for block_id in range(num_blocks):
@@ -536,13 +551,14 @@ def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
         for i in range(num_blocks):
             block = allocator.allocate_immutable_block(
                 prev_block=None,
-                token_ids=token_ids[block_size * i:block_size * (i + 1)])
+                token_ids=token_ids[block_size * i : block_size * (i + 1)],
+            )
             new_block.append(block)
 
         # Verify post-alloc state
 
         # Ensure no blocks are free inside hashless allocator
-        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
+        assert len(allocator._hashless_allocator._free_block_indices) == 0
         # Ensure all blocks are tracked
         assert len(allocator._block_tracker.keys()) == num_blocks
         for block_id in range(num_blocks):
@@ -601,7 +617,8 @@ def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
         # shall get free block from hashless allocator, thus no block left
         # in hashless
         block = allocator.allocate_immutable_block(
-            prev_block=None, token_ids=token_ids[:block_size])
+            prev_block=None, token_ids=token_ids[:block_size]
+        )
 
         assert block.block_id == 0
         assert len(allocator._hashless_allocator._free_block_indices) == 0
@@ -632,8 +649,9 @@ def test_eviction_order(num_blocks: int, block_size: int, seed: int):
         """
 
         random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         num_blocks_to_consume = num_blocks + 1
 
         token_ids = list(range(num_blocks_to_consume * block_size))
@@ -647,8 +665,9 @@ def test_eviction_order(num_blocks: int, block_size: int, seed: int):
             allocator=allocator,
         )
         # There should only be one block allocated at this point
-        assert allocator.get_num_free_blocks() == (num_blocks -
-                                                   num_blocks_in_first_chain)
+        assert allocator.get_num_free_blocks() == (
+            num_blocks - num_blocks_in_first_chain
+        )
 
         # Set the last accessed time of the first block to 1
         blocks_ids = [block.block_id for block in first_chain]
@@ -693,26 +712,22 @@ def test_eviction_order(num_blocks: int, block_size: int, seed: int):
     @staticmethod
     def test_metric():
         block_size = 16
-        allocator = PrefixCachingBlockAllocator(num_blocks=4,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(num_blocks=4, block_size=block_size)
         # Test when no query (0/0)
         assert allocator.get_prefix_cache_hit_rate() == 0.0
 
         token_ids = list(range(block_size))
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids)
+        allocator.allocate_immutable_block(prev_block=None, token_ids=token_ids)
         # Test 0/1 hit rate
         assert allocator.get_prefix_cache_hit_rate() == 0.0
 
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids)
+        allocator.allocate_immutable_block(prev_block=None, token_ids=token_ids)
         # Test 1/2 hit rate
         assert allocator.get_prefix_cache_hit_rate() == 0.5
 
         # Test more than one block
         for _ in range(2, 1005):
-            allocator.allocate_immutable_block(prev_block=None,
-                                               token_ids=token_ids)
+            allocator.allocate_immutable_block(prev_block=None, token_ids=token_ids)
         assert allocator.get_prefix_cache_hit_rate() > 0.99
 
     # Test case for marking cache hit blocks as computed right after
@@ -721,8 +736,7 @@ def test_metric():
     def test_touch_block():
         block_size = 16
         common_blocks = 4
-        allocator = PrefixCachingBlockAllocator(num_blocks=8,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(num_blocks=8, block_size=block_size)
 
         common_token_ids = list(range(block_size * common_blocks))
 
@@ -737,13 +751,13 @@ def test_touch_block():
             block_hashes = [block.content_hash for block in blocks]
             # The allocated blocks should  be marked as touched
             # but not computed.
-            computed_block_ids = allocator.find_cached_blocks_prefix(
-                block_hashes)
+            computed_block_ids = allocator.find_cached_blocks_prefix(block_hashes)
             assert len(computed_block_ids) == 0
 
         allocator.mark_blocks_as_computed([])
         computed_block_ids = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes)
+            block_hashes=block_hashes
+        )
         assert len(computed_block_ids) == common_blocks
 
     @staticmethod
@@ -754,11 +768,12 @@ def test_find_cached_blocks_prefix():
         block_size = 4
         num_blocks = 8
         total_test_blocks = 12
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
 
         token_ids = list(range(total_test_blocks * block_size))
-        block_tokens_seq1 = token_ids[:num_blocks * block_size]
+        block_tokens_seq1 = token_ids[: num_blocks * block_size]
         blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain(
             block_size=block_size,
             token_ids=block_tokens_seq1,
@@ -769,7 +784,8 @@ def test_find_cached_blocks_prefix():
 
         # All blocks should be cached.
         cached_blocks_seq1 = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
+            block_hashes=block_hashes_seq1
+        )
         assert len(cached_blocks_seq1) == num_blocks
 
         # Free the first sequence.
@@ -778,10 +794,11 @@ def test_find_cached_blocks_prefix():
 
         # All blocks should be still be cached if not required to be allocated.
         cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
+            block_hashes=block_hashes_seq1
+        )
         assert len(cached_blocks) == num_blocks
 
-        block_tokens_seq2 = token_ids[num_blocks * block_size:]
+        block_tokens_seq2 = token_ids[num_blocks * block_size :]
         blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain(
             block_size=block_size,
             token_ids=block_tokens_seq2,
@@ -790,13 +807,15 @@ def test_find_cached_blocks_prefix():
         block_hashes_seq2 = [block.content_hash for block in blocks_seq2]
         allocator.mark_blocks_as_computed([])
         cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq2)
+            block_hashes=block_hashes_seq2
+        )
         assert len(cached_blocks) == len(blocks_seq2)
 
         # Half of the blocks from seq1 should still be cached.
         num_evicted_blocks = len(blocks_seq2)
         cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
+            block_hashes=block_hashes_seq1
+        )
         assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
 
     # Test reset prefix cache
@@ -806,8 +825,9 @@ def test_find_cached_blocks_prefix():
     def test_reset_prefix_cache(num_blocks: int, block_size: int):
         """This test case simulates the case of resetting the prefix cache."""
 
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
+        allocator = PrefixCachingBlockAllocator(
+            num_blocks=num_blocks, block_size=block_size
+        )
         token_ids = list(range(3 * block_size))
 
         first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
@@ -844,8 +864,7 @@ def create_immutable_chain(
         allocator: PrefixCachingBlockAllocator,
         extra_hash: Optional[int] = None,
     ) -> list[PrefixCachingBlock]:
-        """Helper method which creates a chain of blocks.
-        """
+        """Helper method which creates a chain of blocks."""
         blocks: list[Block] = []
         num_blocks = math.ceil(len(token_ids) / block_size)
 
@@ -854,20 +873,18 @@ def create_immutable_chain(
 
         prev_block = None
         for block_number in range(0, num_blocks):
-            block_token_ids = token_ids[block_number *
-                                        block_size:(block_number + 1) *
-                                        block_size]
+            block_token_ids = token_ids[
+                block_number * block_size : (block_number + 1) * block_size
+            ]
             prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block,
-                token_ids=block_token_ids,
-                extra_hash=extra_hash)
+                prev_block=prev_block, token_ids=block_token_ids, extra_hash=extra_hash
+            )
             blocks.append(prev_block)
 
         return blocks
 
 
 class TestComputedBlocksTracker:
-
     @staticmethod
     def _get_mock_allocator():
         return MagicMock(spec=PrefixCachingBlockAllocator)
@@ -898,9 +915,9 @@ def test_get_num_cached_tokens():
 
         # Not yet allocated.
         tokens = [0, 1, 2, 3, 4, 5]
-        seq1 = create_dummy_sequence(request_id=0,
-                                     token_ids=tokens,
-                                     block_size=block_size)
+        seq1 = create_dummy_sequence(
+            request_id=0, token_ids=tokens, block_size=block_size
+        )
         mock_allocator.find_cached_blocks_prefix.return_value = []
         assert tracker.get_num_cached_tokens(seq1) == 0
 
@@ -934,11 +951,10 @@ def test_get_num_cached_tokens():
         tracker.remove_seq(seq1.seq_id)
 
         # Re-create the sequence with the same request id to simulate recompute.
-        seq1 = create_dummy_sequence(request_id=0,
-                                     token_ids=tokens,
-                                     block_size=block_size)
-        mock_allocator.find_cached_blocks_prefix.return_value = [
-        ]  # no cached block
+        seq1 = create_dummy_sequence(
+            request_id=0, token_ids=tokens, block_size=block_size
+        )
+        mock_allocator.find_cached_blocks_prefix.return_value = []  # no cached block
         assert tracker.get_num_cached_tokens(seq1) == 0
 
     @staticmethod
@@ -964,9 +980,9 @@ def test_correct_block_hash():
         )
 
         tokens = list(range(block_size * 4))  # 4 blocks.
-        seq = create_dummy_sequence(request_id=0,
-                                    token_ids=tokens,
-                                    block_size=block_size)
+        seq = create_dummy_sequence(
+            request_id=0, token_ids=tokens, block_size=block_size
+        )
         _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
             block_size=block_size,
             token_ids=tokens,
@@ -1001,10 +1017,9 @@ def test_correct_extra_hash():
         tokens = list(range(block_size * 4))
 
         # Create a dummy LoRA sequence with a specific LoRA ID.
-        lora_seq = create_dummy_lora_sequence(request_id=0,
-                                              token_ids=tokens,
-                                              block_size=block_size,
-                                              lora_int_id=1)
+        lora_seq = create_dummy_lora_sequence(
+            request_id=0, token_ids=tokens, block_size=block_size, lora_int_id=1
+        )
 
         _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
             block_size=block_size,
@@ -1017,14 +1032,13 @@ def test_correct_extra_hash():
 
         # Create different dummy sequences that have the same token IDs
         # but different LoRA IDs.
-        seq = create_dummy_sequence(request_id=1,
-                                    token_ids=tokens,
-                                    block_size=block_size)
-
-        different_lora_seq = create_dummy_lora_sequence(request_id=2,
-                                                        token_ids=tokens,
-                                                        block_size=block_size,
-                                                        lora_int_id=2)
+        seq = create_dummy_sequence(
+            request_id=1, token_ids=tokens, block_size=block_size
+        )
+
+        different_lora_seq = create_dummy_lora_sequence(
+            request_id=2, token_ids=tokens, block_size=block_size, lora_int_id=2
+        )
 
         # Due to the different LoRA IDs, corresponding blocks are not cached.
         assert tracker.get_num_cached_tokens(seq) == 0
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index 375b248ebeda..a6a8b33e19d3 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -9,4 +9,4 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index d4dacc4f1296..a5fb2f966248 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -37,11 +37,13 @@ def test_simple():
     num_seq_group = 4
     max_model_len = 16
     max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       num_seq_group,
-                                       max_model_len,
-                                       enable_chunked_prefill=True)
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        num_seq_group,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
@@ -50,9 +52,9 @@ def test_simple():
 
     # Add seq groups to scheduler.
     for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=block_size, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -61,8 +63,11 @@ def test_simple():
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert set(get_sequence_groups(out)) == set(running)
     assert out.num_batched_tokens == num_tokens
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     assert len(seq_group_meta) == num_seq_group
     for s in running:
         append_new_token(s, 1)
@@ -71,8 +76,11 @@ def test_simple():
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert set(get_sequence_groups(out)) == set(running)
     assert out.num_batched_tokens == num_seq_group
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     assert len(seq_group_meta) == num_seq_group
 
 
@@ -97,9 +105,9 @@ def test_chunk():
 
     # Add seq groups to scheduler.
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -127,7 +135,7 @@ def test_chunk():
 
 
 def test_concurrent_chunking():
-    """Verify prefills are chunked properly when 
+    """Verify prefills are chunked properly when
     --max-num-partial-prefills is > 1"""
     block_size = 4
     max_seqs = 60
@@ -149,9 +157,9 @@ def test_concurrent_chunking():
 
     # Add seq groups to scheduler.
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -196,7 +204,8 @@ def test_concurrent_chunking_large_requests():
         _, seq_group = create_dummy_prompt(
             str(i),
             prompt_length=1200,  # Very large prompt
-            block_size=block_size)
+            block_size=block_size,
+        )
         scheduler.add_seq_group(seq_group)
 
     # Verify only a single request is chunked, and it gets all 64 tokens
@@ -208,7 +217,7 @@ def test_concurrent_chunking_large_requests():
 
 
 def test_short_prompts_jump_long_prompts_in_queue():
-    """Verify large prefill requests are punted behind smaller ones if 
+    """Verify large prefill requests are punted behind smaller ones if
     another large prefill request is already running"""
     block_size = 4
     max_seqs = 60
@@ -234,7 +243,8 @@ def test_short_prompts_jump_long_prompts_in_queue():
         _, seq_group = create_dummy_prompt(
             str(i),
             prompt_length=1200,  # Very large prompt
-            block_size=block_size)
+            block_size=block_size,
+        )
         scheduler.add_seq_group(seq_group)
         long_seqs.append(seq_group)
         assert seq_group.is_prefill()
@@ -244,7 +254,8 @@ def test_short_prompts_jump_long_prompts_in_queue():
         _, seq_group = create_dummy_prompt(
             str(i + 2),
             prompt_length=40,  # Very small prompt
-            block_size=block_size)
+            block_size=block_size,
+        )
         scheduler.add_seq_group(seq_group)
         short_seqs.append(seq_group)
         assert seq_group.is_prefill()
@@ -372,9 +383,9 @@ def test_complex():
 
     # Add seq groups to scheduler.
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
         assert seq_group.is_prefill()
@@ -395,9 +406,9 @@ def test_complex():
 
     # Add 2 more requests.
     for i in range(2, 4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -443,9 +454,9 @@ def test_maximal_decoding():
 
     # Add seq groups to scheduler.
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=2,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=2, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
         assert seq_group.is_prefill()
@@ -462,9 +473,7 @@ def test_maximal_decoding():
     append_new_token(running[0], 1)
 
     # Create one more seq_group.
-    _, seq_group = create_dummy_prompt("3",
-                                       prompt_length=2,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("3", prompt_length=2, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     running.append(seq_group)
     assert seq_group.is_prefill()
@@ -535,9 +544,7 @@ def test_prompt_limit():
     scheduler = Scheduler(scheduler_config, cache_config, None)
     running: list[SequenceGroup] = []
 
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=48,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("1", prompt_length=48, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     running.append(seq_group)
     assert seq_group.is_prefill()
@@ -556,19 +563,19 @@ def test_prompt_limit_exceed():
     max_seqs = 64
     max_model_len = 32
     max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       max_seqs,
-                                       max_model_len,
-                                       enable_chunked_prefill=True)
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 16
     cache_config.num_gpu_blocks = 16
     scheduler = Scheduler(scheduler_config, cache_config, None)
     running: list[SequenceGroup] = []
-    _, seq_group = create_dummy_prompt("2",
-                                       prompt_length=48,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("2", prompt_length=48, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     running.append(seq_group)
     assert seq_group.is_prefill()
@@ -595,9 +602,7 @@ def test_chunked_prefill_preempt():
     cache_config.num_gpu_blocks = 16
     scheduler = Scheduler(scheduler_config, cache_config, None)
 
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     _, out = schedule_and_update_computed_tokens(scheduler)
     # The request is chunked.
@@ -613,8 +618,7 @@ def test_chunked_prefill_preempt():
     def cannot_append_second_group1(seq_group, num_lookahead_slots):
         return seq_group.request_id != "1"
 
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group1)
+    scheduler.block_manager.can_append_slots.side_effect = cannot_append_second_group1
 
     # The running prefill is now preempted.
     _, out = schedule_and_update_computed_tokens(scheduler)
@@ -635,8 +639,7 @@ def cannot_append_second_group1(seq_group, num_lookahead_slots):
     def cannot_append_second_group2(seq_group, num_lookahead_slots):
         return True
 
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group2)
+    scheduler.block_manager.can_append_slots.side_effect = cannot_append_second_group2
     _, out = schedule_and_update_computed_tokens(scheduler)
     assert len(out.scheduled_seq_groups) == 1
     assert out.num_prefill_groups == 1
@@ -668,9 +671,7 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
     cache_config.num_gpu_blocks = 16
     scheduler = Scheduler(scheduler_config, cache_config, None)
 
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=30,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("1", prompt_length=30, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     _, out = schedule_and_update_computed_tokens(scheduler)
     # The request is chunked.
@@ -679,8 +680,9 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
     assert out.num_prefill_groups == 1
     assert out.num_batched_tokens == max_num_batched_tokens
     print(out.num_lookahead_slots)
-    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
-                                       num_lookahead_slots)
+    assert out.num_lookahead_slots == (
+        0 if (num_scheduler_steps == 1) else num_lookahead_slots
+    )
 
 
 def test_chunked_prefill_max_seqs():
@@ -701,9 +703,7 @@ def test_chunked_prefill_max_seqs():
     scheduler = Scheduler(scheduler_config, cache_config, None)
     running: list[SequenceGroup] = []
 
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=65,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("1", prompt_length=65, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     running.append(seq_group)
     # The first prefill is chunked.
@@ -713,9 +713,9 @@ def test_chunked_prefill_max_seqs():
 
     # Add new requests.
     for i in range(4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=65,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=65, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -750,11 +750,7 @@ def test_prefix_caching():
         max_model_len,
         enable_chunked_prefill=True,
     )
-    cache_config = CacheConfig(block_size,
-                               1.0,
-                               1,
-                               "auto",
-                               enable_prefix_caching=True)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto", enable_prefix_caching=True)
     cache_config.num_cpu_blocks = 0
     cache_config.num_gpu_blocks = 32
     scheduler = Scheduler(scheduler_config, cache_config, None)
@@ -762,9 +758,9 @@ def test_prefix_caching():
 
     # Add seq groups to scheduler.
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size=block_size,
-                                           prompt_length=50)
+        _, seq_group = create_dummy_prompt(
+            str(i), block_size=block_size, prompt_length=50
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -780,23 +776,21 @@ def test_prefix_caching():
 
 
 def test_prefix_caching_with_concurrent_partial_prefills():
-    """Verify allocating full blocks when prefix caching is enabled with 
+    """Verify allocating full blocks when prefix caching is enabled with
     --max-num-partial-prefills > 1."""
     block_size = 4
     max_seqs = 10
     max_model_len = 8000
     max_num_batched_tokens = 60  # With two slots, each slot will get 30 tokens
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       max_seqs,
-                                       max_model_len,
-                                       enable_chunked_prefill=True,
-                                       max_num_partial_prefills=2)
-    cache_config = CacheConfig(block_size,
-                               1.0,
-                               1,
-                               "auto",
-                               enable_prefix_caching=True)
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto", enable_prefix_caching=True)
     cache_config.num_cpu_blocks = 0
     cache_config.num_gpu_blocks = 32
     scheduler = Scheduler(scheduler_config, cache_config, None)
@@ -804,9 +798,9 @@ def test_prefix_caching_with_concurrent_partial_prefills():
 
     # Add seq groups to scheduler.
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size=block_size,
-                                           prompt_length=50)
+        _, seq_group = create_dummy_prompt(
+            str(i), block_size=block_size, prompt_length=50
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -833,9 +827,8 @@ def test_prefix_caching_with_concurrent_partial_prefills():
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
-def test_chunked_prefill_with_actual_engine(model: str,
-                                            max_num_partial_prefills: int):
-    """Make sure the model can actually sample with concurrent 
+def test_chunked_prefill_with_actual_engine(model: str, max_num_partial_prefills: int):
+    """Make sure the model can actually sample with concurrent
     partial prefills
     """
 
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index 1b958e34df87..8616432905f7 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -20,29 +20,30 @@ def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
 @pytest.mark.parametrize("num_scheduler_steps", [1, 8])
 @pytest.mark.parametrize("enable_chunked_prefill", [False, True])
 @pytest.mark.parametrize("enforce_eager", [False, True])
-def test_num_computed_tokens_update(num_scheduler_steps: int,
-                                    enable_chunked_prefill: bool,
-                                    enforce_eager: bool):
-
+def test_num_computed_tokens_update(
+    num_scheduler_steps: int, enable_chunked_prefill: bool, enforce_eager: bool
+):
     is_multi_step = num_scheduler_steps > 1
     is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
 
     if is_multi_step_chunked_prefill and current_platform.is_rocm():
-        pytest.skip("Multi-step with Chunked-Prefill does not support "
-                    "rocm_flash_attn backend")
+        pytest.skip(
+            "Multi-step with Chunked-Prefill does not support rocm_flash_attn backend"
+        )
 
     # Make a vllm engine
-    runner = VllmRunner(model_name=MODEL,
-                        gpu_memory_utilization=0.7,
-                        num_scheduler_steps=num_scheduler_steps,
-                        enable_chunked_prefill=enable_chunked_prefill,
-                        enforce_eager=enforce_eager)
+    runner = VllmRunner(
+        model_name=MODEL,
+        gpu_memory_utilization=0.7,
+        num_scheduler_steps=num_scheduler_steps,
+        enable_chunked_prefill=enable_chunked_prefill,
+        enforce_eager=enforce_eager,
+    )
     engine: LLMEngine = runner.model.llm_engine
 
     # In multi-step + chunked-prefill there is no separate single prompt step.
     # What is scheduled will run for num_scheduler_steps always.
-    num_prompt_steps = num_scheduler_steps \
-        if is_multi_step_chunked_prefill else 1
+    num_prompt_steps = num_scheduler_steps if is_multi_step_chunked_prefill else 1
 
     num_output_tokens_list = [4, 8, 12, 15, 16, 17]
 
@@ -50,10 +51,12 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
     prompt_len = 10
 
     for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
-        seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
-                                             prompt_length=prompt_len,
-                                             min_tokens=num_output_tokens,
-                                             max_tokens=num_output_tokens)
+        seq, seq_group = create_dummy_prompt(
+            request_id=str(req_idx),
+            prompt_length=prompt_len,
+            min_tokens=num_output_tokens,
+            max_tokens=num_output_tokens,
+        )
         add_seq_group_to_engine(engine, seq_group)
 
         assert seq.data.get_num_computed_tokens() == 0
@@ -65,19 +68,19 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
         if not seq.is_finished():
             prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
             # Test correctness of num_computed_tokens after the prompt steps
-            assert prompt_num_computed_tokens == \
-                        prompt_len + num_prompt_steps - 1
+            assert prompt_num_computed_tokens == prompt_len + num_prompt_steps - 1
 
             decode_step_counter = 0
             while not seq.is_finished():
                 # Test correctness of num_computed_tokens after the decode steps
-                assert seq.data.get_num_computed_tokens(
-                ) == prompt_num_computed_tokens + decode_step_counter
+                assert (
+                    seq.data.get_num_computed_tokens()
+                    == prompt_num_computed_tokens + decode_step_counter
+                )
                 for _ in range(num_scheduler_steps):
                     # decode step
                     engine.step()
                     decode_step_counter += 1
 
         # Test correctness of num_computed_tokens after the sequence finish.
-        assert seq.data.get_num_computed_tokens(
-        ) == prompt_len + num_output_tokens - 1
+        assert seq.data.get_num_computed_tokens() == prompt_len + num_output_tokens - 1
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 591e1780c11c..731d1be00dfd 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -16,9 +16,14 @@
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SequenceGroup, SequenceStatus
 
-from .utils import (append_new_token, append_new_token_seq,
-                    append_new_token_seq_group, create_dummy_prompt,
-                    get_sequence_groups, schedule_and_update_computed_tokens)
+from .utils import (
+    append_new_token,
+    append_new_token_seq,
+    append_new_token_seq_group,
+    create_dummy_prompt,
+    get_sequence_groups,
+    schedule_and_update_computed_tokens,
+)
 
 
 def test_scheduler_add_seq_group():
@@ -37,9 +42,7 @@ def test_scheduler_add_seq_group():
     # Add seq group to scheduler.
     num_seq_group = 4
     for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(str(i), block_size, block_size=block_size)
         scheduler.add_seq_group(seq_group)
         assert scheduler.get_num_unfinished_seq_groups() == i + 1
 
@@ -89,9 +92,9 @@ def test_scheduler_schedule_simple():
 
     # Add seq groups to scheduler.
     for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=block_size, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -100,8 +103,11 @@ def test_scheduler_schedule_simple():
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert set(get_sequence_groups(out)) == set(running)
     assert out.num_batched_tokens == num_tokens
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     assert len(seq_group_meta) == num_seq_group
     append_new_token(out, 1)
 
@@ -109,8 +115,11 @@ def test_scheduler_schedule_simple():
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert set(get_sequence_groups(out)) == set(running)
     assert out.num_batched_tokens == num_seq_group
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     assert len(seq_group_meta) == num_seq_group
     append_new_token(out, 1)
 
@@ -164,12 +173,8 @@ def test_scheduler_schedule_preempt_abort():
     scheduler = Scheduler(scheduler_config, cache_config, None)
 
     # Add seq groups to scheduler.
-    seq_a, seq_group_a = create_dummy_prompt("1",
-                                             block_size,
-                                             block_size=block_size)
-    seq_b, seq_group_b = create_dummy_prompt("2",
-                                             block_size,
-                                             block_size=block_size)
+    seq_a, seq_group_a = create_dummy_prompt("1", block_size, block_size=block_size)
+    seq_b, seq_group_b = create_dummy_prompt("2", block_size, block_size=block_size)
     scheduler.add_seq_group(seq_group_a)
     scheduler.add_seq_group(seq_group_b)
 
@@ -177,8 +182,11 @@ def test_scheduler_schedule_preempt_abort():
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
     assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     assert len(seq_group_meta) == 2
     assert scheduler.get_num_unfinished_seq_groups() == 2
 
@@ -190,8 +198,11 @@ def test_scheduler_schedule_preempt_abort():
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert get_sequence_groups(out) == [seq_group_a]
     assert out.num_batched_tokens == 1
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     assert len(seq_group_meta) == 1
     assert scheduler.get_num_unfinished_seq_groups() == 2
     assert out.preempted == 1
@@ -201,8 +212,11 @@ def test_scheduler_schedule_preempt_abort():
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert get_sequence_groups(out) == [seq_group_b]
     assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     assert len(seq_group_meta) == 1
     assert scheduler.get_num_unfinished_seq_groups() == 1
 
@@ -226,9 +240,9 @@ def test_scheduler_max_seqs():
     all_seq_groups: list[SequenceGroup] = []
     # Add seq groups to scheduler.
     for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=block_size, block_size=block_size
+        )
         all_seq_groups.append(seq_group)
 
     # Append 1 seq group
@@ -270,33 +284,33 @@ def test_scheduler_delay_factor():
     scheduler = Scheduler(scheduler_config, cache_config, None)
 
     # schedule first prompt
-    seq_group_meta, seq_group = create_dummy_prompt("0",
-                                                    prompt_length=block_size,
-                                                    block_size=block_size)
+    seq_group_meta, seq_group = create_dummy_prompt(
+        "0", prompt_length=block_size, block_size=block_size
+    )
     scheduler.add_seq_group(seq_group)
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert out.num_prefill_groups > 0
-    assert seq_group_meta[0].request_id == '0'
+    assert seq_group_meta[0].request_id == "0"
     append_new_token(out, 1)
 
     # wait for a second before scheduling next prompt
     time.sleep(1)
-    seq_group_meta, seq_group = create_dummy_prompt("1",
-                                                    prompt_length=block_size,
-                                                    block_size=block_size)
+    seq_group_meta, seq_group = create_dummy_prompt(
+        "1", prompt_length=block_size, block_size=block_size
+    )
     scheduler.add_seq_group(seq_group)
 
     # second prompt should *not* be scheduled
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert out.num_prefill_groups == 0
-    assert seq_group_meta[0].request_id == '0'
+    assert seq_group_meta[0].request_id == "0"
     append_new_token(out, 1)
 
     # wait for more than 0.5 second and try again
     time.sleep(0.6)
     seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
     assert out.num_prefill_groups > 0
-    assert seq_group_meta[0].request_id == '1'
+    assert seq_group_meta[0].request_id == "1"
     append_new_token(out, 1)
 
 
@@ -333,20 +347,20 @@ def initialize_scheduler(
     return scheduler
 
 
-def create_token_budget(token_budget: int = 10000,
-                        max_num_seqs: int = 10000) -> SchedulingBudget:
+def create_token_budget(
+    token_budget: int = 10000, max_num_seqs: int = 10000
+) -> SchedulingBudget:
     return SchedulingBudget(
         token_budget=token_budget,
         max_num_seqs=max_num_seqs,
     )
 
 
-def add_token_budget(budget: SchedulingBudget,
-                     num_batched_tokens: int = 0,
-                     num_curr_seqs: int = 0):
-    mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
-    budget.add_num_batched_tokens(mock_seq_group.request_id,
-                                  num_batched_tokens)
+def add_token_budget(
+    budget: SchedulingBudget, num_batched_tokens: int = 0, num_curr_seqs: int = 0
+):
+    mock_seq_group = create_dummy_prompt("10", prompt_length=60)[1]
+    budget.add_num_batched_tokens(mock_seq_group.request_id, num_batched_tokens)
     budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
 
 
@@ -356,9 +370,7 @@ def test_prefill_schedule_max_prompt_len():
     """
     block_size = 4
     scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
-    _, seq_group = create_dummy_prompt("0",
-                                       prompt_length=60,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("0", prompt_length=60, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     budget = create_token_budget()
     output = scheduler._schedule_prefills(budget, None)
@@ -375,14 +387,14 @@ def test_prefill_schedule_token_budget():
     Test token budget respected.
     """
     block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_cpu_blocks=64, num_gpu_blocks=64
+    )
     budget = create_token_budget(token_budget=0)
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
 
     # 0 token budget == nothing is scheduled.
@@ -405,14 +417,12 @@ def test_prefill_schedule_token_budget():
     assert len(remaining_waiting) == 1
 
     # Test when current_batched_tokens respected.
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=16,
-                                     num_gpu_blocks=16)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_cpu_blocks=16, num_gpu_blocks=16
+    )
     budget = create_token_budget(token_budget=60)
     add_token_budget(budget, 30, 0)
-    _, seq_group = create_dummy_prompt(str(i),
-                                       prompt_length=60,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt(str(i), prompt_length=60, block_size=block_size)
     # Cannot schedule a prompt that doesn't fit the budget.
     scheduler.add_seq_group(seq_group)
     output = scheduler._schedule_prefills(budget, None)
@@ -437,14 +447,14 @@ def test_prefill_schedule_max_seqs():
     Test max seq respected.
     """
     block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_cpu_blocks=64, num_gpu_blocks=64
+    )
     budget = create_token_budget(max_num_seqs=2)
     for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
     output = scheduler._schedule_prefills(budget, None)
     remaining_waiting = scheduler.waiting
@@ -458,9 +468,7 @@ def test_prefill_schedule_max_seqs():
     scheduler.waiting = deque()
     budget = create_token_budget(max_num_seqs=2)
     add_token_budget(budget, 0, 2)
-    _, seq_group = create_dummy_prompt(str(i),
-                                       prompt_length=60,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt(str(i), prompt_length=60, block_size=block_size)
     scheduler.add_seq_group(seq_group)
     output = scheduler._schedule_prefills(budget, None)
     remaining_waiting = scheduler.waiting
@@ -477,20 +485,23 @@ def test_prefill_schedule_max_lora():
     """
     block_size = 4
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
+    scheduler = initialize_scheduler(
+        lora_config=lora_config,
+        block_size=block_size,
+        num_cpu_blocks=64,
+        num_gpu_blocks=64,
+    )
     budget = create_token_budget(token_budget=120)
     curr_loras: set[int] = set()
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=60,
+            block_size=block_size,
+            lora_request=LoRARequest(
+                lora_name=str(i), lora_int_id=i + 1, lora_path="abc"
+            ),
+        )
         scheduler.add_seq_group(seq_group)
     # Add two more requests to verify lora is prioritized.
     # 0: LoRA, 1: LoRA, 2: regular, 3: regular
@@ -498,9 +509,9 @@ def test_prefill_schedule_max_lora():
     # If a request is not scheduled because it hits max lora, it is
     # prioritized. Verify that.
     for i in range(2, 4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
     # Schedule 2 requests (0 and 2)
     output = scheduler._schedule_prefills(budget, curr_loras)
@@ -529,14 +540,14 @@ def test_prefill_schedule_no_block_manager_capacity():
     Test sequence cannot be scheduled due to block manager has no capacity.
     """
     block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_gpu_blocks=128,
-                                     num_cpu_blocks=128)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_gpu_blocks=128, num_cpu_blocks=128
+    )
     budget = create_token_budget()
     for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
     scheduler.block_manager.can_allocate = MagicMock()
     scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
@@ -551,9 +562,9 @@ def test_prefill_schedule_no_block_manager_capacity():
     scheduler = initialize_scheduler()
     budget = create_token_budget()
     for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler.add_seq_group(seq_group)
     scheduler.block_manager.can_allocate = MagicMock()
     scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
@@ -571,14 +582,14 @@ def test_decode_schedule_preempted():
     Test decodes cannot be scheduled and preempted.
     """
     block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_cpu_blocks=64, num_gpu_blocks=64
+    )
     curr_loras = None
     for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._add_seq_group_to_running(seq_group)
@@ -587,8 +598,7 @@ def test_decode_schedule_preempted():
     def cannot_append_second_group(seq_group, num_lookahead_slots):
         return seq_group.request_id != "1"
 
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
+    scheduler.block_manager.can_append_slots.side_effect = cannot_append_second_group
 
     # 1 cannot be scheduled, and the lowest priority (request 2)
     # should be preempted. 1 will also be preempted.
@@ -615,12 +625,8 @@ def test_schedule_decode_blocks_to_copy_update():
     Verify blocks_to_copy is updated.
     """
     block_size = 4
-    scheduler = initialize_scheduler(block_size=4,
-                                     num_cpu_blocks=16,
-                                     num_gpu_blocks=16)
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
+    scheduler = initialize_scheduler(block_size=4, num_cpu_blocks=16, num_gpu_blocks=16)
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, block_size=block_size)
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
@@ -648,20 +654,23 @@ def test_schedule_decode_blocks_to_copy_update():
 def test_schedule_swapped_max_loras():
     block_size = 4
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
+    scheduler = initialize_scheduler(
+        lora_config=lora_config,
+        block_size=block_size,
+        num_cpu_blocks=32,
+        num_gpu_blocks=32,
+    )
     curr_loras: set[int] = set()
     blocks_to_swap_out: list[tuple[int, int]] = []
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=60,
+            block_size=block_size,
+            lora_request=LoRARequest(
+                lora_name=str(i), lora_int_id=i + 1, lora_path="abc"
+            ),
+        )
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
@@ -680,15 +689,15 @@ def test_schedule_swapped_max_loras():
 
 def test_schedule_swapped_cannot_swap_in():
     block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32
+    )
     curr_loras = None
     blocks_to_swap_out: list[tuple[int, int]] = []
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
@@ -710,15 +719,15 @@ def test_schedule_swapped_cannot_swap_in():
 
 def test_infeasible_swap():
     block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32
+    )
     curr_loras = None
     blocks_to_swap_out: list[tuple[int, int]] = []
     for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
+        _, seq_group = create_dummy_prompt(
+            str(i), prompt_length=60, block_size=block_size
+        )
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
         scheduler._swap_out(seq_group, blocks_to_swap_out)
@@ -741,13 +750,11 @@ def test_infeasible_swap():
 
 def test_schedule_swapped_blocks_to_copy():
     block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
+    scheduler = initialize_scheduler(
+        block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32
+    )
     curr_loras = None
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
+    _, seq_group = create_dummy_prompt("1", prompt_length=60, block_size=block_size)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
     blocks_to_swap_out: list[tuple[int, int]] = []
@@ -840,26 +847,30 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
 
     seqA_tokens = list(range(8))
     num_shared_tokens = 4
-    seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range(
-        12, 16))  # Shared prefix first 4.
-    seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range(
-        16, 20))  # Shared prefix first 4.
-
-    seqA, seqA_group = create_dummy_prompt("0",
-                                           prompt_tokens=seqA_tokens,
-                                           block_size=block_size)
-    seqB, seqB_group = create_dummy_prompt("1",
-                                           prompt_tokens=seqB_tokens,
-                                           block_size=block_size)
-    seqC, seqC_group = create_dummy_prompt("2",
-                                           prompt_tokens=seqC_tokens,
-                                           block_size=block_size)
+    seqB_tokens = seqA_tokens[:num_shared_tokens] + list(
+        range(12, 16)
+    )  # Shared prefix first 4.
+    seqC_tokens = seqA_tokens[:num_shared_tokens] + list(
+        range(16, 20)
+    )  # Shared prefix first 4.
+
+    seqA, seqA_group = create_dummy_prompt(
+        "0", prompt_tokens=seqA_tokens, block_size=block_size
+    )
+    seqB, seqB_group = create_dummy_prompt(
+        "1", prompt_tokens=seqB_tokens, block_size=block_size
+    )
+    seqC, seqC_group = create_dummy_prompt(
+        "2", prompt_tokens=seqC_tokens, block_size=block_size
+    )
 
     # Schedule seqA prefill.
     scheduler.add_seq_group(seqA_group)
     metas, out, _ = scheduler.schedule()
-    assert (len(out.scheduled_seq_groups) == 1
-            and out.scheduled_seq_groups[0].seq_group == seqA_group)
+    assert (
+        len(out.scheduled_seq_groups) == 1
+        and out.scheduled_seq_groups[0].seq_group == seqA_group
+    )
     assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens)
 
     # Schedule seqA decode.
@@ -877,15 +888,18 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
 
     if enable_prefix_caching:
         assert len(out.scheduled_seq_groups) == 2
-        assert set([
-            out.scheduled_seq_groups[0].seq_group,
-            out.scheduled_seq_groups[1].seq_group,
-        ]) == set([seqB_group, seqC_group])
+        assert set(
+            [
+                out.scheduled_seq_groups[0].seq_group,
+                out.scheduled_seq_groups[1].seq_group,
+            ]
+        ) == set([seqB_group, seqC_group])
         assert len(metas) == 2
         for meta in metas:
             assert meta.token_chunk_size == 8
-            assert (len(meta.computed_block_nums) == num_shared_tokens //
-                    block_size)  # 1 Block for the 8 tokens.
+            assert (
+                len(meta.computed_block_nums) == num_shared_tokens // block_size
+            )  # 1 Block for the 8 tokens.
     else:
         assert len(out.scheduled_seq_groups) == 1
         assert len(metas) == 1
@@ -893,8 +907,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
         assert len(metas[0].computed_block_nums) == 0  # No blocks computed.
 
 
-def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
-):
+def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching():
     """
     This test verifies that we don't schedule new prefills if there's already
     a continuous prefill in progress even though the new prefills with shared
@@ -931,12 +944,12 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
     seqC_shared_prefix_len = 4
     seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20))
 
-    seqA, seqA_group = create_dummy_prompt("0",
-                                           prompt_tokens=seqA_tokens,
-                                           block_size=block_size)
-    seqB, seqB_group = create_dummy_prompt("1",
-                                           prompt_tokens=seqB_tokens,
-                                           block_size=block_size)
+    seqA, seqA_group = create_dummy_prompt(
+        "0", prompt_tokens=seqA_tokens, block_size=block_size
+    )
+    seqB, seqB_group = create_dummy_prompt(
+        "1", prompt_tokens=seqB_tokens, block_size=block_size
+    )
 
     # Chunked prefill seqA.
     scheduler.add_seq_group(seqA_group)
@@ -955,27 +968,26 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
     # both seqB and seqC can now be scheduled with seqA is over.
     # seqA is in decoding phase.
     append_new_token_seq(seqA, 999)
-    seqC, seqC_group = create_dummy_prompt("2",
-                                           prompt_tokens=seqC_tokens,
-                                           block_size=block_size)
+    seqC, seqC_group = create_dummy_prompt(
+        "2", prompt_tokens=seqC_tokens, block_size=block_size
+    )
     scheduler.add_seq_group(seqC_group)
     metas, out = schedule_and_update_computed_tokens(scheduler)
     assert len(out.scheduled_seq_groups) == 3
 
     metas = {meta.request_id: meta for meta in metas}
     assert metas[seqA_group.request_id].token_chunk_size == 1  # Decode
-    assert (metas[seqB_group.request_id].token_chunk_size == 8
-            )  # Fully cached prefill
-    assert (
-        metas[seqC_group.request_id].token_chunk_size == 6
-    ), "A partial prefix of C (4 tokens) should be prefilled, with the "
+    assert metas[seqB_group.request_id].token_chunk_size == 8  # Fully cached prefill
+    assert metas[seqC_group.request_id].token_chunk_size == 6, (
+        "A partial prefix of C (4 tokens) should be prefilled, with the "
+    )
     "remaining tokens fit into 3 token budget (4-1 from the seqA). It will "
     "then be rounded down to 2 tokens on block size, thus 6 tokens in total."
 
 
 def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
     """
-    Test that the scheduler does not schedule batches with prompt tokens and 
+    Test that the scheduler does not schedule batches with prompt tokens and
     prompt embeddings co-mingled.
     """
     block_size = 2
@@ -1005,10 +1017,12 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
             seq_embeds.append(torch.rand(embedding_size))
 
     seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens[i],
-                            prompt_embeds=seq_embeds[i],
-                            block_size=block_size)
+        create_dummy_prompt(
+            f"{i}",
+            prompt_tokens=seq_tokens[i],
+            prompt_embeds=seq_embeds[i],
+            block_size=block_size,
+        )
         for i in range(len(seq_tokens))
     ]
 
@@ -1017,24 +1031,29 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
 
     while not all(seq.is_finished() for seq, _ in seq_and_seq_groups):
         unfinished_seq_groups = [
-            seq_group for _, seq_group in seq_and_seq_groups
+            seq_group
+            for _, seq_group in seq_and_seq_groups
             if not seq_group.is_finished()
         ]
         _, out = schedule_and_update_computed_tokens(scheduler)
         assert len(out.scheduled_seq_groups) > 0
         batch_is_prompt_embeds = out.scheduled_seq_groups[
-            0].seq_group.uses_prompt_embeds()
+            0
+        ].seq_group.uses_prompt_embeds()
         expected_scheduled_seq_groups = [
-            seq_group for seq_group in unfinished_seq_groups
+            seq_group
+            for seq_group in unfinished_seq_groups
             if seq_group.uses_prompt_embeds() == batch_is_prompt_embeds
         ]
 
         # We should have as many scheduled groups as possible, without mixing
         assert len(out.scheduled_seq_groups) == min(
-            max_seq_group, len(expected_scheduled_seq_groups))
-        assert all(scheduled_seq_group.seq_group.uses_prompt_embeds() ==
-                   batch_is_prompt_embeds
-                   for scheduled_seq_group in out.scheduled_seq_groups)
+            max_seq_group, len(expected_scheduled_seq_groups)
+        )
+        assert all(
+            scheduled_seq_group.seq_group.uses_prompt_embeds() == batch_is_prompt_embeds
+            for scheduled_seq_group in out.scheduled_seq_groups
+        )
 
         # Finish the scheduled groups
         for scheduled_seq_group in out.scheduled_seq_groups:
@@ -1078,9 +1097,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         seq_tokens_with_swapped.append([i] * seq_length)
 
     seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_with_swapped[i],
-                            block_size=block_size)
+        create_dummy_prompt(
+            f"{i}", prompt_tokens=seq_tokens_with_swapped[i], block_size=block_size
+        )
         for i in range(len(seq_tokens_with_swapped))
     ]
 
@@ -1090,43 +1109,46 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler._add_seq_group_to_swapped(seq_group)
 
     scheduler._schedule_swapped(budget, curr_loras)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        1
+    )
     assert seq_id_to_num_tokens_computed is None
 
     # Prefill schedule don't have a space for another LoRA, so
     # we ignore this request for now.
     block_size = 4
     lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64,
-                                     enable_prefix_caching=True)
+    scheduler = initialize_scheduler(
+        lora_config=lora_config,
+        block_size=block_size,
+        num_cpu_blocks=64,
+        num_gpu_blocks=64,
+        enable_prefix_caching=True,
+    )
     budget = create_token_budget(token_budget=120)
     num_seqs = 2
     for i in range(num_seqs):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=seq_length,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=seq_length,
+            block_size=block_size,
+            lora_request=LoRARequest(
+                lora_name=str(i), lora_int_id=i + 1, lora_path="abc"
+            ),
+        )
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_prefills(budget, curr_loras)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        1
+    )
     assert seq_id_to_num_tokens_computed is None
 
     # Priority preemption schedule
     scheduler._schedule_priority_preemption(budget)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        1
+    )
     assert seq_id_to_num_tokens_computed is None
 
     # Prefill scheduler does not schedule batches with prompt tokens and
@@ -1152,10 +1174,12 @@ def test_remove_seq_from_computed_blocks_tracker():
     seq_embeds.append(torch.rand(embedding_size))
 
     seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_with_embedding[i],
-                            prompt_embeds=seq_embeds[i],
-                            block_size=block_size)
+        create_dummy_prompt(
+            f"{i}",
+            prompt_tokens=seq_tokens_with_embedding[i],
+            prompt_embeds=seq_embeds[i],
+            block_size=block_size,
+        )
         for i in range(len(seq_tokens_with_embedding))
     ]
 
@@ -1163,9 +1187,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        1
+    )
     assert seq_id_to_num_tokens_computed is None
 
     #  Prefill scheduler budget num_batched_tokens
@@ -1189,9 +1213,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         seq_tokens_prefill_budget.append([i] * seq_length)
 
     seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_prefill_budget[i],
-                            block_size=block_size)
+        create_dummy_prompt(
+            f"{i}", prompt_tokens=seq_tokens_prefill_budget[i], block_size=block_size
+        )
         for i in range(len(seq_tokens_prefill_budget))
     ]
 
@@ -1199,9 +1223,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(2))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        2
+    )
     assert seq_id_to_num_tokens_computed is None
 
     # Budget can not schedule in waiting
@@ -1225,9 +1249,11 @@ def test_remove_seq_from_computed_blocks_tracker():
         seq_tokens_prefill_budget_waiting.append(list(range(seq_length)))
 
     seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_prefill_budget_waiting[i],
-                            block_size=block_size)
+        create_dummy_prompt(
+            f"{i}",
+            prompt_tokens=seq_tokens_prefill_budget_waiting[i],
+            block_size=block_size,
+        )
         for i in range(len(seq_tokens_prefill_budget_waiting))
     ]
 
@@ -1235,9 +1261,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        1
+    )
     assert seq_id_to_num_tokens_computed is None
 
     # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
@@ -1256,16 +1282,16 @@ def test_remove_seq_from_computed_blocks_tracker():
     seq_tokens_prompt_limit: list[list[int]] = []
     seq_tokens_prompt_limit.append(list(range(seq_length)))
     seq_and_seq_groups = [
-        create_dummy_prompt("0",
-                            prompt_tokens=seq_tokens_prompt_limit[0],
-                            block_size=block_size)
+        create_dummy_prompt(
+            "0", prompt_tokens=seq_tokens_prompt_limit[0], block_size=block_size
+        )
     ]
     for _, seq_group in seq_and_seq_groups:
         scheduler.add_seq_group(seq_group)
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(0))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        0
+    )
     assert seq_id_to_num_tokens_computed is None
 
     # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
@@ -1287,9 +1313,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         seq_tokens_never.append(list(range(seq_length)))
 
     seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_never[i],
-                            block_size=block_size)
+        create_dummy_prompt(
+            f"{i}", prompt_tokens=seq_tokens_never[i], block_size=block_size
+        )
         for i in range(len(seq_tokens_never))
     ]
 
@@ -1297,9 +1323,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(0))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        0
+    )
     assert seq_id_to_num_tokens_computed is None
 
     # Budget can not allocate, AllocStatus is LATER
@@ -1321,9 +1347,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         seq_tokens_later.append(list(range(seq_length)))
 
     seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_later[i],
-                            block_size=block_size)
+        create_dummy_prompt(
+            f"{i}", prompt_tokens=seq_tokens_later[i], block_size=block_size
+        )
         for i in range(len(seq_tokens_later))
     ]
 
@@ -1331,7 +1357,7 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
+    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
+        1
+    )
     assert seq_id_to_num_tokens_computed is None
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index 20cc083ec8db..af93fdf4e074 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -7,12 +7,16 @@
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import SequenceGroup
 
-from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
-                    get_sequence_groups, schedule_and_update_computed_tokens)
+from .utils import (
+    append_new_token,
+    create_dummy_prompt_encoder_decoder,
+    get_sequence_groups,
+    schedule_and_update_computed_tokens,
+)
 
 
 def test_scheduler_schedule_simple_encoder_decoder():
-    '''
+    """
     Test basic scheduler functionality in the context
     of an encoder/decoder model. Focus on testing
     enc/dec-specific functionality sense tests already
@@ -32,7 +36,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
     * Abort scheduled seq groups
     * Assert that aborted seq groups no longer appear in
       cross-attention block table
-    '''
+    """
 
     block_size = 4
     num_seq_group = 4
@@ -55,7 +59,8 @@ def test_scheduler_schedule_simple_encoder_decoder():
         req_id = str(i)
         req_id_list.append(req_id)
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            req_id, block_size, block_size, block_size)
+            req_id, block_size, block_size, block_size
+        )
         scheduler.add_seq_group(seq_group)
         running.append(seq_group)
 
@@ -64,15 +69,22 @@ def test_scheduler_schedule_simple_encoder_decoder():
     seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
     # - Verify that sequence group cross-attention block tables are
     #   registered with the block manager
-    assert all([(req_id in scheduler.block_manager.cross_block_tables)
-                for req_id in req_id_list])
+    assert all(
+        [
+            (req_id in scheduler.block_manager.cross_block_tables)
+            for req_id in req_id_list
+        ]
+    )
     # - Validate sequence-group status
     assert set(get_sequence_groups(out)) == set(running)
     # - Validate number of batched tokens
     assert out.num_batched_tokens == num_tokens
     # - Validate there are no remaining blocks to swap
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     # - Validate all seq groups were scheduled
     assert len(seq_group_meta_list) == num_seq_group
     append_new_token(out, 1)
@@ -81,18 +93,25 @@ def test_scheduler_schedule_simple_encoder_decoder():
     seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
     # - Verify that sequence group metadata includes encoder attention
     #   and cross-attention metadata
-    assert all([
-        not ((seq_group_meta.encoder_seq_data is None) or
-             (seq_group_meta.cross_block_table is None))
-        for seq_group_meta in seq_group_meta_list
-    ])
+    assert all(
+        [
+            not (
+                (seq_group_meta.encoder_seq_data is None)
+                or (seq_group_meta.cross_block_table is None)
+            )
+            for seq_group_meta in seq_group_meta_list
+        ]
+    )
     # - Validate sequence-group status
     assert set(get_sequence_groups(out)) == set(running)
     # - Validate there is one batched token per seq group
     assert out.num_batched_tokens == num_seq_group
     # - Validate there are no remaining blocks to swap
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
+    assert (
+        not out.blocks_to_copy
+        and not out.blocks_to_swap_in
+        and not out.blocks_to_swap_out
+    )
     # - Validate that all seq groups were scheduled
     assert len(seq_group_meta_list) == num_seq_group
     append_new_token(out, 1)
diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
index 8281298d6634..75e4a37dda69 100644
--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@@ -15,22 +15,20 @@ def test_msgspec_serialization():
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=num_lookahead_slots,
-        running_queue_size=4)
+        running_queue_size=4,
+    )
 
     encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
-                                      dec_hook=decode_hook)
+    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=decode_hook)
     req = decoder.decode(encoder.encode(execute_model_req))
     expected = execute_model_req.seq_group_metadata_list
     actual = req.seq_group_metadata_list
-    assert (len(expected) == len(actual))
+    assert len(expected) == len(actual)
     expected = expected[0]
     actual = actual[0]
 
     assert expected.block_tables == actual.block_tables
     assert expected.is_prompt == actual.is_prompt
     assert expected.request_id == actual.request_id
-    assert (expected.seq_data[0].prompt_token_ids ==
-            actual.seq_data[0].prompt_token_ids)
-    assert (expected.seq_data[0].output_token_ids ==
-            actual.seq_data[0].output_token_ids)
+    assert expected.seq_data[0].prompt_token_ids == actual.seq_data[0].prompt_token_ids
+    assert expected.seq_data[0].output_token_ids == actual.seq_data[0].output_token_ids
diff --git a/tests/core/utils.py b/tests/core/utils.py
index b746c1786464..124c080b017e 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -12,8 +12,7 @@
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.sequence import (Logprob, Sequence, SequenceGroup,
-                           SequenceGroupMetadata)
+from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceGroupMetadata
 
 
 def create_dummy_prompt(
@@ -35,10 +34,11 @@ def create_dummy_prompt(
         prompt_tokens = list(range(prompt_length))
 
     prompt_str = " ".join([str(t) for t in prompt_tokens])
-    inputs = token_inputs(
-        prompt_token_ids=prompt_tokens,
-        prompt=prompt_str) if prompt_embeds is None else embeds_inputs(
-            prompt_embeds=prompt_embeds)
+    inputs = (
+        token_inputs(prompt_token_ids=prompt_tokens, prompt=prompt_str)
+        if prompt_embeds is None
+        else embeds_inputs(prompt_embeds=prompt_embeds)
+    )
     prompt = Sequence(
         int(request_id),
         inputs=inputs,
@@ -48,26 +48,29 @@ def create_dummy_prompt(
         request_id=request_id,
         seqs=[prompt],
         arrival_time=time.time(),
-        sampling_params=SamplingParams(max_tokens=max_tokens,
-                                       min_tokens=min_tokens),
+        sampling_params=SamplingParams(max_tokens=max_tokens, min_tokens=min_tokens),
         lora_request=lora_request,
     )
 
     return prompt, seq_group
 
 
-def create_dummy_lora_sequence(request_id: int, token_ids: list[int],
-                               block_size: int, lora_int_id: int) -> Sequence:
-    return Sequence(seq_id=request_id,
-                    inputs=token_inputs(token_ids),
-                    block_size=block_size,
-                    lora_request=LoRARequest(lora_name="dummy",
-                                             lora_path="/dummy",
-                                             lora_int_id=lora_int_id))
+def create_dummy_lora_sequence(
+    request_id: int, token_ids: list[int], block_size: int, lora_int_id: int
+) -> Sequence:
+    return Sequence(
+        seq_id=request_id,
+        inputs=token_inputs(token_ids),
+        block_size=block_size,
+        lora_request=LoRARequest(
+            lora_name="dummy", lora_path="/dummy", lora_int_id=lora_int_id
+        ),
+    )
 
 
-def create_dummy_sequence(request_id: int, token_ids: list[int],
-                          block_size: int) -> Sequence:
+def create_dummy_sequence(
+    request_id: int, token_ids: list[int], block_size: int
+) -> Sequence:
     return Sequence(
         seq_id=request_id,
         inputs=token_inputs(token_ids),
@@ -94,36 +97,36 @@ def create_dummy_prompt_encoder_decoder(
     encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
 
     inputs: EncoderDecoderInputs = {
-        "decoder": token_inputs(decoder_prompt_tokens,
-                                prompt=decoder_prompt_str),
-        "encoder": token_inputs(encoder_prompt_tokens,
-                                prompt=encoder_prompt_str),
+        "decoder": token_inputs(decoder_prompt_tokens, prompt=decoder_prompt_str),
+        "encoder": token_inputs(encoder_prompt_tokens, prompt=encoder_prompt_str),
     }
 
-    decoder_prompt = Sequence(int(request_id),
-                              inputs=inputs["decoder"],
-                              block_size=block_size)
+    decoder_prompt = Sequence(
+        int(request_id), inputs=inputs["decoder"], block_size=block_size
+    )
 
-    encoder_prompt = Sequence(int(request_id),
-                              inputs=inputs["encoder"],
-                              block_size=block_size)
+    encoder_prompt = Sequence(
+        int(request_id), inputs=inputs["encoder"], block_size=block_size
+    )
 
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[decoder_prompt],
-                              arrival_time=time.time(),
-                              lora_request=lora_request,
-                              encoder_seq=encoder_prompt)
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=[decoder_prompt],
+        arrival_time=time.time(),
+        lora_request=lora_request,
+        encoder_seq=encoder_prompt,
+    )
 
     return decoder_prompt, encoder_prompt, seq_group
 
 
 def create_seq_group(
-        seq_prompt_len: int = 1024,
-        seq_output_lens: GenericSequence[int] = (128, ),
-        request_id: str = '0',
-        seq_id_start: int = 0,
-        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
-
+    seq_prompt_len: int = 1024,
+    seq_output_lens: GenericSequence[int] = (128,),
+    request_id: str = "0",
+    seq_id_start: int = 0,
+    sampling_params: Optional[SamplingParams] = None,
+) -> SequenceGroup:
     assert len(seq_output_lens) > 0
 
     if sampling_params is None:
@@ -157,12 +160,12 @@ def create_seq_group(
 
 
 def create_seq_group_encoder_decoder(
-        seq_prompt_len: int = 1024,
-        seq_output_lens: GenericSequence[int] = (128, ),
-        request_id: str = '0',
-        seq_id_start: int = 0,
-        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
-
+    seq_prompt_len: int = 1024,
+    seq_output_lens: GenericSequence[int] = (128,),
+    request_id: str = "0",
+    seq_id_start: int = 0,
+    sampling_params: Optional[SamplingParams] = None,
+) -> SequenceGroup:
     assert len(seq_output_lens) > 0
 
     if sampling_params is None:
@@ -198,11 +201,13 @@ def create_seq_group_encoder_decoder(
         block_size=16,
     )
 
-    return SequenceGroup(request_id=request_id,
-                         seqs=seqs,
-                         sampling_params=sampling_params,
-                         arrival_time=time.time(),
-                         encoder_seq=encoder_seq)
+    return SequenceGroup(
+        request_id=request_id,
+        seqs=seqs,
+        sampling_params=sampling_params,
+        arrival_time=time.time(),
+        encoder_seq=encoder_seq,
+    )
 
 
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
@@ -250,7 +255,6 @@ def __init__(self, scheduler: Scheduler):
         self.call_history: dict[str, list[Any]] = defaultdict(list)
 
     def __getattr__(self, name: str) -> Any:
-
         def wrapper(*args, **kwargs):
             result = getattr(self.scheduler_, name)(*args, **kwargs)
             self.call_history[name].append((args, kwargs, result))
@@ -259,6 +263,7 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     def last_schedule_ret(
-        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
+        self,
+    ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
         _, _, ret = self.call_history["schedule"][-1]
         return ret
diff --git a/tests/cuda/test_cuda_context.py b/tests/cuda/test_cuda_context.py
index f973b284b87e..6336f2112c66 100644
--- a/tests/cuda/test_cuda_context.py
+++ b/tests/cuda/test_cuda_context.py
@@ -13,7 +13,7 @@
 def check_cuda_context():
     """Check CUDA driver context status"""
     try:
-        cuda = ctypes.CDLL('libcuda.so')
+        cuda = ctypes.CDLL("libcuda.so")
         device = ctypes.c_int()
         result = cuda.cuCtxGetDevice(ctypes.byref(device))
         return (True, device.value) if result == 0 else (False, None)
@@ -27,9 +27,11 @@ def run_cuda_test_in_thread(device_input, expected_device_id):
         # New thread should have no CUDA context initially
         valid_before, device_before = check_cuda_context()
         if valid_before:
-            return False, \
-                "CUDA context should not exist in new thread, " \
-                f"got device {device_before}"
+            return (
+                False,
+                "CUDA context should not exist in new thread, "
+                f"got device {device_before}",
+            )
 
         # Test setting CUDA context
         current_platform.set_device(device_input)
@@ -39,8 +41,7 @@ def run_cuda_test_in_thread(device_input, expected_device_id):
         if not valid_after:
             return False, "CUDA context should be valid after set_cuda_context"
         if device_id != expected_device_id:
-            return False, \
-                f"Expected device {expected_device_id}, got {device_id}"
+            return False, f"Expected device {expected_device_id}, got {device_id}"
 
         return True, "Success"
     except Exception as e:
@@ -50,30 +51,30 @@ def run_cuda_test_in_thread(device_input, expected_device_id):
 class TestSetCudaContext:
     """Test suite for the set_cuda_context function."""
 
-    @pytest.mark.skipif(not current_platform.is_cuda(),
-                        reason="CUDA not available")
-    @pytest.mark.parametrize(argnames="device_input,expected_device_id",
-                             argvalues=[
-                                 (0, 0),
-                                 (torch.device('cuda:0'), 0),
-                                 ('cuda:0', 0),
-                             ],
-                             ids=["int", "torch_device", "string"])
-    def test_set_cuda_context_parametrized(self, device_input,
-                                           expected_device_id):
+    @pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
+    @pytest.mark.parametrize(
+        argnames="device_input,expected_device_id",
+        argvalues=[
+            (0, 0),
+            (torch.device("cuda:0"), 0),
+            ("cuda:0", 0),
+        ],
+        ids=["int", "torch_device", "string"],
+    )
+    def test_set_cuda_context_parametrized(self, device_input, expected_device_id):
         """Test setting CUDA context in isolated threads."""
         with ThreadPoolExecutor(max_workers=1) as executor:
-            future = executor.submit(run_cuda_test_in_thread, device_input,
-                                     expected_device_id)
+            future = executor.submit(
+                run_cuda_test_in_thread, device_input, expected_device_id
+            )
             success, message = future.result(timeout=30)
         assert success, message
 
-    @pytest.mark.skipif(not current_platform.is_cuda(),
-                        reason="CUDA not available")
+    @pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
     def test_set_cuda_context_invalid_device_type(self):
         """Test error handling for invalid device type."""
         with pytest.raises(ValueError, match="Expected a cuda device"):
-            current_platform.set_device(torch.device('cpu'))
+            current_platform.set_device(torch.device("cpu"))
 
 
 if __name__ == "__main__":
diff --git a/tests/detokenizer/test_disable_detokenization.py b/tests/detokenizer/test_disable_detokenization.py
index ae06a985c7ec..a77626df5dc7 100644
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -17,20 +17,16 @@ def test_computed_prefix_blocks(model: str):
     prompt = (
         "You are a helpful assistant. How do I build a car from cardboard and "
         "paper clips? Is there an easy to follow video tutorial available "
-        "online for free?")
+        "online for free?"
+    )
 
     llm = LLM(model=model)
-    sampling_params = SamplingParams(max_tokens=10,
-                                     temperature=0.0,
-                                     detokenize=False)
+    sampling_params = SamplingParams(max_tokens=10, temperature=0.0, detokenize=False)
 
-    outputs_no_detokenization = llm.generate(prompt,
-                                             sampling_params)[0].outputs[0]
+    outputs_no_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
     sampling_params.detokenize = True
-    outputs_with_detokenization = llm.generate(prompt,
-                                               sampling_params)[0].outputs[0]
+    outputs_with_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
 
-    assert outputs_no_detokenization.text == ''
-    assert outputs_with_detokenization.text != ''
-    assert outputs_no_detokenization.token_ids == \
-        outputs_with_detokenization.token_ids
+    assert outputs_no_detokenization.text == ""
+    assert outputs_with_detokenization.text != ""
+    assert outputs_no_detokenization.token_ids == outputs_with_detokenization.token_ids
diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py
index bd221977224f..dbb62b305e3c 100644
--- a/tests/detokenizer/test_stop_checker.py
+++ b/tests/detokenizer/test_stop_checker.py
@@ -12,8 +12,7 @@
 from vllm.sequence import Logprob, Sequence, SequenceStatus
 
 
-def sequence_with_eos(text: str, eos_token: str,
-                      eos_token_id: int) -> Sequence:
+def sequence_with_eos(text: str, eos_token: str, eos_token_id: int) -> Sequence:
     """
     Create a Sequence that ends with an EOS token.
     """
@@ -28,22 +27,29 @@ def sequence_with_eos(text: str, eos_token: str,
     offset = eos_token_id + 1
     for i in range(offset, len(text) + offset):
         seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
-    seq.append_token_id(token_id=eos_token_id,
-                        logprobs={eos_token_id: Logprob(0.0)})
+    seq.append_token_id(token_id=eos_token_id, logprobs={eos_token_id: Logprob(0.0)})
 
     seq.status = SequenceStatus.RUNNING
 
     return seq
 
 
-@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
-    ("This text ends with EOS token", "</s>", 2),
-])
+@pytest.mark.parametrize(
+    ["text_wo_eos", "eos_token", "eos_token_id"],
+    [
+        ("This text ends with EOS token", "</s>", 2),
+    ],
+)
 @pytest.mark.parametrize("ignore_eos", [True, False])
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 @pytest.mark.skip_global_cleanup
-def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
-                           ignore_eos: bool, include_stop_str_in_output: bool):
+def test_stop_on_eos_token(
+    text_wo_eos: str,
+    eos_token: str,
+    eos_token_id: int,
+    ignore_eos: bool,
+    include_stop_str_in_output: bool,
+):
     """
     Test the behavior of the StopChecker's maybe_stop_sequence method
     when an EOS token is encountered.
@@ -56,8 +62,9 @@ def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
 
     tokenizer = MagicMock(spec=PreTrainedTokenizer)
     get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
-    stop_checker = StopChecker(max_model_len=1024,
-                               get_tokenizer_for_seq=get_tokenizer_for_seq)
+    stop_checker = StopChecker(
+        max_model_len=1024, get_tokenizer_for_seq=get_tokenizer_for_seq
+    )
 
     seq = sequence_with_eos(
         text=text_wo_eos,
@@ -70,7 +77,8 @@ def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
     sampling_params = SamplingParams(
         min_tokens=1,
         ignore_eos=ignore_eos,
-        include_stop_str_in_output=include_stop_str_in_output)
+        include_stop_str_in_output=include_stop_str_in_output,
+    )
 
     stop_checker.maybe_stop_sequence(
         seq=seq,
diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
index 9716f7d72a58..7415f96c2665 100644
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -31,34 +31,39 @@ def test_stop_reason(vllm_model, example_prompts):
     llm = vllm_model.model
 
     # test stop token
-    outputs = llm.generate(example_prompts,
-                           sampling_params=SamplingParams(
-                               ignore_eos=True,
-                               seed=SEED,
-                               max_tokens=MAX_TOKENS,
-                               stop_token_ids=[stop_token_id]))
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(
+            ignore_eos=True,
+            seed=SEED,
+            max_tokens=MAX_TOKENS,
+            stop_token_ids=[stop_token_id],
+        ),
+    )
     for output in outputs:
         output = output.outputs[0]
         assert output.finish_reason == "stop"
         assert output.stop_reason == stop_token_id
 
     # test stop string
-    outputs = llm.generate(example_prompts,
-                           sampling_params=SamplingParams(
-                               ignore_eos=True,
-                               seed=SEED,
-                               max_tokens=MAX_TOKENS,
-                               stop="."))
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(
+            ignore_eos=True, seed=SEED, max_tokens=MAX_TOKENS, stop="."
+        ),
+    )
     for output in outputs:
         output = output.outputs[0]
         assert output.finish_reason == "stop"
         assert output.stop_reason == STOP_STR
 
     # test EOS token
-    outputs = llm.generate(example_prompts,
-                           sampling_params=SamplingParams(
-                               seed=SEED, max_tokens=MAX_TOKENS))
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(seed=SEED, max_tokens=MAX_TOKENS),
+    )
     for output in outputs:
         output = output.outputs[0]
         assert output.finish_reason == "length" or (
-            output.finish_reason == "stop" and output.stop_reason is None)
+            output.finish_reason == "stop" and output.stop_reason is None
+        )
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index efe938a20c4f..d544bbbbc5db 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -11,12 +11,14 @@
 MAX_TOKENS = 200
 
 
-def _test_stopping(llm: LLM,
-                   expected_output: str,
-                   expected_reason: Any,
-                   stop: Optional[list[str]] = None,
-                   stop_token_ids: Optional[list[int]] = None,
-                   include_in_output: bool = False) -> None:
+def _test_stopping(
+    llm: LLM,
+    expected_output: str,
+    expected_reason: Any,
+    stop: Optional[list[str]] = None,
+    stop_token_ids: Optional[list[int]] = None,
+    include_in_output: bool = False,
+) -> None:
     output = llm.generate(
         "A story about vLLM:\n",
         SamplingParams(
@@ -25,7 +27,8 @@ def _test_stopping(llm: LLM,
             stop=stop,
             stop_token_ids=stop_token_ids,
             include_stop_str_in_output=include_in_output,
-        ))[0].outputs[0]
+        ),
+    )[0].outputs[0]
 
     assert output is not None
     assert output.text == expected_output
@@ -37,17 +40,21 @@ def _set_async_mode(llm, is_async):
 
 
 def _stop_basic(llm):
-    _test_stopping(llm,
-                   stop=["."],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=".")
+    _test_stopping(
+        llm,
+        stop=["."],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization",
+        expected_reason=".",
+    )
 
-    _test_stopping(llm,
-                   stop=["."],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization.",
-                   expected_reason=".")
+    _test_stopping(
+        llm,
+        stop=["."],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organization.",
+        expected_reason=".",
+    )
 
 
 def _stop_multi_tokens(llm):
@@ -56,45 +63,54 @@ def _stop_multi_tokens(llm):
         stop=["group of peo", "short"],
         include_in_output=False,
         expected_output="VLLM is a 100% volunteer organization. We are a ",
-        expected_reason="group of peo")
+        expected_reason="group of peo",
+    )
 
     _test_stopping(
         llm,
         stop=["group of peo", "short"],
         include_in_output=True,
-        expected_output=
-        "VLLM is a 100% volunteer organization. We are a group of peo",
-        expected_reason="group of peo")
+        expected_output="VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo",
+    )
 
 
 def _stop_partial_token(llm):
-    _test_stopping(llm,
-                   stop=["gani"],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer or",
-                   expected_reason="gani")
+    _test_stopping(
+        llm,
+        stop=["gani"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer or",
+        expected_reason="gani",
+    )
 
-    _test_stopping(llm,
-                   stop=["gani"],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organi",
-                   expected_reason="gani")
+    _test_stopping(
+        llm,
+        stop=["gani"],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organi",
+        expected_reason="gani",
+    )
 
 
 def _stop_token_id(llm):
     # token id 13013 => " organization"
 
-    _test_stopping(llm,
-                   stop_token_ids=[13013],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer",
-                   expected_reason=13013)
-
-    _test_stopping(llm,
-                   stop_token_ids=[13013],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=13013)
+    _test_stopping(
+        llm,
+        stop_token_ids=[13013],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer",
+        expected_reason=13013,
+    )
+
+    _test_stopping(
+        llm,
+        stop_token_ids=[13013],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organization",
+        expected_reason=13013,
+    )
 
 
 @pytest.mark.skip_global_cleanup
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index 666a715cc0da..efe6fee58f31 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -111,8 +111,7 @@ def __init__(
         self.last_seq = -1
         self.decoder = msgspec.msgpack.Decoder(type=decode_type)
 
-    def receive_one(self,
-                    timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+    def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
         """Receive a single message with timeout"""
         if not self.sub.poll(timeout):
             return None
@@ -135,8 +134,7 @@ def request_replay(self, start_seq: int, socket_idx: int = 0) -> None:
 
         self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
 
-    def receive_replay(self,
-                       socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
+    def receive_replay(self, socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
         """Receive replayed messages from a specific replay socket"""
         if not self.replay_sockets:
             raise ValueError("Replay sockets not initialized")
diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py
index e2de462612b4..1ddce64f8e61 100644
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -12,7 +12,8 @@
 
 from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
-    CustomAllreduce)
+    CustomAllreduce,
+)
 
 # create a cpu process group for communicating metadata (ipc handle)
 dist.init_process_group(backend="gloo")
@@ -52,7 +53,8 @@
         assert ord(host_data[i]) == byte_value, (
             f"Rank {rank} failed"
             f" to verify buffer {p}. Expected {byte_value}, "
-            f"got {ord(host_data[i])}")
+            f"got {ord(host_data[i])}"
+        )
 
 print(f"Rank {rank} verified all buffers")
 
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index e2cb579e22dc..5def4f9c1316 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -13,10 +13,13 @@
 import ray
 import torch
 
-from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce,
-                              tensor_model_parallel_reduce_scatter)
+from vllm.distributed import (
+    broadcast_tensor_dict,
+    get_pp_group,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+    tensor_model_parallel_reduce_scatter,
+)
 
 from ..utils import init_test_distributed_environment, multi_process_parallel
 
@@ -36,12 +39,11 @@ def all_reduce_test_worker(
 
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     num_elements = 8
     all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
-        (r + 1) for r in range(tp_size)
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
+        for r in range(tp_size)
     ]
     expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
     t = all_tensors[rank % tp_size]
@@ -50,28 +52,31 @@ def all_reduce_test_worker(
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
-                               pp_size: int, rank: int,
-                               distributed_init_port: str):
+def reduce_scatter_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     num_elements = 8
     all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
-        (r + 1) for r in range(tp_size)
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
+        for r in range(tp_size)
     ]
 
     index = rank % tp_size
     partition_size = num_elements // tp_size
     all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    expected = all_reduce[index * partition_size:(index + 1) * partition_size]
+    expected = all_reduce[index * partition_size : (index + 1) * partition_size]
     t = all_tensors[index]
     t = tensor_model_parallel_reduce_scatter(t, 0)
     torch.testing.assert_close(t, expected)
@@ -91,8 +96,7 @@ def all_gather_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     num_dimensions = 3
     tensor_size = list(range(2, num_dimensions + 2))
     total_size = 1
@@ -100,8 +104,10 @@ def all_gather_test_worker(
         total_size *= s
     for all_gather_dimension in range(num_dimensions):
         all_tensors = [
-            torch.arange(total_size, dtype=torch.float32,
-                         device="cuda").reshape(tensor_size) * (r + 1)
+            torch.arange(total_size, dtype=torch.float32, device="cuda").reshape(
+                tensor_size
+            )
+            * (r + 1)
             for r in range(tp_size)
         ]
         expected = torch.cat(all_tensors, dim=all_gather_dimension)
@@ -124,8 +130,7 @@ def broadcast_tensor_dict_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     test_dict = {
         # device tensor
         "a": torch.arange(8, dtype=torch.float32, device="cuda"),
@@ -133,10 +138,7 @@ def broadcast_tensor_dict_test_worker(
         "b": torch.arange(16, dtype=torch.int8, device="cpu"),
         "c": "test",
         "d": [1, 2, 3],
-        "e": {
-            "a": 1,
-            "b": 2
-        },
+        "e": {"a": 1, "b": 2},
         # empty tensor
         "f": torch.tensor([], dtype=torch.float32, device="cuda"),
     }
@@ -165,8 +167,7 @@ def send_recv_tensor_dict_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     test_dict = {
         # device tensor
@@ -175,10 +176,7 @@ def send_recv_tensor_dict_test_worker(
         "b": torch.arange(16, dtype=torch.int8, device="cpu"),
         "c": "test",
         "d": [1, 2, 3],
-        "e": {
-            "a": 1,
-            "b": 2
-        },
+        "e": {"a": 1, "b": 2},
         # empty tensor
         "f": torch.tensor([], dtype=torch.float32, device="cuda"),
     }
@@ -210,8 +208,7 @@ def send_recv_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     size = 64
     test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
@@ -226,13 +223,14 @@ def send_recv_test_worker(
         torch.testing.assert_close(test_tensor, recv_tensor)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 @pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target", [
-    all_reduce_test_worker, all_gather_test_worker,
-    broadcast_tensor_dict_test_worker
-])
+@pytest.mark.parametrize(
+    "test_target",
+    [all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker],
+)
 def test_multi_process_tensor_parallel(
     monkeypatch: pytest.MonkeyPatch,
     tp_size: int,
@@ -241,11 +239,13 @@ def test_multi_process_tensor_parallel(
     multi_process_parallel(monkeypatch, tp_size, 1, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
-    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]
+)
 def test_multi_process_pipeline_parallel(
     monkeypatch: pytest.MonkeyPatch,
     pp_size: int,
@@ -254,15 +254,21 @@ def test_multi_process_pipeline_parallel(
     multi_process_parallel(monkeypatch, 1, pp_size, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pp_size", [2])
-@pytest.mark.parametrize("test_target", [
-    send_recv_test_worker, send_recv_tensor_dict_test_worker,
-    all_reduce_test_worker, all_gather_test_worker,
-    broadcast_tensor_dict_test_worker
-])
+@pytest.mark.parametrize(
+    "test_target",
+    [
+        send_recv_test_worker,
+        send_recv_tensor_dict_test_worker,
+        all_reduce_test_worker,
+        all_gather_test_worker,
+        broadcast_tensor_dict_test_worker,
+    ],
+)
 def test_multi_process_tensor_parallel_pipeline_parallel(
     tp_size: int,
     pp_size: int,
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index fae49c41d5f8..68d37709c4af 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -8,13 +8,18 @@
 import torch
 import torch.distributed as dist
 
-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_group, graph_capture)
-
-from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment, multi_process_parallel)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tp_group,
+    graph_capture,
+)
+
+from ..utils import (
+    ensure_model_parallel_initialized,
+    init_test_distributed_environment,
+    multi_process_parallel,
+)
 
 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -34,8 +39,7 @@ def graph_allreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
         group = get_tensor_model_parallel_group().device_group
 
@@ -61,18 +65,15 @@ def graph_allreduce(
             for dtype in [torch.float32, torch.float16, torch.bfloat16]:
                 with graph_capture(device=device) as graph_capture_context:
                     # use integers so result matches NCCL exactly
-                    inp1 = torch.randint(1,
-                                         16, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
-                    inp2 = torch.randint(1,
-                                         16, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
+                    inp1 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    inp2 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
                     torch.cuda.synchronize()
                     graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                         for i in range(num_communication):
                             out1 = tensor_model_parallel_all_reduce(inp1)
                             # the input buffer is immediately modified to test
@@ -97,8 +98,7 @@ def eager_allreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
         # we use the first group to communicate once
         # and the second group to communicate twice
@@ -133,5 +133,4 @@ def test_custom_allreduce(
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
-                           test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py
index b93696e4be0e..ea7a88abda24 100644
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from ..entrypoints.openai.test_oot_registration import (
-    run_and_test_dummy_opt_api_server)
+from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server
 
 
 def test_distributed_oot(dummy_opt_path: str):
diff --git a/tests/distributed/test_eplb_algo.py b/tests/distributed/test_eplb_algo.py
index e47ccba99c81..79805a7cce53 100644
--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@@ -10,10 +10,12 @@
 def test_basic_rebalance():
     """Test basic rebalancing functionality"""
     # Example from https://github.com/deepseek-ai/eplb
-    weight = torch.tensor([
-        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
-        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
-    ])
+    weight = torch.tensor(
+        [
+            [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+            [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+        ]
+    )
 
     num_layers = weight.shape[0]
     num_replicas = 16
@@ -21,45 +23,49 @@ def test_basic_rebalance():
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify output shapes
     assert phy2log.shape == (
         2,
         16,
     ), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
-    assert (log2phy.shape[0] == 2
-            ), f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
-    assert (
-        log2phy.shape[1] == 12
-    ), f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    assert log2phy.shape[0] == 2, (
+        f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
+    )
+    assert log2phy.shape[1] == 12, (
+        f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    )
     assert logcnt.shape == (
         2,
         12,
     ), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"
 
     # Verify physical to logical expert mapping range is correct
-    assert torch.all(phy2log >= 0) and torch.all(
-        phy2log < 12), "Physical to logical mapping should be in range [0, 12)"
+    assert torch.all(phy2log >= 0) and torch.all(phy2log < 12), (
+        "Physical to logical mapping should be in range [0, 12)"
+    )
 
     # Verify expert count reasonableness
-    assert torch.all(
-        logcnt >= 1), "Each logical expert should have at least 1 replica"
-    assert (
-        torch.sum(logcnt, dim=1).sum() == num_replicas *
-        num_layers), f"Total replicas should be {num_replicas * num_layers}"
+    assert torch.all(logcnt >= 1), "Each logical expert should have at least 1 replica"
+    assert torch.sum(logcnt, dim=1).sum() == num_replicas * num_layers, (
+        f"Total replicas should be {num_replicas * num_layers}"
+    )
 
     # Verify expected output
-    expected_phy2log = torch.tensor([
-        [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
-        [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
-    ])
+    expected_phy2log = torch.tensor(
+        [
+            [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
+            [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
+        ]
+    )
     assert torch.all(phy2log == expected_phy2log)
 
-    expected_logcnt = torch.tensor([[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1],
-                                    [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]])
+    expected_logcnt = torch.tensor(
+        [[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1], [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]]
+    )
     assert torch.all(logcnt == expected_logcnt)
 
 
@@ -71,9 +77,9 @@ def test_single_gpu_case():
     num_nodes = 1
     num_gpus = 1
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (1, 4)
@@ -93,19 +99,19 @@ def test_equal_weights():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (1, 8)
     assert logcnt.shape == (1, 8)
 
     # With equal weights, each expert should have exactly one replica
-    assert torch.all(
-        logcnt == 1
-    ), "With equal weights and no replication, " \
-       "each expert should have exactly 1 replica"
+    assert torch.all(logcnt == 1), (
+        "With equal weights and no replication, "
+        "each expert should have exactly 1 replica"
+    )
 
 
 def test_extreme_weight_imbalance():
@@ -116,35 +122,37 @@ def test_extreme_weight_imbalance():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (1, 12)
     assert logcnt.shape == (1, 8)
 
     # Expert with highest weight (index 0) should have more replicas
-    assert (
-        logcnt[0, 0]
-        > logcnt[0, 1]), "Expert with highest weight should have more replicas"
+    assert logcnt[0, 0] > logcnt[0, 1], (
+        "Expert with highest weight should have more replicas"
+    )
 
 
 def test_multiple_layers():
     """Test multiple layers case"""
-    weight = torch.tensor([
-        [10, 20, 30, 40, 50, 60],  # First layer
-        [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
-        [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
-    ])
+    weight = torch.tensor(
+        [
+            [10, 20, 30, 40, 50, 60],  # First layer
+            [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
+            [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
+        ]
+    )
     num_replicas = 8
     num_groups = 2
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify shapes
     assert phy2log.shape == (3, 8)
@@ -152,12 +160,12 @@ def test_multiple_layers():
 
     # Verify expert allocation is reasonable for each layer
     for layer in range(3):
-        assert torch.all(phy2log[layer] >= 0) and torch.all(
-            phy2log[layer] < 6
-        ), f"Layer {layer} physical to logical mapping" \
-            "should be in range [0, 6)"
-        assert (torch.sum(logcnt[layer]) == num_replicas
-                ), f"Layer {layer} total replicas should be {num_replicas}"
+        assert torch.all(phy2log[layer] >= 0) and torch.all(phy2log[layer] < 6), (
+            f"Layer {layer} physical to logical mappingshould be in range [0, 6)"
+        )
+        assert torch.sum(logcnt[layer]) == num_replicas, (
+            f"Layer {layer} total replicas should be {num_replicas}"
+        )
 
 
 def test_parameter_validation():
@@ -179,17 +187,19 @@ def test_parameter_validation():
 
 def test_small_scale_hierarchical():
     """Test small-scale hierarchical load balancing"""
-    weight = torch.tensor([
-        [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
-    ])
+    weight = torch.tensor(
+        [
+            [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
+        ]
+    )
     num_replicas = 12
     num_groups = 4  # 4 groups, 2 experts each
     num_nodes = 2  # 2 nodes
     num_gpus = 4  # 4 GPUs
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Verify basic constraints
     assert phy2log.shape == (1, 12)
@@ -199,8 +209,9 @@ def test_small_scale_hierarchical():
 
     # Expert with highest weight should have more replicas
     max_weight_expert = torch.argmax(weight[0])
-    assert (logcnt[0, max_weight_expert]
-            >= 2), "Highest weight expert should have multiple replicas"
+    assert logcnt[0, max_weight_expert] >= 2, (
+        "Highest weight expert should have multiple replicas"
+    )
 
 
 def test_global_load_balance_fallback():
@@ -213,9 +224,9 @@ def test_global_load_balance_fallback():
     num_nodes = 2
     num_gpus = 4
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Should work normally, just using global load balancing strategy
     assert phy2log.shape == (1, 8)
@@ -235,9 +246,9 @@ def test_device_compatibility(device):
     num_nodes = 1
     num_gpus = 2
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
 
     # Function will convert to CPU internally, but should handle different
     # device inputs normally
@@ -250,7 +261,8 @@ def test_additional_cases():
 
     # Test case 1: Large-scale distributed setup
     weight1 = torch.tensor(
-        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]])
+        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
+    )
     phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
 
     assert phy2log1.shape == (1, 24)
@@ -258,10 +270,12 @@ def test_additional_cases():
     assert torch.sum(logcnt1) == 24
 
     # Test case 2: Different weight distributions
-    weight2 = torch.tensor([
-        [200, 150, 100, 50, 25, 12],  # Decreasing weights
-        [12, 25, 50, 100, 150, 200],  # Increasing weights
-    ])
+    weight2 = torch.tensor(
+        [
+            [200, 150, 100, 50, 25, 12],  # Decreasing weights
+            [12, 25, 50, 100, 150, 200],  # Increasing weights
+        ]
+    )
     phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
 
     assert phy2log2.shape == (2, 10)
@@ -274,19 +288,21 @@ def test_additional_cases():
 
 
 if __name__ == "__main__":
-    weight = torch.tensor([
-        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
-        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
-    ])
+    weight = torch.tensor(
+        [
+            [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+            [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+        ]
+    )
 
     num_replicas = 16
     num_groups = 4
     num_nodes = 2
     num_gpus = 8
 
-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
     print(phy2log)
 
     test_basic_rebalance()
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index de9ed1eabbac..7ca3d3d27b56 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -9,11 +9,12 @@
 import torch
 import torch.distributed
 
-from vllm.distributed.eplb.rebalance_execute import (
-    rearrange_expert_weights_inplace)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             get_tp_group,
-                                             init_distributed_environment)
+from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_tp_group,
+    init_distributed_environment,
+)
 from vllm.utils import update_environment_variables
 
 
@@ -22,13 +23,13 @@ def distributed_run(fn, world_size):
     processes: list[multiprocessing.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
@@ -45,7 +46,7 @@ def worker_fn_wrapper(fn):
     # and update the environment variables in the function
     def wrapped_fn(env):
         update_environment_variables(env)
-        local_rank = os.environ['LOCAL_RANK']
+        local_rank = os.environ["LOCAL_RANK"]
         device = torch.device(f"cuda:{local_rank}")
         torch.cuda.set_device(device)
         init_distributed_environment()
@@ -60,20 +61,20 @@ def wrapped_fn(env):
 
 
 def create_expert_indices_with_redundancy(
-        num_layers: int,
-        num_logical_experts: int,
-        total_physical_experts: int,
-        redundancy_config: list[int],  # redundancy for each logical expert
+    num_layers: int,
+    num_logical_experts: int,
+    total_physical_experts: int,
+    redundancy_config: list[int],  # redundancy for each logical expert
 ) -> torch.Tensor:
     """
     Create expert indices with redundancy.
-    
+
     Args:
         num_layers: number of layers
         num_logical_experts: number of logical experts
         total_physical_experts: total number of physical experts
         redundancy_config: redundancy for each logical expert
-    
+
     Returns:
         indices: Shape (num_layers, total_physical_experts)
     """
@@ -106,11 +107,11 @@ def create_expert_weights(
 ) -> list[list[torch.Tensor]]:
     """
     Create fake expert weights tensor for testing.
-    
+
     Use `arange` to generate predictable weights values, based on logical
     expert ID.
     All replicas of the same logical expert should have the same weights.
-    
+
     Args:
         physical_to_logical_mapping: Shape (num_layers, num_local_experts)
             mapping[layer, physical_pos] = logical_expert_id
@@ -120,27 +121,27 @@ def create_expert_weights(
     for layer in range(num_layers):
         layer_weights = []
         for weight_idx, hidden_size in enumerate(hidden_sizes):
-            weight_tensor = torch.zeros(num_local_experts,
-                                        hidden_size,
-                                        device=device,
-                                        dtype=torch.float32)
+            weight_tensor = torch.zeros(
+                num_local_experts, hidden_size, device=device, dtype=torch.float32
+            )
 
             for local_expert in range(num_local_experts):
                 # Get the logical expert ID for this physical expert
                 global_pos = rank * num_local_experts + local_expert
                 logical_expert_id = physical_to_logical_mapping[
-                    layer, global_pos].item()
+                    layer, global_pos
+                ].item()
 
                 # Generate weights based on logical expert ID
                 # (so that all replicas of the same logical expert have the
                 # same weights)
-                base_value = (logical_expert_id * 1000 + layer * 100 +
-                              weight_idx * 10)
-                weight_tensor[local_expert] = torch.arange(base_value,
-                                                           base_value +
-                                                           hidden_size,
-                                                           device=device,
-                                                           dtype=torch.float32)
+                base_value = logical_expert_id * 1000 + layer * 100 + weight_idx * 10
+                weight_tensor[local_expert] = torch.arange(
+                    base_value,
+                    base_value + hidden_size,
+                    device=device,
+                    dtype=torch.float32,
+                )
 
             layer_weights.append(weight_tensor)
         expert_weights.append(layer_weights)
@@ -182,12 +183,15 @@ def verify_expert_weights_after_shuffle(
 
                 # Check if the weights are correct
                 actual_weights = weight_tensor[local_expert]
-                expected_base = (expected_logical_expert * 1000 + layer * 100 +
-                                 weight_idx * 10)
-                expected_weights = torch.arange(expected_base,
-                                                expected_base + hidden_size,
-                                                device=actual_weights.device,
-                                                dtype=actual_weights.dtype)
+                expected_base = (
+                    expected_logical_expert * 1000 + layer * 100 + weight_idx * 10
+                )
+                expected_weights = torch.arange(
+                    expected_base,
+                    expected_base + hidden_size,
+                    device=actual_weights.device,
+                    dtype=actual_weights.dtype,
+                )
 
                 torch.testing.assert_close(
                     actual_weights,
@@ -195,7 +199,8 @@ def verify_expert_weights_after_shuffle(
                     msg=f"Layer {layer}, weight {weight_idx},"
                     f"local expert {local_expert}: "
                     f"weights do not match. "
-                    f"Expected logical expert {expected_logical_expert}")
+                    f"Expected logical expert {expected_logical_expert}",
+                )
 
 
 def verify_redundant_experts_have_same_weights(
@@ -222,23 +227,23 @@ def verify_redundant_experts_have_same_weights(
                 total_physical_experts,
                 hidden_size,
                 device=expert_weights[layer][weight_idx].device,
-                dtype=expert_weights[layer][weight_idx].dtype)
+                dtype=expert_weights[layer][weight_idx].dtype,
+            )
 
             # Use all_gather to collect expert weights from current node
             # expert_weights[layer][weight_idx] shape:
             # [num_local_experts, hidden_size]
             local_weights = expert_weights[layer][
-                weight_idx]  # [num_local_experts, hidden_size]
+                weight_idx
+            ]  # [num_local_experts, hidden_size]
 
             # Split tensor along dim 0 into a list for all_gather
-            gathered_weights_list = torch.chunk(gathered_weights,
-                                                world_size,
-                                                dim=0)
+            gathered_weights_list = torch.chunk(gathered_weights, world_size, dim=0)
 
             torch.distributed.all_gather(
                 # Output list: each element corresponds to one rank's weights
                 list(gathered_weights_list),
-                local_weights  # Input: current rank's local weights
+                local_weights,  # Input: current rank's local weights
             )
 
             all_weights.append(gathered_weights)
@@ -266,7 +271,8 @@ def verify_redundant_experts_have_same_weights(
                         msg=f"Layer {layer}, weight {weight_idx},"
                         f"logical expert {logical_expert_id}: "
                         f"Physical expert {physical_pos} has different weights"
-                        f"than expected")
+                        f"than expected",
+                    )
 
 
 @pytest.mark.parametrize(
@@ -290,10 +296,11 @@ def verify_redundant_experts_have_same_weights(
         # 4 GPU, 8 experts per GPU
         # 16 logical experts, 32 physical experts, 16 redundant experts
         (4, 8, 8, 16),
-    ])
-def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
-                                                  num_local_experts,
-                                                  num_logical_experts):
+    ],
+)
+def test_rearrange_expert_weights_with_redundancy(
+    world_size, num_layers, num_local_experts, num_logical_experts
+):
     """Test the functionality of rearranging expert weights with redundancy."""
 
     if torch.cuda.device_count() < world_size:
@@ -304,8 +311,8 @@ def worker_fn():
         # Initialize model parallel (using tensor parallel as an entrypoint
         # to expert parallel)
         ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
         ep_group = get_tp_group().cpu_group
         ep_rank = torch.distributed.get_rank()
@@ -316,8 +323,9 @@ def worker_fn():
         hidden_sizes = [32, 64]  # Two different weight matrices
 
         # Create old expert indices (with redundancy)
-        redundancy_config = create_redundancy_config(num_logical_experts,
-                                                     total_physical_experts)
+        redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
 
         old_indices = create_expert_indices_with_redundancy(
             num_layers,
@@ -328,7 +336,8 @@ def worker_fn():
 
         # Create new expert indices (with redundancy)
         new_redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts)
+            num_logical_experts, total_physical_experts
+        )
         new_indices = create_expert_indices_with_redundancy(
             num_layers,
             num_logical_experts,
@@ -337,9 +346,9 @@ def worker_fn():
         )
 
         # Create expert weights
-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               old_indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
 
         # Execute weight rearrangement
         rearrange_expert_weights_inplace(
@@ -383,8 +392,8 @@ def test_rearrange_expert_weights_no_change(world_size):
     @worker_fn_wrapper
     def worker_fn():
         ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
         ep_group = get_tp_group().cpu_group
         ep_rank = torch.distributed.get_rank()
@@ -401,12 +410,12 @@ def worker_fn():
 
         # Same indices - no change
         indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            redundancy_config)
+            num_layers, num_logical_experts, total_physical_experts, redundancy_config
+        )
 
-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+        )
 
         # Save original weights
         original_weights = []
@@ -422,7 +431,8 @@ def worker_fn():
             indices,  # Same indices
             expert_weights,
             ep_group,
-            is_profile=False)
+            is_profile=False,
+        )
 
         # Verify that the weights have not changed
         for layer in range(num_layers):
@@ -430,8 +440,8 @@ def worker_fn():
                 torch.testing.assert_close(
                     expert_weights[layer][weight_idx],
                     original_weights[layer][weight_idx],
-                    msg=f"Layer {layer}, weight {weight_idx} should remain "
-                    f"unchanged")
+                    msg=f"Layer {layer}, weight {weight_idx} should remain unchanged",
+                )
 
     distributed_run(worker_fn, world_size)
 
@@ -446,8 +456,8 @@ def test_rearrange_expert_weights_profile_mode(world_size):
     @worker_fn_wrapper
     def worker_fn():
         ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
 
         ep_group = get_tp_group().cpu_group
         ep_rank = torch.distributed.get_rank()
@@ -460,21 +470,23 @@ def worker_fn():
         hidden_sizes = [32]
 
         # Create different index distributions
-        old_redundancy = create_redundancy_config(num_logical_experts,
-                                                  total_physical_experts)
-        new_redundancy = create_redundancy_config(num_logical_experts,
-                                                  total_physical_experts)
+        old_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
 
         old_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            old_redundancy)
+            num_layers, num_logical_experts, total_physical_experts, old_redundancy
+        )
         new_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            new_redundancy)
+            num_layers, num_logical_experts, total_physical_experts, new_redundancy
+        )
 
-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               old_indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
 
         # Save original weights
         original_weights = []
@@ -490,7 +502,7 @@ def worker_fn():
             new_indices,
             expert_weights,
             ep_group,
-            is_profile=True  # Profile mode
+            is_profile=True,  # Profile mode
         )
 
         # In profile mode, the weights should remain unchanged
@@ -499,6 +511,7 @@ def worker_fn():
                 torch.testing.assert_close(
                     expert_weights[layer][weight_idx],
                     original_weights[layer][weight_idx],
-                    msg="In profile mode, the weights should remain unchanged")
+                    msg="In profile mode, the weights should remain unchanged",
+                )
 
     distributed_run(worker_fn, world_size)
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
index 8be9ee0a1889..f06f6771a4a0 100644
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -6,24 +6,29 @@
 import msgspec
 import pytest
 
-from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
-                                        NullEventPublisher)
+from vllm.distributed.kv_events import (
+    EventBatch,
+    EventPublisherFactory,
+    NullEventPublisher,
+)
 
 DP_RANK = 0
 
 
 class EventSample(
-        msgspec.Struct,
-        tag=True,  # type: ignore
-        array_like=True  # type: ignore
+    msgspec.Struct,
+    tag=True,  # type: ignore
+    array_like=True,  # type: ignore
 ):
     """Test event for publisher testing"""
+
     id: int
     value: str
 
 
 class SampleBatch(EventBatch):
     """Test event batch for publisher testing"""
+
     events: list[EventSample]
 
 
@@ -44,10 +49,8 @@ def test_basic_publishing(publisher, subscriber):
 
     seq, received = result
     assert seq == 0, "Sequence number mismatch"
-    assert received.ts == pytest.approx(test_batch.ts,
-                                        abs=0.1), ("Timestamp mismatch")
-    assert len(received.events) == len(
-        test_batch.events), ("Number of events mismatch")
+    assert received.ts == pytest.approx(test_batch.ts, abs=0.1), "Timestamp mismatch"
+    assert len(received.events) == len(test_batch.events), "Number of events mismatch"
 
     for i, event in enumerate(received.events):
         assert event.id == i, "Event id mismatch"
@@ -88,9 +91,9 @@ def test_replay_mechanism(publisher, subscriber):
     assert len(replayed) > 0, "No replayed messages received"
     seqs = [seq for seq, _ in replayed]
     assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
-    assert seqs == list(range(min(seqs),
-                              max(seqs) +
-                              1)), ("Replayed messages not consecutive")
+    assert seqs == list(range(min(seqs), max(seqs) + 1)), (
+        "Replayed messages not consecutive"
+    )
 
 
 def test_buffer_limit(publisher, subscriber, publisher_config):
@@ -126,6 +129,7 @@ def test_topic_filtering(publisher_config):
     pub = EventPublisherFactory.create(publisher_config, DP_RANK)
 
     from .conftest import MockSubscriber
+
     sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
     sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
 
@@ -137,11 +141,13 @@ def test_topic_filtering(publisher_config):
 
         foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
         assert all(msg is not None for msg in foo_received), (
-            "Subscriber with matching topic should receive messages")
+            "Subscriber with matching topic should receive messages"
+        )
 
         bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
         assert all(msg is None for msg in bar_received), (
-            "Subscriber with non-matching topic should receive no messages")
+            "Subscriber with non-matching topic should receive no messages"
+        )
     finally:
         pub.shutdown()
         sub_foo.close()
@@ -178,8 +184,7 @@ def publish_events():
 
     publisher_thread.join()
 
-    assert len(received) >= num_batches * 0.9, (
-        "We should have received most messages")
+    assert len(received) >= num_batches * 0.9, "We should have received most messages"
 
     seqs = [seq for seq, _ in received]
     assert sorted(seqs) == seqs, "Sequence numbers should be in order"
@@ -209,13 +214,15 @@ def test_data_parallel_rank_tagging(publisher_config):
         # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
         expected_endpoint_0 = base_endpoint  # rank 0 gets port + 0 = same port
         expected_endpoint_1 = base_endpoint.replace(
-            ":5557", ":5558")  # rank 1 gets port + 1
+            ":5557", ":5558"
+        )  # rank 1 gets port + 1
     else:
         # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
         expected_endpoint_0 = base_endpoint  # rank 0 gets base
         expected_endpoint_1 = base_endpoint + "_dp1"  # rank 1 gets _dp1
 
     from .conftest import MockSubscriber
+
     sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
     sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
 
@@ -241,15 +248,15 @@ def test_data_parallel_rank_tagging(publisher_config):
 
         # Verify DP rank tagging
         assert received_0.data_parallel_rank == 0, (
-            f"Expected DP rank 0, got {received_0.data_parallel_rank}")
+            f"Expected DP rank 0, got {received_0.data_parallel_rank}"
+        )
         assert received_1.data_parallel_rank == 1, (
-            f"Expected DP rank 1, got {received_1.data_parallel_rank}")
+            f"Expected DP rank 1, got {received_1.data_parallel_rank}"
+        )
 
         # Verify event content is correct
-        assert len(
-            received_0.events) == 2, "Wrong number of events from rank 0"
-        assert len(
-            received_1.events) == 3, "Wrong number of events from rank 1"
+        assert len(received_0.events) == 2, "Wrong number of events from rank 0"
+        assert len(received_1.events) == 3, "Wrong number of events from rank 1"
 
     finally:
         pub_0.shutdown()
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index f641bf160414..a010e5b0f709 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -46,28 +46,24 @@ def detailed(
     ):
         return EPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=False,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=2 * tp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=2 * tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
+                ParallelSetup(
+                    tp_size=2 * tp_base, eager_mode=False, chunked_prefill=True
+                ),
+                ParallelSetup(
+                    tp_size=2 * tp_base, eager_mode=True, chunked_prefill=False
+                ),
             ],
             distributed_backends=["mp", "ray"],
             task=task,
-            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+            test_options=EPTestOptions(
+                trust_remote_code=trust_remote_code,
+                tokenizer_mode=tokenizer_mode,
+                load_format=load_format,
+                hf_overrides=hf_overrides,
+            ),
         )
 
     @staticmethod
@@ -82,16 +78,16 @@ def fast(
     ):
         return EPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
             ],
             distributed_backends=["mp"],
             task=task,
-            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+            test_options=EPTestOptions(
+                trust_remote_code=trust_remote_code,
+                tokenizer_mode=tokenizer_mode,
+                load_format=load_format,
+                hf_overrides=hf_overrides,
+            ),
         )
 
     def iter_params(self, model_name: str):
@@ -99,8 +95,7 @@ def iter_params(self, model_name: str):
 
         for parallel_setup in self.parallel_setups:
             for distributed_backend in self.distributed_backends:
-                yield (model_name, parallel_setup, distributed_backend,
-                       self.task, opts)
+                yield (model_name, parallel_setup, distributed_backend, self.task, opts)
 
 
 # NOTE: You can adjust tp_base locally to fit the model in GPU
diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py
index ef17a51fff0e..8d818edbb3bd 100644
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -24,14 +24,13 @@
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
 
-@pytest.mark.skipif(not VLLM_MULTI_NODE,
-                    reason="Need at least 2 nodes to run the test.")
+@pytest.mark.skipif(
+    not VLLM_MULTI_NODE, reason="Need at least 2 nodes to run the test."
+)
 def test_multi_node_assignment() -> None:
-
     # NOTE: important to keep this class definition here
     # to let ray use cloudpickle to serialize it.
     class Actor:
-
         def get_ip(self):
             return get_ip()
 
@@ -41,8 +40,7 @@ def get_ip(self):
 
         current_ip = get_ip()
         workers = []
-        for bundle_id, bundle in enumerate(
-                config.placement_group.bundle_specs):
+        for bundle_id, bundle in enumerate(config.placement_group.bundle_specs):
             if not bundle.get("GPU", 0):
                 continue
             scheduling_strategy = PlacementGroupSchedulingStrategy(
diff --git a/tests/distributed/test_node_count.py b/tests/distributed/test_node_count.py
index e3c36ef5ef37..b48c025aa1a2 100644
--- a/tests/distributed/test_node_count.py
+++ b/tests/distributed/test_node_count.py
@@ -32,12 +32,15 @@
         # Expected node count based on environment variable)
         expected = int(os.environ.get("NUM_NODES", "1"))
 
-        assert test_result == expected, \
-            f"Expected {expected} nodes, got {test_result}"
+        assert test_result == expected, f"Expected {expected} nodes, got {test_result}"
 
         if pg == dist.group.WORLD:
-            print(f"Node count test passed! Got {test_result} nodes "
-                  f"when using torch distributed!")
+            print(
+                f"Node count test passed! Got {test_result} nodes "
+                f"when using torch distributed!"
+            )
         else:
-            print(f"Node count test passed! Got {test_result} nodes "
-                  f"when using StatelessProcessGroup!")
+            print(
+                f"Node count test passed! Got {test_result} nodes "
+                f"when using StatelessProcessGroup!"
+            )
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 926a33c949eb..5bc71a0bbb8c 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -7,6 +7,7 @@
  all workers in a node other than the head node, which can cause the test
  to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
@@ -35,7 +36,7 @@ def use_v0_only(monkeypatch):
     weights. Once we enable V1 by default for PP, we can
     remove this.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 class ParallelSetup(NamedTuple):
@@ -68,7 +69,8 @@ def __post_init__(self):
             raise ValueError(
                 f"Length mismatch: distributed_backends "
                 f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+                f"vllm_major_versions ({len(self.vllm_major_versions)})"
+            )
 
     @staticmethod
     def detailed(
@@ -81,32 +83,43 @@ def detailed(
     ):
         return PPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              eager_mode=False,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=2 * tp_base,
-                              pp_size=pp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=2 * tp_base,
-                              pp_size=pp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(
+                    tp_size=tp_base,
+                    pp_size=pp_base,
+                    eager_mode=False,
+                    chunked_prefill=False,
+                ),
+                ParallelSetup(
+                    tp_size=tp_base,
+                    pp_size=2 * pp_base,
+                    eager_mode=False,
+                    chunked_prefill=True,
+                ),
+                ParallelSetup(
+                    tp_size=tp_base,
+                    pp_size=2 * pp_base,
+                    eager_mode=True,
+                    chunked_prefill=False,
+                ),
+                ParallelSetup(
+                    tp_size=2 * tp_base,
+                    pp_size=pp_base,
+                    eager_mode=False,
+                    chunked_prefill=True,
+                ),
+                ParallelSetup(
+                    tp_size=2 * tp_base,
+                    pp_size=pp_base,
+                    eager_mode=True,
+                    chunked_prefill=False,
+                ),
             ],
             distributed_backends=["mp", "mp", "ray", "ray"],
             vllm_major_versions=["0", "1", "0", "1"],
             task=task,
-            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=PPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     @staticmethod
@@ -120,26 +133,36 @@ def fast(
     ):
         return PPTestSettings(
             parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(
+                    tp_size=tp_base,
+                    pp_size=pp_base,
+                    eager_mode=True,
+                    chunked_prefill=False,
+                ),
             ],
             distributed_backends=["mp"],
             vllm_major_versions=["0"],
             task=task,
-            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=PPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(self.distributed_backends,
-                                                   self.vllm_major_versions):
-                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+            for backend, vllm_major_version in zip(
+                self.distributed_backends, self.vllm_major_versions
+            ):
+                yield (
+                    model_id,
+                    parallel_setup,
+                    backend,
+                    vllm_major_version,
+                    self.task,
+                    opts,
+                )
 
 
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -317,8 +340,10 @@ def _compare_tp(
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
     if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
     if multi_node_only and not VLLM_MULTI_NODE:
         pytest.skip("Not in multi-node setting")
 
@@ -348,8 +373,7 @@ def _compare_tp(
 
     specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
     testing_ray_compiled_graph = False
-    if distributed_backend == "ray" and (vllm_major_version == "1"
-                                         or specific_case):
+    if distributed_backend == "ray" and (vllm_major_version == "1" or specific_case):
         # For V1, test Ray Compiled Graph for all the tests
         # For V0, test Ray Compiled Graph for a subset of the tests
         pp_env = {
@@ -398,12 +422,7 @@ def _compare_tp(
     ]
 
     try:
-        compare_two_settings(model_id,
-                             pp_args,
-                             tp_args,
-                             pp_env,
-                             tp_env,
-                             method=method)
+        compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)
     except Exception:
         if testing_ray_compiled_graph and vllm_major_version == "0":
             # Ray Compiled Graph tests are flaky for V0,
@@ -414,11 +433,19 @@ def _compare_tp(
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "vllm_major_version",
+        "task",
+        "test_options",
+    ),
     [
-        params for model_id, settings in TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
     ],
 )
 @create_new_process_for_each_test()
@@ -431,23 +458,33 @@ def test_tp_language_generation(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                vllm_major_version,
-                task,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=False)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        vllm_major_version,
+        task,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=False,
+    )
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "vllm_major_version",
+        "task",
+        "test_options",
+    ),
     [
-        params for model_id, settings in EMBEDDING_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in EMBEDDING_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
     ],
 )
 @create_new_process_for_each_test()
@@ -460,23 +497,33 @@ def test_tp_language_embedding(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                vllm_major_version,
-                task,
-                test_options,
-                num_gpus_available,
-                method="encode",
-                is_multimodal=False)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        vllm_major_version,
+        task,
+        test_options,
+        num_gpus_available,
+        method="encode",
+        is_multimodal=False,
+    )
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "vllm_major_version",
+        "task",
+        "test_options",
+    ),
     [
-        params for model_id, settings in MULTIMODAL_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in MULTIMODAL_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
     ],
 )
 @create_new_process_for_each_test()
@@ -489,12 +536,14 @@ def test_tp_multimodal_generation(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                vllm_major_version,
-                task,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=True)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        vllm_major_version,
+        task,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=True,
+    )
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 69ceedd345a8..4df6f43970d7 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -9,7 +9,6 @@
 
 
 def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
-
     with monkeypatch.context() as m:
 
         def _verify(partition_str, num_layers, pp_size, goldens):
@@ -57,7 +56,8 @@ def _verify(partition_str, num_layers, pp_size, goldens):
         (5, 3, 0, (0, 2)),
         (5, 3, 1, (2, 4)),
         (5, 3, 2, (4, 5)),
-    ])
+    ],
+)
 def test_uneven_auto_partition(
     num_hidden_layers: int,
     pp_size: int,
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index a027a9e37dd6..518b1bf76fd7 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -12,13 +12,19 @@
     from typing_extensions import LiteralString
 
 
-@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
-    (2, "JackFram/llama-160m"),
-])
-@pytest.mark.parametrize("ATTN_BACKEND", [
-    "FLASH_ATTN",
-    "FLASHINFER",
-])
+@pytest.mark.parametrize(
+    "PP_SIZE, MODEL_NAME",
+    [
+        (2, "JackFram/llama-160m"),
+    ],
+)
+@pytest.mark.parametrize(
+    "ATTN_BACKEND",
+    [
+        "FLASH_ATTN",
+        "FLASHINFER",
+    ],
+)
 @create_new_process_for_each_test()
 def test_pp_cudagraph(
     monkeypatch: pytest.MonkeyPatch,
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index abfad9ebfe7d..4bab709fb589 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -9,13 +9,15 @@
 import torch
 import torch.distributed
 
-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             get_world_group, graph_capture,
-                                             init_distributed_environment)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_world_group,
+    graph_capture,
+    init_distributed_environment,
+)
 from vllm.utils import update_environment_variables
 
 
@@ -24,13 +26,13 @@ def distributed_run(fn, world_size):
     processes: list[multiprocessing.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
@@ -47,7 +49,7 @@ def worker_fn_wrapper(fn):
     # and update the environment variables in the function
     def wrapped_fn(env):
         update_environment_variables(env)
-        local_rank = os.environ['LOCAL_RANK']
+        local_rank = os.environ["LOCAL_RANK"]
         device = torch.device(f"cuda:{local_rank}")
         torch.cuda.set_device(device)
         init_distributed_environment()
@@ -58,17 +60,18 @@ def wrapped_fn(env):
 
 @worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
-    tensor = torch.ones(16, 1024, 1024,
-                        dtype=torch.float32).cuda(pynccl_comm.rank)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
     tensor = pynccl_comm.all_reduce(tensor)
     torch.cuda.synchronize()
     assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl():
     distributed_run(worker_fn, 2)
 
@@ -78,7 +81,7 @@ def multiple_allreduce_worker_fn():
     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
     groups = [
         torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
-        torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+        torch.distributed.new_group(ranks=[2, 3], backend="gloo"),
     ]
     group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
     pynccl_comm = PyNcclCommunicator(group=group, device=device)
@@ -95,8 +98,9 @@ def multiple_allreduce_worker_fn():
         assert torch.all(tensor == 2).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_allreduce():
     # this tests pynccl for multiple tp groups, in a standalone way
     # i.e. call `pynccl_comm.all_reduce` directly
@@ -121,8 +125,9 @@ def multiple_allreduce_with_vllm_worker_fn():
             assert torch.all(tensor == 2).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_allreduce_with_vllm():
     # this tests pynccl for multiple tp groups, together with vllm
     # i.e. call `tensor_model_parallel_all_reduce`
@@ -133,10 +138,11 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
     with torch.no_grad():
         graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                         device=get_world_group().device)
+        pynccl_comm = PyNcclCommunicator(
+            get_world_group().cpu_group, device=get_world_group().device
+        )
         # run something in the default stream to initialize torch engine
-        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
+        a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
         torch.cuda.synchronize()
         with torch.cuda.graph(graph):
             a_out = pynccl_comm.all_reduce(a)
@@ -148,84 +154,90 @@ def worker_fn_with_cudagraph():
 
 @worker_fn_wrapper
 def all_gather_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     num_elems = 1000
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * num_elems
-    result = torch.zeros(num_elems * world_size,
-                         dtype=torch.float32,
-                         device=device)
-
-    expected = torch.cat([
-        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
-        for r in range(world_size)
-    ]).to(device)
+    tensor = (
+        torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
+    )
+    result = torch.zeros(num_elems * world_size, dtype=torch.float32, device=device)
+
+    expected = torch.cat(
+        [
+            torch.arange(num_elems, dtype=torch.float32) + r * num_elems
+            for r in range(world_size)
+        ]
+    ).to(device)
 
     pynccl_comm.all_gather(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_all_gather():
     distributed_run(all_gather_worker_fn, 2)
 
 
 @worker_fn_wrapper
 def all_gatherv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     assert world_size <= 8
     sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
     num_elems = sizes[rank]
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * 100
+    tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
     result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)
 
-    expected = torch.cat([
-        torch.arange(sizes[r], dtype=torch.float32) + r * 100
-        for r in range(world_size)
-    ]).to(device)
+    expected = torch.cat(
+        [
+            torch.arange(sizes[r], dtype=torch.float32) + r * 100
+            for r in range(world_size)
+        ]
+    ).to(device)
 
     pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_all_gatherv():
     distributed_run(all_gatherv_worker_fn, 2)
 
 
 @worker_fn_wrapper
 def reduce_scatter_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     num_elems = 1000
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * num_elems
-    assert (num_elems % world_size == 0)
-    result = torch.zeros(num_elems // world_size,
-                         dtype=torch.float32,
-                         device=device)
+    tensor = (
+        torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
+    )
+    assert num_elems % world_size == 0
+    result = torch.zeros(num_elems // world_size, dtype=torch.float32, device=device)
 
     # Calculate expected result for this rank's chunk
     scattered_size = num_elems // world_size
@@ -233,34 +245,37 @@ def reduce_scatter_worker_fn():
         torch.arange(num_elems, dtype=torch.float32) + r * num_elems
         for r in range(world_size)
     ]
-    expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
-                   for tensor in all_tensors).to(device)
+    expected = sum(
+        tensor[rank * scattered_size : (rank + 1) * scattered_size]
+        for tensor in all_tensors
+    ).to(device)
 
     pynccl_comm.reduce_scatter(result, tensor)
     torch.cuda.synchronize()
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_reduce_scatter():
     distributed_run(reduce_scatter_worker_fn, 2)
 
 
 @worker_fn_wrapper
 def reduce_scatterv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
 
     rank = pynccl_comm.rank
     world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"
 
     assert world_size <= 8
     sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
     num_elems = sum(sizes)
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * 100
+    tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
     result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)
 
     # Calculate expected result for this rank's chunk
@@ -278,41 +293,41 @@ def reduce_scatterv_worker_fn():
     torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_reduce_scatterv():
     distributed_run(reduce_scatterv_worker_fn, 2)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_with_cudagraph():
     distributed_run(worker_fn_with_cudagraph, 2)
 
 
 @worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
     if pynccl_comm.rank == 0:
-        tensor = torch.ones(16, 1024, 1024,
-                            dtype=torch.float32).cuda(pynccl_comm.rank)
+        tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
     else:
-        tensor = torch.empty(16, 1024, 1024,
-                             dtype=torch.float32).cuda(pynccl_comm.rank)
+        tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
 
     if pynccl_comm.rank == 0:
-        pynccl_comm.send(tensor,
-                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
     else:
-        pynccl_comm.recv(tensor,
-                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     assert torch.all(tensor == 1).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_send_recv():
     distributed_run(send_recv_worker_fn, 2)
 
@@ -322,27 +337,20 @@ def multiple_send_recv_worker_fn():
     device = torch.device(f"cuda:{torch.distributed.get_rank()}")
     groups = [
         torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
-        torch.distributed.new_group(ranks=[1, 3], backend="gloo")
+        torch.distributed.new_group(ranks=[1, 3], backend="gloo"),
     ]
     group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
     pynccl_comm = PyNcclCommunicator(group=group, device=device)
     if torch.distributed.get_rank() == 0:
         tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
     elif torch.distributed.get_rank() == 1:
-        tensor = 2 * torch.ones(
-            16, 1024, 1024, dtype=torch.float32, device=device)
+        tensor = 2 * torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
     else:
-        tensor = torch.empty(16,
-                             1024,
-                             1024,
-                             dtype=torch.float32,
-                             device=device)
+        tensor = torch.empty(16, 1024, 1024, dtype=torch.float32, device=device)
     if torch.distributed.get_rank() in [0, 1]:
-        pynccl_comm.send(tensor,
-                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
     else:
-        pynccl_comm.recv(tensor,
-                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
     torch.cuda.synchronize()
     if torch.distributed.get_rank() in [0, 2]:
         assert torch.all(tensor == 1).cpu().item()
@@ -350,14 +358,16 @@ def multiple_send_recv_worker_fn():
         assert torch.all(tensor == 2).cpu().item()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_send_recv():
     distributed_run(multiple_send_recv_worker_fn, 4)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_broadcast():
     distributed_run(broadcast_worker_fn, 4)
 
@@ -366,19 +376,17 @@ def test_pynccl_broadcast():
 def broadcast_worker_fn():
     # Test broadcast for every root rank.
     # Essentially this is an all-gather operation.
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
     recv_tensors = [
-        torch.empty(16,
-                    1024,
-                    1024,
-                    dtype=torch.float32,
-                    device=pynccl_comm.device)
+        torch.empty(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
         for i in range(pynccl_comm.world_size)
     ]
-    recv_tensors[pynccl_comm.rank] = torch.ones(
-        16, 1024, 1024, dtype=torch.float32,
-        device=pynccl_comm.device) * pynccl_comm.rank
+    recv_tensors[pynccl_comm.rank] = (
+        torch.ones(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
+        * pynccl_comm.rank
+    )
 
     for i in range(pynccl_comm.world_size):
         pynccl_comm.broadcast(recv_tensors[i], src=i)
diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py
index a4added29144..579bc55dbba7 100644
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -8,21 +8,24 @@
 import torch
 import torch.distributed as dist
 
-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_group, graph_capture)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tp_group,
+    graph_capture,
+)
 from vllm.platforms import current_platform
 
-from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment, multi_process_parallel)
+from ..utils import (
+    ensure_model_parallel_initialized,
+    init_test_distributed_environment,
+    multi_process_parallel,
+)
 
 torch.manual_seed(42)
 random.seed(44)
 # Size over 8MB is sufficient for custom quick allreduce.
-test_sizes = [
-    random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)
-]
+test_sizes = [random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)]
 for i, v in enumerate(test_sizes):
     test_sizes[i] -= v % 8
 
@@ -39,8 +42,7 @@ def graph_quickreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
         group = get_tensor_model_parallel_group().device_group
 
@@ -65,18 +67,15 @@ def graph_quickreduce(
         for sz in test_sizes:
             for dtype in [torch.float16, torch.bfloat16]:
                 with graph_capture(device=device) as graph_capture_context:
-                    inp1 = torch.randint(1,
-                                         23, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
-                    inp2 = torch.randint(-23,
-                                         1, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
+                    inp1 = torch.randint(
+                        1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    inp2 = torch.randint(
+                        -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
                     torch.cuda.synchronize()
                     graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                         for _ in range(num_communication):
                             out1 = tensor_model_parallel_all_reduce(inp1)
                             dist.all_reduce(inp1, group=group)
@@ -100,39 +99,42 @@ def eager_quickreduce(
         device = torch.device(f"cuda:{rank}")
         torch.cuda.set_device(device)
 
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
         # Size over 8MB is sufficient for custom quick allreduce.
         sz = 16 * 1024 * 1024
         fa = get_tp_group().device_communicator.qr_comm
-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.float16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.float16, device=device
+        )
         out = fa.quick_all_reduce(inp)
         torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
 
-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.bfloat16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.bfloat16, device=device
+        )
         out = fa.quick_all_reduce(inp)
         torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
 
 
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test quick allreduce for rocm")
+@pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
+)
 @pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
 @pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
-def test_custom_quick_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
-                                pipeline_parallel_size, test_target,
-                                quant_mode):
+def test_custom_quick_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+    quant_mode,
+):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
 
     monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
 
-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
-                           test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 94ad8f4f1213..baf75fd48c63 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -22,15 +22,13 @@
         dist.broadcast_object_list(recv, src=0)
         ip, port = recv
 
-    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
-                                                dist.get_world_size())
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
 
     for pg in [dist.group.WORLD, stateless_pg]:
         test_result = all(in_the_same_node_as(pg, source_rank=0))
 
         expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-        assert test_result == expected, \
-            f"Expected {expected}, got {test_result}"
+        assert test_result == expected, f"Expected {expected}, got {test_result}"
         if pg == dist.group.WORLD:
             print("Same node test passed! when using torch distributed!")
         else:
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index b2f6a8ab9dd3..67f960f294d4 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -7,6 +7,7 @@
  all workers in a node other than the head node, which can cause the test
  to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
@@ -56,7 +57,8 @@ def __post_init__(self):
             raise ValueError(
                 f"Length mismatch: distributed_backends "
                 f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+                f"vllm_major_versions ({len(self.vllm_major_versions)})"
+            )
 
     @staticmethod
     def detailed(
@@ -72,18 +74,22 @@ def detailed(
             for pp_multiplier in [1, 2]:
                 for chunked_prefill_val in [False, True]:
                     parallel_setups.append(
-                        ParallelSetup(tp_size=tp_base,
-                                      pp_size=pp_multiplier * pp_base,
-                                      enable_fusion=False,
-                                      eager_mode=eager_mode_val,
-                                      chunked_prefill=chunked_prefill_val))
+                        ParallelSetup(
+                            tp_size=tp_base,
+                            pp_size=pp_multiplier * pp_base,
+                            enable_fusion=False,
+                            eager_mode=eager_mode_val,
+                            chunked_prefill=chunked_prefill_val,
+                        )
+                    )
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
             task=task,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     @staticmethod
@@ -100,18 +106,22 @@ def fast(
             for pp_multiplier in [1, 2]:
                 for chunked_prefill_val in [False, True]:
                     parallel_setups.append(
-                        ParallelSetup(tp_size=tp_base,
-                                      pp_size=pp_multiplier * pp_base,
-                                      enable_fusion=False,
-                                      eager_mode=eager_mode_val,
-                                      chunked_prefill=chunked_prefill_val))
+                        ParallelSetup(
+                            tp_size=tp_base,
+                            pp_size=pp_multiplier * pp_base,
+                            enable_fusion=False,
+                            eager_mode=eager_mode_val,
+                            chunked_prefill=chunked_prefill_val,
+                        )
+                    )
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
             task=task,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     @staticmethod
@@ -126,28 +136,39 @@ def fp8_quant(
         parallel_setups = []
         for fusion_val in [False, True]:
             parallel_setups.append(
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              enable_fusion=fusion_val,
-                              eager_mode=True,
-                              chunked_prefill=False))
+                ParallelSetup(
+                    tp_size=tp_base,
+                    pp_size=pp_base,
+                    enable_fusion=fusion_val,
+                    eager_mode=True,
+                    chunked_prefill=False,
+                )
+            )
         return SPTestSettings(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
             task=task,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
         )
 
     def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(self.distributed_backends,
-                                                   self.vllm_major_versions):
-                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+            for backend, vllm_major_version in zip(
+                self.distributed_backends, self.vllm_major_versions
+            ):
+                yield (
+                    model_id,
+                    parallel_setup,
+                    backend,
+                    vllm_major_version,
+                    self.task,
+                    opts,
+                )
 
 
 def _compare_sp(
@@ -199,8 +220,10 @@ def _compare_sp(
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
     if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
     if multi_node_only and not VLLM_MULTI_NODE:
         pytest.skip("Not in multi-node setting")
 
@@ -229,14 +252,14 @@ def _compare_sp(
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
     compilation_config = {
-        'level': 3,
-        'custom_ops': ["+rms_norm"],
-        'compile_sizes': [4, 8],
-        'splitting_ops': [],
-        'pass_config': {
-            'enable_sequence_parallelism': True,
-            'enable_fusion': enable_fusion,
-            'enable_noop': True,
+        "level": 3,
+        "custom_ops": ["+rms_norm"],
+        "compile_sizes": [4, 8],
+        "splitting_ops": [],
+        "pass_config": {
+            "enable_sequence_parallelism": True,
+            "enable_fusion": enable_fusion,
+            "enable_noop": True,
         },
     }
 
@@ -266,12 +289,9 @@ def _compare_sp(
     ]
 
     try:
-        compare_two_settings(model_id,
-                             tp_sp_args,
-                             tp_args,
-                             tp_sp_env,
-                             tp_env,
-                             method=method)
+        compare_two_settings(
+            model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
+        )
     except Exception:
         testing_ray_compiled_graph = tp_sp_env is not None
         if testing_ray_compiled_graph and vllm_major_version == "0":
@@ -292,15 +312,22 @@ def _compare_sp(
     # TODO support other models
     # [LANGUAGE GENERATION]
     "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]
 
 
 @pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "vllm_major_version",
+        "task",
+        "test_options",
+    ),
     [
-        params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
+        params
+        for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id)
         if model_id in SP_TEST_MODELS
     ],
@@ -315,12 +342,14 @@ def test_tp_sp_generation(
     test_options: SPTestOptions,
     num_gpus_available,
 ):
-    _compare_sp(model_id,
-                parallel_setup,
-                distributed_backend,
-                vllm_major_version,
-                task,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=False)
+    _compare_sp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        vllm_major_version,
+        task,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=False,
+    )
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index e1357b4a34e9..cdea1bfe8f28 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -26,13 +26,13 @@ def distributed_run(fn, world_size):
     processes = []
     for i in range(number_of_processes):
         env = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
@@ -57,25 +57,23 @@ def wrapped_fn(env):
 
 @worker_fn_wrapper
 def worker_fn():
-
     rank = dist.get_rank()
     if rank == 0:
         port = get_open_port()
-        ip = '127.0.0.1'
+        ip = "127.0.0.1"
         dist.broadcast_object_list([ip, port], src=0)
     else:
         recv = [None, None]
         dist.broadcast_object_list(recv, src=0)
         ip, port = recv  # type: ignore
 
-    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
-                                                dist.get_world_size())
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
 
     for pg in [dist.group.WORLD, stateless_pg]:
-
         writer_rank = 2
         broadcaster = MessageQueue.create_from_process_group(
-            pg, 40 * 1024, 2, writer_rank)
+            pg, 40 * 1024, 2, writer_rank
+        )
         if rank == writer_rank:
             seed = random.randint(0, 1000)
             dist.broadcast_object_list([seed], writer_rank)
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index 9f2c3eaec359..f415409d7b37 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -24,13 +24,15 @@
 
 # set different `gpu_memory_utilization` and `swap_space` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
-llm = LLM(model="facebook/opt-125m",
-          tensor_parallel_size=2,
-          pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
-          distributed_executor_backend="external_launcher",
-          gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4),
-          seed=0)
+llm = LLM(
+    model="facebook/opt-125m",
+    tensor_parallel_size=2,
+    pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
+    distributed_executor_backend="external_launcher",
+    gpu_memory_utilization=random.uniform(0.7, 0.9),
+    swap_space=random.randint(1, 4),
+    seed=0,
+)
 
 outputs = llm.generate(prompts, sampling_params)
 
@@ -48,15 +50,14 @@ def test_consistent_across_ranks(obj):
         assert container[0] == obj
 
 
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
 
 # make sure we can access the model parameters from the calling process
 # of the `LLM` instance.
-params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
-              model.parameters())
+params = list(
+    llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
+)
 test_consistent_across_ranks(len(params))
 
 # all ranks should have the same outputs
@@ -65,5 +66,4 @@ def test_consistent_across_ranks(obj):
     generated_text = output.outputs[0].text
     test_consistent_across_ranks(prompt)
     test_consistent_across_ranks(generated_text)
-    print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
+    print(f"Rank {torch_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 0287ad94e388..2a6936fcd4c2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -10,21 +10,22 @@
 import vllm.envs as envs
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import (cuda_device_count_stateless, get_open_port,
-                        update_environment_variables)
+from vllm.utils import (
+    cuda_device_count_stateless,
+    get_open_port,
+    update_environment_variables,
+)
 
 from ..utils import multi_gpu_test
 
 
 @ray.remote
 class _CUDADeviceCountStatelessTestActor:
-
     def get_count(self):
         return cuda_device_count_stateless()
 
     def set_cuda_visible_devices(self, cuda_visible_devices: str):
-        update_environment_variables(
-            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
 
     def get_cuda_visible_devices(self):
         return envs.CUDA_VISIBLE_DEVICES
@@ -34,10 +35,9 @@ def test_cuda_device_count_stateless():
     """Test that cuda_device_count_stateless changes return value if
     CUDA_VISIBLE_DEVICES is changed."""
     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
-        num_gpus=2).remote()
-    assert len(
-        sorted(ray.get(
-            actor.get_cuda_visible_devices.remote()).split(","))) == 2
+        num_gpus=2
+    ).remote()
+    assert len(sorted(ray.get(actor.get_cuda_visible_devices.remote()).split(","))) == 2
     assert ray.get(actor.get_count.remote()) == 2
     ray.get(actor.set_cuda_visible_devices.remote("0"))
     assert ray.get(actor.get_count.remote()) == 1
@@ -46,15 +46,13 @@ def test_cuda_device_count_stateless():
 
 
 def cpu_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
-                                           port=port2,
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            host="127.0.0.1", port=port2, rank=rank, world_size=3
+        )
     data = torch.tensor([rank])
     data = pg1.broadcast_obj(data, src=2)
     assert data.item() == 2
@@ -68,16 +66,14 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
 
 def gpu_worker(rank, WORLD_SIZE, port1, port2):
     torch.cuda.set_device(rank)
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
-                                           port=port2,
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            host="127.0.0.1", port=port2, rank=rank, world_size=3
+        )
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
     data = torch.tensor([rank]).cuda()
     pynccl1.all_reduce(data)
@@ -96,10 +92,9 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def broadcast_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     if rank == 2:
         pg1.broadcast_obj("secret", src=2)
     else:
@@ -109,10 +104,9 @@ def broadcast_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def allgather_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
     data = pg1.all_gather_obj(rank)
     assert data == list(range(WORLD_SIZE))
     pg1.barrier()
@@ -121,7 +115,8 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2):
 @pytest.mark.skip(reason="This test is flaky and prone to hang.")
 @multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize(
-    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
+    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]
+)
 def test_stateless_process_group(worker):
     port1 = get_open_port()
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -129,12 +124,14 @@ def test_stateless_process_group(worker):
         port2 = get_open_port()
     WORLD_SIZE = 4
     from multiprocessing import get_context
+
     ctx = get_context("fork")
     processes = []
     for i in range(WORLD_SIZE):
         rank = i
         processes.append(
-            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
+            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))
+        )
     for p in processes:
         p.start()
     for p in processes:
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index 8b99d9d6e21f..26866a95fb55 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -4,22 +4,24 @@
 
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
 """
+
 from typing import Optional
 
 import pytest
 from transformers import AutoModelForSeq2SeqLM
 
-from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
-                                     global_force_attn_backend_context_manager)
+from vllm.attention.selector import (
+    _Backend,
+    _cached_get_attn_backend,
+    global_force_attn_backend_context_manager,
+)
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import DecoderPromptType
 from ..models.utils import check_logprobs_close
 
-LIST_ENC_DEC_SUPPORTED_BACKENDS = [
-    _Backend.XFORMERS, _Backend.FLASH_ATTN, None
-]
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN, None]
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -28,7 +30,7 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 def vllm_to_hf_output(
@@ -61,7 +63,7 @@ def clear_cache():
 @pytest.mark.parametrize("enforce_eager", [True, False])
 @pytest.mark.skipif(
     current_platform.is_cpu(),
-    reason="CPU backend is not currently supported with encoder/decoder models"
+    reason="CPU backend is not currently supported with encoder/decoder models",
 )
 def test_encoder_decoder_e2e(
     hf_runner,
@@ -75,19 +77,18 @@ def test_encoder_decoder_e2e(
     enforce_eager: bool,
     attn_backend: _Backend,
 ) -> None:
-    '''
+    """
     End-to-End (E2E) test for the encoder-decoder framework.
     This test evaluates the encoder-decoder functionality using the BART
     model. We compare the outputs of the Hugging Face and vLLM
     implementations to ensure that both implementations produce consistent
     and correct results.
-    '''
+    """
     with global_force_attn_backend_context_manager(attn_backend):
         if attn_backend == _Backend.FLASH_ATTN:
             # Flash Attention works only with bfloat16 data-type
-            dtype = 'bfloat16'
-        test_case_prompts = example_encoder_decoder_prompts[
-            decoder_prompt_type]
+            dtype = "bfloat16"
+        test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type]
 
         # Configuration settings for HF baseline
         hf_kwargs = {
@@ -98,25 +99,22 @@ def test_encoder_decoder_e2e(
             "length_penalty": 1.0,
             "early_stopping": False,
             "no_repeat_ngram_size": None,
-            "min_length": 0
+            "min_length": 0,
         }
 
-        with hf_runner(model, dtype=dtype,
-                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-            hf_outputs = (
-                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                    test_case_prompts,
-                    max_tokens,
-                    num_logprobs,
-                    **hf_kwargs,
-                ))
-        with vllm_runner(model, dtype=dtype,
-                         enforce_eager=enforce_eager) as vllm_model:
+        with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+            hf_outputs = hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                test_case_prompts,
+                max_tokens,
+                num_logprobs,
+                **hf_kwargs,
+            )
+        with vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager) as vllm_model:
             vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-                test_case_prompts, max_tokens, num_logprobs)
+                test_case_prompts, max_tokens, num_logprobs
+            )
 
-        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
-                          else 0)
+        hf_skip_tokens = 1 if decoder_prompt_type == DecoderPromptType.NONE else 0
 
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py
index 375b248ebeda..a6a8b33e19d3 100644
--- a/tests/engine/conftest.py
+++ b/tests/engine/conftest.py
@@ -9,4 +9,4 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 5a91758414a5..90028efb2d81 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -10,22 +10,30 @@
 import pytest
 
 from vllm.config import CompilationConfig, config
-from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
-                                   get_type, get_type_hints, is_not_builtin,
-                                   is_type, literal_to_kwargs, optional_type,
-                                   parse_type)
+from vllm.engine.arg_utils import (
+    EngineArgs,
+    contains_type,
+    get_kwargs,
+    get_type,
+    get_type_hints,
+    is_not_builtin,
+    is_type,
+    literal_to_kwargs,
+    optional_type,
+    parse_type,
+)
 from vllm.utils import FlexibleArgumentParser
 
 
-@pytest.mark.parametrize(("type", "value", "expected"), [
-    (int, "42", 42),
-    (float, "3.14", 3.14),
-    (str, "Hello World!", "Hello World!"),
-    (json.loads, '{"foo":1,"bar":2}', {
-        "foo": 1,
-        "bar": 2
-    }),
-])
+@pytest.mark.parametrize(
+    ("type", "value", "expected"),
+    [
+        (int, "42", 42),
+        (float, "3.14", 3.14),
+        (str, "Hello World!", "Hello World!"),
+        (json.loads, '{"foo":1,"bar":2}', {"foo": 1, "bar": 2}),
+    ],
+)
 def test_parse_type(type, value, expected):
     parse_type_func = parse_type(type)
     assert parse_type_func(value) == expected
@@ -37,43 +45,52 @@ def test_optional_type():
     assert optional_type_func("42") == 42
 
 
-@pytest.mark.parametrize(("type_hint", "type", "expected"), [
-    (int, int, True),
-    (int, float, False),
-    (list[int], list, True),
-    (list[int], tuple, False),
-    (Literal[0, 1], Literal, True),
-])
+@pytest.mark.parametrize(
+    ("type_hint", "type", "expected"),
+    [
+        (int, int, True),
+        (int, float, False),
+        (list[int], list, True),
+        (list[int], tuple, False),
+        (Literal[0, 1], Literal, True),
+    ],
+)
 def test_is_type(type_hint, type, expected):
     assert is_type(type_hint, type) == expected
 
 
-@pytest.mark.parametrize(("type_hints", "type", "expected"), [
-    ({float, int}, int, True),
-    ({int, tuple[int]}, int, True),
-    ({int, tuple[int]}, float, False),
-    ({str, Literal["x", "y"]}, Literal, True),
-])
+@pytest.mark.parametrize(
+    ("type_hints", "type", "expected"),
+    [
+        ({float, int}, int, True),
+        ({int, tuple[int]}, int, True),
+        ({int, tuple[int]}, float, False),
+        ({str, Literal["x", "y"]}, Literal, True),
+    ],
+)
 def test_contains_type(type_hints, type, expected):
     assert contains_type(type_hints, type) == expected
 
 
-@pytest.mark.parametrize(("type_hints", "type", "expected"), [
-    ({int, float}, int, int),
-    ({int, float}, str, None),
-    ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
-])
+@pytest.mark.parametrize(
+    ("type_hints", "type", "expected"),
+    [
+        ({int, float}, int, int),
+        ({int, float}, str, None),
+        ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
+    ],
+)
 def test_get_type(type_hints, type, expected):
     assert get_type(type_hints, type) == expected
 
 
-@pytest.mark.parametrize(("type_hints", "expected"), [
-    ({Literal[1, 2]}, {
-        "type": int,
-        "choices": [1, 2]
-    }),
-    ({Literal[1, "a"]}, Exception),
-])
+@pytest.mark.parametrize(
+    ("type_hints", "expected"),
+    [
+        ({Literal[1, 2]}, {"type": int, "choices": [1, 2]}),
+        ({Literal[1, "a"]}, Exception),
+    ],
+)
 def test_literal_to_kwargs(type_hints, expected):
     context = nullcontext()
     if expected is Exception:
@@ -144,22 +161,27 @@ class DummyConfig:
     """Different config with from_cli method"""
 
 
-@pytest.mark.parametrize(("type_hint", "expected"), [
-    (int, False),
-    (DummyConfig, True),
-])
+@pytest.mark.parametrize(
+    ("type_hint", "expected"),
+    [
+        (int, False),
+        (DummyConfig, True),
+    ],
+)
 def test_is_not_builtin(type_hint, expected):
     assert is_not_builtin(type_hint) == expected
 
 
 @pytest.mark.parametrize(
-    ("type_hint", "expected"), [
+    ("type_hint", "expected"),
+    [
         (Annotated[int, "annotation"], {int}),
         (Optional[int], {int, type(None)}),
         (Annotated[Optional[int], "annotation"], {int, type(None)}),
         (Optional[Annotated[int, "annotation"]], {int, type(None)}),
     ],
-    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"])
+    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"],
+)
 def test_get_type_hints(type_hint, expected):
     assert get_type_hints(type_hint) == expected
 
@@ -199,24 +221,16 @@ def test_get_kwargs():
     ("arg", "expected"),
     [
         (None, dict()),
-        ('{"video": {"num_frames": 123} }', {
-            "video": {
-                "num_frames": 123
-            }
-        }),
+        ('{"video": {"num_frames": 123} }', {"video": {"num_frames": 123}}),
         (
             '{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }',  # noqa
             {
-                "video": {
-                    "num_frames": 123,
-                    "fps": 1.0,
-                    "foo": "bar"
-                },
-                "image": {
-                    "foo": "bar"
-                }
-            }),
-    ])
+                "video": {"num_frames": 123, "fps": 1.0, "foo": "bar"},
+                "image": {"foo": "bar"},
+            },
+        ),
+    ],
+)
 def test_media_io_kwargs_parser(arg, expected):
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     if arg is None:
@@ -251,24 +265,32 @@ def test_compilation_config():
     assert args.compilation_config.level == 3
 
     # set to string form of a dict
-    args = parser.parse_args([
-        "-O",
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-        '"use_inductor": false}',
-    ])
-    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-            and not args.compilation_config.use_inductor)
+    args = parser.parse_args(
+        [
+            "-O",
+            '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '"use_inductor": false}',
+        ]
+    )
+    assert (
+        args.compilation_config.level == 3
+        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+        and not args.compilation_config.use_inductor
+    )
 
     # set to string form of a dict
-    args = parser.parse_args([
-        "--compilation-config="
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-        '"use_inductor": true}',
-    ])
-    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-            and args.compilation_config.use_inductor)
+    args = parser.parse_args(
+        [
+            "--compilation-config="
+            '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '"use_inductor": true}',
+        ]
+    )
+    assert (
+        args.compilation_config.level == 3
+        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+        and args.compilation_config.use_inductor
+    )
 
 
 def test_prefix_cache_default():
@@ -276,8 +298,7 @@ def test_prefix_cache_default():
     args = parser.parse_args([])
 
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert (not engine_args.enable_prefix_caching
-            ), "prefix caching defaults to off."
+    assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
 
     # with flag to turn it on.
     args = parser.parse_args(["--enable-prefix-caching"])
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index ac5a1f957dfe..8a05351e879f 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -18,15 +18,17 @@ def test_computed_prefix_blocks(model: str, block_size: int):
     prompt = (
         "You are a helpful assistant. How do I build a car from cardboard and "
         "paper clips? Is there an easy to follow video tutorial available "
-        "online for free?")
+        "online for free?"
+    )
     prompt2 = (
         " Please recommend to me some resources where I can learn not only to "
         "handle technical difficulties of building a car, but also "
-        "decoration.")
+        "decoration."
+    )
 
-    engine_args = EngineArgs(model=model,
-                             block_size=block_size,
-                             enable_prefix_caching=True)
+    engine_args = EngineArgs(
+        model=model, block_size=block_size, enable_prefix_caching=True
+    )
 
     engine = LLMEngine.from_engine_args(engine_args)
     sampling_params = SamplingParams()
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 15c7a97b50e1..bc6994c5f041 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -14,17 +14,17 @@
 from vllm.sampling_params import SamplingParams
 
 
-class Mock:
-    ...
+class Mock: ...
 
 
 class CustomUniExecutor(UniProcExecutor):
-
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict] = None) -> list[Any]:
+    def collective_rpc(
+        self,
+        method: Union[str, Callable],
+        timeout: Optional[float] = None,
+        args: tuple = (),
+        kwargs: Optional[dict] = None,
+    ) -> list[Any]:
         # Drop marker to show that this was ran
         with open(".marker", "w"):
             ...
@@ -37,12 +37,10 @@ def collective_rpc(self,
 @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_custom_executor_type_checking(model):
     with pytest.raises(ValueError):
-        engine_args = EngineArgs(model=model,
-                                 distributed_executor_backend=Mock)
+        engine_args = EngineArgs(model=model, distributed_executor_backend=Mock)
         LLMEngine.from_engine_args(engine_args)
     with pytest.raises(ValueError):
-        engine_args = AsyncEngineArgs(model=model,
-                                      distributed_executor_backend=Mock)
+        engine_args = AsyncEngineArgs(model=model, distributed_executor_backend=Mock)
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py
index 458f4deb743a..9935eeea8b29 100644
--- a/tests/engine/test_multi_step_output_processor.py
+++ b/tests/engine/test_multi_step_output_processor.py
@@ -11,8 +11,12 @@
 from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput, SequenceStatus)
+from vllm.sequence import (
+    CompletionSequenceGroupOutput,
+    Logprob,
+    SequenceOutput,
+    SequenceStatus,
+)
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.utils import Counter
 
@@ -44,9 +48,9 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
     seq_group = create_seq_group(
         seq_prompt_len=1024,
         seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=seq_output_len +
-                                       num_new_tokens,
-                                       ignore_eos=True),
+        sampling_params=SamplingParams(
+            max_tokens=seq_output_len + num_new_tokens, ignore_eos=True
+        ),
     )
 
     seq = seq_group.get_seqs()[0]
@@ -64,12 +68,13 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
                 )
             ],
             prompt_logprobs=None,
-        ) for output_token in new_token_ids
+        )
+        for output_token in new_token_ids
     ]
 
-    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
+    assert seq.get_token_ids()[-len(new_token_ids) :] != new_token_ids
     output_processor.process_outputs(seq_group, outputs)
-    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
+    assert seq.get_token_ids()[-len(new_token_ids) :] == new_token_ids
 
 
 @pytest.mark.parametrize("seq_prompt_len", [1024])
@@ -77,8 +82,9 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
 @pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
 @pytest.mark.parametrize("max_tokens", [128 + 3])
 @pytest.mark.skip_global_cleanup
-def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
-                             seq_output_len: int, max_tokens: int):
+def test_respects_max_tokens(
+    num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, max_tokens: int
+):
     """Verify tokens after max_tokens are dropped and not appended to the
     sequence.
     """
@@ -98,7 +104,9 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
     seq_group = create_seq_group(
         seq_prompt_len=seq_prompt_len,
         seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=max_tokens, ),
+        sampling_params=SamplingParams(
+            max_tokens=max_tokens,
+        ),
     )
 
     seq = seq_group.get_seqs()[0]
@@ -116,7 +124,8 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
                 )
             ],
             prompt_logprobs=None,
-        ) for output_token in new_token_ids
+        )
+        for output_token in new_token_ids
     ]
 
     assert seq.get_len() == seq_prompt_len + seq_output_len
@@ -126,9 +135,11 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
     assert seq.get_len() == seq_prompt_len + max_tokens
 
     # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
+    expected_appended_tokens = new_token_ids[: max_tokens - seq_output_len]
+    assert (
+        seq.get_token_ids()[-len(expected_appended_tokens) :]
+        == expected_appended_tokens
+    )
 
 
 @pytest.mark.parametrize("seq_prompt_len", [1024])
@@ -136,8 +147,9 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
 @pytest.mark.parametrize("num_new_tokens", [12])
 @pytest.mark.parametrize("seed", list(range(6)))
 @pytest.mark.skip_global_cleanup
-def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                               seq_output_len: int, seed: int):
+def test_respects_eos_token_id(
+    num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int
+):
     """Verify the eos token id is included in the sequence, but subsequent
     tokens are dropped (not appended to sequence).
     """
@@ -162,7 +174,8 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
         seq_output_lens=[seq_output_len],
         sampling_params=SamplingParams(
             # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens, ),
+            max_tokens=seq_output_len + num_new_tokens,
+        ),
     )
 
     seq = seq_group.get_seqs()[0]
@@ -183,7 +196,8 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
                 )
             ],
             prompt_logprobs=None,
-        ) for output_token in new_token_ids
+        )
+        for output_token in new_token_ids
     ]
 
     assert seq.get_len() == seq_prompt_len + seq_output_len
@@ -193,9 +207,11 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
     assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
 
     # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:eos_index + 1]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
+    expected_appended_tokens = new_token_ids[: eos_index + 1]
+    assert (
+        seq.get_token_ids()[-len(expected_appended_tokens) :]
+        == expected_appended_tokens
+    )
 
 
 @pytest.mark.parametrize("seq_prompt_len", [1024])
@@ -203,8 +219,9 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
 @pytest.mark.parametrize("num_new_tokens", [12])
 @pytest.mark.parametrize("seed", list(range(6)))
 @pytest.mark.skip_global_cleanup
-def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                              seq_output_len: int, seed: int):
+def test_ignores_eos_token_id(
+    num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int
+):
     """When sampling parameters dictate that we should ignore the eos token id,
     ensure all token ids are appended even if the eos token id is emitted.
     """
@@ -252,7 +269,8 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
                 )
             ],
             prompt_logprobs=None,
-        ) for output_token in new_token_ids
+        )
+        for output_token in new_token_ids
     ]
 
     assert seq.get_len() == seq_prompt_len + seq_output_len
@@ -262,10 +280,13 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
     assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
 
     # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
-                                             seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
+    expected_appended_tokens = new_token_ids[
+        : seq_output_len + num_new_tokens - seq_output_len
+    ]
+    assert (
+        seq.get_token_ids()[-len(expected_appended_tokens) :]
+        == expected_appended_tokens
+    )
 
 
 def mock_tokenizer(eos_token_id=1000):
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index b5381b61a020..3ca19da99ccd 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -10,8 +10,11 @@
 import pytest
 
 from vllm.config import VllmConfig
-from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
-                                                  ResultHandler, WorkerMonitor)
+from vllm.executor.multiproc_worker_utils import (
+    ProcessWorkerWrapper,
+    ResultHandler,
+    WorkerMonitor,
+)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 
@@ -32,8 +35,8 @@ def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
     result_handler = ResultHandler()
     vllm_config = VllmConfig()
     workers = [
-        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
-                             rank) for rank in range(8)
+        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config, rank)
+        for rank in range(8)
     ]
 
     worker_monitor = WorkerMonitor(workers, result_handler)
@@ -53,8 +56,7 @@ def test_local_workers() -> None:
 
     def execute_workers(worker_input: str) -> None:
         worker_outputs = [
-            worker.execute_method("worker_method", worker_input)
-            for worker in workers
+            worker.execute_method("worker_method", worker_input) for worker in workers
         ]
 
         for rank, output in enumerate(worker_outputs):
@@ -152,8 +154,7 @@ async def execute_workers(worker_input: str) -> None:
     # Test error case
     exception = ValueError("fake error")
     try:
-        _result = await workers[0].execute_method_async(
-            "worker_method", exception)
+        _result = await workers[0].execute_method_async("worker_method", exception)
         pytest.fail("task should have failed")
     except Exception as e:
         assert isinstance(e, ValueError)
@@ -172,8 +173,7 @@ async def execute_workers(worker_input: str) -> None:
 
     # Further attempts to submit tasks should fail
     try:
-        _result = await workers[0].execute_method_async(
-            "worker_method", "test")
+        _result = await workers[0].execute_method_async("worker_method", "test")
         pytest.fail("task should fail once workers have been shut down")
     except Exception as e:
         assert isinstance(e, ChildProcessError)
diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
index 42e88e84770a..b15bf1f4dcb0 100644
--- a/tests/engine/test_options.py
+++ b/tests/engine/test_options.py
@@ -23,8 +23,9 @@ def test_skip_tokenizer_initialization(model: str):
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
         llm.generate("abc", sampling_params)
 
-    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
-                           sampling_params=sampling_params)
+    outputs = llm.generate(
+        {"prompt_token_ids": [1, 2, 3]}, sampling_params=sampling_params
+    )
     assert len(outputs) > 0
     completions = outputs[0].outputs
     assert len(completions) > 0
@@ -34,8 +35,7 @@ def test_skip_tokenizer_initialization(model: str):
 
 @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
-def test_enable_prompt_embeds(hf_runner, model: str,
-                              enable_prompt_embeds: bool):
+def test_enable_prompt_embeds(hf_runner, model: str, enable_prompt_embeds: bool):
     prompt = "abc"
 
     with hf_runner(model) as hf_model:
@@ -45,8 +45,11 @@ def test_enable_prompt_embeds(hf_runner, model: str,
         embed_layer = hf_model.model.get_input_embeddings()
         prompt_embeds = embed_layer(token_ids).squeeze(0)
 
-    ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
-        ValueError, match="set `--enable-prompt-embeds`"))
+    ctx = (
+        nullcontext()
+        if enable_prompt_embeds
+        else pytest.raises(ValueError, match="set `--enable-prompt-embeds`")
+    )
 
     llm = LLM(
         model=model,
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index 9c62761d78af..f63c0cc596e4 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -5,12 +5,12 @@
 
 from ..conftest import IMAGE_ASSETS
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
-    "cherry_blossom":
-    "USER: <image>\nWhat is the season?\nASSISTANT:",
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+        "cherry_blossom": "USER: <image>\nWhat is the season?\nASSISTANT:",
+    }
+)
 
 models = ["llava-hf/llava-1.5-7b-hf"]
 
@@ -19,8 +19,7 @@
 def test_context_length_too_short(vllm_runner, image_assets, model):
     images = [asset.pil_image for asset in image_assets]
 
-    with pytest.raises(ValueError,
-                       match="longer than the maximum model length"):
+    with pytest.raises(ValueError, match="longer than the maximum model length"):
         vllm_model = vllm_runner(
             model,
             max_model_len=128,  # LLaVA has a feature size of 576
@@ -28,6 +27,6 @@ def test_context_length_too_short(vllm_runner, image_assets, model):
         )
 
         with vllm_model:
-            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
-                                       max_tokens=1,
-                                       images=[images[0]])
+            vllm_model.generate_greedy(
+                [HF_IMAGE_PROMPTS[0]], max_tokens=1, images=[images[0]]
+            )
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a7c533ec2419..bea264cc8fb5 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -26,8 +26,10 @@ def sample_token_ids():
 
 @pytest.fixture
 def sample_regex():
-    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+    return (
+        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+    )
 
 
 @pytest.fixture
@@ -35,40 +37,27 @@ def sample_json_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
             "skills": {
                 "type": "array",
-                "items": {
-                    "type": "string",
-                    "maxLength": 10
-                },
-                "minItems": 3
+                "items": {"type": "string", "maxLength": 10},
+                "minItems": 3,
             },
             "work_history": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
-                        "company": {
-                            "type": "string"
-                        },
-                        "duration": {
-                            "type": "number"
-                        },
-                        "position": {
-                            "type": "string"
-                        }
+                        "company": {"type": "string"},
+                        "duration": {"type": "number"},
+                        "position": {"type": "string"},
                     },
-                    "required": ["company", "position"]
-                }
-            }
+                    "required": ["company", "position"],
+                },
+            },
         },
-        "required": ["name", "age", "skills", "work_history"]
+        "required": ["name", "age", "skills", "work_history"],
     }
 
 
@@ -80,65 +69,53 @@ def sample_complex_json_schema():
             "score": {
                 "type": "integer",
                 "minimum": 0,
-                "maximum": 100  # Numeric range
+                "maximum": 100,  # Numeric range
             },
             "grade": {
                 "type": "string",
-                "pattern": "^[A-D]$"  # Regex pattern
+                "pattern": "^[A-D]$",  # Regex pattern
             },
             "email": {
                 "type": "string",
-                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
             },
             "tags": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "pattern":
-                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
-                }
-            }
+                    "pattern": "^[a-z]{1,10}$",  # Combining length and pattern restrictions
+                },
+            },
         },
-        "required": ["score", "grade", "email", "tags"]
+        "required": ["score", "grade", "email", "tags"],
     }
 
 
 @pytest.fixture
 def sample_definition_json_schema():
     return {
-        '$defs': {
-            'Step': {
-                'properties': {
-                    'explanation': {
-                        'title': 'Explanation',
-                        'type': 'string'
-                    },
-                    'output': {
-                        'title': 'Output',
-                        'type': 'string'
-                    }
+        "$defs": {
+            "Step": {
+                "properties": {
+                    "explanation": {"title": "Explanation", "type": "string"},
+                    "output": {"title": "Output", "type": "string"},
                 },
-                'required': ['explanation', 'output'],
-                'title': 'Step',
-                'type': 'object'
+                "required": ["explanation", "output"],
+                "title": "Step",
+                "type": "object",
             }
         },
-        'properties': {
-            'steps': {
-                'items': {
-                    '$ref': '#/$defs/Step'
-                },
-                'title': 'Steps',
-                'type': 'array'
+        "properties": {
+            "steps": {
+                "items": {"$ref": "#/$defs/Step"},
+                "title": "Steps",
+                "type": "array",
             },
-            'final_answer': {
-                'title': 'Final Answer',
-                'type': 'string'
-            }
+            "final_answer": {"title": "Final Answer", "type": "string"},
         },
-        'required': ['steps', 'final_answer'],
-        'title': 'MathReasoning',
-        'type': 'object'
+        "required": ["steps", "final_answer"],
+        "title": "MathReasoning",
+        "type": "object",
     }
 
 
@@ -149,55 +126,61 @@ def sample_enum_json_schema():
         "properties": {
             "status": {
                 "type": "string",
-                "enum": ["active", "inactive",
-                         "pending"]  # Literal values using enum
+                "enum": ["active", "inactive", "pending"],  # Literal values using enum
             },
             "priority": {
                 "type": "string",
-                "enum": ["low", "medium", "high", "critical"]
+                "enum": ["low", "medium", "high", "critical"],
             },
             "category": {
                 "type": "object",
                 "properties": {
                     "type": {
                         "type": "string",
-                        "enum": ["bug", "feature", "improvement"]
+                        "enum": ["bug", "feature", "improvement"],
                     },
                     "severity": {
                         "type": "integer",
-                        "enum": [1, 2, 3, 4,
-                                 5]  # Enum can also contain numbers
-                    }
+                        "enum": [1, 2, 3, 4, 5],  # Enum can also contain numbers
+                    },
                 },
-                "required": ["type", "severity"]
+                "required": ["type", "severity"],
             },
             "flags": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "enum": ["urgent", "blocked", "needs_review", "approved"]
-                }
-            }
+                    "enum": ["urgent", "blocked", "needs_review", "approved"],
+                },
+            },
         },
-        "required": ["status", "priority", "category", "flags"]
+        "required": ["status", "priority", "category", "flags"],
     }
 
 
 @pytest.fixture
 def sample_guided_choice():
     return [
-        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
-        "Ruby", "Swift", "Kotlin"
+        "Python",
+        "Java",
+        "JavaScript",
+        "C++",
+        "C#",
+        "PHP",
+        "TypeScript",
+        "Ruby",
+        "Swift",
+        "Kotlin",
     ]
 
 
 @pytest.fixture
 def sample_sql_statements():
-    return ("""
+    return """
 start: select_statement
 select_statement: "SELECT" column "from" table "where" condition
 column: "col_1" | "col_2"
 table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
-""")
+"""
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 30a666d4c39c..3b416eef4d73 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -45,20 +45,23 @@ def run_test(model_name, more_args=None):
 
     measured_value = results["results"][TASK][FILTER]
     assert model_name in EXPECTED_VALUES, (
-        f"Cannot find the expected value for the model {model_name=}")
+        f"Cannot find the expected value for the model {model_name=}"
+    )
     expected_value = EXPECTED_VALUES[model_name]
-    assert (measured_value - RTOL < expected_value
-            and measured_value + RTOL > expected_value
-            ), f"Expected: {expected_value} |  Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < expected_value
+        and measured_value + RTOL > expected_value
+    ), f"Expected: {expected_value} |  Measured: {measured_value}"
 
 
 # TODO: [AlexM] Fix it with new CI/CD tests
-TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
+TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda()
-                    and not current_platform.is_tpu(),
-                    reason="V1 is currently only supported on CUDA and TPU")
+@pytest.mark.skipif(
+    not current_platform.is_cuda() and not current_platform.is_tpu(),
+    reason="V1 is currently only supported on CUDA and TPU",
+)
 @pytest.mark.parametrize("model", MODEL_NAMES)
 def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 97cf3b5ce8fc..ce641a45fa32 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -14,9 +14,7 @@
 def text_llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
@@ -29,14 +27,8 @@ def text_llm():
 def test_chat(text_llm):
     prompt1 = "Explain the concept of entropy."
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
     ]
     outputs = text_llm.chat(messages)
     assert len(outputs) == 1
@@ -47,25 +39,13 @@ def test_multi_chat(text_llm):
     prompt2 = "Explain what among us is."
 
     conversation1 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt1
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
     ]
 
     conversation2 = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": prompt2
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt2},
     ]
 
     messages = [conversation1, conversation2]
@@ -96,25 +76,20 @@ def vision_llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+@pytest.mark.parametrize("image_urls", [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(vision_llm, image_urls: list[str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            } for image_url in image_urls),
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
     outputs = vision_llm.chat(messages)
     assert len(outputs) >= 0
 
@@ -125,14 +100,8 @@ def test_llm_chat_tokenization_no_double_bos(text_llm):
     Check we get a single BOS token for llama chat.
     """
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello!"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello!"},
     ]
     outputs = text_llm.chat(messages)
     assert len(outputs) == 1
@@ -169,14 +138,8 @@ def thinking_llm():
 @pytest.mark.parametrize("enable_thinking", [True, False])
 def test_chat_extra_kwargs(thinking_llm, enable_thinking):
     messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "What is 1+1?"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "What is 1+1?"},
     ]
 
     outputs = thinking_llm.chat(
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 3a13f8c979f2..937aa5c13246 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -23,9 +23,11 @@ def echo_rank(self):
         return self.rank
 
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-              enforce_eager=True,
-              load_format="dummy",
-              tensor_parallel_size=tp_size,
-              distributed_executor_backend=backend)
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        load_format="dummy",
+        tensor_parallel_size=tp_size,
+        distributed_executor_backend=backend,
+    )
     assert llm.collective_rpc(echo_rank) == list(range(tp_size))
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index b930f05bebd0..163da19d1465 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -41,12 +41,14 @@ def v1(run_with_both_engines):
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=32768,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.75,
-              enforce_eager=True,
-              seed=0)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
@@ -56,8 +58,9 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_match(o1: list[PoolingRequestOutput],
-                         o2: list[PoolingRequestOutput]):
+def assert_outputs_match(
+    o1: list[PoolingRequestOutput], o2: list[PoolingRequestOutput]
+):
     check_embeddings_close(
         embeddings_0_lst=[o.outputs.data for o in o1],
         embeddings_1_lst=[o.outputs.data for o in o2],
@@ -68,17 +71,18 @@ def assert_outputs_match(o1: list[PoolingRequestOutput],
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
-def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
-                                                    prompt_token_ids):
+@pytest.mark.parametrize("prompt_token_ids", TOKEN_IDS)
+def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, prompt_token_ids):
     pooling_params = PoolingParams()
 
     with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
-                               pooling_params=pooling_params)
+        v1_output = llm.encode(
+            prompt_token_ids=prompt_token_ids, pooling_params=pooling_params
+        )
 
-    v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
-                           pooling_params=pooling_params)
+    v2_output = llm.encode(
+        {"prompt_token_ids": prompt_token_ids}, pooling_params=pooling_params
+    )
     assert_outputs_match(v1_output, v2_output)
 
 
@@ -87,13 +91,12 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
     pooling_params = PoolingParams()
 
     with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
-                               pooling_params=pooling_params)
+        v1_output = llm.encode(
+            prompt_token_ids=TOKEN_IDS, pooling_params=pooling_params
+        )
 
     v2_output = llm.encode(
-        [{
-            "prompt_token_ids": p
-        } for p in TOKEN_IDS],
+        [{"prompt_token_ids": p} for p in TOKEN_IDS],
         pooling_params=pooling_params,
     )
     assert_outputs_match(v1_output, v2_output)
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 707891f6bdd8..bde2d7c31c09 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -35,11 +35,13 @@ def v1(run_with_both_engines):
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.10,
-              enforce_eager=True)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,
+    )
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
@@ -54,17 +56,18 @@ def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
-def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
-                                                    prompt_token_ids):
+@pytest.mark.parametrize("prompt_token_ids", TOKEN_IDS)
+def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, prompt_token_ids):
     sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
 
     with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
-                                 sampling_params=sampling_params)
+        v1_output = llm.generate(
+            prompt_token_ids=prompt_token_ids, sampling_params=sampling_params
+        )
 
-    v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
-                             sampling_params=sampling_params)
+    v2_output = llm.generate(
+        {"prompt_token_ids": prompt_token_ids}, sampling_params=sampling_params
+    )
     assert_outputs_equal(v1_output, v2_output)
 
 
@@ -73,13 +76,12 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
     sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
 
     with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
-                                 sampling_params=sampling_params)
+        v1_output = llm.generate(
+            prompt_token_ids=TOKEN_IDS, sampling_params=sampling_params
+        )
 
     v2_output = llm.generate(
-        [{
-            "prompt_token_ids": p
-        } for p in TOKEN_IDS],
+        [{"prompt_token_ids": p} for p in TOKEN_IDS],
         sampling_params=sampling_params,
     )
     assert_outputs_equal(v1_output, v2_output)
@@ -124,7 +126,8 @@ def test_max_model_len():
     outputs = llm.generate(PROMPTS, sampling_params)
     for output in outputs:
         num_total_tokens = len(output.prompt_token_ids) + len(
-            output.outputs[0].token_ids)
+            output.outputs[0].token_ids
+        )
         # Total tokens must not exceed max_model_len.
         # It can be less if generation finishes due to other reasons (e.g., EOS)
         # before reaching the absolute model length limit.
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index b7d53e31fd71..a99d6ea01f76 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -4,6 +4,7 @@
 import weakref
 
 import pytest
+
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
@@ -26,6 +27,7 @@
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
+
     mpatch = MonkeyPatch()
     yield mpatch
     mpatch.undo()
@@ -33,20 +35,21 @@ def monkeypatch_module():
 
 @pytest.fixture(scope="module", params=[False, True])
 def llm(request, monkeypatch_module):
-
     use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              tensor_parallel_size=1,
-              max_model_len=8192,
-              enable_lora=True,
-              max_loras=4,
-              max_lora_rank=64,
-              max_num_seqs=128,
-              enforce_eager=True)
+    llm = LLM(
+        model=MODEL_NAME,
+        tensor_parallel_size=1,
+        max_model_len=8192,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        max_num_seqs=128,
+        enforce_eager=True,
+    )
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py
index 533da9e6d6ea..896091533ad2 100644
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -16,9 +16,8 @@ def test_gpu_memory_utilization():
     # makes sure gpu_memory_utilization is per-instance limit,
     # not a global limit
     llms = [
-        LLM(model="facebook/opt-125m",
-            gpu_memory_utilization=0.3,
-            enforce_eager=True) for i in range(3)
+        LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
+        for i in range(3)
     ]
     for llm in llms:
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 55578341cb2e..1797a7beaa5e 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -26,7 +26,7 @@
     ("guidance", True),
 ]
 
-ALL_DECODING_BACKENDS = ([("outlines", False)] + GRAMMAR_DECODING_BACKENDS)
+ALL_DECODING_BACKENDS = [("outlines", False)] + GRAMMAR_DECODING_BACKENDS
 
 
 @pytest.fixture(scope="module")
@@ -42,23 +42,27 @@ def llm():
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_regex(sample_regex, llm, guided_decoding_backend: str,
-                      disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_regex(
+    sample_regex, llm, guided_decoding_backend: str, disable_any_whitespace: bool
+):
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
         guided_decoding=GuidedDecodingParams(
             regex=sample_regex,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
 
-    outputs = llm.generate(prompts=[
-        f"Give an example IPv4 address with this regex: {sample_regex}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+    outputs = llm.generate(
+        prompts=[f"Give an example IPv4 address with this regex: {sample_regex}"] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
     for output in outputs:
@@ -73,24 +77,30 @@ def test_guided_regex(sample_regex, llm, guided_decoding_backend: str,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_json_completion(sample_json_schema, llm,
-                                guided_decoding_backend: str,
-                                disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_json_completion(
+    sample_json_schema, llm, guided_decoding_backend: str, disable_any_whitespace: bool
+):
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(
             json=sample_json_schema,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example JSON for an employee profile "
+            f"that fits this schema: {sample_json_schema}"
+        ]
+        * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -107,24 +117,33 @@ def test_guided_json_completion(sample_json_schema, llm,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_complex_json_completion(sample_complex_json_schema, llm,
-                                        guided_decoding_backend: str,
-                                        disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_complex_json_completion(
+    sample_complex_json_schema,
+    llm,
+    guided_decoding_backend: str,
+    disable_any_whitespace: bool,
+):
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(
             json=sample_complex_json_schema,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an assignment grade "
-        f"that fits this schema: {sample_complex_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example JSON for an assignment grade "
+            f"that fits this schema: {sample_complex_json_schema}"
+        ]
+        * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -137,29 +156,37 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm,
         assert generated_text is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_complex_json_schema)
+        jsonschema.validate(instance=output_json, schema=sample_complex_json_schema)
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_definition_json_completion(sample_definition_json_schema, llm,
-                                           guided_decoding_backend: str,
-                                           disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_definition_json_completion(
+    sample_definition_json_schema,
+    llm,
+    guided_decoding_backend: str,
+    disable_any_whitespace: bool,
+):
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(
             json=sample_definition_json_schema,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for solving 8x + 7 = -23 "
-        f"that fits this schema: {sample_definition_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example JSON for solving 8x + 7 = -23 "
+            f"that fits this schema: {sample_definition_json_schema}"
+        ]
+        * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -172,29 +199,37 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
         assert generated_text is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_definition_json_schema)
+        jsonschema.validate(instance=output_json, schema=sample_definition_json_schema)
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_enum_json_completion(sample_enum_json_schema, llm,
-                                     guided_decoding_backend: str,
-                                     disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_enum_json_completion(
+    sample_enum_json_schema,
+    llm,
+    guided_decoding_backend: str,
+    disable_any_whitespace: bool,
+):
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(
             json=sample_enum_json_schema,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
-    outputs = llm.generate(prompts=[
-        "Create a bug report JSON that fits this schema: "
-        f"{sample_enum_json_schema}. Make it for a high priority critical bug."
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
+    outputs = llm.generate(
+        prompts=[
+            "Create a bug report JSON that fits this schema: "
+            f"{sample_enum_json_schema}. Make it for a high priority critical bug."
+        ]
+        * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -207,37 +242,41 @@ def test_guided_enum_json_completion(sample_enum_json_schema, llm,
         assert generated_text is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json,
-                            schema=sample_enum_json_schema)
+        jsonschema.validate(instance=output_json, schema=sample_enum_json_schema)
 
         # Additional assertions to verify enum values
         assert output_json["status"] in ["active", "inactive", "pending"]
         assert output_json["priority"] in ["low", "medium", "high", "critical"]
-        assert output_json["category"]["type"] in [
-            "bug", "feature", "improvement"
-        ]
+        assert output_json["category"]["type"] in ["bug", "feature", "improvement"]
         assert output_json["category"]["severity"] in [1, 2, 3, 4, 5]
         for flag in output_json["flags"]:
             assert flag in ["urgent", "blocked", "needs_review", "approved"]
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_choice_completion(sample_guided_choice, llm,
-                                  guided_decoding_backend: str,
-                                  disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_choice_completion(
+    sample_guided_choice,
+    llm,
+    guided_decoding_backend: str,
+    disable_any_whitespace: bool,
+):
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
         guided_decoding=GuidedDecodingParams(
             choice=sample_guided_choice,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
     outputs = llm.generate(
         prompts="The best language for type-safe systems programming is ",
         sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
 
     assert outputs is not None
     for output in outputs:
@@ -252,11 +291,15 @@ def test_guided_choice_completion(sample_guided_choice, llm,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         GRAMMAR_DECODING_BACKENDS)
-def test_guided_grammar(sample_sql_statements, llm,
-                        guided_decoding_backend: str,
-                        disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", GRAMMAR_DECODING_BACKENDS
+)
+def test_guided_grammar(
+    sample_sql_statements,
+    llm,
+    guided_decoding_backend: str,
+    disable_any_whitespace: bool,
+):
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -264,10 +307,14 @@ def test_guided_grammar(sample_sql_statements, llm,
         guided_decoding=GuidedDecodingParams(
             grammar=sample_sql_statements,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
     outputs = llm.generate(
-        prompts=("Generate a sql state that select col_1 from "
-                 "table_1 where it is equals to 1"),
+        prompts=(
+            "Generate a sql state that select col_1 from "
+            "table_1 where it is equals to 1"
+        ),
         sampling_params=sampling_params,
         use_tqdm=True,
     )
@@ -282,12 +329,12 @@ def test_guided_grammar(sample_sql_statements, llm,
         assert generated_text is not None
         # use Lark to parse the output, and make sure it's a valid parse tree
         from lark import Lark
+
         parser = Lark(sample_sql_statements)
         parser.parse(generated_text)
 
         # remove spaces for comparison b/c we removed them in the grammar
-        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
-            " ", "")
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
 
         assert generated_text.strip() == ground_truth
 
@@ -299,10 +346,12 @@ def test_guided_options_request_deprecation_warning(sample_regex, llm):
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     with pytest.warns(DeprecationWarning, match="guided_options_request"):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True,
-                     guided_options_request=dict(guided_regex=sample_regex))
+        llm.generate(
+            prompts="This should fail",
+            sampling_params=sampling_params,
+            use_tqdm=True,
+            guided_options_request=dict(guided_regex=sample_regex),
+        )
 
 
 @pytest.mark.skip_global_cleanup
@@ -310,13 +359,16 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+        guided_decoding=GuidedDecodingParams(regex=sample_regex),
+    )
 
     with pytest.raises(ValueError, match="Cannot set both"):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True,
-                     guided_options_request=dict(guided_regex=sample_regex))
+        llm.generate(
+            prompts="This should fail",
+            sampling_params=sampling_params,
+            use_tqdm=True,
+            guided_options_request=dict(guided_regex=sample_regex),
+        )
 
 
 @pytest.mark.skip_global_cleanup
@@ -327,31 +379,35 @@ def test_disable_guided_decoding_fallback(sample_regex, llm):
         "properties": {
             "example": {
                 "type": "string",
-                "minLength": 5  # unsupported by xgrammar
+                "minLength": 5,  # unsupported by xgrammar
             }
-        }
+        },
     }
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=unsupported_json,
-                                         backend="xgrammar",
-                                         disable_fallback=True))
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(
+            json=unsupported_json, backend="xgrammar", disable_fallback=True
+        ),
+    )
 
     with pytest.raises(
-            ValueError,
-            match="xgrammar does not support advanced JSON schema features "
-            "like string length, item limits, or property bounds."):
-        llm.generate(prompts="This should fail",
-                     sampling_params=sampling_params,
-                     use_tqdm=True)
+        ValueError,
+        match="xgrammar does not support advanced JSON schema features "
+        "like string length, item limits, or property bounds.",
+    ):
+        llm.generate(
+            prompts="This should fail", sampling_params=sampling_params, use_tqdm=True
+        )
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         GRAMMAR_DECODING_BACKENDS)
-def test_guided_json_object(llm, guided_decoding_backend: str,
-                            disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", GRAMMAR_DECODING_BACKENDS
+)
+def test_guided_json_object(
+    llm, guided_decoding_backend: str, disable_any_whitespace: bool
+):
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=100,
@@ -359,13 +415,18 @@ def test_guided_json_object(llm, guided_decoding_backend: str,
         guided_decoding=GuidedDecodingParams(
             json_object=True,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
 
     outputs = llm.generate(
-        prompts=("Generate a JSON object with curly braces for a person with "
-                 "name and age fields for John Smith who is 31 years old."),
+        prompts=(
+            "Generate a JSON object with curly braces for a person with "
+            "name and age fields for John Smith who is 31 years old."
+        ),
         sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
 
     assert outputs is not None
     for output in outputs:
@@ -401,10 +462,12 @@ class CarDescription(BaseModel):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str,
-                                          disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_json_completion_with_enum(
+    llm, guided_decoding_backend: str, disable_any_whitespace: bool
+):
     json_schema = CarDescription.model_json_schema()
     sampling_params = SamplingParams(
         temperature=1.0,
@@ -412,12 +475,15 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str,
         guided_decoding=GuidedDecodingParams(
             json=json_schema,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace))
+            disable_any_whitespace=disable_any_whitespace,
+        ),
+    )
     outputs = llm.generate(
         prompts="Generate a JSON with the brand, model and car_type of"
         "the most iconic car from the 90's",
         sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
 
     assert outputs is not None
     for output in outputs:
@@ -433,27 +499,18 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
-                         ALL_DECODING_BACKENDS)
-def test_guided_number_range_json_completion(llm, guided_decoding_backend: str,
-                                             disable_any_whitespace: bool):
+@pytest.mark.parametrize(
+    "guided_decoding_backend,disable_any_whitespace", ALL_DECODING_BACKENDS
+)
+def test_guided_number_range_json_completion(
+    llm, guided_decoding_backend: str, disable_any_whitespace: bool
+):
     sample_output_schema = {
         "type": "object",
         "properties": {
-            "age": {
-                "type": "integer",
-                "minimum": 18,
-                "maximum": 99
-            },
-            "score": {
-                "type": "number",
-                "minimum": 0.0,
-                "maximum": 100.0
-            },
-            "zipcode": {
-                "type": "string",
-                "pattern": r"^\d{5}(-\d{4})?$"
-            },
+            "age": {"type": "integer", "minimum": 18, "maximum": 99},
+            "score": {"type": "number", "minimum": 0.0, "maximum": 100.0},
+            "zipcode": {"type": "string", "pattern": r"^\d{5}(-\d{4})?$"},
         },
         "required": ["age", "score", "zipcode"],
     }
@@ -463,12 +520,11 @@ def test_guided_number_range_json_completion(llm, guided_decoding_backend: str,
         guided_decoding=GuidedDecodingParams(
             json=sample_output_schema,
             backend=guided_decoding_backend,
-            disable_any_whitespace=disable_any_whitespace),
+            disable_any_whitespace=disable_any_whitespace,
+        ),
     )
     outputs = llm.generate(
-        prompts=[
-            "Create a JSON object for a user with age, score, and zipcode."
-        ] * 2,
+        prompts=["Create a JSON object for a user with age, score, and zipcode."] * 2,
         sampling_params=sampling_params,
         use_tqdm=True,
     )
@@ -487,43 +543,38 @@ def test_guided_number_range_json_completion(llm, guided_decoding_backend: str,
         jsonschema.validate(instance=output_json, schema=sample_output_schema)
         assert 18 <= output_json["age"] <= 99
         assert 0.0 <= output_json["score"] <= 100.0
-        assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"])
-                is not None)
+        assert re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"]) is not None
 
 
 @pytest.mark.skip_global_cleanup
 def test_guidance_no_additional_properties(llm):
     schema = {
-        'type': 'object',
-        'properties': {
-            'a1': {
-                'type': 'string'
-            },
-            'a2': {
-                'type': 'string'
-            },
-            'a3': {
-                'type': 'string'
-            }
+        "type": "object",
+        "properties": {
+            "a1": {"type": "string"},
+            "a2": {"type": "string"},
+            "a3": {"type": "string"},
         },
-        'required': ['a1', 'a2', 'a3'],
+        "required": ["a1", "a2", "a3"],
     }
 
     prompt = (
         "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
         "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
         "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
-        "<|im_end|>\n<|im_start|>assistant\n")
+        "<|im_end|>\n<|im_start|>assistant\n"
+    )
 
     def generate_with_backend(backend, disable_additional_properties):
         guided_params = GuidedDecodingParams(
             json=schema,
             backend=backend,
             disable_any_whitespace=True,
-            disable_additional_properties=disable_additional_properties)
-        sampling_params = SamplingParams(temperature=0,
-                                         max_tokens=256,
-                                         guided_decoding=guided_params)
+            disable_additional_properties=disable_additional_properties,
+        )
+        sampling_params = SamplingParams(
+            temperature=0, max_tokens=256, guided_decoding=guided_params
+        )
 
         outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
         assert outputs is not None
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 61b6b4fbf8e3..c8679852a54f 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -5,10 +5,10 @@
 from contextlib import nullcontext
 
 import pytest
-from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm_test_utils import BlameResult, blame
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -16,7 +16,7 @@ def use_v0_only(monkeypatch):
     """
     V1 only supports xgrammar so this is irrelevant.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 def run_normal_opt125m():
@@ -29,9 +29,7 @@ def run_normal_opt125m():
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="facebook/opt-125m",
-              enforce_eager=True,
-              gpu_memory_utilization=0.3)
+    llm = LLM(model="facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.3)
     outputs = llm.generate(prompts, sampling_params)
     for output in outputs:
         prompt = output.prompt
@@ -53,9 +51,9 @@ def run_normal():
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="distilbert/distilgpt2",
-              enforce_eager=True,
-              gpu_memory_utilization=0.3)
+    llm = LLM(
+        model="distilbert/distilgpt2", enforce_eager=True, gpu_memory_utilization=0.3
+    )
     outputs = llm.generate(prompts, sampling_params)
     for output in outputs:
         prompt = output.prompt
@@ -69,18 +67,19 @@ def run_normal():
 
 def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
-    llm = LLM(model="distilbert/distilgpt2",
-              enforce_eager=True,
-              guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.3)
+    llm = LLM(
+        model="distilbert/distilgpt2",
+        enforce_eager=True,
+        guided_decoding_backend="lm-format-enforcer",
+        gpu_memory_utilization=0.3,
+    )
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     outputs = llm.generate(
-        prompts=[
-            f"Give an example IPv4 address with this regex: {sample_regex}"
-        ] * 2,
+        prompts=[f"Give an example IPv4 address with this regex: {sample_regex}"] * 2,
         sampling_params=sampling_params,
         use_tqdm=True,
-        guided_options_request=dict(guided_regex=sample_regex))
+        guided_options_request=dict(guided_regex=sample_regex),
+    )
 
     for output in outputs:
         prompt = output.prompt
@@ -89,8 +88,7 @@ def run_lmfe(sample_regex):
 
 
 def test_lazy_outlines(sample_regex):
-    """If users don't use guided decoding, outlines should not be imported.
-    """
+    """If users don't use guided decoding, outlines should not be imported."""
     # make sure outlines is not imported
     module_name = "outlines"
     # In CI, we only check finally if the module is imported.
@@ -99,8 +97,7 @@ def test_lazy_outlines(sample_regex):
     # and help find the root cause.
     # We don't run it in CI by default because it is slow.
     use_blame = False
-    context = blame(
-        lambda: module_name in sys.modules) if use_blame else nullcontext()
+    context = blame(lambda: module_name in sys.modules) if use_blame else nullcontext()
     with context as result:
         run_normal()
         run_lmfe(sample_regex)
@@ -109,4 +106,5 @@ def test_lazy_outlines(sample_regex):
         print(f"the first import location is:\n{result.trace_stack}")
     assert module_name not in sys.modules, (
         f"Module {module_name} is imported. To see the first"
-        f" import location, run the test with `use_blame=True`.")
+        f" import location, run the test with `use_blame=True`."
+    )
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 1b7be15d5d69..a63e9dd5240f 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -16,12 +16,12 @@ def v1(run_with_both_engines):
 
 def test_empty_prompt():
     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
+    with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
         llm.generate([""])
 
 
 @pytest.mark.skip_v1
 def test_out_of_vocab_token():
     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='out of vocabulary'):
+    with pytest.raises(ValueError, match="out of vocabulary"):
         llm.generate({"prompt_token_ids": [999999]})
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index a606eeab5887..ab5e20c0df74 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
+
 import importlib
 import sys
 
@@ -88,12 +89,11 @@ def disable_connect(*args, **kwargs):
 
 
 def _re_import_modules():
-    hf_hub_module_names = [
-        k for k in sys.modules if k.startswith("huggingface_hub")
-    ]
+    hf_hub_module_names = [k for k in sys.modules if k.startswith("huggingface_hub")]
     transformers_module_names = [
-        k for k in sys.modules if k.startswith("transformers")
-        and not k.startswith("transformers_modules")
+        k
+        for k in sys.modules
+        if k.startswith("transformers") and not k.startswith("transformers_modules")
     ]
 
     reload_exception = None
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 41b70f80e3b8..fe2f88577294 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -27,7 +27,7 @@
     [],  # Default
     ["--enable-chunked-prefill"],  # Chunked
     ["--num-scheduler-steps", "8"],  # MS
-    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
+    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"],  # MS+Stream
 ]
 MAX_WAIT_SECONDS = None
 
@@ -47,14 +47,15 @@ def run_test(more_args):
     print(f"Running with: {args}")
 
     with RemoteOpenAIServer(
-            MODEL_NAME, args,
-            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
+        MODEL_NAME, args, max_wait_seconds=MAX_WAIT_SECONDS
+    ) as remote_server:
         url = f"{remote_server.url_for('v1')}/completions"
 
         model_args = (
             f"model={MODEL_NAME},"
             f"base_url={url},"
-            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+        )
 
         results = lm_eval.simple_evaluate(
             model="local-completions",
@@ -63,14 +64,16 @@ def run_test(more_args):
         )
 
         measured_value = results["results"][TASK][FILTER]
-        assert (measured_value - RTOL < EXPECTED_VALUE
-                and measured_value + RTOL > EXPECTED_VALUE
-                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+        assert (
+            measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda()
-                    and not current_platform.is_tpu(),
-                    reason="V1 currently only supported on CUDA and TPU")
+@pytest.mark.skipif(
+    not current_platform.is_cuda() and not current_platform.is_tpu(),
+    reason="V1 currently only supported on CUDA and TPU",
+)
 def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
@@ -86,8 +89,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
-                                    more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, more_args):
     """Run with the V0 Engine."""
 
     with monkeypatch.context() as m:
diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/openai/correctness/test_mteb_embed.py
index 12a86f9bdd59..48e8d0fce86a 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py
@@ -4,10 +4,12 @@
 
 import pytest
 
-from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
-                                                      MTEB_EMBED_TOL,
-                                                      OpenAIClientMtebEncoder,
-                                                      run_mteb_embed_task)
+from tests.models.language.pooling.mteb_utils import (
+    MTEB_EMBED_TASKS,
+    MTEB_EMBED_TOL,
+    OpenAIClientMtebEncoder,
+    run_mteb_embed_task,
+)
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
@@ -18,9 +20,7 @@
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
-    ]
+    args = ["--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py
index 05e953de4a0f..569aabb8c53f 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -7,9 +7,15 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from tests.models.language.pooling.mteb_utils import (
-    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
-    RerankClientMtebEncoder, ScoreClientMtebEncoder,
-    mteb_test_rerank_models_hf, run_mteb_rerank)
+    MTEB_RERANK_LANGS,
+    MTEB_RERANK_TASKS,
+    MTEB_RERANK_TOL,
+    RerankClientMtebEncoder,
+    ScoreClientMtebEncoder,
+    mteb_test_rerank_models_hf,
+    run_mteb_rerank,
+)
+
 # yapf: enable
 from tests.utils import RemoteOpenAIServer
 
@@ -20,9 +26,7 @@
 
 @pytest.fixture(scope="module")
 def server():
-    args = [
-        "--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
-    ]
+    args = ["--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
@@ -39,8 +43,7 @@ def st_main_score(hf_runner):
 def test_mteb_score(server, st_main_score):
     url = server.url_for("score")
     encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
-    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
-                                      MTEB_RERANK_LANGS)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
@@ -52,8 +55,7 @@ def test_mteb_score(server, st_main_score):
 def test_mteb_rerank(server, st_main_score):
     url = server.url_for("rerank")
     encoder = RerankClientMtebEncoder(MODEL_NAME, url)
-    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
-                                      MTEB_RERANK_LANGS)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, MTEB_RERANK_LANGS)
 
     print("VLLM main score: ", vllm_main_score)
     print("SentenceTransformer main score: ", st_main_score)
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 58195f98bd35..17c5867372c4 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -7,6 +7,7 @@
 This simulates real work usage of the API and makes sure that the frontend and
 AsyncLLMEngine are working correctly.
 """
+
 import asyncio
 import io
 import time
@@ -45,7 +46,8 @@ async def transcribe_audio(client, tokenizer, y, sr):
         # NOTE there's no streaming in transcriptions, can't measure ttft
     latency = end_time - start_time
     num_output_tokens = len(
-        tokenizer(transcription.text, add_special_tokens=False).input_ids)
+        tokenizer(transcription.text, add_special_tokens=False).input_ids
+    )
     return latency, num_output_tokens, transcription.text
 
 
@@ -71,7 +73,8 @@ async def process_dataset(model, client, data, concurrent_request):
     for sample in data:
         audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
         task = asyncio.create_task(
-            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+            bound_transcribe(model, sem, client, (audio, sr), sample["text"])
+        )
         tasks.append(task)
     return await asyncio.gather(*tasks)
 
@@ -95,34 +98,35 @@ def print_performance_metrics(results, total_time):
 
 
 def add_duration(sample):
-    y, sr = sample['audio']["array"], sample['audio']["sampling_rate"]
-    sample['duration_ms'] = librosa.get_duration(y=y, sr=sr) * 1000
+    y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+    sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
     return sample
 
 
-def load_hf_dataset(dataset_repo: str, split='validation', **hf_kwargs):
+def load_hf_dataset(dataset_repo: str, split="validation", **hf_kwargs):
     ## Load and filter the dataset
     dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
-    if 'duration_ms' not in dataset[0]:
+    if "duration_ms" not in dataset[0]:
         # compute duration to filter
         dataset = dataset.map(add_duration)
 
     # Whisper max supported duration
-    dataset = dataset.filter(lambda example: example['duration_ms'] < 30000)
+    dataset = dataset.filter(lambda example: example["duration_ms"] < 30000)
     return dataset
 
 
-def run_evaluation(model: str,
-                   client,
-                   dataset,
-                   max_concurrent_reqs: int,
-                   n_examples: int = -1,
-                   print_metrics: bool = True):
+def run_evaluation(
+    model: str,
+    client,
+    dataset,
+    max_concurrent_reqs: int,
+    n_examples: int = -1,
+    print_metrics: bool = True,
+):
     if n_examples > 0:
         dataset = dataset.select(range(n_examples))
     start = time.perf_counter()
-    results = asyncio.run(
-        process_dataset(model, client, dataset, max_concurrent_reqs))
+    results = asyncio.run(process_dataset(model, client, dataset, max_concurrent_reqs))
     end = time.perf_counter()
     total_time = end - start
     print(f"Total Test Time: {total_time:.4f} seconds")
@@ -132,8 +136,7 @@ def run_evaluation(model: str,
     predictions = [res[2] for res in results]
     references = [res[3] for res in results]
     wer = load("wer")
-    wer_score = 100 * wer.compute(references=references,
-                                  predictions=predictions)
+    wer_score = 100 * wer.compute(references=references, predictions=predictions)
     print("WER:", wer_score)
     return wer_score
 
@@ -142,26 +145,25 @@ def run_evaluation(model: str,
 @pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
 # Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
 @pytest.mark.parametrize(
-    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"])
+    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
+)
 # NOTE: Expected WER measured with equivalent hf.transformers args:
 # whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
 @pytest.mark.parametrize("expected_wer", [12.744980])
-def test_wer_correctness(model_name,
-                         dataset_repo,
-                         expected_wer,
-                         n_examples=-1,
-                         max_concurrent_request=None):
+def test_wer_correctness(
+    model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None
+):
     # TODO refactor to use `ASRDataset`
-    with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
+    with RemoteOpenAIServer(model_name, ["--enforce-eager"]) as remote_server:
         dataset = load_hf_dataset(dataset_repo)
 
         if not max_concurrent_request:
             # No max concurrency
-            max_concurrent_request = n_examples if n_examples > 0\
-                else len(dataset)
+            max_concurrent_request = n_examples if n_examples > 0 else len(dataset)
 
         client = remote_server.get_async_client()
-        wer = run_evaluation(model_name, client, dataset,
-                             max_concurrent_request, n_examples)
+        wer = run_evaluation(
+            model_name, client, dataset, max_concurrent_request, n_examples
+        )
         if expected_wer:
             torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index ab3c80905438..c81e2ec2190f 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -47,15 +47,11 @@ async def client(server):
     ids=["completion", "chat"],
     argnames=["create_func_gen", "content_body"],
     argvalues=[
-        (lambda x: x.completions.create, {
-            "prompt": " ".join(['A'] * 10_000)
-        }),
-        (lambda x: x.chat.completions.create, {
-            "messages": [{
-                "role": "user",
-                "content": " ".join(['A'] * 10_000)
-            }]
-        }),
+        (lambda x: x.completions.create, {"prompt": " ".join(["A"] * 10_000)}),
+        (
+            lambda x: x.chat.completions.create,
+            {"messages": [{"role": "user", "content": " ".join(["A"] * 10_000)}]},
+        ),
     ],
 )
 async def test_with_and_without_truncate(
@@ -68,15 +64,15 @@ async def test_with_and_without_truncate(
     body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
 
     num_requests = 10
-    truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
-                              (num_requests - num_requests // 2))
+    truncate_prompt_tokens = [1000] * (num_requests // 2) + [None] * (
+        num_requests - num_requests // 2
+    )
     random.shuffle(truncate_prompt_tokens)
 
-    bodies = [{
-        **body, "extra_body": {
-            'truncate_prompt_tokens': t
-        }
-    } for t in truncate_prompt_tokens]
+    bodies = [
+        {**body, "extra_body": {"truncate_prompt_tokens": t}}
+        for t in truncate_prompt_tokens
+    ]
 
     async def get_status_code(**kwargs):
         try:
@@ -94,18 +90,12 @@ async def get_status_code(**kwargs):
     ids=["single completion", "multiple completions", "chat"],
     argnames=["create_func_gen", "content_body"],
     argvalues=[
-        (lambda x: x.completions.create, {
-            "prompt": " ".join(['A'] * 300_000)
-        }),
-        (lambda x: x.completions.create, {
-            "prompt": [" ".join(['A'] * 300_000)] * 2
-        }),
-        (lambda x: x.chat.completions.create, {
-            "messages": [{
-                "role": "user",
-                "content": " ".join(['A'] * 300_000)
-            }]
-        }),
+        (lambda x: x.completions.create, {"prompt": " ".join(["A"] * 300_000)}),
+        (lambda x: x.completions.create, {"prompt": [" ".join(["A"] * 300_000)] * 2}),
+        (
+            lambda x: x.chat.completions.create,
+            {"messages": [{"role": "user", "content": " ".join(["A"] * 300_000)}]},
+        ),
     ],
 )
 async def test_healthcheck_response_time(
@@ -127,9 +117,7 @@ def get_response_time(url):
         return end_time - start_time
 
     no_load_response_time = get_response_time(server.url_for("health"))
-    tasks = [
-        asyncio.create_task(create_func(**body)) for _ in range(num_requests)
-    ]
+    tasks = [asyncio.create_task(create_func(**body)) for _ in range(num_requests)]
     await asyncio.sleep(1)  # give the tasks a chance to start running
     load_response_time = get_response_time(server.url_for("health"))
 
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index d67c05ab3e8d..e9e73f88a7bb 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -54,24 +54,18 @@ def base64_encoded_audio() -> dict[str, str]:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
-async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
-                                         model_name: str, audio_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_single_chat_session_audio(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": {"url": audio_url}},
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -80,13 +74,15 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -108,56 +104,52 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
-async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
-                                               model_name: str,
-                                               audio_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": audio_url
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_error_on_invalid_audio_url_type(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": audio_url},
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # audio_url should be a dict {"url": "some url"}, not directly a string
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=model_name,
-                                                 messages=messages,
-                                                 max_completion_tokens=10,
-                                                 temperature=0.0)
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: dict[str, str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url":
-                    f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+                    },
+                },
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -166,13 +158,15 @@ async def test_single_chat_session_audio_base64encoded(
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -196,25 +190,26 @@ async def test_single_chat_session_audio_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_input_audio(
-        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: dict[str, str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": base64_encoded_audio[audio_url],
-                    "format": "wav"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": base64_encoded_audio[audio_url],
+                        "format": "wav",
+                    },
+                },
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -222,13 +217,15 @@ async def test_single_chat_session_input_audio(
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -250,24 +247,18 @@ async def test_single_chat_session_input_audio(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
-async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
-                                    model_name: str, audio_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_chat_streaming_audio(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": {"url": audio_url}},
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -307,27 +298,27 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
-async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
-                                          model_name: str, audio_url: str,
-                                          base64_encoded_audio: dict[str,
-                                                                     str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": base64_encoded_audio[audio_url],
-                    "format": "wav"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+async def test_chat_streaming_input_audio(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": base64_encoded_audio[audio_url],
+                        "format": "wav",
+                    },
+                },
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -367,26 +358,23 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
-    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]])
-async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_urls: list[str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                }
-            } for audio_url in audio_urls),
-            {
-                "type": "text",
-                "text": "What's happening in this audio?"
-            },
-        ],
-    }]
+    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]
+)
+async def test_multi_audio_input(
+    client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "audio_url", "audio_url": {"url": audio_url}}
+                    for audio_url in audio_urls
+                ),
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
 
     if len(audio_urls) > MAXIMUM_AUDIOS:
         with pytest.raises(openai.BadRequestError):  # test multi-audio input
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index a55941976cd8..50ec87b4464f 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -16,9 +16,9 @@
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def server_args(request: pytest.FixtureRequest) -> list[str]:
-    """ Provide extra arguments to the server via indirect parametrization
+    """Provide extra arguments to the server via indirect parametrization
 
     Usage:
 
@@ -80,8 +80,10 @@ async def client(server):
     "server_args",
     [
         pytest.param([], id="default-frontend-multiprocessing"),
-        pytest.param(["--disable-frontend-multiprocessing"],
-                     id="disable-frontend-multiprocessing")
+        pytest.param(
+            ["--disable-frontend-multiprocessing"],
+            id="disable-frontend-multiprocessing",
+        ),
     ],
     indirect=True,
 )
@@ -97,8 +99,10 @@ async def test_show_version(server: RemoteOpenAIServer):
     "server_args",
     [
         pytest.param([], id="default-frontend-multiprocessing"),
-        pytest.param(["--disable-frontend-multiprocessing"],
-                     id="disable-frontend-multiprocessing")
+        pytest.param(
+            ["--disable-frontend-multiprocessing"],
+            id="disable-frontend-multiprocessing",
+        ),
     ],
     indirect=True,
 )
@@ -112,11 +116,13 @@ async def test_check_health(server: RemoteOpenAIServer):
 @pytest.mark.parametrize(
     "server_args",
     [
-        pytest.param(["--max-model-len", "10100"],
-                     id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--max-model-len", "10100"], id="default-frontend-multiprocessing"
+        ),
         pytest.param(
             ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
-            id="disable-frontend-multiprocessing")
+            id="disable-frontend-multiprocessing",
+        ),
     ],
     indirect=True,
 )
@@ -131,14 +137,16 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
     # Request about 2 million tokens
     for _ in range(200):
         task = asyncio.create_task(
-            client.chat.completions.create(messages=chat_input,
-                                           model=MODEL_NAME,
-                                           max_tokens=10000,
-                                           extra_body={"min_tokens": 10000}))
+            client.chat.completions.create(
+                messages=chat_input,
+                model=MODEL_NAME,
+                max_tokens=10000,
+                extra_body={"min_tokens": 10000},
+            )
+        )
         tasks.append(task)
 
-    done, pending = await asyncio.wait(tasks,
-                                       return_when=asyncio.ALL_COMPLETED)
+    done, pending = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
 
     # Make sure all requests were sent to the server and timed out
     # (We don't want to hide other errors like 400s that would invalidate this
@@ -151,16 +159,15 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
     # If the server had not cancelled all the other requests, then it would not
     # be able to respond to this one within the timeout
     client = server.get_async_client(timeout=5)
-    response = await client.chat.completions.create(messages=chat_input,
-                                                    model=MODEL_NAME,
-                                                    max_tokens=10)
+    response = await client.chat.completions.create(
+        messages=chat_input, model=MODEL_NAME, max_tokens=10
+    )
 
     assert len(response.choices) == 1
 
 
 @pytest.mark.asyncio
 async def test_request_wrong_content_type(server: RemoteOpenAIServer):
-
     chat_input = [{"role": "user", "content": "Write a long story"}]
     client = server.get_async_client()
 
@@ -169,17 +176,13 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
             messages=chat_input,
             model=MODEL_NAME,
             max_tokens=10000,
-            extra_headers={
-                "Content-Type": "application/x-www-form-urlencoded"
-            })
+            extra_headers={"Content-Type": "application/x-www-form-urlencoded"},
+        )
 
 
 @pytest.mark.parametrize(
     "server_args",
-    [
-        pytest.param(["--enable-server-load-tracking"],
-                     id="enable-server-load-tracking")
-    ],
+    [pytest.param(["--enable-server-load-tracking"], id="enable-server-load-tracking")],
     indirect=True,
 )
 @pytest.mark.asyncio
@@ -202,7 +205,8 @@ def make_long_completion_request():
 
     # Start the completion request in a background thread.
     completion_future = asyncio.create_task(
-        asyncio.to_thread(make_long_completion_request))
+        asyncio.to_thread(make_long_completion_request)
+    )
 
     # Give a short delay to ensure the request has started.
     await asyncio.sleep(0.1)
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index e7c3ffaa6a9f..a21be27e61ae 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -15,8 +15,10 @@
 from openai import BadRequestError, OpenAI
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
+from .test_completion import (
+    zephyr_lora_added_tokens_files,  # noqa: F401
+    zephyr_lora_files,  # noqa: F401
+)
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -25,6 +27,7 @@
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
+
     mpatch = MonkeyPatch()
     yield mpatch
     mpatch.undo()
@@ -32,13 +35,13 @@ def monkeypatch_module():
 
 @pytest.fixture(scope="module", params=[False, True])
 def server(
-        request,
-        monkeypatch_module,
-        zephyr_lora_files,  #noqa: F811
-        zephyr_lora_added_tokens_files):  # noqa: F811
-
+    request,
+    monkeypatch_module,
+    zephyr_lora_files,  # noqa: F811
+    zephyr_lora_added_tokens_files,
+):  # noqa: F811
     use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
     args = [
         # use half precision for speed and memory savings in CI environment
@@ -67,8 +70,9 @@ def server(
 @pytest.fixture
 def is_v1_server(server):
     import os
-    assert os.environ['VLLM_USE_V1'] in ['0', '1']
-    return os.environ['VLLM_USE_V1'] == '1'
+
+    assert os.environ["VLLM_USE_V1"] in ["0", "1"]
+    return os.environ["VLLM_USE_V1"] == "1"
 
 
 @pytest_asyncio.fixture
@@ -84,20 +88,18 @@ async def client(server):
     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
 async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
         max_completion_tokens=5,
         temperature=0.0,
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
     assert choice.logprobs is None
@@ -110,13 +112,10 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -124,7 +123,8 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
         max_completion_tokens=5,
         temperature=0.0,
         logprobs=True,
-        top_logprobs=0)
+        top_logprobs=0,
+    )
 
     choice = chat_completion.choices[0]
     assert choice.logprobs is not None
@@ -138,13 +138,10 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -152,7 +149,8 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
         max_completion_tokens=5,
         temperature=0.0,
         logprobs=True,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
 
     choice = chat_completion.choices[0]
     assert choice.logprobs is not None
@@ -165,41 +163,39 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
-                                      model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     # Default max_logprobs is 20, so this should raise an error
     with pytest.raises((openai.BadRequestError, openai.APIError)):
-        stream = await client.chat.completions.create(model=model_name,
-                                                      messages=messages,
-                                                      max_completion_tokens=10,
-                                                      logprobs=True,
-                                                      top_logprobs=21,
-                                                      stream=True)
+        stream = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            logprobs=True,
+            top_logprobs=21,
+            stream=True,
+        )
         async for chunk in stream:
             ...
 
     with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=model_name,
-                                             messages=messages,
-                                             max_completion_tokens=10,
-                                             logprobs=True,
-                                             top_logprobs=30,
-                                             stream=False)
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            logprobs=True,
+            top_logprobs=30,
+            stream=False,
+        )
 
     # the server should still work afterwards
     chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=10,
-        stream=False)
+        model=model_name, messages=messages, max_completion_tokens=10, stream=False
+    )
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 0
 
@@ -209,27 +205,20 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
     "model_name, prompt_logprobs",
     [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
 )
-async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
-                                    model_name: str,
-                                    prompt_logprobs: Optional[int]):
+async def test_prompt_logprobs_chat(
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
+):
     params: dict = {
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "Who won the world series in 2020?"
-        }, {
-            "role":
-            "assistant",
-            "content":
-            "The Los Angeles Dodgers won the World Series in 2020."
-        }, {
-            "role": "user",
-            "content": "Where was it played?"
-        }],
-        "model":
-        model_name
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"},
+            {
+                "role": "assistant",
+                "content": "The Los Angeles Dodgers won the World Series in 2020.",
+            },
+            {"role": "user", "content": "Where was it played?"},
+        ],
+        "model": model_name,
     }
 
     if prompt_logprobs is not None:
@@ -252,29 +241,21 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
-                                                  model_name: str):
+async def test_more_than_one_prompt_logprobs_chat(
+    client: openai.AsyncOpenAI, model_name: str
+):
     params: dict = {
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "Who won the world series in 2020?"
-        }, {
-            "role":
-            "assistant",
-            "content":
-            "The Los Angeles Dodgers won the World Series in 2020."
-        }, {
-            "role": "user",
-            "content": "Where was it played?"
-        }],
-        "model":
-        model_name,
-        "extra_body": {
-            "prompt_logprobs": 1
-        }
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"},
+            {
+                "role": "assistant",
+                "content": "The Los Angeles Dodgers won the World Series in 2020.",
+            },
+            {"role": "user", "content": "Where was it played?"},
+        ],
+        "model": model_name,
+        "extra_body": {"prompt_logprobs": 1},
     }
 
     completion_1 = await client.chat.completions.create(**params)
@@ -291,15 +272,11 @@ async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora"],
 )
-async def test_single_chat_session(client: openai.AsyncOpenAI,
-                                   model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -307,14 +284,16 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert chat_completion.id is not None
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47)
+        completion_tokens=10, prompt_tokens=37, total_tokens=47
+    )
 
     message = choice.message
     assert message.content is not None and len(message.content) >= 10
@@ -339,13 +318,10 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
     [MODEL_NAME, "zephyr-lora"],
 )
 async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -387,15 +363,13 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
-                                              model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
+async def test_chat_completion_stream_options(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
 
     # Test stream=True, stream_options={"include_usage": False}
     stream = await client.chat.completions.create(
@@ -404,23 +378,21 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         temperature=0.0,
         stream=True,
-        stream_options={"include_usage": False})
+        stream_options={"include_usage": False},
+    )
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options={"include_usage": True,
     #                                   "continuous_usage_stats": False}}
-    stream = await client.chat.completions.create(model=model_name,
-                                                  messages=messages,
-                                                  max_completion_tokens=10,
-                                                  temperature=0.0,
-                                                  stream=True,
-                                                  stream_options={
-                                                      "include_usage":
-                                                      True,
-                                                      "continuous_usage_stats":
-                                                      False
-                                                  })
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True, "continuous_usage_stats": False},
+    )
 
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
@@ -432,8 +404,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=False, stream_options={"include_usage": None}
@@ -444,7 +416,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             max_completion_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": None})
+            stream_options={"include_usage": None},
+        )
 
     # Test stream=False, stream_options={"include_usage": True}
     with pytest.raises(BadRequestError):
@@ -454,7 +427,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
             max_completion_tokens=10,
             temperature=0.0,
             stream=False,
-            stream_options={"include_usage": True})
+            stream_options={"include_usage": True},
+        )
 
     # Test stream=True, stream_options={"include_usage": True,
     #                           "continuous_usage_stats": True}
@@ -473,92 +447,86 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
     last_completion_tokens = 0
     async for chunk in stream:
         assert chunk.usage.prompt_tokens >= 0
-        assert last_completion_tokens == 0 or \
-               chunk.usage.completion_tokens > last_completion_tokens or \
-               (
-                   not chunk.choices and
-                   chunk.usage.completion_tokens == last_completion_tokens
-               )
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert (
+            last_completion_tokens == 0
+            or chunk.usage.completion_tokens > last_completion_tokens
+            or (
+                not chunk.choices
+                and chunk.usage.completion_tokens == last_completion_tokens
+            )
+        )
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
         last_completion_tokens = chunk.usage.completion_tokens
 
     assert last_completion_tokens == 10
 
 
 @pytest.mark.asyncio
-async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  sample_guided_choice):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
+async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_guided_choice):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(guided_choice=sample_guided_choice),
+    )
     choice1 = chat_completion.choices[0].message.content
     assert choice1 in sample_guided_choice
 
     messages.append({"role": "assistant", "content": choice1})
-    messages.append({
-        "role": "user",
-        "content": "I disagree, pick another one"
-    })
+    messages.append({"role": "user", "content": "I disagree, pick another one"})
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(guided_choice=sample_guided_choice),
+    )
     choice2 = chat_completion.choices[0].message.content
     assert choice2 in sample_guided_choice
     assert choice1 != choice2
 
 
 @pytest.mark.asyncio
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                sample_json_schema):
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example JSON for an employee profile that "
+            f"fits this schema: {sample_json_schema}",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema))
+        extra_body=dict(guided_json=sample_json_schema),
+    )
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
     jsonschema.validate(instance=json1, schema=sample_json_schema)
 
     messages.append({"role": "assistant", "content": message.content})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
+    messages.append(
+        {"role": "user", "content": "Give me another one with a different name and age"}
+    )
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema))
+        extra_body=dict(guided_json=sample_json_schema),
+    )
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -569,21 +537,19 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example IP address with this regex: {sample_regex}"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example IP address with this regex: {sample_regex}",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex))
+        extra_body=dict(guided_regex=sample_regex),
+    )
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(sample_regex, ip1) is not None
@@ -594,7 +560,8 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex))
+        extra_body=dict(guided_regex=sample_regex),
+    )
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(sample_regex, ip2) is not None
@@ -603,45 +570,41 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
 
 @pytest.mark.asyncio
 async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
 
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=MODEL_NAME,
-                                                 messages=messages,
-                                                 extra_body=dict(guided_regex={
-                                                     1: "Python",
-                                                     2: "C++"
-                                                 }))
+        _ = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            extra_body=dict(guided_regex={1: "Python", 2: "C++"}),
+        )
 
 
 @pytest.mark.asyncio
-async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           sample_guided_choice):
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
+async def test_guided_choice_chat_logprobs(
+    client: openai.AsyncOpenAI, sample_guided_choice
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(guided_choice=sample_guided_choice),
+    )
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
@@ -654,16 +617,14 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example JSON for an employee profile that "
+            f"fits this schema: {sample_json_schema}",
+        },
+    ]
 
     # non-streaming
 
@@ -671,20 +632,17 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": sample_json_schema
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
+        tools=[
+            {
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema,
+                },
             }
-        },
+        ],
+        tool_choice={"type": "function", "function": {"name": "dummy_function_name"}},
     )
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
@@ -693,12 +651,9 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
     jsonschema.validate(instance=json1, schema=sample_json_schema)
 
     messages.append({"role": "assistant", "content": json_string})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
+    messages.append(
+        {"role": "user", "content": "Give me another one with a different name and age"}
+    )
 
     # streaming
 
@@ -706,21 +661,19 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": sample_json_schema
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
+        tools=[
+            {
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema,
+                },
             }
-        },
-        stream=True)
+        ],
+        tool_choice={"type": "function", "function": {"name": "dummy_function_name"}},
+        stream=True,
+    )
 
     output = []
     finish_reason_count = 0
@@ -743,11 +696,11 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_required_tool_use(client: openai.AsyncOpenAI,
-                                 is_v1_server: bool, model_name: str):
+async def test_required_tool_use(
+    client: openai.AsyncOpenAI, is_v1_server: bool, model_name: str
+):
     if is_v1_server:
-        pytest.skip(
-            "tool_choice='required' requires features unsupported on V1")
+        pytest.skip("tool_choice='required' requires features unsupported on V1")
 
     tools = [
         {
@@ -760,20 +713,16 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description":
-                            "The city to find the weather for, e.g. 'Vienna'",
+                            "description": "The city to find the weather for, e.g. 'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
+                            "type": "string",
+                            "description": "The country that the city is in, e.g. 'Austria'",
                         },
                         "unit": {
                             "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
+                            "description": "The unit to fetch the temperature in",
                             "enum": ["celsius", "fahrenheit"],
                         },
                     },
@@ -791,26 +740,20 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description":
-                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "description": "The city to get the forecast for, e.g. 'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
+                            "type": "string",
+                            "description": "The country that the city is in, e.g. 'Austria'",
                         },
                         "days": {
-                            "type":
-                            "integer",
-                            "description":
-                            "Number of days to get the forecast for (1-7)",
+                            "type": "integer",
+                            "description": "Number of days to get the forecast for (1-7)",
                         },
                         "unit": {
                             "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
+                            "description": "The unit to fetch the temperature in",
                             "enum": ["celsius", "fahrenheit"],
                         },
                     },
@@ -821,19 +764,11 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
     ]
 
     messages = [
+        {"role": "user", "content": "Hi! How are you doing today?"},
+        {"role": "assistant", "content": "I'm doing well! How can I help you?"},
         {
             "role": "user",
-            "content": "Hi! How are you doing today?"
-        },
-        {
-            "role": "assistant",
-            "content": "I'm doing well! How can I help you?"
-        },
-        {
-            "role":
-            "user",
-            "content":
-            "Can you tell me what the current weather is in Berlin and the "\
+            "content": "Can you tell me what the current weather is in Berlin and the "
             "forecast for the next 5 days, in fahrenheit?",
         },
     ]
@@ -867,64 +802,66 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  sample_json_schema):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+async def test_inconsistent_tool_choice_and_tools(
+    client: openai.AsyncOpenAI, sample_json_schema
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example JSON for an employee profile that "
+            f"fits this schema: {sample_json_schema}",
+        },
+    ]
 
     with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=MODEL_NAME,
-                                             messages=messages,
-                                             max_completion_tokens=1000,
-                                             tool_choice={
-                                                 "type": "function",
-                                                 "function": {
-                                                     "name":
-                                                     "dummy_function_name"
-                                                 }
-                                             })
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tool_choice={
+                "type": "function",
+                "function": {"name": "dummy_function_name"},
+            },
+        )
 
     with pytest.raises(openai.BadRequestError):
         await client.chat.completions.create(
             model=MODEL_NAME,
             messages=messages,
             max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "dummy_function_name",
+                        "description": "This is a dummy function",
+                        "parameters": sample_json_schema,
+                    },
                 }
-            }],
+            ],
             tool_choice={
                 "type": "function",
-                "function": {
-                    "name": "nondefined_function_name"
-                }
-            })
+                "function": {"name": "nondefined_function_name"},
+            },
+        )
     with pytest.raises(openai.BadRequestError):
         await client.chat.completions.create(
             model=MODEL_NAME,
             messages=messages,
             max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "dummy_function_name",
+                        "description": "This is a dummy function",
+                        "parameters": sample_json_schema,
+                    },
                 }
-            }],
-            tool_choice={})
+            ],
+            tool_choice={},
+        )
 
 
 @pytest.mark.asyncio
@@ -932,13 +869,17 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
     for _ in range(2):
         resp = await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role":
-                "user",
-                "content": ('what is 1+1? please respond with a JSON object, '
-                            'the format is {"result": 2}')
-            }],
-            response_format={"type": "json_object"})
+            messages=[
+                {
+                    "role": "user",
+                    "content": (
+                        "what is 1+1? please respond with a JSON object, "
+                        'the format is {"result": 2}'
+                    ),
+                }
+            ],
+            response_format={"type": "json_object"},
+        )
 
         content = resp.choices[0].message.content
         assert content is not None
@@ -954,10 +895,7 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
     for _ in range(2):
         resp = await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role": "user",
-                "content": prompt
-            }],
+            messages=[{"role": "user", "content": prompt}],
         )
         content = resp.choices[0].message.content
         assert content is not None
@@ -968,10 +906,7 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
     for _ in range(2):
         resp = await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role": "user",
-                "content": prompt
-            }],
+            messages=[{"role": "user", "content": prompt}],
             response_format={
                 "type": "json_schema",
                 "json_schema": {
@@ -979,13 +914,12 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
                     "schema": {
                         "type": "object",
                         "properties": {
-                            "result": {
-                                "type": "integer"
-                            },
+                            "result": {"type": "integer"},
                         },
                     },
-                }
-            })
+                },
+            },
+        )
 
         content = resp.choices[0].message.content
         assert content is not None
@@ -998,13 +932,16 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
 async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
     resp = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?",
-            "extra_field": "0",
-        }],  # type: ignore
+        messages=[
+            {
+                "role": "user",
+                "content": "what is 1+1?",
+                "extra_field": "0",
+            }
+        ],  # type: ignore
         temperature=0,
-        seed=0)
+        seed=0,
+    )
 
     content = resp.choices[0].message.content
     assert content is not None
@@ -1014,18 +951,20 @@ async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
 async def test_complex_message_content(client: openai.AsyncOpenAI):
     resp = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role":
-            "user",
-            "content": [{
-                "type":
-                "text",
-                "text":
-                "what is 1+1? please provide the result without any other text."
-            }]
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "what is 1+1? please provide the result without any other text.",
+                    }
+                ],
+            }
+        ],
         temperature=0,
-        seed=0)
+        seed=0,
+    )
     content = resp.choices[0].message.content
     assert content == "2"
 
@@ -1037,24 +976,27 @@ async def test_custom_role(client: openai.AsyncOpenAI):
 
     resp1 = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "my-custom-role",
-            "content": "what is 1+1?",
-        }],  # type: ignore
+        messages=[
+            {
+                "role": "my-custom-role",
+                "content": "what is 1+1?",
+            }
+        ],  # type: ignore
         temperature=0,
-        seed=0)
+        seed=0,
+    )
 
     resp2 = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "my-custom-role",
-            "content": [{
-                "type": "text",
-                "text": "what is 1+1?"
-            }]
-        }],  # type: ignore
+        messages=[
+            {
+                "role": "my-custom-role",
+                "content": [{"type": "text", "text": "what is 1+1?"}],
+            }
+        ],  # type: ignore
         temperature=0,
-        seed=0)
+        seed=0,
+    )
 
     content1 = resp1.choices[0].message.content
     content2 = resp2.choices[0].message.content
@@ -1063,22 +1005,24 @@ async def test_custom_role(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_long_seed(client: openai.AsyncOpenAI):
-    for seed in [
-            torch.iinfo(torch.long).min - 1,
-            torch.iinfo(torch.long).max + 1
-    ]:
+    for seed in [torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1]:
         with pytest.raises(BadRequestError) as exc_info:
             await client.chat.completions.create(
                 model=MODEL_NAME,
-                messages=[{
-                    "role": "system",
-                    "content": "You are a helpful assistant.",
-                }],
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant.",
+                    }
+                ],
                 temperature=0,
-                seed=seed)
+                seed=seed,
+            )
 
-        assert ("greater_than_equal" in exc_info.value.message
-                or "less_than_equal" in exc_info.value.message)
+        assert (
+            "greater_than_equal" in exc_info.value.message
+            or "less_than_equal" in exc_info.value.message
+        )
 
 
 @pytest.mark.asyncio
@@ -1089,15 +1033,11 @@ async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
     }
     data = {
         # model_name is avoided here.
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
-        "max_tokens":
-        5
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "what is 1+1?"},
+        ],
+        "max_tokens": 5,
     }
 
     response = requests.post(url, headers=headers, json=data)
@@ -1122,10 +1062,7 @@ async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
         base_url=openai_api_base,
     )
     messages = [
-        {
-            "role": "user",
-            "content": "Hello, vLLM!"
-        },
+        {"role": "user", "content": "Hello, vLLM!"},
     ]
     response = client.chat.completions.create(
         model="",  # empty string
@@ -1135,15 +1072,11 @@ async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
 
 
 @pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
 
     request_args = {
         "model": MODEL_NAME,
@@ -1155,8 +1088,9 @@ async def test_invocations(server: RemoteOpenAIServer,
 
     chat_completion = await client.chat.completions.create(**request_args)
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     chat_output = chat_completion.model_dump()
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index de63f4ed218b..c9c42efd2849 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -44,27 +44,26 @@ class TestCase(NamedTuple):
     "test_case",
     [
         TestCase(model_name=MODEL_NAME, echo=True),
-        TestCase(model_name=MODEL_NAME, echo=False)
+        TestCase(model_name=MODEL_NAME, echo=False),
     ],
 )
 async def test_chat_session_with_echo_and_continue_final_message(
-        client: openai.AsyncOpenAI, test_case: TestCase):
+    client: openai.AsyncOpenAI, test_case: TestCase
+):
     saying: str = "Here is a common saying about apple. An apple a day, keeps"
     # test echo with continue_final_message parameter
     chat_completion = await client.chat.completions.create(
         model=test_case.model_name,
-        messages=[{
-            "role": "user",
-            "content": "tell me a common saying"
-        }, {
-            "role": "assistant",
-            "content": saying
-        }],
+        messages=[
+            {"role": "user", "content": "tell me a common saying"},
+            {"role": "assistant", "content": saying},
+        ],
         extra_body={
             "echo": test_case.echo,
             "continue_final_message": True,
-            "add_generation_prompt": False
-        })
+            "add_generation_prompt": False,
+        },
+    )
     assert chat_completion.id is not None
     assert len(chat_completion.choices) == 1
 
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
index e9d1a855294c..936875a25e0e 100644
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -53,10 +53,7 @@ async def test_chat_logit_bias_valid(client):
 
     completion = await client.chat.completions.create(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "Testing valid logit bias"
-        }],
+        messages=[{"role": "user", "content": "Testing valid logit bias"}],
         max_tokens=5,
         logit_bias={str(valid_token_id): 1.0},
     )
@@ -73,10 +70,7 @@ async def test_chat_logit_bias_invalid(client):
     with pytest.raises(openai.BadRequestError) as excinfo:
         await client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{
-                "role": "user",
-                "content": "Testing invalid logit bias"
-            }],
+            messages=[{"role": "user", "content": "Testing invalid logit bias"}],
             max_tokens=5,
             logit_bias={str(invalid_token_id): 1.0},
         )
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 6e32887f5ed0..17e36444fe84 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -4,8 +4,7 @@
 import pytest
 
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
-                                         load_chat_template)
+from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -17,48 +16,54 @@
 
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATION_OUTPUT = [
-    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        True,
+        False,
+        """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
 What is the capital of<|im_end|>
 <|im_start|>assistant
-"""),
-    ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
+""",
+    ),
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        False,
+        False,
+        """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
-What is the capital of"""),
-    ("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
+What is the capital of""",
+    ),
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        False,
+        True,
+        """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
 What is the capital of<|im_end|>
 <|im_start|>assistant
-The capital of"""),
+The capital of""",
+    ),
 ]
 
 TEST_MESSAGES = [
-    {
-        'role': 'user',
-        'content': 'Hello'
-    },
-    {
-        'role': 'assistant',
-        'content': 'Hi there!'
-    },
-    {
-        'role': 'user',
-        'content': 'What is the capital of'
-    },
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hi there!"},
+    {"role": "user", "content": "What is the capital of"},
 ]
-ASSISTANT_MESSAGE_TO_CONTINUE = {
-    'role': 'assistant',
-    'content': 'The capital of'
-}
+ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
 
 
 def test_load_chat_template():
@@ -68,8 +73,11 @@ def test_load_chat_template():
     # Test assertions
     assert template_content is not None
     # Hard coded value for template_chatml.jinja
-    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
+    assert (
+        template_content
+        == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
+    )  # noqa: E501
 
 
 def test_no_load_chat_template_filelike():
@@ -91,9 +99,11 @@ def test_no_load_chat_template_literallike():
 
 @pytest.mark.parametrize(
     "model,template,add_generation_prompt,continue_final_message,expected_output",
-    MODEL_TEMPLATE_GENERATION_OUTPUT)
-def test_get_gen_prompt(model, template, add_generation_prompt,
-                        continue_final_message, expected_output):
+    MODEL_TEMPLATE_GENERATION_OUTPUT,
+)
+def test_get_gen_prompt(
+    model, template, add_generation_prompt, continue_final_message, expected_output
+):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
 
@@ -116,7 +126,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
     mock_request = ChatCompletionRequest(
         model=model,
         messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
-        if continue_final_message else TEST_MESSAGES,
+        if continue_final_message
+        else TEST_MESSAGES,
         add_generation_prompt=add_generation_prompt,
         continue_final_message=continue_final_message,
     )
@@ -135,4 +146,5 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
     # Test assertion
     assert result == expected_output, (
         f"The generated prompt does not match the expected output for "
-        f"model {model} and template {template}")
+        f"model {model} and template {template}"
+    )
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index 03730b67283c..4f23eee46211 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -14,9 +14,14 @@
 @pytest.fixture(scope="module")
 def server():  # noqa: F811
     args = [
-        "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
-        "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
-        "hermes"
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--reasoning-parser",
+        "deepseek_r1",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -29,50 +34,44 @@ async def client(server):
         yield async_client
 
 
-TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                 },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                "required": ["city", "state", "unit"],
             },
-            "required": ["city", "state", "unit"]
-        }
+        },
     }
-}]
-
-MESSAGES = [{
-    "role": "user",
-    "content": "Hi! How are you doing today?"
-}, {
-    "role": "assistant",
-    "content": "I'm doing well! How can I help you?"
-}, {
-    "role":
-    "user",
-    "content":
-    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
-}]
+]
+
+MESSAGES = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
+    },
+]
 
 FUNC_NAME = "get_current_weather"
 FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
@@ -105,9 +104,7 @@ def extract_reasoning_and_calls(chunks: list):
 
 # test streaming
 @pytest.mark.asyncio
-async def test_chat_streaming_of_tool_and_reasoning(
-        client: openai.AsyncOpenAI):
-
+async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
     stream = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=MESSAGES,
@@ -120,8 +117,7 @@ async def test_chat_streaming_of_tool_and_reasoning(
     async for chunk in stream:
         chunks.append(chunk)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-        chunks)
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
     assert len(reasoning_content) > 0
     assert len(function_names) > 0 and function_names[0] == FUNC_NAME
     assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
@@ -130,7 +126,6 @@ async def test_chat_streaming_of_tool_and_reasoning(
 # test full generate
 @pytest.mark.asyncio
 async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
-
     tool_calls = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=MESSAGES,
@@ -140,7 +135,5 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
     )
 
     assert len(tool_calls.choices[0].message.reasoning_content) > 0
-    assert tool_calls.choices[0].message.tool_calls[0].function.name \
-          == FUNC_NAME
-    assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
-          == FUNC_ARGS
+    assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
+    assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 3c8ed955a65a..b248ffa23e6f 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -42,7 +42,8 @@ async def client(server):
 
 @pytest.mark.asyncio
 async def test_completion_stream_options_and_logprobs_with_long_prompts(
-        client: openai.AsyncOpenAI):
+    client: openai.AsyncOpenAI,
+):
     # Test stream with long prompt
     prompt = "What is the capital of France?" * 400
 
@@ -64,8 +65,9 @@ async def test_completion_stream_options_and_logprobs_with_long_prompts(
     async for chunk in stream:
         assert chunk.usage.prompt_tokens >= 0
         assert chunk.usage.completion_tokens >= 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
         if not finished:
             tokens_received += 1
             assert chunk.choices[0].text
@@ -79,15 +81,13 @@ async def test_completion_stream_options_and_logprobs_with_long_prompts(
 
 @pytest.mark.asyncio
 async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
-        client: openai.AsyncOpenAI):
+    client: openai.AsyncOpenAI,
+):
     # Test stream with long prompt
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?" * 400
-    }]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?" * 400},
+    ]
     stream = await client.chat.completions.create(
         model=MODEL_NAME,
         messages=messages,
@@ -108,8 +108,9 @@ async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
     async for chunk in stream:
         assert chunk.usage.prompt_tokens >= 0
         assert chunk.usage.completion_tokens >= 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
 
         if not finished:
             if chunk.choices[0].delta.content == "":
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index b2472658ca81..db2689188130 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -27,21 +27,16 @@ def server():
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_single_input_classification(server: RemoteOpenAIServer,
-                                     model_name: str):
+def test_single_input_classification(server: RemoteOpenAIServer, model_name: str):
     input_text = "This product was excellent and exceeded my expectations"
 
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": input_text
-        },
+        json={"model": model_name, "input": input_text},
     )
 
     classification_response.raise_for_status()
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert output.object == "list"
     assert output.model == MODEL_NAME
@@ -51,8 +46,7 @@ def test_single_input_classification(server: RemoteOpenAIServer,
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_multiple_inputs_classification(server: RemoteOpenAIServer,
-                                        model_name: str):
+def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
         "The product arrived on time and works perfectly",
         "I'm very satisfied with my purchase, would buy again",
@@ -64,13 +58,9 @@ def test_multiple_inputs_classification(server: RemoteOpenAIServer,
 
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": input_texts
-        },
+        json={"model": model_name, "input": input_texts},
     )
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert len(output.data) == len(input_texts)
     for i, item in enumerate(output.data):
@@ -87,16 +77,11 @@ def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
 
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": long_text,
-            "truncate_prompt_tokens": 5
-        },
+        json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
     )
 
     classification_response.raise_for_status()
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert len(output.data) == 1
     assert output.data[0].index == 0
@@ -106,15 +91,12 @@ def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer,
-                                              model_name: str):
+def test_invalid_truncate_prompt_tokens_error(
+    server: RemoteOpenAIServer, model_name: str
+):
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": "test",
-            "truncate_prompt_tokens": 513
-        },
+        json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
     )
 
     error = classification_response.json()
@@ -127,10 +109,7 @@ def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer,
 def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": ""
-        },
+        json={"model": model_name, "input": ""},
     )
 
     error = classification_response.json()
@@ -139,18 +118,13 @@ def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_batch_classification_empty_list(server: RemoteOpenAIServer,
-                                         model_name: str):
+def test_batch_classification_empty_list(server: RemoteOpenAIServer, model_name: str):
     classification_response = requests.post(
         server.url_for("classify"),
-        json={
-            "model": model_name,
-            "input": []
-        },
+        json={"model": model_name, "input": []},
     )
     classification_response.raise_for_status()
-    output = ClassificationResponse.model_validate(
-        classification_response.json())
+    output = ClassificationResponse.model_validate(classification_response.json())
 
     assert output.object == "list"
     assert isinstance(output.data, list)
@@ -161,15 +135,17 @@ def test_batch_classification_empty_list(server: RemoteOpenAIServer,
 async def test_invocations(server: RemoteOpenAIServer):
     request_args = {
         "model": MODEL_NAME,
-        "input": "This product was excellent and exceeded my expectations"
+        "input": "This product was excellent and exceeded my expectations",
     }
 
-    classification_response = requests.post(server.url_for("classify"),
-                                            json=request_args)
+    classification_response = requests.post(
+        server.url_for("classify"), json=request_args
+    )
     classification_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     classification_output = classification_response.json()
@@ -177,7 +153,9 @@ async def test_invocations(server: RemoteOpenAIServer):
 
     assert classification_output.keys() == invocation_output.keys()
     for classification_data, invocation_data in zip(
-            classification_output["data"], invocation_output["data"]):
+        classification_output["data"], invocation_output["data"]
+    ):
         assert classification_data.keys() == invocation_data.keys()
         assert classification_data["probs"] == pytest.approx(
-            invocation_data["probs"], rel=0.01)
+            invocation_data["probs"], rel=0.01
+        )
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 504fd72aa4ae..c6e6bf0be1f8 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -5,8 +5,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.cli_args import (make_arg_parser,
-                                              validate_parsed_serve_args)
+from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.utils import FlexibleArgumentParser
 
@@ -15,7 +14,7 @@
 LORA_MODULE = {
     "name": "module2",
     "path": "/path/to/module2",
-    "base_model_name": "llama"
+    "base_model_name": "llama",
 }
 CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
 assert CHATML_JINJA_PATH.exists()
@@ -30,24 +29,26 @@ def serve_parser():
 ### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
     # Test old format: name=path
-    args = serve_parser.parse_args([
-        '--lora-modules',
-        'module1=/path/to/module1',
-    ])
-    expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            "module1=/path/to/module1",
+        ]
+    )
+    expected = [LoRAModulePath(name="module1", path="/path/to/module1")]
     assert args.lora_modules == expected
 
 
 def test_valid_json_format(serve_parser):
     # Test valid JSON format input
-    args = serve_parser.parse_args([
-        '--lora-modules',
-        json.dumps(LORA_MODULE),
-    ])
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            json.dumps(LORA_MODULE),
+        ]
+    )
     expected = [
-        LoRAModulePath(name='module2',
-                       path='/path/to/module2',
-                       base_model_name='llama')
+        LoRAModulePath(name="module2", path="/path/to/module2", base_model_name="llama")
     ]
     assert args.lora_modules == expected
 
@@ -55,47 +56,53 @@ def test_valid_json_format(serve_parser):
 def test_invalid_json_format(serve_parser):
     # Test invalid JSON format input, missing closing brace
     with pytest.raises(SystemExit):
-        serve_parser.parse_args([
-            '--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
-        ])
+        serve_parser.parse_args(
+            ["--lora-modules", '{"name": "module3", "path": "/path/to/module3"']
+        )
 
 
 def test_invalid_type_error(serve_parser):
     # Test type error when values are not JSON or key=value
     with pytest.raises(SystemExit):
-        serve_parser.parse_args([
-            '--lora-modules',
-            'invalid_format'  # This is not JSON or key=value format
-        ])
+        serve_parser.parse_args(
+            [
+                "--lora-modules",
+                "invalid_format",  # This is not JSON or key=value format
+            ]
+        )
 
 
 def test_invalid_json_field(serve_parser):
     # Test valid JSON format but missing required fields
     with pytest.raises(SystemExit):
-        serve_parser.parse_args([
-            '--lora-modules',
-            '{"name": "module4"}'  # Missing required 'path' field
-        ])
+        serve_parser.parse_args(
+            [
+                "--lora-modules",
+                '{"name": "module4"}',  # Missing required 'path' field
+            ]
+        )
 
 
 def test_empty_values(serve_parser):
     # Test when no LoRA modules are provided
-    args = serve_parser.parse_args(['--lora-modules', ''])
+    args = serve_parser.parse_args(["--lora-modules", ""])
     assert args.lora_modules == []
 
 
 def test_multiple_valid_inputs(serve_parser):
     # Test multiple valid inputs (both old and JSON format)
-    args = serve_parser.parse_args([
-        '--lora-modules',
-        'module1=/path/to/module1',
-        json.dumps(LORA_MODULE),
-    ])
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            "module1=/path/to/module1",
+            json.dumps(LORA_MODULE),
+        ]
+    )
     expected = [
-        LoRAModulePath(name='module1', path='/path/to/module1'),
-        LoRAModulePath(name='module2',
-                       path='/path/to/module2',
-                       base_model_name='llama')
+        LoRAModulePath(name="module1", path="/path/to/module1"),
+        LoRAModulePath(
+            name="module2", path="/path/to/module2", base_model_name="llama"
+        ),
     ]
     assert args.lora_modules == expected
 
@@ -111,40 +118,46 @@ def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
 
 def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
     """Ensure validation passes with tool choice enabled with a call parser"""
-    args = serve_parser.parse_args(args=[
-        "--enable-auto-tool-choice",
-        "--tool-call-parser",
-        "mistral",
-    ])
+    args = serve_parser.parse_args(
+        args=[
+            "--enable-auto-tool-choice",
+            "--tool-call-parser",
+            "mistral",
+        ]
+    )
     validate_parsed_serve_args(args)
 
 
 def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
     """Ensure validation fails if reasoning is enabled with auto tool choice"""
-    args = serve_parser.parse_args(args=[
-        "--enable-auto-tool-choice",
-        "--reasoning-parser",
-        "deepseek_r1",
-    ])
+    args = serve_parser.parse_args(
+        args=[
+            "--enable-auto-tool-choice",
+            "--reasoning-parser",
+            "deepseek_r1",
+        ]
+    )
     with pytest.raises(TypeError):
         validate_parsed_serve_args(args)
 
 
 def test_passes_with_reasoning_parser(serve_parser):
-    """Ensure validation passes if reasoning is enabled 
+    """Ensure validation passes if reasoning is enabled
     with a reasoning parser"""
-    args = serve_parser.parse_args(args=[
-        "--reasoning-parser",
-        "deepseek_r1",
-    ])
+    args = serve_parser.parse_args(
+        args=[
+            "--reasoning-parser",
+            "deepseek_r1",
+        ]
+    )
     validate_parsed_serve_args(args)
 
 
 def test_chat_template_validation_for_happy_paths(serve_parser):
     """Ensure validation passes if the chat template exists"""
     args = serve_parser.parse_args(
-        args=["--chat-template",
-              CHATML_JINJA_PATH.absolute().as_posix()])
+        args=["--chat-template", CHATML_JINJA_PATH.absolute().as_posix()]
+    )
     validate_parsed_serve_args(args)
 
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index df9586ee84de..23b032f67d8b 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -12,6 +12,7 @@
 import pytest_asyncio
 import regex as re
 import requests
+
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
@@ -47,8 +48,7 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     # Copy tokenizer to adapter and add some unique tokens
     # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], special_tokens=True)
     assert added == 3
     tokenizer.save_pretrained(tmp_model_dir)
     yield tmp_model_dir
@@ -61,8 +61,9 @@ def zephyr_pa_files():
 
 
 @pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
-                        zephyr_pa_files):
+def default_server_args(
+    zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files
+):
     return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -93,8 +94,7 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
     ]
 
 
-@pytest.fixture(scope="module",
-                params=["", "--disable-frontend-multiprocessing"])
+@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
 def server(default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
@@ -112,16 +112,20 @@ async def client(server):
 @pytest.mark.parametrize(
     # first test base model, then test loras, then test prompt adapters
     "model_name,num_virtual_tokens",
-    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
-     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
-     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+    [
+        (MODEL_NAME, 0),
+        ("zephyr-lora", 0),
+        ("zephyr-lora2", 0),
+        ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
+        ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS),
+    ],
 )
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
-                                 num_virtual_tokens: int):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
+async def test_single_completion(
+    client: openai.AsyncOpenAI, model_name: str, num_virtual_tokens: int
+):
+    completion = await client.completions.create(
+        model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 1
@@ -132,7 +136,8 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
     assert completion.usage == openai.types.CompletionUsage(
         completion_tokens=5,
         prompt_tokens=6 + num_virtual_tokens,
-        total_tokens=11 + num_virtual_tokens)
+        total_tokens=11 + num_virtual_tokens,
+    )
 
     # test using token IDs
     completion = await client.completions.create(
@@ -240,11 +245,12 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
+async def test_too_many_completion_logprobs(
+    client: openai.AsyncOpenAI, model_name: str
+):
     with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        (openai.BadRequestError, openai.APIError)
+    ):  # test using token IDs
         await client.completions.create(
             model=model_name,
             prompt=[0, 0, 0, 0, 0],
@@ -256,7 +262,8 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
         )
         ...
     with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        (openai.BadRequestError, openai.APIError)
+    ):  # test using token IDs
         stream = await client.completions.create(
             model=model_name,
             prompt=[0, 0, 0, 0, 0],
@@ -281,13 +288,13 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
-                                                         (MODEL_NAME, 0),
-                                                         (MODEL_NAME, 1),
-                                                         (MODEL_NAME, None)])
-async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
-                                          model_name: str,
-                                          prompt_logprobs: Optional[int]):
+@pytest.mark.parametrize(
+    "model_name, prompt_logprobs",
+    [(MODEL_NAME, -1), (MODEL_NAME, 0), (MODEL_NAME, 1), (MODEL_NAME, None)],
+)
+async def test_prompt_logprobs_completion(
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
+):
     params: dict = {
         "prompt": ["A robot may not injure another robot", "My name is"],
         "model": model_name,
@@ -316,8 +323,7 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
+async def test_completion_streaming(client: openai.AsyncOpenAI, model_name: str):
     prompt = "What is an LLM?"
 
     single_completion = await client.completions.create(
@@ -327,11 +333,9 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
         temperature=0.0,
     )
     single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
+    stream = await client.completions.create(
+        model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+    )
     chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
@@ -360,11 +364,9 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     n = 3
     max_tokens = 5
 
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=max_tokens,
-                                             n=n,
-                                             stream=True)
+    stream = await client.completions.create(
+        model=model_name, prompt=prompt, max_tokens=max_tokens, n=n, stream=True
+    )
     chunks: list[list[str]] = [[] for i in range(n)]
     finish_reason_count = 0
     async for chunk in stream:
@@ -384,53 +386,55 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
+async def test_completion_stream_options(client: openai.AsyncOpenAI, model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options=
     #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": False,
+            "continuous_usage_stats": False,
+        },
+    )
 
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options=
     #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": False,
+            "continuous_usage_stats": True,
+        },
+    )
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options=
     #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": False,
+        },
+    )
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
@@ -441,57 +445,63 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=True, stream_options=
     #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+    )
     async for chunk in stream:
         assert chunk.usage is not None
         assert chunk.usage.prompt_tokens > 0
         assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
         if chunk.choices[0].finish_reason is not None:
             final_chunk = await stream.__anext__()
             assert final_chunk.usage is not None
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=False, stream_options=
     #     {"include_usage": None}
     with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None},
+        )
 
     # Test stream=False, stream_options=
     #    {"include_usage": True}
     with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True},
+        )
 
     # Test stream=False, stream_options=
     #     {"continuous_usage_stats": None}
@@ -502,7 +512,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             max_tokens=5,
             temperature=0.0,
             stream=False,
-            stream_options={"continuous_usage_stats": None})
+            stream_options={"continuous_usage_stats": None},
+        )
 
     # Test stream=False, stream_options=
     #    {"continuous_usage_stats": True}
@@ -513,7 +524,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             max_tokens=5,
             temperature=0.0,
             stream=False,
-            stream_options={"continuous_usage_stats": True})
+            stream_options={"continuous_usage_stats": True},
+        )
 
 
 @pytest.mark.asyncio
@@ -544,15 +556,19 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
             extra_body=dict(
                 # NOTE: this has to be true for n > 1 in vLLM, but
                 # not necessary for official client.
-                use_beam_search=True),
+                use_beam_search=True
+            ),
         )
         assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
+        assert batch.choices[0].text != batch.choices[1].text, (
+            "beam search should be different"
+        )
+        assert batch.choices[0].text == batch.choices[2].text, (
+            "two copies of the same prompt should be the same"
+        )
+        assert batch.choices[1].text == batch.choices[3].text, (
+            "two copies of the same prompt should be the same"
+        )
 
         # test streaming
         batch = await client.completions.create(
@@ -587,14 +603,18 @@ async def test_logits_bias(client: openai.AsyncOpenAI):
         seed=42,
     )
     assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
+    response_tokens = tokenizer(completion.choices[0].text, add_special_tokens=False)[
+        "input_ids"
+    ]
+    expected_tokens = tokenizer(
+        tokenizer.decode([token_id] * 5), add_special_tokens=False
+    )["input_ids"]
+    assert all(
+        [
+            response == expected
+            for response, expected in zip(response_tokens, expected_tokens)
+        ]
+    )
 
     # Test ban
     completion = await client.completions.create(
@@ -603,16 +623,16 @@ async def test_logits_bias(client: openai.AsyncOpenAI):
         max_tokens=max_tokens,
         temperature=0.0,
     )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
+    response_tokens = tokenizer(completion.choices[0].text, add_special_tokens=False)[
+        "input_ids"
+    ]
     first_response = completion.choices[0].text
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=prompt,
         max_tokens=max_tokens,
         temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
+        logit_bias={str(token): -100 for token in response_tokens},
     )
     assert first_response != completion.choices[0].text
 
@@ -641,9 +661,9 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str,
-                                      sample_json_schema):
+async def test_guided_json_completion(
+    client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_json_schema
+):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example JSON for an employee profile "
@@ -651,8 +671,11 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
         n=3,
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(
+            guided_json=sample_json_schema,
+            guided_decoding_backend=guided_decoding_backend,
+        ),
+    )
 
     assert completion.id is not None
     assert len(completion.choices) == 3
@@ -663,38 +686,42 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str,
-                                       sample_regex):
+async def test_guided_regex_completion(
+    client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_regex
+):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
         n=3,
         temperature=1.0,
         max_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(
+            guided_regex=sample_regex, guided_decoding_backend=guided_decoding_backend
+        ),
+    )
 
     assert completion.id is not None
     assert len(completion.choices) == 3
     for i in range(3):
-        assert re.fullmatch(sample_regex,
-                            completion.choices[i].text) is not None
+        assert re.fullmatch(sample_regex, completion.choices[i].text) is not None
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str,
-                                        sample_guided_choice):
+async def test_guided_choice_completion(
+    client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_guided_choice
+):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt="The best language for type-safe systems programming is ",
         n=2,
         temperature=1.0,
         max_tokens=10,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(
+            guided_choice=sample_guided_choice,
+            guided_decoding_backend=guided_decoding_backend,
+        ),
+    )
 
     assert completion.id is not None
     assert len(completion.choices) == 2
@@ -703,21 +730,23 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements):
-
+async def test_guided_grammar(client: openai.AsyncOpenAI, sample_sql_statements):
     completion = await client.completions.create(
         model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
+        prompt=(
+            "Generate a sql state that select col_1 from "
+            "table_1 where it is equals to 1"
+        ),
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_grammar=sample_sql_statements))
+        extra_body=dict(guided_grammar=sample_sql_statements),
+    )
 
     content = completion.choices[0].text
 
     # use Lark to parse the output, and make sure it's a valid parse tree
     from lark import Lark
+
     parser = Lark(sample_sql_statements)
     parser.parse(content)
 
@@ -734,52 +763,56 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
+async def test_echo_logprob_completion(
+    client: openai.AsyncOpenAI, model_name: str, logprobs_arg: int
+):
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
     # test using text and token IDs
     for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            echo=True,
+            logprobs=logprobs_arg,
+        )
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt, list) else prompt
         assert re.search(r"^" + prompt_text, completion.choices[0].text)
         logprobs = completion.choices[0].logprobs
         assert logprobs is not None
         assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
+        assert len(logprobs.token_logprobs) > 5 and logprobs.token_logprobs[0] is None
+        assert len(logprobs.top_logprobs) > 5 and logprobs.top_logprobs[0] is None
         for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
+            assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) > 5
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex):
+async def test_guided_decoding_type_error(
+    client: openai.AsyncOpenAI,
+    guided_decoding_backend: str,
+    sample_json_schema,
+    sample_regex,
+):
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
+            extra_body=dict(
+                guided_json=42, guided_decoding_backend=guided_decoding_backend
+            ),
+        )
 
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=sample_regex,
-                            guided_json=sample_json_schema))
+            extra_body=dict(guided_regex=sample_regex, guided_json=sample_json_schema),
+        )
 
 
 @pytest.mark.asyncio
@@ -789,19 +822,21 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
         (MODEL_NAME, False, False),
         (MODEL_NAME, False, True),
         (MODEL_NAME, True, False),
-        (MODEL_NAME, True, True)  # should not raise BadRequestError error
+        (MODEL_NAME, True, True),  # should not raise BadRequestError error
     ],
 )
-async def test_echo_stream_completion(client: openai.AsyncOpenAI,
-                                      model_name: str, stream: bool,
-                                      echo: bool):
+async def test_echo_stream_completion(
+    client: openai.AsyncOpenAI, model_name: str, stream: bool, echo: bool
+):
     saying: str = "Hello, my name is"
-    result = await client.completions.create(model=model_name,
-                                             prompt=saying,
-                                             max_tokens=10,
-                                             temperature=0.0,
-                                             echo=echo,
-                                             stream=stream)
+    result = await client.completions.create(
+        model=model_name,
+        prompt=saying,
+        max_tokens=10,
+        temperature=0.0,
+        echo=echo,
+        stream=stream,
+    )
 
     stop_reason = "length"
 
@@ -837,8 +872,7 @@ async def test_echo_stream_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
     request_args = {
         "model": MODEL_NAME,
         "prompt": "Hello, my name is",
@@ -849,8 +883,9 @@ async def test_invocations(server: RemoteOpenAIServer,
 
     completion = await client.completions.create(**request_args)
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     completion_output = completion.model_dump()
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index eca048d855b5..d6835db4c959 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -42,9 +42,13 @@ async def client(server):
 @pytest.mark.parametrize("stream", [True, False])
 @pytest.mark.parametrize("tool_choice", ["auto", "required"])
 @pytest.mark.parametrize("enable_thinking", [True, False])
-async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
-                                 stream: bool, tool_choice: str,
-                                 enable_thinking: bool):
+async def test_function_tool_use(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    stream: bool,
+    tool_choice: str,
+    enable_thinking: bool,
+):
     tools = [
         {
             "type": "function",
@@ -56,26 +60,21 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description":
-                            "The city to find the weather for, e.g. 'Vienna'",
+                            "description": "The city to find the weather for, e.g. 'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
+                            "type": "string",
+                            "description": "The country that the city is in, e.g. 'Austria'",
                         },
                         "unit": {
                             "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
+                            "description": "The unit to fetch the temperature in",
                             "enum": ["celsius", "fahrenheit"],
                         },
                         "options": {
                             "$ref": "#/$defs/WeatherOptions",
-                            "description":
-                            "Optional parameters for weather query",
+                            "description": "Optional parameters for weather query",
                         },
                     },
                     "required": ["country", "unit"],
@@ -95,8 +94,7 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
                                 "include_forecast": {
                                     "type": "boolean",
                                     "default": False,
-                                    "description":
-                                    "Whether to include a 24-hour forecast",
+                                    "description": "Whether to include a 24-hour forecast",
                                     "title": "Include Forecast",
                                 },
                                 "language": {
@@ -122,26 +120,20 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description":
-                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "description": "The city to get the forecast for, e.g. 'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
+                            "type": "string",
+                            "description": "The country that the city is in, e.g. 'Austria'",
                         },
                         "days": {
-                            "type":
-                            "integer",
-                            "description":
-                            "Number of days to get the forecast for (1-7)",
+                            "type": "integer",
+                            "description": "Number of days to get the forecast for (1-7)",
                         },
                         "unit": {
                             "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
+                            "description": "The unit to fetch the temperature in",
                             "enum": ["celsius", "fahrenheit"],
                         },
                     },
@@ -152,19 +144,11 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
     ]
 
     messages = [
+        {"role": "user", "content": "Hi! How are you doing today?"},
+        {"role": "assistant", "content": "I'm doing well! How can I help you?"},
         {
             "role": "user",
-            "content": "Hi! How are you doing today?"
-        },
-        {
-            "role": "assistant",
-            "content": "I'm doing well! How can I help you?"
-        },
-        {
-            "role":
-            "user",
-            "content":
-            "Can you tell me what the current weather is in Berlin and the "\
+            "content": "Can you tell me what the current weather is in Berlin and the "
             "forecast for the next 5 days, in fahrenheit?",
         },
     ]
@@ -175,16 +159,11 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
             model=model_name,
             tools=tools,
             tool_choice=tool_choice,
-            extra_body={
-                "chat_template_kwargs": {
-                    "enable_thinking": enable_thinking
-                }
-            })
+            extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
+        )
         if enable_thinking:
-            assert chat_completion.choices[0].message.\
-                reasoning_content is not None
-            assert chat_completion.choices[0].message.\
-                reasoning_content != ""
+            assert chat_completion.choices[0].message.reasoning_content is not None
+            assert chat_completion.choices[0].message.reasoning_content != ""
         assert chat_completion.choices[0].message.tool_calls is not None
         assert len(chat_completion.choices[0].message.tool_calls) > 0
     else:
@@ -195,11 +174,8 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
             tools=tools,
             tool_choice=tool_choice,
             stream=True,
-            extra_body={
-                "chat_template_kwargs": {
-                    "enable_thinking": enable_thinking
-                }
-            })
+            extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
+        )
 
         output = []
         async for chunk in output_stream:
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index 00d3ffb61ee9..b2ae15cbf33b 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -10,6 +10,7 @@
 import pytest
 import pytest_asyncio
 import torch
+
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
@@ -37,8 +38,7 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     # Copy tokenizer to adapter and add some unique tokens
     # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], special_tokens=True)
     assert added == 3
     tokenizer.save_pretrained(tmp_model_dir)
     yield tmp_model_dir
@@ -65,8 +65,7 @@ def default_server_args(
     ]
 
 
-@pytest.fixture(scope="module",
-                params=["", "--disable-frontend-multiprocessing"])
+@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
 def server_with_prompt_embeds(default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
@@ -86,13 +85,14 @@ def create_dummy_embeds(num_tokens: int = 5) -> str:
     dummy_embeds = torch.randn(num_tokens, CONFIG.hidden_size)
     buffer = io.BytesIO()
     torch.save(dummy_embeds, buffer)
-    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_completions_with_prompt_embeds(
-        client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
+    client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str
+):
     # Test case: Single prompt embeds input
     encoded_embeds = create_dummy_embeds()
     completion = await client_with_prompt_embeds.completions.create(
@@ -100,7 +100,8 @@ async def test_completions_with_prompt_embeds(
         prompt="",  # Add empty prompt as required parameter
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     assert len(completion.choices[0].text) >= 1
     assert completion.choices[0].prompt_logprobs is None
 
@@ -111,7 +112,8 @@ async def test_completions_with_prompt_embeds(
         prompt="",  # Add empty prompt as required parameter
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
     assert len(completion.choices) == 2
     assert len(completion.choices[0].text) >= 1
     assert len(completion.choices[1].text) >= 1
@@ -123,7 +125,8 @@ async def test_completions_with_prompt_embeds(
         prompt="",  # Add empty prompt as required parameter
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     single_output = single_completion.choices[0].text
 
     stream = await client_with_prompt_embeds.completions.create(
@@ -132,7 +135,8 @@ async def test_completions_with_prompt_embeds(
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     chunks = []
     finish_reason_count = 0
     async for chunk in stream:
@@ -152,12 +156,12 @@ async def test_completions_with_prompt_embeds(
         max_tokens=5,
         temperature=0.0,
         stream=True,
-        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
     chunks_stream_embeds: list[list[str]] = [[], []]
     finish_reason_count = 0
     async for chunk in stream:
-        chunks_stream_embeds[chunk.choices[0].index].append(
-            chunk.choices[0].text)
+        chunks_stream_embeds[chunk.choices[0].index].append(chunk.choices[0].text)
         if chunk.choices[0].finish_reason is not None:
             finish_reason_count += 1
     assert finish_reason_count == 2
@@ -173,7 +177,8 @@ async def test_completions_with_prompt_embeds(
         prompt="This is a prompt",
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     assert len(completion.choices) == 2
     completion_text_only = await client_with_prompt_embeds.completions.create(
         model=model_name,
@@ -186,18 +191,18 @@ async def test_completions_with_prompt_embeds(
         prompt="",
         max_tokens=5,
         temperature=0.0,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
     # Embeddings responses should be handled first
-    assert completion_mixed.choices[0].text == completion_embeds_only.choices[
-        0].text
-    assert completion_mixed.choices[1].text == completion_text_only.choices[
-        0].text
+    assert completion_mixed.choices[0].text == completion_embeds_only.choices[0].text
+    assert completion_mixed.choices[1].text == completion_text_only.choices[0].text
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_completions_errors_with_prompt_embeds(
-        client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
+    client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str
+):
     # Test error case: invalid prompt_embeds
     with pytest.raises(BadRequestError):
         await client_with_prompt_embeds.completions.create(
@@ -205,15 +210,16 @@ async def test_completions_errors_with_prompt_embeds(
             model=model_name,
             max_tokens=5,
             temperature=0.0,
-            extra_body={"prompt_embeds": "invalid_base64"})
+            extra_body={"prompt_embeds": "invalid_base64"},
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_completions_with_logprobs_and_prompt_embeds(
-        client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int,
-        model_name: str):
+    client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int, model_name: str
+):
     # Test case: Logprobs using prompt_embeds
     encoded_embeds = create_dummy_embeds()
     completion = await client_with_prompt_embeds.completions.create(
@@ -223,7 +229,8 @@ async def test_completions_with_logprobs_and_prompt_embeds(
         temperature=0.0,
         echo=False,
         logprobs=logprobs_arg,
-        extra_body={"prompt_embeds": encoded_embeds})
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
 
     logprobs = completion.choices[0].logprobs
     assert logprobs is not None
@@ -243,7 +250,8 @@ async def test_completions_with_logprobs_and_prompt_embeds(
         temperature=0.0,
         echo=False,
         logprobs=logprobs_arg,
-        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]})
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
 
     assert len(completion.choices) == 2
     for choice in completion.choices:
@@ -253,6 +261,5 @@ async def test_completions_with_logprobs_and_prompt_embeds(
         assert len(logprobs.token_logprobs) == 5
         assert len(logprobs.top_logprobs) == 5
         for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
+            assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) == 5
diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index 1fc87c8b42a7..237e921016b6 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -16,8 +16,7 @@
 # need a multimodal model for these tests.
 
 # Contains a modality specific lora alongside the base model
-MULTIMODAL_MODEL_NAME = snapshot_download(
-    "microsoft/Phi-4-multimodal-instruct")
+MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")
 
 ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
@@ -26,6 +25,7 @@
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
+
     mpatch = MonkeyPatch()
     yield mpatch
     mpatch.undo()
@@ -33,9 +33,8 @@ def monkeypatch_module():
 
 @pytest.fixture(scope="module", params=[False, True])
 def multimodal_server(request, monkeypatch_module):  # noqa: F811
-
     use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
     args = [
         # use half precision for speed and memory savings in CI environment
@@ -56,7 +55,7 @@ def multimodal_server(request, monkeypatch_module):  # noqa: F811
         "--gpu-memory-utilization",
         "0.8",
         "--default-mm-loras",
-        f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}",
+        f'{{"audio": "{AUDIO_LORA_PATH}"}}',
     ]
 
     with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args) as remote_server:
@@ -80,25 +79,25 @@ async def test_default_mm_lora_chat_completions(
     multi_modal_client: openai.AsyncOpenAI,
     audio_assets: AudioTestAssets,
 ):
-    messages = [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": "Can you transcribe this audio?",
-        }, {
-            "type": "audio_url",
-            "audio_url": {
-                "url": audio_assets[0].url
-            },
-        }]
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Can you transcribe this audio?",
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": audio_assets[0].url},
+                },
+            ],
+        }
+    ]
 
     chat_completion = await multi_modal_client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=128,
-        temperature=0.0)
+        model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0
+    )
 
     assert len(chat_completion.choices) > 0
 
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f03c96b12179..eb9786b3677d 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -12,8 +12,7 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
+from ...models.language.pooling.embed_utils import run_embedding_correctness_test
 from ...models.utils import check_embeddings_close
 from ...utils import RemoteOpenAIServer
 
@@ -57,15 +56,13 @@ async def client(server):
 
 @pytest.fixture(scope="module")
 def hf_model(hf_runner):
-    with hf_runner(MODEL_NAME, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
         yield hf_model
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
-                                model_name: str):
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
@@ -77,7 +74,8 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -97,7 +95,8 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -109,12 +108,12 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
-                               model_name: str):
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
     # test list[str]
     input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
+        "The cat sat on the mat.",
+        "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky.",
     ]
     embedding_response = await client.embeddings.create(
         model=model_name,
@@ -122,7 +121,8 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 3
@@ -135,15 +135,20 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
     run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
 
     # test list[list[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
+    input_tokens = [
+        [4, 5, 7, 9, 20],
+        [15, 29, 499],
+        [24, 24, 24, 24, 24],
+        [25, 32, 64, 77],
+    ]
     embedding_response = await client.embeddings.create(
         model=model_name,
         input=input_tokens,
         encoding_format="float",
     )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 4
@@ -155,19 +160,23 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_conversation_embedding(server: RemoteOpenAIServer,
-                                      client: openai.AsyncOpenAI,
-                                      model_name: str):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+async def test_conversation_embedding(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     chat_response = requests.post(
         server.url_for("v1/embeddings"),
@@ -196,64 +205,66 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
         extra_body={"add_special_tokens": False},
     )
     completion_embeddings = EmbeddingResponse.model_validate(
-        completion_response.model_dump(mode="json"))
+        completion_response.model_dump(mode="json")
+    )
 
     assert chat_embeddings.id is not None
     assert completion_embeddings.id is not None
     assert chat_embeddings.created <= completion_embeddings.created
-    assert chat_embeddings.model_dump(
-        exclude={"id", "created"}) == (completion_embeddings.model_dump(
-            exclude={"id", "created"}))
+    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
+        completion_embeddings.model_dump(exclude={"id", "created"})
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
-                                      model_name: str):
+async def test_batch_base64_embedding(
+    hf_model, client: openai.AsyncOpenAI, model_name: str
+):
     input_texts = [
         "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
+        "The best thing about vLLM is that it supports many different models",
     ]
 
-    responses_float = await client.embeddings.create(input=input_texts,
-                                                     model=model_name,
-                                                     encoding_format="float")
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
     float_data = [d.embedding for d in responses_float.data]
     run_embedding_correctness_test(hf_model, input_texts, float_data)
 
-    responses_base64 = await client.embeddings.create(input=input_texts,
-                                                      model=model_name,
-                                                      encoding_format="base64")
+    responses_base64 = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="base64"
+    )
     base64_data = []
     for data in responses_base64.data:
         base64_data.append(
-            np.frombuffer(base64.b64decode(data.embedding),
-                          dtype="float32").tolist())
+            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
+        )
 
     run_embedding_correctness_test(hf_model, input_texts, base64_data)
 
     # Default response is float32 decoded from base64 by OpenAI Client
-    responses_default = await client.embeddings.create(input=input_texts,
-                                                       model=model_name)
+    responses_default = await client.embeddings.create(
+        input=input_texts, model=model_name
+    )
     default_data = [d.embedding for d in responses_default.data]
     run_embedding_correctness_test(hf_model, input_texts, default_data)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
-                                           model_name: str):
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
 
     # test single embedding
     embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        extra_body={"truncate_prompt_tokens": 10})
+        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
+    )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -263,15 +274,34 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 10
 
     input_tokens = [
-        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
-        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
+        1,
+        24428,
+        289,
+        18341,
+        26165,
+        285,
+        19323,
+        283,
+        289,
+        26789,
+        3871,
+        28728,
+        9901,
+        340,
+        2229,
+        385,
+        340,
+        315,
+        28741,
+        28804,
+        2,
     ]
     embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        extra_body={"truncate_prompt_tokens": 10})
+        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
+    )
     embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
+        embedding_response.model_dump(mode="json")
+    )
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
@@ -283,8 +313,9 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
-                                                   model_name: str):
+async def test_single_embedding_truncation_invalid(
+    client: openai.AsyncOpenAI, model_name: str
+):
     input_texts = [
         "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
     ]
@@ -293,15 +324,17 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
         response = await client.embeddings.create(
             model=model_name,
             input=input_texts,
-            extra_body={"truncate_prompt_tokens": 8193})
+            extra_body={"truncate_prompt_tokens": 8193},
+        )
         assert "error" in response.object
-        assert "truncate_prompt_tokens value is greater than max_model_len. "\
-               "Please, select a smaller truncation size." in response.message
+        assert (
+            "truncate_prompt_tokens value is greater than max_model_len. "
+            "Please, select a smaller truncation size." in response.message
+        )
 
 
 @pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
@@ -314,35 +347,43 @@ async def test_invocations(server: RemoteOpenAIServer,
 
     completion_response = await client.embeddings.create(**request_args)
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     completion_output = completion_response.model_dump()
     invocation_output = invocation_response.json()
 
     assert completion_output.keys() == invocation_output.keys()
-    for completion_data, invocation_data in zip(completion_output["data"],
-                                                invocation_output["data"]):
+    for completion_data, invocation_data in zip(
+        completion_output["data"], invocation_output["data"]
+    ):
         assert completion_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="completion",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=[completion_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="completion",
+            name_1="invocation",
+        )
 
 
 @pytest.mark.asyncio
 async def test_invocations_conversation(server: RemoteOpenAIServer):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     request_args = {
         "model": MODEL_NAME,
@@ -350,22 +391,25 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
         "encoding_format": "float",
     }
 
-    chat_response = requests.post(server.url_for("v1/embeddings"),
-                                  json=request_args)
+    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
     chat_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     chat_output = chat_response.json()
     invocation_output = invocation_response.json()
 
     assert chat_output.keys() == invocation_output.keys()
-    for chat_data, invocation_data in zip(chat_output["data"],
-                                          invocation_output["data"]):
+    for chat_data, invocation_data in zip(
+        chat_output["data"], invocation_output["data"]
+    ):
         assert chat_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="chat",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=[chat_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="chat",
+            name_1="invocation",
+        )
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 08b797dc57ad..05c2b5dcc471 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -12,16 +12,17 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
 from ...conftest import HfRunner
-from ...models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
+from ...models.language.pooling.embed_utils import run_embedding_correctness_test
 from ...models.utils import EmbedModelInfo
 from ...utils import RemoteOpenAIServer
 
 MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
-                   is_matryoshka=True,
-                   matryoshka_dimensions=[256]),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        is_matryoshka=True,
+        matryoshka_dimensions=[256],
+    ),
 ]
 
 input_texts = [
@@ -49,15 +50,14 @@ def server(model_info, dtype: str):
         dtype,
         "--enforce-eager",
         "--max-model-len",
-        "512"
+        "512",
     ]
 
     if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
         # Manually enable Matryoshka Embeddings
-        args.extend([
-            "--trust_remote_code", "--hf_overrides",
-            '{"matryoshka_dimensions":[256]}'
-        ])
+        args.extend(
+            ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
+        )
 
     with RemoteOpenAIServer(model_info.name, args) as remote_server:
         yield remote_server
@@ -65,14 +65,16 @@ def server(model_info, dtype: str):
 
 @pytest.fixture(scope="module")
 def hf_model(hf_runner, model_info, dtype: str):
-    with hf_runner(model_info.name, dtype=dtype,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(
+        model_info.name, dtype=dtype, is_sentence_transformer=True
+    ) as hf_model:
         yield hf_model
 
 
 @pytest.mark.asyncio
-async def test_matryoshka(model_info: EmbedModelInfo,
-                          server: RemoteOpenAIServer, hf_model: HfRunner):
+async def test_matryoshka(
+    model_info: EmbedModelInfo, server: RemoteOpenAIServer, hf_model: HfRunner
+):
     client = server.get_async_client()
 
     async def make_request_and_correctness_test(dimensions):
@@ -85,7 +87,8 @@ async def make_request_and_correctness_test(dimensions):
             encoding_format="float",
         )
         embeddings = EmbeddingResponse.model_validate(
-            embedding_response.model_dump(mode="json"))
+            embedding_response.model_dump(mode="json")
+        )
 
         assert embeddings.id is not None
         assert len(embeddings.data) == 3
@@ -98,8 +101,7 @@ async def make_request_and_correctness_test(dimensions):
             assert len(embeddings.data[0].embedding) == dimensions
 
         vllm_outputs = [d.embedding for d in embeddings.data]
-        run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
-                                       dimensions)
+        run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
 
     if model_info.is_matryoshka:
         valid_dimensions: list[Optional[int]] = [None]
diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
index 9c2aef23e877..c68226409550 100644
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -31,10 +31,9 @@ async def client(server):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
+    completion = await client.completions.create(
+        model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 1
@@ -43,7 +42,8 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=2, total_tokens=7)
+        completion_tokens=5, prompt_tokens=2, total_tokens=7
+    )
 
     # test using token IDs
     completion = await client.completions.create(
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index bcdeaaacedea..5f54a881387b 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -9,6 +9,7 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
@@ -23,26 +24,18 @@
 BADREQUEST_CASES = [
     (
         "test_rank",
-        {
-            "r": 1024
-        },
+        {"r": 1024},
         "is greater than max_lora_rank",
     ),
     (
         "test_bias",
-        {
-            "bias": "all"
-        },
+        {"bias": "all"},
         "Adapter bias cannot be used without bias_enabled",
     ),
-    ("test_dora", {
-        "use_dora": True
-    }, "does not yet support DoRA"),
+    ("test_dora", {"use_dora": True}, "does not yet support DoRA"),
     (
         "test_modules_to_save",
-        {
-            "modules_to_save": ["lm_head"]
-        },
+        {"modules_to_save": ["lm_head"]},
         "only supports modules_to_save being None",
     ),
 ]
@@ -56,29 +49,28 @@ def zephyr_lora_files():
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
+
     mpatch = MonkeyPatch()
     yield mpatch
     mpatch.undo()
 
 
 @pytest.fixture(scope="module", params=[False, True])
-def server_with_lora_modules_json(request, monkeypatch_module,
-                                  zephyr_lora_files):
-
+def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files):
     use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
     # Define the json format LoRA module configurations
     lora_module_1 = {
         "name": "zephyr-lora",
         "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
+        "base_model_name": MODEL_NAME,
     }
 
     lora_module_2 = {
         "name": "zephyr-lora2",
         "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
+        "base_model_name": MODEL_NAME,
     }
 
     args = [
@@ -110,14 +102,12 @@ def server_with_lora_modules_json(request, monkeypatch_module,
 
 @pytest_asyncio.fixture
 async def client(server_with_lora_modules_json):
-    async with server_with_lora_modules_json.get_async_client(
-    ) as async_client:
+    async with server_with_lora_modules_json.get_async_client() as async_client:
         yield async_client
 
 
 @pytest.mark.asyncio
-async def test_static_lora_lineage(client: openai.AsyncOpenAI,
-                                   zephyr_lora_files):
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -125,23 +115,19 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI,
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
     assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
+    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
     assert lora_models[1].id == "zephyr-lora2"
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
-                                    zephyr_lora_files):
-
-    response = await client.post("load_lora_adapter",
-                                 cast_to=str,
-                                 body={
-                                     "lora_name": "zephyr-lora-3",
-                                     "lora_path": zephyr_lora_files
-                                 })
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+    response = await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
+    )
     # Ensure adapter loads before querying /models
     assert "success" in response
 
@@ -156,37 +142,37 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
     with pytest.raises(openai.NotFoundError):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": "notfound",
-                              "lora_path": "/not/an/adapter"
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
+        )
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
-                                          tmp_path):
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
     (invalid_files / "adapter_config.json").write_text("this is not json")
 
     with pytest.raises(openai.BadRequestError):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": "invalid-json",
-                              "lora_path": str(invalid_files)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": "invalid-json", "lora_path": str(invalid_files)},
+        )
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("test_name,config_change,expected_error",
-                         BADREQUEST_CASES)
-async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
-                                        zephyr_lora_files, test_name: str,
-                                        config_change: dict,
-                                        expected_error: str):
+@pytest.mark.parametrize("test_name,config_change,expected_error", BADREQUEST_CASES)
+async def test_dynamic_lora_badrequests(
+    client: openai.AsyncOpenAI,
+    tmp_path,
+    zephyr_lora_files,
+    test_name: str,
+    config_change: dict,
+    expected_error: str,
+):
     # Create test directory
     test_dir = tmp_path / test_name
 
@@ -206,29 +192,28 @@ async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
 
     # Test loading the adapter
     with pytest.raises(openai.BadRequestError, match=expected_error):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": test_name,
-                              "lora_path": str(test_dir)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": test_name, "lora_path": str(test_dir)},
+        )
 
 
 @pytest.mark.asyncio
-async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
-                                      zephyr_lora_files):
-    """Validate that many loras can be dynamically registered and inferenced 
+async def test_multiple_lora_adapters(
+    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+):
+    """Validate that many loras can be dynamically registered and inferenced
     with concurrently"""
 
     # This test file configures the server with --max-cpu-loras=2 and this test
     # will concurrently load 10 adapters, so it should flex the LRU cache
     async def load_and_run_adapter(adapter_name: str):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": adapter_name,
-                              "lora_path": str(zephyr_lora_files)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+        )
         for _ in range(3):
             await client.completions.create(
                 model=adapter_name,
@@ -238,8 +223,7 @@ async def load_and_run_adapter(adapter_name: str):
 
     lora_tasks = []
     for i in range(10):
-        lora_tasks.append(
-            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+        lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
 
     results, _ = await asyncio.wait(lora_tasks)
 
@@ -249,8 +233,8 @@ async def load_and_run_adapter(adapter_name: str):
 
 @pytest.mark.asyncio
 async def test_loading_invalid_adapters_does_not_break_others(
-        client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
-
+    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
     (invalid_files / "adapter_config.json").write_text("this is not json")
@@ -281,20 +265,18 @@ async def run_good_requests(client):
     # Run a bunch of bad adapter loads
     for _ in range(25):
         with suppress(openai.NotFoundError):
-            await client.post("load_lora_adapter",
-                              cast_to=str,
-                              body={
-                                  "lora_name": "notfound",
-                                  "lora_path": "/not/an/adapter"
-                              })
+            await client.post(
+                "load_lora_adapter",
+                cast_to=str,
+                body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
+            )
     for _ in range(25):
         with suppress(openai.BadRequestError):
-            await client.post("load_lora_adapter",
-                              cast_to=str,
-                              body={
-                                  "lora_name": "invalid",
-                                  "lora_path": str(invalid_files)
-                              })
+            await client.post(
+                "load_lora_adapter",
+                cast_to=str,
+                body={"lora_name": "invalid", "lora_path": str(invalid_files)},
+            )
 
     # Ensure all the running requests with lora adapters succeeded
     stop_good_requests_event.set()
@@ -303,12 +285,11 @@ async def run_good_requests(client):
         assert not isinstance(r, Exception), f"Got exception {r}"
 
     # Ensure we can load another adapter and run it
-    await client.post("load_lora_adapter",
-                      cast_to=str,
-                      body={
-                          "lora_name": "valid",
-                          "lora_path": zephyr_lora_files
-                      })
+    await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": "valid", "lora_path": zephyr_lora_files},
+    )
     await client.completions.create(
         model="valid",
         prompt=["Hello there", "Foo bar bazz buzz"],
@@ -325,12 +306,11 @@ async def test_beam_search_with_lora_adapters(
     """Validate that async beam search can be used with lora."""
 
     async def load_and_run_adapter(adapter_name: str):
-        await client.post("load_lora_adapter",
-                          cast_to=str,
-                          body={
-                              "lora_name": adapter_name,
-                              "lora_path": str(zephyr_lora_files)
-                          })
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+        )
         for _ in range(3):
             await client.completions.create(
                 model=adapter_name,
@@ -341,8 +321,7 @@ async def load_and_run_adapter(adapter_name: str):
 
     lora_tasks = []
     for i in range(3):
-        lora_tasks.append(
-            asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+        lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
 
     results, _ = await asyncio.wait(lora_tasks)
 
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index d4afdf7751c8..6b91552c4565 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -13,8 +13,7 @@
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest
 from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -33,14 +32,14 @@ class MockHFConfig:
 @dataclass
 class MockModelConfig:
     """Minimal mock ModelConfig for testing."""
+
     model: str = MODEL_NAME
     tokenizer: str = MODEL_NAME
     trust_remote_code: bool = False
     tokenizer_mode: str = "auto"
     max_model_len: int = 100
     tokenizer_revision: Optional[str] = None
-    multimodal_config: MultiModalConfig = field(
-        default_factory=MultiModalConfig)
+    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
     logits_processor_pattern: Optional[str] = None
     diff_sampling_param: Optional[dict] = None
@@ -53,17 +52,21 @@ def get_diff_sampling_param(self):
 
 
 class MockLoRAResolver(LoRAResolver):
-
-    async def resolve_lora(self, base_model_name: str,
-                           lora_name: str) -> Optional[LoRARequest]:
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> Optional[LoRARequest]:
         if lora_name == "test-lora":
-            return LoRARequest(lora_name="test-lora",
-                               lora_int_id=1,
-                               lora_local_path="/fake/path/test-lora")
+            return LoRARequest(
+                lora_name="test-lora",
+                lora_int_id=1,
+                lora_local_path="/fake/path/test-lora",
+            )
         elif lora_name == "invalid-lora":
-            return LoRARequest(lora_name="invalid-lora",
-                               lora_int_id=2,
-                               lora_local_path="/fake/path/invalid-lora")
+            return LoRARequest(
+                lora_name="invalid-lora",
+                lora_int_id=2,
+                lora_local_path="/fake/path/invalid-lora",
+            )
         return None
 
 
@@ -92,29 +95,28 @@ def mock_add_lora_side_effect(lora_request: LoRARequest):
             return
         elif lora_request.lora_name == "invalid-lora":
             # Simulate failure during addition (e.g. invalid format)
-            raise ValueError(f"Simulated failure adding LoRA: "
-                             f"{lora_request.lora_name}")
+            raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
 
     mock_engine.add_lora.side_effect = mock_add_lora_side_effect
     mock_engine.generate.reset_mock()
     mock_engine.add_lora.reset_mock()
 
     mock_model_config = MockModelConfig()
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+        model_config=mock_model_config,
+    )
 
-    serving_completion = OpenAIServingCompletion(mock_engine,
-                                                 mock_model_config,
-                                                 models,
-                                                 request_logger=None)
+    serving_completion = OpenAIServingCompletion(
+        mock_engine, mock_model_config, models, request_logger=None
+    )
 
     return mock_engine, serving_completion
 
 
 @pytest.mark.asyncio
-async def test_serving_completion_with_lora_resolver(mock_serving_setup,
-                                                     monkeypatch):
+async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
 
     mock_engine, serving_completion = mock_serving_setup
@@ -136,14 +138,13 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup,
     assert called_lora_request.lora_name == lora_model_name
 
     mock_engine.generate.assert_called_once()
-    called_lora_request = mock_engine.generate.call_args[1]['lora_request']
+    called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
     assert isinstance(called_lora_request, LoRARequest)
     assert called_lora_request.lora_name == lora_model_name
 
 
 @pytest.mark.asyncio
-async def test_serving_completion_resolver_not_found(mock_serving_setup,
-                                                     monkeypatch):
+async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
     monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
 
     mock_engine, serving_completion = mock_serving_setup
@@ -166,7 +167,8 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup,
 
 @pytest.mark.asyncio
 async def test_serving_completion_resolver_add_lora_fails(
-        mock_serving_setup, monkeypatch):
+    mock_serving_setup, monkeypatch
+):
     monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
 
     mock_engine, serving_completion = mock_serving_setup
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 2d7b845736b8..c47065df4c34 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -54,19 +54,22 @@ def default_server_args():
     ]
 
 
-@pytest.fixture(scope="module",
-                params=[
-                    "",
-                    "--enable-chunked-prefill",
-                    "--disable-frontend-multiprocessing",
-                    f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
-                ])
+@pytest.fixture(
+    scope="module",
+    params=[
+        "",
+        "--enable-chunked-prefill",
+        "--disable-frontend-multiprocessing",
+        f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
+    ],
+)
 def server(use_v1, default_server_args, request):
     if request.param:
         default_server_args.append(request.param)
-    env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args,
-                            env_dict=env_dict) as remote_server:
+    env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
+    with RemoteOpenAIServer(
+        MODEL_NAME, default_server_args, env_dict=env_dict
+    ) as remote_server:
         yield remote_server
 
 
@@ -87,30 +90,36 @@ async def client(server):
 # {metric_family: [(suffix, expected_value)]}
 EXPECTED_VALUES = {
     "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:time_per_output_token_seconds":
-    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
+    "vllm:time_per_output_token_seconds": [
+        ("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
+    ],
     "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
     "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
     "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
     "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
     "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
-    "vllm:request_prompt_tokens":
-    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
-    "vllm:request_generation_tokens":
-    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
+    "vllm:request_prompt_tokens": [
+        ("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
+        ("_count", _NUM_REQUESTS),
+    ],
+    "vllm:request_generation_tokens": [
+        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+        ("_count", _NUM_REQUESTS),
+    ],
     "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
     "vllm:request_params_max_tokens": [
         ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-        ("_count", _NUM_REQUESTS)
+        ("_count", _NUM_REQUESTS),
+    ],
+    "vllm:iteration_tokens_total": [
+        (
+            "_sum",
+            _NUM_REQUESTS
+            * (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST),
+        ),
+        ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
     ],
-    "vllm:iteration_tokens_total":
-    [("_sum", _NUM_REQUESTS *
-      (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
-     ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
-    "vllm:prompt_tokens": [("_total",
-                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
+    "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
     "vllm:generation_tokens": [
         ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
     ],
@@ -119,14 +128,16 @@ async def client(server):
 
 
 @pytest.mark.asyncio
-async def test_metrics_counts(server: RemoteOpenAIServer,
-                              client: openai.AsyncClient, use_v1: bool):
+async def test_metrics_counts(
+    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+):
     for _ in range(_NUM_REQUESTS):
         # sending a request triggers the metrics to be logged.
         await client.completions.create(
             model=MODEL_NAME,
             prompt=_TOKENIZED_PROMPT,
-            max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
+            max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST,
+        )
 
     response = requests.get(server.url_for("metrics"))
     print(response.text)
@@ -134,9 +145,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
-                or (not server.show_hidden_metrics
-                    and metric_family in HIDDEN_DEPRECATED_METRICS)):
+        if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
+            not server.show_hidden_metrics
+            and metric_family in HIDDEN_DEPRECATED_METRICS
+        ):
             continue
 
         found_metric = False
@@ -160,14 +172,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
                             assert sample.value == expected_value, (
                                 f"{metric_name_w_suffix} expected value of "
                                 f"{expected_value} did not match found value "
-                                f"{sample.value}")
+                                f"{sample.value}"
+                            )
                             break
                     assert found_suffix, (
                         f"Did not find {metric_name_w_suffix} in prom endpoint"
                     )
                 break
 
-        assert found_metric, (f"Did not find {metric_family} in prom endpoint")
+        assert found_metric, f"Did not find {metric_family} in prom endpoint"
 
 
 EXPECTED_METRICS = [
@@ -277,20 +290,19 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
 
 @pytest.mark.asyncio
-async def test_metrics_exist(server: RemoteOpenAIServer,
-                             client: openai.AsyncClient, use_v1: bool):
+async def test_metrics_exist(
+    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+):
     # sending a request triggers the metrics to be logged.
-    await client.completions.create(model=MODEL_NAME,
-                                    prompt="Hello, my name is",
-                                    max_tokens=5,
-                                    temperature=0.0)
+    await client.completions.create(
+        model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
 
     response = requests.get(server.url_for("metrics"))
     assert response.status_code == HTTPStatus.OK
 
-    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        if (not server.show_hidden_metrics
-                and metric not in HIDDEN_DEPRECATED_METRICS):
+    for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
+        if not server.show_hidden_metrics and metric not in HIDDEN_DEPRECATED_METRICS:
             assert metric in response.text
 
 
@@ -303,27 +315,30 @@ def test_metrics_exist_run_batch(use_v1: bool):
     port = "8001"
     server_url = f"http://{base_url}:{port}"
 
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(input_batch)
         input_file.flush()
-        proc = subprocess.Popen([
-            sys.executable,
-            "-m",
-            "vllm.entrypoints.openai.run_batch",
-            "-i",
-            input_file.name,
-            "-o",
-            output_file.name,
-            "--model",
-            "intfloat/multilingual-e5-small",
-            "--enable-metrics",
-            "--url",
-            base_url,
-            "--port",
-            port,
-        ], )
+        proc = subprocess.Popen(
+            [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.run_batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+                "--enable-metrics",
+                "--url",
+                base_url,
+                "--port",
+                port,
+            ],
+        )
 
         def is_server_up(url):
             try:
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 1980daa80db9..9444a8a677b0 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -4,6 +4,7 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
@@ -61,7 +62,6 @@ async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
     lora_models = models[1:]
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
+    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
     assert lora_models[0].id == "zephyr-lora"
     assert lora_models[1].id == "zephyr-lora2"
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index f0ce50debe49..ba463be1d5cd 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -25,13 +25,10 @@ def run_and_test_dummy_opt_api_server(model, tp=1):
         client = server.get_client()
         completion = client.chat.completions.create(
             model=model,
-            messages=[{
-                "role": "system",
-                "content": "You are a helpful assistant."
-            }, {
-                "role": "user",
-                "content": "Hello!"
-            }],
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Hello!"},
+            ],
             temperature=0,
         )
         generated_text = completion.choices[0].message.content
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 580bf34f20c4..c715e9246792 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -70,11 +70,15 @@ def no_file_type(case: schemathesis.models.Case):
             -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
             http://localhost:8000/tokenize
         """  # noqa: E501
-        if (op.method.lower() == "post" and op.path == "/tokenize"
-                and hasattr(case, "body") and isinstance(case.body, dict)
-                and "messages" in case.body
-                and isinstance(case.body["messages"], list)
-                and len(case.body["messages"]) > 0):
+        if (
+            op.method.lower() == "post"
+            and op.path == "/tokenize"
+            and hasattr(case, "body")
+            and isinstance(case.body, dict)
+            and "messages" in case.body
+            and isinstance(case.body["messages"], list)
+            and len(case.body["messages"]) > 0
+        ):
             for message in case.body["messages"]:
                 if not isinstance(message, dict):
                     continue
@@ -102,9 +106,8 @@ def test_openapi_stateless(case: schemathesis.Case):
 
     timeout = {
         # requires a longer timeout
-        ("POST", "/v1/chat/completions"):
-        LONG_TIMEOUT_SECONDS,
+        ("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
     }.get(key, DEFAULT_TIMEOUT_SECONDS)
 
-    #No need to verify SSL certificate for localhost
+    # No need to verify SSL certificate for localhost
     case.call_and_validate(verify=False, timeout=timeout)
diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/openai/test_optional_middleware.py
index 882fa0886ce3..0361cd182f27 100644
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@@ -37,7 +37,7 @@ def server(request: pytest.FixtureRequest):
         "--enforce-eager",
         "--max-num-seqs",
         "2",
-        *passed_params
+        *passed_params,
     ]
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
@@ -73,8 +73,9 @@ async def test_missing_api_token(server: RemoteOpenAIServer):
 )
 @pytest.mark.asyncio
 async def test_passed_api_token(server: RemoteOpenAIServer):
-    response = requests.get(server.url_for("v1/models"),
-                            headers={"Authorization": "Bearer test"})
+    response = requests.get(
+        server.url_for("v1/models"), headers={"Authorization": "Bearer test"}
+    )
     assert response.status_code == HTTPStatus.OK
 
 
@@ -110,7 +111,8 @@ async def test_enable_request_id_header(server: RemoteOpenAIServer):
 )
 @pytest.mark.asyncio
 async def test_custom_request_id_header(server: RemoteOpenAIServer):
-    response = requests.get(server.url_for("health"),
-                            headers={"X-Request-Id": "Custom"})
+    response = requests.get(
+        server.url_for("health"), headers={"X-Request-Id": "Custom"}
+    )
     assert "X-Request-Id" in response.headers
     assert response.headers.get("X-Request-Id") == "Custom"
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 02165ee6d58e..2a10be084b92 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -47,11 +47,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
     # test single pooling
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_texts,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_texts, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -67,11 +63,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
     input_tokens = [1, 1, 1, 1, 1]
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_tokens,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -89,16 +81,13 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
 async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
     # test list[str]
     input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
+        "The cat sat on the mat.",
+        "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky.",
     ]
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_texts,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_texts, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -111,15 +100,15 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
     assert poolings.usage.total_tokens == 29
 
     # test list[list[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
+    input_tokens = [
+        [4, 5, 7, 9, 20],
+        [15, 29, 499],
+        [24, 24, 24, 24, 24],
+        [25, 32, 64, 77],
+    ]
     response = requests.post(
         server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_tokens,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "input": input_tokens, "encoding_format": "float"},
     )
     response.raise_for_status()
     poolings = PoolingResponse.model_validate(response.json())
@@ -134,18 +123,21 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_conversation_pooling(server: RemoteOpenAIServer,
-                                    model_name: str):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     chat_response = requests.post(
         server.url_for("pooling"),
@@ -181,24 +173,22 @@ async def test_conversation_pooling(server: RemoteOpenAIServer,
         },
     )
     completions_response.raise_for_status()
-    completion_poolings = PoolingResponse.model_validate(
-        completions_response.json())
+    completion_poolings = PoolingResponse.model_validate(completions_response.json())
 
     assert chat_poolings.id is not None
     assert completion_poolings.id is not None
     assert chat_poolings.created <= completion_poolings.created
-    assert chat_poolings.model_dump(
-        exclude={"id", "created"}) == (completion_poolings.model_dump(
-            exclude={"id", "created"}))
+    assert chat_poolings.model_dump(exclude={"id", "created"}) == (
+        completion_poolings.model_dump(exclude={"id", "created"})
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_pooling(server: RemoteOpenAIServer,
-                                    model_name: str):
+async def test_batch_base64_pooling(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
         "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
+        "The best thing about vLLM is that it supports many different models",
     ]
 
     float_response = requests.post(
@@ -211,9 +201,7 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
     )
     float_response.raise_for_status()
     responses_float = PoolingResponse.model_validate(float_response.json())
-    float_data = [
-        np.array(d.data).squeeze(-1).tolist() for d in responses_float.data
-    ]
+    float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data]
 
     base64_response = requests.post(
         server.url_for("pooling"),
@@ -229,13 +217,15 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
     decoded_responses_base64_data = []
     for data in responses_base64.data:
         decoded_responses_base64_data.append(
-            np.frombuffer(base64.b64decode(data.data),
-                          dtype="float32").tolist())
-
-    check_embeddings_close(embeddings_0_lst=float_data,
-                           embeddings_1_lst=decoded_responses_base64_data,
-                           name_0="float32",
-                           name_1="base64")
+            np.frombuffer(base64.b64decode(data.data), dtype="float32").tolist()
+        )
+
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=decoded_responses_base64_data,
+        name_0="float32",
+        name_1="base64",
+    )
 
     # Default response is float32 decoded from base64 by OpenAI Client
     default_response = requests.post(
@@ -251,10 +241,12 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
         np.array(d.data).squeeze(-1).tolist() for d in responses_default.data
     ]
 
-    check_embeddings_close(embeddings_0_lst=float_data,
-                           embeddings_1_lst=default_data,
-                           name_0="float32",
-                           name_1="default")
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=default_data,
+        name_0="float32",
+        name_1="default",
+    )
 
 
 @pytest.mark.asyncio
@@ -269,39 +261,46 @@ async def test_invocations(server: RemoteOpenAIServer):
         "encoding_format": "float",
     }
 
-    completion_response = requests.post(server.url_for("pooling"),
-                                        json=request_args)
+    completion_response = requests.post(server.url_for("pooling"), json=request_args)
     completion_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     completion_output = completion_response.json()
     invocation_output = invocation_response.json()
 
     assert completion_output.keys() == invocation_output.keys()
-    for completion_data, invocation_data in zip(completion_output["data"],
-                                                invocation_output["data"]):
+    for completion_data, invocation_data in zip(
+        completion_output["data"], invocation_output["data"]
+    ):
         assert completion_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=completion_data["data"],
-                               embeddings_1_lst=invocation_data["data"],
-                               name_0="completion",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=completion_data["data"],
+            embeddings_1_lst=invocation_data["data"],
+            name_0="completion",
+            name_1="invocation",
+        )
 
 
 @pytest.mark.asyncio
 async def test_invocations_conversation(server: RemoteOpenAIServer):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
 
     request_args = {
         "model": MODEL_NAME,
@@ -312,18 +311,22 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
     chat_response = requests.post(server.url_for("pooling"), json=request_args)
     chat_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     chat_output = chat_response.json()
     invocation_output = invocation_response.json()
 
     assert chat_output.keys() == invocation_output.keys()
-    for chat_data, invocation_data in zip(chat_output["data"],
-                                          invocation_output["data"]):
+    for chat_data, invocation_data in zip(
+        chat_output["data"], invocation_output["data"]
+    ):
         assert chat_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=chat_data["data"],
-                               embeddings_1_lst=invocation_data["data"],
-                               name_0="chat",
-                               name_1="invocation")
+        check_embeddings_close(
+            embeddings_0_lst=chat_data["data"],
+            embeddings_1_lst=invocation_data["data"],
+            name_0="chat",
+            name_1="invocation",
+        )
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index ff0730c77032..dcea9571cf7d 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -16,12 +16,12 @@ async def test_empty_prompt():
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
 
-        with pytest.raises(openai.BadRequestError,
-                           match="decoder prompt cannot be empty"):
-            await client.completions.create(model=model_name,
-                                            prompt="",
-                                            max_tokens=5,
-                                            temperature=0.0)
+        with pytest.raises(
+            openai.BadRequestError, match="decoder prompt cannot be empty"
+        ):
+            await client.completions.create(
+                model=model_name, prompt="", max_tokens=5, temperature=0.0
+            )
 
 
 @pytest.mark.asyncio
@@ -31,12 +31,12 @@ async def test_out_of_vocab_token_ids():
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
 
-        with pytest.raises(openai.BadRequestError,
-                           match=re.compile('.*out of vocabulary.*').pattern):
-            await client.completions.create(model=model_name,
-                                            prompt=[999999],
-                                            max_tokens=5,
-                                            temperature=0.0)
+        with pytest.raises(
+            openai.BadRequestError, match=re.compile(".*out of vocabulary.*").pattern
+        ):
+            await client.completions.create(
+                model=model_name, prompt=[999999], max_tokens=5, temperature=0.0
+            )
 
 
 @pytest.mark.asyncio
@@ -47,14 +47,13 @@ async def test_reject_multistep_with_guided_decoding():
         client = remote_server.get_async_client()
 
         with pytest.raises(
-                openai.BadRequestError,
-                match=re.compile(
-                    '.*Guided decoding .* multi-step decoding.*').pattern):
+            openai.BadRequestError,
+            match=re.compile(".*Guided decoding .* multi-step decoding.*").pattern,
+        ):
             await client.completions.create(
                 model=model_name,
                 prompt="Hello",
                 max_tokens=5,
                 temperature=0.0,
-                extra_body={"response_format": {
-                    "type": "json_object"
-                }})
+                extra_body={"response_format": {"type": "json_object"}},
+            )
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index 4da97fe13691..b7d03bbc2634 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -32,15 +32,18 @@ def server():
 def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
     documents = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
     ]
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json={
-                                        "model": model_name,
-                                        "query": query,
-                                        "documents": documents,
-                                    })
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": query,
+            "documents": documents,
+        },
+    )
     rerank_response.raise_for_status()
     rerank = RerankResponse.model_validate(rerank_response.json())
 
@@ -56,16 +59,14 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
     documents = [
         "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.", "Cross-encoder models are neat"
+        "The capital of France is Paris.",
+        "Cross-encoder models are neat",
     ]
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json={
-                                        "model": model_name,
-                                        "query": query,
-                                        "documents": documents,
-                                        "top_n": 2
-                                    })
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": model_name, "query": query, "documents": documents, "top_n": 2},
+    )
     rerank_response.raise_for_status()
     rerank = RerankResponse.model_validate(rerank_response.json())
 
@@ -78,28 +79,26 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
-
     query = "What is the capital of France?" * 100
     documents = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
     ]
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json={
-                                        "model": model_name,
-                                        "query": query,
-                                        "documents": documents
-                                    })
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": model_name, "query": query, "documents": documents},
+    )
     assert rerank_response.status_code == 400
     # Assert just a small fragments of the response
-    assert "Please reduce the length of the input." in \
-        rerank_response.text
+    assert "Please reduce the length of the input." in rerank_response.text
 
 
 def test_invocations(server: RemoteOpenAIServer):
     query = "What is the capital of France?"
     documents = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
     ]
 
     request_args = {
@@ -108,20 +107,22 @@ def test_invocations(server: RemoteOpenAIServer):
         "documents": documents,
     }
 
-    rerank_response = requests.post(server.url_for("rerank"),
-                                    json=request_args)
+    rerank_response = requests.post(server.url_for("rerank"), json=request_args)
     rerank_response.raise_for_status()
 
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
     invocation_response.raise_for_status()
 
     rerank_output = rerank_response.json()
     invocation_output = invocation_response.json()
 
     assert rerank_output.keys() == invocation_output.keys()
-    for rerank_result, invocations_result in zip(rerank_output["results"],
-                                                 invocation_output["results"]):
+    for rerank_result, invocations_result in zip(
+        rerank_output["results"], invocation_output["results"]
+    ):
         assert rerank_result.keys() == invocations_result.keys()
         assert rerank_result["relevance_score"] == pytest.approx(
-            invocations_result["relevance_score"], rel=0.01)
+            invocations_result["relevance_score"], rel=0.01
+        )
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 099062e55c72..8f5a3104e6e0 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -10,11 +10,13 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import default_server_args  # noqa: F401
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
-from .test_completion import zephyr_pa_files  # noqa: F401
-from .test_completion import MODEL_NAME
+from .test_completion import (
+    MODEL_NAME,
+    default_server_args,  # noqa: F401
+    zephyr_lora_added_tokens_files,  # noqa: F401
+    zephyr_lora_files,  # noqa: F401
+    zephyr_pa_files,  # noqa: F401
+)
 
 
 @pytest.fixture(scope="module")
@@ -25,22 +27,19 @@ def server_fixture(request, default_server_args):  # noqa: F811
         with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
             yield (remote_server, True)
     else:
-        with RemoteOpenAIServer(MODEL_NAME,
-                                default_server_args) as remote_server:
+        with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
             yield (remote_server, False)
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
-async def test_completion_return_tokens_as_token_ids_completion(
-        server_fixture):
+async def test_completion_return_tokens_as_token_ids_completion(server_fixture):
     server, use_server_flag = server_fixture
     request_args = {}
     if not use_server_flag:
         request_args["return_tokens_as_token_ids"] = True
 
     async with server.get_async_client() as client:
-
         completion = await client.completions.create(
             model=MODEL_NAME,
             # Include Unicode characters to test for dividing a single
@@ -51,7 +50,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
             temperature=0,
             max_tokens=10,
             logprobs=1,
-            extra_body=request_args)
+            extra_body=request_args,
+        )
 
         text = completion.choices[0].text
         token_strs = completion.choices[0].logprobs.tokens
@@ -85,22 +85,22 @@ async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
             # Include Unicode characters to test for dividing a single
             # character across multiple tokens: 🎉 is [28705, 31862] for the
             # Zephyr tokenizer
-            messages=[{
-                "role": "system",
-                "content": "You like to respond in only emojis, like 🎉"
-            }, {
-                "role": "user",
-                "content": "Please write some emojis: 🐱🐶🎉"
-            }],
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You like to respond in only emojis, like 🎉",
+                },
+                {"role": "user", "content": "Please write some emojis: 🐱🐶🎉"},
+            ],
             temperature=0,
             max_tokens=8,
             logprobs=True,
-            extra_body=request_args)
+            extra_body=request_args,
+        )
 
         text = response.choices[0].message.content
         tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
         token_ids = []
         for logprob_content in response.choices[0].logprobs.content:
-            token_ids.append(
-                int(logprob_content.token.removeprefix("token_id:")))
+            token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
         assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 7b4966848b9d..6bcb80878f07 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -51,26 +51,31 @@ class TestCase(NamedTuple):
             model_name=MODEL_NAME,
             base_url=["v1"],  # http://localhost:8000/v1
             api_key=ERROR_API_KEY,
-            expected_error=openai.AuthenticationError),
+            expected_error=openai.AuthenticationError,
+        ),
         TestCase(
             model_name=MODEL_NAME,
             base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
             api_key=ERROR_API_KEY,
-            expected_error=openai.AuthenticationError),
+            expected_error=openai.AuthenticationError,
+        ),
         TestCase(
             model_name=MODEL_NAME,
             base_url=["v1"],  # http://localhost:8000/v1
             api_key=API_KEY,
-            expected_error=None),
+            expected_error=None,
+        ),
         TestCase(
             model_name=MODEL_NAME,
             base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
             api_key=API_KEY,
-            expected_error=None),
+            expected_error=None,
+        ),
     ],
 )
-async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
-                                                   test_case: TestCase):
+async def test_chat_session_root_path_with_api_key(
+    server: RemoteOpenAIServer, test_case: TestCase
+):
     saying: str = "Here is a common saying about apple. An apple a day, keeps"
     ctx = contextlib.nullcontext()
     if test_case.expected_error is not None:
@@ -79,20 +84,16 @@ async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
         client = openai.AsyncOpenAI(
             api_key=test_case.api_key,
             base_url=server.url_for(*test_case.base_url),
-            max_retries=0)
+            max_retries=0,
+        )
         chat_completion = await client.chat.completions.create(
             model=test_case.model_name,
-            messages=[{
-                "role": "user",
-                "content": "tell me a common saying"
-            }, {
-                "role": "assistant",
-                "content": saying
-            }],
-            extra_body={
-                "continue_final_message": True,
-                "add_generation_prompt": False
-            })
+            messages=[
+                {"role": "user", "content": "tell me a common saying"},
+                {"role": "assistant", "content": saying},
+            ],
+            extra_body={"continue_final_message": True, "add_generation_prompt": False},
+        )
 
         assert chat_completion.id is not None
         assert len(chat_completion.choices) == 1
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index e23f41e983b0..d31dadf90679 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -35,15 +35,24 @@
 
 
 def test_empty_file():
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write("")
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "intfloat/multilingual-e5-small"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
@@ -53,15 +62,24 @@ def test_empty_file():
 
 
 def test_completions():
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(INPUT_BATCH)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "NousResearch/Meta-Llama-3-8B-Instruct",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
@@ -77,30 +95,48 @@ def test_completions_invalid_input():
     """
     Ensure that we fail when the input doesn't conform to the openai api.
     """
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(INVALID_INPUT_BATCH)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "NousResearch/Meta-Llama-3-8B-Instruct",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode != 0, f"{proc=}"
 
 
 def test_embeddings():
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(INPUT_EMBEDDING_BATCH)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            "--model", "intfloat/multilingual-e5-small"
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
@@ -112,24 +148,26 @@ def test_embeddings():
             BatchRequestOutput.model_validate_json(line)
 
 
-@pytest.mark.parametrize("input_batch",
-                         [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
+@pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
 def test_score(input_batch):
-    with tempfile.NamedTemporaryFile(
-            "w") as input_file, tempfile.NamedTemporaryFile(
-                "r") as output_file:
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
         input_file.write(input_batch)
         input_file.flush()
-        proc = subprocess.Popen([
-            "vllm",
-            "run-batch",
-            "-i",
-            input_file.name,
-            "-o",
-            output_file.name,
-            "--model",
-            "BAAI/bge-reranker-v2-m3",
-        ], )
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "BAAI/bge-reranker-v2-m3",
+            ],
+        )
         proc.communicate()
         proc.wait()
         assert proc.returncode == 0, f"{proc=}"
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 187542b7bafc..a5381ec1f87b 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -21,14 +21,8 @@ def v1(run_with_both_engines):
 
 
 MODELS = [
-    {
-        "name": "BAAI/bge-reranker-v2-m3",
-        "is_cross_encoder": True
-    },
-    {
-        "name": "BAAI/bge-base-en-v1.5",
-        "is_cross_encoder": False
-    },
+    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
+    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
 ]
 DTYPE = "half"
 
@@ -37,9 +31,7 @@ def run_transformers(hf_model, model, text_pairs):
     if model["is_cross_encoder"]:
         return hf_model.predict(text_pairs).tolist()
     else:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
         return [
             F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
             for pair in hf_embeddings
@@ -63,8 +55,9 @@ def server(model: dict[str, Any]):
 def runner(model: dict[str, Any], hf_runner):
     kwargs = {
         "dtype": DTYPE,
-        "is_cross_encoder" if model["is_cross_encoder"]\
-              else "is_sentence_transformer": True
+        "is_cross_encoder"
+        if model["is_cross_encoder"]
+        else "is_sentence_transformer": True,
     }
 
     with hf_runner(model["name"], **kwargs) as hf_model:
@@ -72,21 +65,23 @@ def runner(model: dict[str, Any], hf_runner):
 
 
 class TestModel:
-
-    def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
-                                    model: dict[str, Any], runner):
+    def test_text_1_str_text_2_list(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
         text_1 = "What is the capital of France?"
         text_2 = [
             "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
+            "The capital of France is Paris.",
         ]
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         score_response.raise_for_status()
         score = ScoreResponse.model_validate(score_response.json())
 
@@ -102,23 +97,26 @@ def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
         for i in range(len(vllm_outputs)):
             assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
-    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
-                                     model: dict[str, Any], runner):
+    def test_text_1_list_text_2_list(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
         text_1 = [
             "What is the capital of the United States?",
-            "What is the capital of France?"
+            "What is the capital of France?",
         ]
         text_2 = [
             "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
+            "The capital of France is Paris.",
         ]
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         score_response.raise_for_status()
         score = ScoreResponse.model_validate(score_response.json())
 
@@ -134,17 +132,20 @@ def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
         for i in range(len(vllm_outputs)):
             assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
-    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
-                                   model: dict[str, Any], runner):
+    def test_text_1_str_text_2_str(
+        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
+    ):
         text_1 = "What is the capital of France?"
         text_2 = "The capital of France is Paris."
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         score_response.raise_for_status()
         score = ScoreResponse.model_validate(score_response.json())
 
@@ -160,40 +161,41 @@ def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
         for i in range(len(vllm_outputs)):
             assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
-    def test_score_max_model_len(self, server: RemoteOpenAIServer,
-                                 model: dict[str, Any]):
-
+    def test_score_max_model_len(
+        self, server: RemoteOpenAIServer, model: dict[str, Any]
+    ):
         text_1 = "What is the capital of France?" * 20
         text_2 = [
             "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
+            "The capital of France is Paris.",
         ]
 
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+            },
+        )
         assert score_response.status_code == 400
         # Assert just a small fragments of the response
-        assert "Please reduce the length of the input." in \
-            score_response.text
+        assert "Please reduce the length of the input." in score_response.text
 
         # Test truncation
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                           "truncate_prompt_tokens": 101
-                                       })
+        score_response = requests.post(
+            server.url_for("score"),
+            json={
+                "model": model["name"],
+                "text_1": text_1,
+                "text_2": text_2,
+                "truncate_prompt_tokens": 101,
+            },
+        )
         assert score_response.status_code == 400
-        assert "Please, select a smaller truncation size." in \
-            score_response.text
+        assert "Please, select a smaller truncation size." in score_response.text
 
-    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
-                                                                       Any]):
+    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
         text_1 = "What is the capital of France?"
         text_2 = "The capital of France is Paris."
 
@@ -203,20 +205,22 @@ def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
             "text_2": text_2,
         }
 
-        score_response = requests.post(server.url_for("score"),
-                                       json=request_args)
+        score_response = requests.post(server.url_for("score"), json=request_args)
         score_response.raise_for_status()
 
-        invocation_response = requests.post(server.url_for("invocations"),
-                                            json=request_args)
+        invocation_response = requests.post(
+            server.url_for("invocations"), json=request_args
+        )
         invocation_response.raise_for_status()
 
         score_output = score_response.json()
         invocation_output = invocation_response.json()
 
         assert score_output.keys() == invocation_output.keys()
-        for score_data, invocation_data in zip(score_output["data"],
-                                               invocation_output["data"]):
+        for score_data, invocation_data in zip(
+            score_output["data"], invocation_output["data"]
+        ):
             assert score_data.keys() == invocation_data.keys()
             assert score_data["score"] == pytest.approx(
-                invocation_data["score"], rel=0.01)
+                invocation_data["score"], rel=0.01
+            )
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 8a7892cf6d6a..ade16ad35781 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -13,8 +13,7 @@
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MODEL_NAME = "openai-community/gpt2"
@@ -50,7 +49,6 @@ def get_diff_sampling_param(self):
 
 @dataclass
 class MockEngine:
-
     async def get_model_config(self):
         return MockModelConfig()
 
@@ -60,13 +58,15 @@ async def _async_serving_chat_init():
     model_config = await engine.get_model_config()
 
     models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
-    serving_completion = OpenAIServingChat(engine,
-                                           model_config,
-                                           models,
-                                           response_role="assistant",
-                                           chat_template=CHAT_TEMPLATE,
-                                           chat_template_content_format="auto",
-                                           request_logger=None)
+    serving_completion = OpenAIServingChat(
+        engine,
+        model_config,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
     return serving_completion
 
 
@@ -81,23 +81,24 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
 
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=MockModelConfig())
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     MockModelConfig(),
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+        model_config=MockModelConfig(),
+    )
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        MockModelConfig(),
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
 
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
         guided_decoding_backend="outlines",
     )
 
@@ -125,24 +126,25 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+        model_config=mock_model_config,
+    )
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        mock_model_config,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
 
     # Test Case 1: No max_tokens specified in request
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
         guided_decoding_backend="outlines",
     )
 
@@ -180,24 +182,25 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+        model_config=mock_model_config,
+    )
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        mock_model_config,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
 
     # Test case 1: No max_tokens specified, defaults to context_window
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
         guided_decoding_backend="outlines",
     )
 
@@ -225,11 +228,10 @@ async def test_serving_chat_should_set_correct_max_tokens():
 
 @pytest.mark.asyncio
 async def test_serving_chat_could_load_correct_generation_config():
-
     mock_model_config = MockModelConfig()
     mock_model_config.diff_sampling_param = {
         "temperature": 0.5,
-        "repetition_penalty": 1.05
+        "repetition_penalty": 1.05,
     }
 
     mock_engine = MagicMock(spec=MQLLMEngineClient)
@@ -237,23 +239,24 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+        model_config=mock_model_config,
+    )
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        mock_model_config,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
 
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
         guided_decoding_backend="outlines",
     )
 
@@ -291,24 +294,25 @@ async def test_serving_chat_did_set_correct_cache_salt():
     mock_engine.errored = False
 
     # Initialize the serving chat
-    models = OpenAIServingModels(engine_client=mock_engine,
-                                 base_model_paths=BASE_MODEL_PATHS,
-                                 model_config=mock_model_config)
-    serving_chat = OpenAIServingChat(mock_engine,
-                                     mock_model_config,
-                                     models,
-                                     response_role="assistant",
-                                     chat_template=CHAT_TEMPLATE,
-                                     chat_template_content_format="auto",
-                                     request_logger=None)
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+        model_config=mock_model_config,
+    )
+    serving_chat = OpenAIServingChat(
+        mock_engine,
+        mock_model_config,
+        models,
+        response_role="assistant",
+        chat_template=CHAT_TEMPLATE,
+        chat_template_content_format="auto",
+        request_logger=None,
+    )
 
     # Test cache_salt
     req = ChatCompletionRequest(
         model=MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
+        messages=[{"role": "user", "content": "what is 1+1?"}],
     )
 
     # By default cache_salt in the engine prompt is not set
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 5f334c754a3f..3d21919489f3 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -8,19 +8,20 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              LoadLoRAAdapterRequest,
-                                              UnloadLoRAAdapterRequest)
-from vllm.entrypoints.openai.serving_models import (BaseModelPath,
-                                                    OpenAIServingModels)
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    LoadLoRAAdapterRequest,
+    UnloadLoRAAdapterRequest,
+)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
-LORA_LOADING_SUCCESS_MESSAGE = (
-    "Success: LoRA adapter '{lora_name}' added successfully.")
+LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
 LORA_UNLOADING_SUCCESS_MESSAGE = (
-    "Success: LoRA adapter '{lora_name}' removed successfully.")
+    "Success: LoRA adapter '{lora_name}' removed successfully."
+)
 
 
 async def _async_serving_models_init() -> OpenAIServingModels:
@@ -29,11 +30,13 @@ async def _async_serving_models_init() -> OpenAIServingModels:
     # Set the max_model_len attribute to avoid missing attribute
     mock_model_config.max_model_len = 2048
 
-    serving_models = OpenAIServingModels(engine_client=mock_engine_client,
-                                         base_model_paths=BASE_MODEL_PATHS,
-                                         model_config=mock_model_config,
-                                         lora_modules=None,
-                                         prompt_adapters=None)
+    serving_models = OpenAIServingModels(
+        engine_client=mock_engine_client,
+        base_model_paths=BASE_MODEL_PATHS,
+        model_config=mock_model_config,
+        lora_modules=None,
+        prompt_adapters=None,
+    )
     await serving_models.init_static_loras()
 
     return serving_models
@@ -43,19 +46,18 @@ async def _async_serving_models_init() -> OpenAIServingModels:
 async def test_serving_model_name():
     serving_models = await _async_serving_models_init()
     assert serving_models.model_name(None) == MODEL_NAME
-    request = LoRARequest(lora_name="adapter",
-                          lora_path="/path/to/adapter2",
-                          lora_int_id=1)
+    request = LoRARequest(
+        lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1
+    )
     assert serving_models.model_name(request) == request.lora_name
 
 
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
     serving_models = await _async_serving_models_init()
-    request = LoadLoRAAdapterRequest(lora_name="adapter",
-                                     lora_path="/path/to/adapter2")
+    request = LoadLoRAAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2")
     response = await serving_models.load_lora_adapter(request)
-    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter")
     assert len(serving_models.lora_requests) == 1
     assert "adapter" in serving_models.lora_requests
     assert serving_models.lora_requests["adapter"].lora_name == "adapter"
@@ -74,15 +76,16 @@ async def test_load_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
     serving_models = await _async_serving_models_init()
-    request = LoadLoRAAdapterRequest(lora_name="adapter1",
-                                     lora_path="/path/to/adapter1")
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
     response = await serving_models.load_lora_adapter(request)
-    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
-        lora_name='adapter1')
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
     assert len(serving_models.lora_requests) == 1
 
-    request = LoadLoRAAdapterRequest(lora_name="adapter1",
-                                     lora_path="/path/to/adapter1")
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
     response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
@@ -93,15 +96,15 @@ async def test_load_lora_adapter_duplicate():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
     serving_models = await _async_serving_models_init()
-    request = LoadLoRAAdapterRequest(lora_name="adapter1",
-                                     lora_path="/path/to/adapter1")
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
     response = await serving_models.load_lora_adapter(request)
     assert len(serving_models.lora_requests) == 1
 
     request = UnloadLoRAAdapterRequest(lora_name="adapter1")
     response = await serving_models.unload_lora_adapter(request)
-    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
-        lora_name='adapter1')
+    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
     assert len(serving_models.lora_requests) == 0
 
 
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 29a94c852bba..ff46df81d0ff 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -24,16 +24,13 @@ async def test_shutdown_on_engine_failure():
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         async with remote_server.get_async_client() as client:
-
-            with pytest.raises(
-                (openai.APIConnectionError, openai.InternalServerError)):
+            with pytest.raises((openai.APIConnectionError, openai.InternalServerError)):
                 # Asking for lots of prompt logprobs will currently crash the
                 # engine. This may change in the future when that bug is fixed
                 prompt = "Hello " * 4000
                 await client.completions.create(
-                    model=MODEL_NAME,
-                    prompt=prompt,
-                    extra_body={"prompt_logprobs": 10})
+                    model=MODEL_NAME, prompt=prompt, extra_body={"prompt_logprobs": 10}
+                )
 
             # Now the server should shut down
             return_code = remote_server.proc.wait(timeout=8)
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 0dd6af17ef22..e07436f89d2d 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -20,14 +20,12 @@ def test_sleep_mode():
         "--enable-sleep-mode",
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={
-                                "VLLM_SERVER_DEV_MODE": "1",
-                                "CUDA_VISIBLE_DEVICES": "0"
-                            }) as remote_server:
-        response = requests.post(remote_server.url_for("sleep"),
-                                 params={"level": "1"})
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+        env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
+    ) as remote_server:
+        response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
         assert response.status_code == 200
         response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
@@ -40,12 +38,12 @@ def test_sleep_mode():
         assert response.json().get("is_sleeping") is False
 
         # test wake up with tags
-        response = requests.post(remote_server.url_for("sleep"),
-                                 params={"level": "1"})
+        response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
         assert response.status_code == 200
 
-        response = requests.post(remote_server.url_for("wake_up"),
-                                 params={"tags": ["weights"]})
+        response = requests.post(
+            remote_server.url_for("wake_up"), params={"tags": ["weights"]}
+        )
         assert response.status_code == 200
 
         # is sleeping should be false after waking up any part of the engine
@@ -53,8 +51,9 @@ def test_sleep_mode():
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is True
 
-        response = requests.post(remote_server.url_for("wake_up"),
-                                 params={"tags": ["kv_cache"]})
+        response = requests.post(
+            remote_server.url_for("wake_up"), params={"tags": ["kv_cache"]}
+        )
         assert response.status_code == 200
 
         response = requests.get(remote_server.url_for("is_sleeping"))
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
index 4bf379850365..9b24fdfa5c91 100644
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -11,7 +11,10 @@
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tensorizer import (
-    TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
+    TensorizerConfig,
+    tensorize_lora_adapter,
+    tensorize_vllm_model,
+)
 
 from ...utils import RemoteOpenAIServer
 
@@ -29,21 +32,20 @@ def cleanup():
     _cleanup()
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def tmp_dir():
     with tempfile.TemporaryDirectory() as path:
         yield path
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def model_uri(tmp_dir):
     yield f"{tmp_dir}/model.tensors"
 
 
 @pytest.fixture(scope="module")
 def tensorize_model_and_lora(tmp_dir, model_uri):
-    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
-                                         lora_dir=tmp_dir)
+    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, lora_dir=tmp_dir)
     args = EngineArgs(model=MODEL_NAME, device="cuda")
 
     tensorize_lora_adapter(LORA_PATH, tensorizer_config)
@@ -66,8 +68,11 @@ def server(model_uri, tensorize_model_and_lora):
 
     ## Start OpenAI API server
     args = [
-        "--load-format", "tensorizer", "--served-model-name", MODEL_NAME,
-        "--enable-lora"
+        "--load-format",
+        "tensorizer",
+        "--served-model-name",
+        MODEL_NAME,
+        "--enable-lora",
     ]
 
     model_dir = os.path.dirname(model_uri)
@@ -85,10 +90,9 @@ async def client(server):
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
     _cleanup()
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
+    completion = await client.completions.create(
+        model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 1
@@ -97,4 +101,5 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
     assert len(completion.choices[0].text) >= 5
     assert completion.choices[0].finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+        completion_tokens=5, prompt_tokens=6, total_tokens=11
+    )
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 0dbbdfbfd24a..307dc0cacfe9 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -8,8 +8,10 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
+from .test_completion import (
+    zephyr_lora_added_tokens_files,  # noqa: F401
+    zephyr_lora_files,  # noqa: F401
+)
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -40,10 +42,10 @@ def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
 
 
 @pytest.fixture(scope="module")
-def tokenizer_name(model_name: str,
-                   zephyr_lora_added_tokens_files: str):  # noqa: F811
-    return zephyr_lora_added_tokens_files if (
-        model_name == "zephyr-lora2") else model_name
+def tokenizer_name(model_name: str, zephyr_lora_added_tokens_files: str):  # noqa: F811
+    return (
+        zephyr_lora_added_tokens_files if (model_name == "zephyr-lora2") else model_name
+    )
 
 
 @pytest_asyncio.fixture
@@ -63,19 +65,20 @@ async def test_tokenize_completions(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     for add_special in [False, True]:
         prompt = "vllm1 This is a test prompt."
         tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
 
-        response = requests.post(server.url_for("tokenize"),
-                                 json={
-                                     "add_special_tokens": add_special,
-                                     "model": model_name,
-                                     "prompt": prompt
-                                 })
+        response = requests.post(
+            server.url_for("tokenize"),
+            json={
+                "add_special_tokens": add_special,
+                "model": model_name,
+                "prompt": prompt,
+            },
+        )
         response.raise_for_status()
 
         result = response.json()
@@ -96,48 +99,39 @@ async def test_tokenize_chat(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
-            conversation = [{
-                "role": "user",
-                "content": "Hi there!"
-            }, {
-                "role": "assistant",
-                "content": "Nice to meet you!"
-            }, {
-                "role": "user",
-                "content": "Can I ask a question? vllm1"
-            }]
+            conversation = [
+                {"role": "user", "content": "Hi there!"},
+                {"role": "assistant", "content": "Nice to meet you!"},
+                {"role": "user", "content": "Can I ask a question? vllm1"},
+            ]
             for continue_final in [False, True]:
                 if add_generation and continue_final:
                     continue
                 if continue_final:
-                    conversation.append({
-                        "role": "assistant",
-                        "content": "Sure,"
-                    })
+                    conversation.append({"role": "assistant", "content": "Sure,"})
 
                 prompt = tokenizer.apply_chat_template(
                     add_generation_prompt=add_generation,
                     continue_final_message=continue_final,
                     conversation=conversation,
-                    tokenize=False)
-                tokens = tokenizer.encode(prompt,
-                                          add_special_tokens=add_special)
-
-                response = requests.post(server.url_for("tokenize"),
-                                         json={
-                                             "add_generation_prompt":
-                                             add_generation,
-                                             "continue_final_message":
-                                             continue_final,
-                                             "add_special_tokens": add_special,
-                                             "messages": conversation,
-                                             "model": model_name
-                                         })
+                    tokenize=False,
+                )
+                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+                response = requests.post(
+                    server.url_for("tokenize"),
+                    json={
+                        "add_generation_prompt": add_generation,
+                        "continue_final_message": continue_final,
+                        "add_special_tokens": add_special,
+                        "messages": conversation,
+                        "model": model_name,
+                    },
+                )
                 response.raise_for_status()
 
                 result = response.json()
@@ -158,41 +152,35 @@ async def test_tokenize_chat_with_tools(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
-            conversation = [{
-                "role":
-                "user",
-                "content":
-                "What's the weather like in Paris today?",
-            }]
-
-            tools = [{
-                "type": "function",
-                "function": {
-                    "name": "get_weather",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {
-                            "location": {
-                                "type": "string"
-                            }
+            conversation = [
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris today?",
+                }
+            ]
+
+            tools = [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"location": {"type": "string"}},
                         },
                     },
-                },
-            }]
+                }
+            ]
 
             for continue_final in [False, True]:
                 if add_generation and continue_final:
                     continue
                 if continue_final:
-                    conversation.append({
-                        "role": "assistant",
-                        "content": "Sure,"
-                    })
+                    conversation.append({"role": "assistant", "content": "Sure,"})
 
                 prompt = tokenizer.apply_chat_template(
                     add_generation_prompt=add_generation,
@@ -201,8 +189,7 @@ async def test_tokenize_chat_with_tools(
                     tools=tools,
                     tokenize=False,
                 )
-                tokens = tokenizer.encode(prompt,
-                                          add_special_tokens=add_special)
+                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
 
                 response = requests.post(
                     server.url_for("tokenize"),
@@ -235,17 +222,12 @@ async def test_tokenize_with_return_token_strs(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     prompt = "This is a token_strs test prompt! vllm1"
     response = requests.post(
         server.url_for("tokenize"),
-        json={
-            "prompt": prompt,
-            "model": model_name,
-            "return_token_strs": True
-        },
+        json={"prompt": prompt, "model": model_name, "return_token_strs": True},
     )
     response.raise_for_status()
 
@@ -270,17 +252,14 @@ async def test_detokenize(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                              tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
 
     prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
 
-    response = requests.post(server.url_for("detokenize"),
-                             json={
-                                 "model": model_name,
-                                 "tokens": tokens
-                             })
+    response = requests.post(
+        server.url_for("detokenize"), json={"model": model_name, "tokens": tokens}
+    )
     response.raise_for_status()
 
     assert response.json() == {"prompt": prompt}
@@ -329,14 +308,15 @@ async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
     }
     for field, expected_type in field_types.items():
         if field in result and result[field] is not None:
-            assert isinstance(
-                result[field],
-                expected_type), (f"{field} should be {expected_type.__name__}")
+            assert isinstance(result[field], expected_type), (
+                f"{field} should be {expected_type.__name__}"
+            )
 
 
 @pytest.mark.asyncio
 async def test_tokenizer_info_added_tokens_structure(
-    server: RemoteOpenAIServer, ):
+    server: RemoteOpenAIServer,
+):
     """Test added_tokens_decoder structure if present."""
     response = requests.get(server.url_for("tokenizer_info"))
     response.raise_for_status()
@@ -347,25 +327,23 @@ async def test_tokenizer_info_added_tokens_structure(
             assert isinstance(token_id, str), "Token IDs should be strings"
             assert isinstance(token_info, dict), "Token info should be a dict"
             assert "content" in token_info, "Token info should have content"
-            assert "special" in token_info, (
-                "Token info should have special flag")
-            assert isinstance(token_info["special"],
-                              bool), ("Special flag should be boolean")
+            assert "special" in token_info, "Token info should have special flag"
+            assert isinstance(token_info["special"], bool), (
+                "Special flag should be boolean"
+            )
 
 
 @pytest.mark.asyncio
 async def test_tokenizer_info_consistency_with_tokenize(
-    server: RemoteOpenAIServer, ):
+    server: RemoteOpenAIServer,
+):
     """Test that tokenizer info is consistent with tokenization endpoint."""
     info_response = requests.get(server.url_for("tokenizer_info"))
     info_response.raise_for_status()
     info = info_response.json()
     tokenize_response = requests.post(
         server.url_for("tokenize"),
-        json={
-            "model": MODEL_NAME,
-            "prompt": "Hello world!"
-        },
+        json={"model": MODEL_NAME, "prompt": "Hello world!"},
     )
     tokenize_response.raise_for_status()
     tokenize_result = tokenize_response.json()
@@ -373,7 +351,8 @@ async def test_tokenizer_info_consistency_with_tokenize(
     tokenize_max_len = tokenize_result.get("max_model_len")
     if info_max_len and tokenize_max_len:
         assert info_max_len >= tokenize_max_len, (
-            "Info max length should be >= tokenize max length")
+            "Info max length should be >= tokenize max length"
+        )
 
 
 @pytest.mark.asyncio
@@ -384,6 +363,5 @@ async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
     result = response.json()
     chat_template = result.get("chat_template")
     if chat_template:
-        assert isinstance(chat_template,
-                          str), ("Chat template should be a string")
-        assert chat_template.strip(), "Chat template should not be empty"
\ No newline at end of file
+        assert isinstance(chat_template, str), "Chat template should be a string"
+        assert chat_template.strip(), "Chat template should not be empty"
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index a8e2eb40b157..b5fc3b0f471c 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -18,29 +18,33 @@
 from ...utils import RemoteOpenAIServer
 
 MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
 ]
 
 
 @pytest.fixture
 def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
+    path = AudioAsset("mary_had_lamb").get_local_path()
     with open(str(path), "rb") as f:
         yield f
 
 
 @pytest.fixture
 def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
+    path = AudioAsset("winning_call").get_local_path()
     with open(str(path), "rb") as f:
         yield f
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    "model_name",
-    ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
+    "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
+)
 async def test_basic_audio(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
@@ -55,8 +59,9 @@ async def test_basic_audio(mary_had_lamb, model_name):
             file=mary_had_lamb,
             language="en",
             response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
+            temperature=0.0,
+        )
+        out = json.loads(transcription)["text"]
         assert "Mary had a little lamb," in out
 
 
@@ -69,10 +74,9 @@ async def test_bad_requests(mary_had_lamb):
 
         # invalid language
         with pytest.raises(openai.BadRequestError):
-            await client.audio.transcriptions.create(model=model_name,
-                                                     file=mary_had_lamb,
-                                                     language="hh",
-                                                     temperature=0.0)
+            await client.audio.transcriptions.create(
+                model=model_name, file=mary_had_lamb, language="hh", temperature=0.0
+            )
 
 
 @pytest.mark.asyncio
@@ -90,7 +94,7 @@ async def test_long_audio_request(mary_had_lamb, model_name):
     repeated_audio = np.tile(audio, 10)
     # Repeated audio to buffer
     buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format='WAV')
+    sf.write(buffer, repeated_audio, sr, format="WAV")
     buffer.seek(0)
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
@@ -99,8 +103,9 @@ async def test_long_audio_request(mary_had_lamb, model_name):
             file=buffer,
             language="en",
             response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
+            temperature=0.0,
+        )
+        out = json.loads(transcription)["text"]
         counts = out.count("Mary had a little lamb")
         assert counts == 10, counts
 
@@ -112,10 +117,9 @@ async def test_non_asr_model(winning_call):
     server_args = ["--enforce-eager"]
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
-        res = await client.audio.transcriptions.create(model=model_name,
-                                                       file=winning_call,
-                                                       language="en",
-                                                       temperature=0.0)
+        res = await client.audio.transcriptions.create(
+            model=model_name, file=winning_call, language="en", temperature=0.0
+        )
         assert res.code == 400 and not res.text
         assert res.message == "The model does not support Transcriptions API"
 
@@ -129,10 +133,8 @@ async def test_completion_endpoints():
         client = remote_server.get_async_client()
         res = await client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "system",
-                "content": "You are a helpful assistant."
-            }])
+            messages=[{"role": "system", "content": "You are a helpful assistant."}],
+        )
         assert res.code == 400
         assert res.message == "The model does not support Chat Completions API"
 
@@ -153,13 +155,14 @@ async def test_streaming_response(winning_call):
             file=winning_call,
             response_format="json",
             language="en",
-            temperature=0.0)
+            temperature=0.0,
+        )
         # Unfortunately this only works when the openai client is patched
         # to use streaming mode, not exposed in the transcription api.
         original_post = AsyncAPIClient.post
 
         async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
+            kwargs["stream"] = True
             return await original_post(*args, **kwargs)
 
         with patch.object(AsyncAPIClient, "post", new=post_with_stream):
@@ -170,11 +173,12 @@ async def post_with_stream(*args, **kwargs):
                 language="en",
                 temperature=0.0,
                 extra_body=dict(stream=True),
-                timeout=30)
+                timeout=30,
+            )
             # Reconstruct from chunks and validate
             async for chunk in res:
                 # just a chunk
-                text = chunk.choices[0]['delta']['content']
+                text = chunk.choices[0]["delta"]["content"]
                 transcription += text
 
         assert transcription == res_no_stream.text
@@ -188,7 +192,7 @@ async def test_stream_options(winning_call):
         original_post = AsyncAPIClient.post
 
         async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
+            kwargs["stream"] = True
             return await original_post(*args, **kwargs)
 
         with patch.object(AsyncAPIClient, "post", new=post_with_stream):
@@ -198,10 +202,13 @@ async def post_with_stream(*args, **kwargs):
                 file=winning_call,
                 language="en",
                 temperature=0.0,
-                extra_body=dict(stream=True,
-                                stream_include_usage=True,
-                                stream_continuous_usage_stats=True),
-                timeout=30)
+                extra_body=dict(
+                    stream=True,
+                    stream_include_usage=True,
+                    stream_continuous_usage_stats=True,
+                ),
+                timeout=30,
+            )
             final = False
             continuous = True
             async for chunk in res:
@@ -209,7 +216,7 @@ async def post_with_stream(*args, **kwargs):
                     # final usage sent
                     final = True
                 else:
-                    continuous = continuous and hasattr(chunk, 'usage')
+                    continuous = continuous and hasattr(chunk, "usage")
             assert final and continuous
 
 
@@ -217,7 +224,7 @@ async def post_with_stream(*args, **kwargs):
 async def test_sampling_params(mary_had_lamb):
     """
     Compare sampling with params and greedy sampling to assert results
-    are different when extreme sampling parameters values are picked. 
+    are different when extreme sampling parameters values are picked.
     """
     model_name = "openai/whisper-small"
     server_args = ["--enforce-eager"]
@@ -228,20 +235,24 @@ async def test_sampling_params(mary_had_lamb):
             file=mary_had_lamb,
             language="en",
             temperature=0.8,
-            extra_body=dict(seed=42,
-                            repetition_penalty=1.9,
-                            top_k=12,
-                            top_p=0.4,
-                            min_p=0.5,
-                            frequency_penalty=1.8,
-                            presence_penalty=2.0))
+            extra_body=dict(
+                seed=42,
+                repetition_penalty=1.9,
+                top_k=12,
+                top_p=0.4,
+                min_p=0.5,
+                frequency_penalty=1.8,
+                presence_penalty=2.0,
+            ),
+        )
 
         greedy_transcription = await client.audio.transcriptions.create(
             model=model_name,
             file=mary_had_lamb,
             language="en",
             temperature=0.0,
-            extra_body=dict(seed=42))
+            extra_body=dict(seed=42),
+        )
 
         assert greedy_transcription.text != transcription.text
 
@@ -252,7 +263,7 @@ async def test_audio_prompt(mary_had_lamb):
     server_args = ["--enforce-eager"]
     prompt = "This is a speech, recorded in a phonograph."
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        #Prompts should not omit the part of original prompt while transcribing.
+        # Prompts should not omit the part of original prompt while transcribing.
         prefix = "The first words I spoke in the original phonograph"
         client = remote_server.get_async_client()
         transcription = await client.audio.transcriptions.create(
@@ -260,8 +271,9 @@ async def test_audio_prompt(mary_had_lamb):
             file=mary_had_lamb,
             language="en",
             response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
+            temperature=0.0,
+        )
+        out = json.loads(transcription)["text"]
         assert prefix in out
         transcription_wprompt = await client.audio.transcriptions.create(
             model=model_name,
@@ -269,6 +281,7 @@ async def test_audio_prompt(mary_had_lamb):
             language="en",
             response_format="text",
             prompt=prompt,
-            temperature=0.0)
-        out_prompt = json.loads(transcription_wprompt)['text']
+            temperature=0.0,
+        )
+        out_prompt = json.loads(transcription_wprompt)["text"]
         assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index 79e769e3a1aa..8f15d9d43e92 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
+
 # imports for guided decoding tests
 import json
 from unittest.mock import patch
@@ -20,7 +21,7 @@
 @pytest.fixture
 def foscolo():
     # Test translation it->en
-    path = AudioAsset('azacinto_foscolo').get_local_path()
+    path = AudioAsset("azacinto_foscolo").get_local_path()
     with open(str(path), "rb") as f:
         yield f
 
@@ -38,8 +39,9 @@ async def test_basic_audio(foscolo):
             response_format="text",
             # TODO remove once language detection is implemented
             extra_body=dict(language="it"),
-            temperature=0.0)
-        out = json.loads(translation)['text'].strip().lower()
+            temperature=0.0,
+        )
+        out = json.loads(translation)["text"].strip().lower()
         assert "greek sea" in out
 
 
@@ -57,8 +59,9 @@ async def test_audio_prompt(foscolo):
             prompt=prompt,
             extra_body=dict(language="it"),
             response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
+            temperature=0.0,
+        )
+        out = json.loads(transcription)["text"]
         assert "Nor will I ever touch the sacred" not in out
         assert prompt not in out
 
@@ -70,9 +73,9 @@ async def test_non_asr_model(foscolo):
     server_args = ["--enforce-eager"]
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
-        res = await client.audio.translations.create(model=model_name,
-                                                     file=foscolo,
-                                                     temperature=0.0)
+        res = await client.audio.translations.create(
+            model=model_name, file=foscolo, temperature=0.0
+        )
         assert res.code == 400 and not res.text
         assert res.message == "The model does not support Translations API"
 
@@ -89,27 +92,28 @@ async def test_streaming_response(foscolo):
             file=foscolo,
             response_format="json",
             extra_body=dict(language="it"),
-            temperature=0.0)
+            temperature=0.0,
+        )
         # Unfortunately this only works when the openai client is patched
         # to use streaming mode, not exposed in the translation api.
         original_post = AsyncAPIClient.post
 
         async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
+            kwargs["stream"] = True
             return await original_post(*args, **kwargs)
 
         with patch.object(AsyncAPIClient, "post", new=post_with_stream):
             client = remote_server.get_async_client()
-            res = await client.audio.translations.create(model=model_name,
-                                                         file=foscolo,
-                                                         temperature=0.0,
-                                                         extra_body=dict(
-                                                             stream=True,
-                                                             language="it"))
+            res = await client.audio.translations.create(
+                model=model_name,
+                file=foscolo,
+                temperature=0.0,
+                extra_body=dict(stream=True, language="it"),
+            )
             # Reconstruct from chunks and validate
             async for chunk in res:
                 # just a chunk
-                text = chunk.choices[0]['delta']['content']
+                text = chunk.choices[0]["delta"]["content"]
                 translation += text
 
         assert translation == res_no_stream.text
@@ -123,7 +127,7 @@ async def test_stream_options(foscolo):
         original_post = AsyncAPIClient.post
 
         async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
+            kwargs["stream"] = True
             return await original_post(*args, **kwargs)
 
         with patch.object(AsyncAPIClient, "post", new=post_with_stream):
@@ -132,10 +136,13 @@ async def post_with_stream(*args, **kwargs):
                 model=model_name,
                 file=foscolo,
                 temperature=0.0,
-                extra_body=dict(language="it",
-                                stream=True,
-                                stream_include_usage=True,
-                                stream_continuous_usage_stats=True))
+                extra_body=dict(
+                    language="it",
+                    stream=True,
+                    stream_include_usage=True,
+                    stream_continuous_usage_stats=True,
+                ),
+            )
             final = False
             continuous = True
             async for chunk in res:
@@ -143,7 +150,7 @@ async def post_with_stream(*args, **kwargs):
                     # final usage sent
                     final = True
                 else:
-                    continuous = continuous and hasattr(chunk, 'usage')
+                    continuous = continuous and hasattr(chunk, "usage")
             assert final and continuous
 
 
@@ -157,7 +164,7 @@ async def test_long_audio_request(foscolo):
     repeated_audio = np.tile(audio, 2)
     # Repeated audio to buffer
     buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format='WAV')
+    sf.write(buffer, repeated_audio, sr, format="WAV")
     buffer.seek(0)
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
@@ -166,6 +173,7 @@ async def test_long_audio_request(foscolo):
             file=buffer,
             extra_body=dict(language="it"),
             response_format="text",
-            temperature=0.0)
-        out = json.loads(translation)['text'].strip().lower()
+            temperature=0.0,
+        )
+        out = json.loads(translation)["text"].strip().lower()
         assert out.count("greek sea") == 2
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index b33a26af65b3..774315041d07 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -54,12 +54,10 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
     kwargs: dict[str, Any] = {
         "model": MODEL_NAME,
         "input": input,
-        "truncate_prompt_tokens": truncation_size
+        "truncate_prompt_tokens": truncation_size,
     }
 
-    response = await client.post(path="embeddings",
-                                 cast_to=object,
-                                 body={**kwargs})
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
 
     assert response["usage"]["prompt_tokens"] == truncation_size
 
@@ -70,15 +68,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
     kwargs: dict[str, Any] = {
         "model": MODEL_NAME,
         "input": input,
-        "truncate_prompt_tokens": truncation_size
+        "truncate_prompt_tokens": truncation_size,
     }
 
     with pytest.raises(openai.BadRequestError) as err:
-        err = await client.post(path="embeddings",
-                                cast_to=object,
-                                body={**kwargs})
+        err = await client.post(path="embeddings", cast_to=object, body={**kwargs})
 
-        assert str(err) == f"""openai.BadRequestError: 
+        assert (
+            str(err)
+            == f"""openai.BadRequestError: 
                     Error code: 400 - {{'object': 'error', 
                     'message': 'truncate_prompt_tokens value 
                     ({truncation_size}) 
@@ -86,6 +84,7 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
                     Please, select a smaller truncation size.', 
                     'type': 'BadRequestError', 
                     'param': None, 'code': 400}}"""
+        )
 
 
 @pytest.mark.asyncio
@@ -94,11 +93,9 @@ async def test_max_truncation_size(client: openai.AsyncOpenAI):
     kwargs: dict[str, Any] = {
         "model": MODEL_NAME,
         "input": input,
-        "truncate_prompt_tokens": truncation_size
+        "truncate_prompt_tokens": truncation_size,
     }
 
-    response = await client.post(path="embeddings",
-                                 cast_to=object,
-                                 body={**kwargs})
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
 
     assert response["usage"]["prompt_tokens"] == max_model_len
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index b68e08556ee9..825dbc7d2e48 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -58,24 +58,18 @@ def base64_encoded_video() -> dict[str, str]:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_single_chat_session_video(client: openai.AsyncOpenAI,
-                                         model_name: str, video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_single_chat_session_video(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video_url", "video_url": {"url": video_url}},
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -84,13 +78,15 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -112,54 +108,44 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
-                                               model_name: str,
-                                               video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": video_url
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_error_on_invalid_video_url_type(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video_url", "video_url": video_url},
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
 
     # video_url should be a dict {"url": "some url"}, not directly a string
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=model_name,
-                                                 messages=messages,
-                                                 max_completion_tokens=10,
-                                                 temperature=0.0)
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
-                                                    model_name: str,
-                                                    video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_single_chat_session_video_beamsearch(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video_url", "video_url": {"url": video_url}},
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -168,36 +154,38 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
-    assert chat_completion.choices[
-        0].message.content != chat_completion.choices[1].message.content
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: dict[str, str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url":
-                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    video_url: str,
+    base64_encoded_video: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                    },
+                },
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -206,13 +194,15 @@ async def test_single_chat_session_video_base64encoded(
         max_completion_tokens=10,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -236,58 +226,54 @@ async def test_single_chat_session_video_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded_beamsearch(
-        client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: dict[str, str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url":
-                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    video_url: str,
+    base64_encoded_video: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                    },
+                },
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
         n=2,
         max_completion_tokens=10,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
-    assert chat_completion.choices[
-        0].message.content != chat_completion.choices[1].message.content
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-async def test_chat_streaming_video(client: openai.AsyncOpenAI,
-                                    model_name: str, video_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+async def test_chat_streaming_video(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video_url", "video_url": {"url": video_url}},
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -327,27 +313,23 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
-    "video_urls",
-    [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
-async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
-                                 video_urls: list[str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            } for video_url in video_urls),
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-        ],
-    }]
+    "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
+)
+async def test_multi_video_input(
+    client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "video_url", "video_url": {"url": video_url}}
+                    for video_url in video_urls
+                ),
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
 
     if len(video_urls) > MAXIMUM_VIDEOS:
         with pytest.raises(openai.BadRequestError):  # test multi-video input
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index b6f1d64803e5..e9984a38d068 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -79,19 +79,22 @@ def base64_encoded_image() -> dict[str, str]:
 
 
 def get_hf_prompt_tokens(model_name, content, image_url):
-    processor = AutoProcessor.from_pretrained(model_name,
-                                              trust_remote_code=True,
-                                              num_crops=4)
+    processor = AutoProcessor.from_pretrained(
+        model_name, trust_remote_code=True, num_crops=4
+    )
 
     placeholder = "<|image_1|>\n"
-    messages = [{
-        "role": "user",
-        "content": f"{placeholder}{content}",
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": f"{placeholder}{content}",
+        }
+    ]
     images = [Image.open(requests.get(image_url, stream=True).raw)]
 
     prompt = processor.tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True)
+        messages, tokenize=False, add_generation_prompt=True
+    )
     inputs = processor(prompt, images, return_tensors="pt")
 
     return inputs.input_ids.shape[1]
@@ -100,25 +103,19 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image(client: openai.AsyncOpenAI,
-                                         model_name: str, image_url: str):
+async def test_single_chat_session_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
 
     max_completion_tokens = 10
     # test single completion
@@ -128,17 +125,18 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
         max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
-    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
-                                            image_url)
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
         completion_tokens=max_completion_tokens,
         prompt_tokens=hf_prompt_tokens,
-        total_tokens=hf_prompt_tokens + max_completion_tokens)
+        total_tokens=hf_prompt_tokens + max_completion_tokens,
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -160,55 +158,45 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
-                                               model_name: str,
-                                               image_url: str):
+async def test_error_on_invalid_image_url_type(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": image_url
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": image_url},
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
 
     # image_url should be a dict {"url": "some url"}, not directly a string
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=model_name,
-                                                 messages=messages,
-                                                 max_completion_tokens=10,
-                                                 temperature=0.0)
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
-                                                    model_name: str,
-                                                    image_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+async def test_single_chat_session_image_beamsearch(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -217,37 +205,39 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
-    assert chat_completion.choices[
-        0].message.content != chat_completion.choices[1].message.content
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: dict[str, str]):
-
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_url: str,
+    base64_encoded_image: dict[str, str],
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    },
+                },
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
 
     max_completion_tokens = 10
     # test single completion
@@ -257,17 +247,18 @@ async def test_single_chat_session_image_base64encoded(
         max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
-        top_logprobs=5)
+        top_logprobs=5,
+    )
     assert len(chat_completion.choices) == 1
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
-    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
-                                            image_url)
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
         completion_tokens=max_completion_tokens,
         prompt_tokens=hf_prompt_tokens,
-        total_tokens=hf_prompt_tokens + max_completion_tokens)
+        total_tokens=hf_prompt_tokens + max_completion_tokens,
+    )
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -291,36 +282,37 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
 async def test_single_chat_session_image_base64encoded_beamsearch(
-        client: openai.AsyncOpenAI, model_name: str, image_idx: int,
-        base64_encoded_image: dict[str, str]):
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_idx: int,
+    base64_encoded_image: dict[str, str],
+):
     # NOTE: This test also validates that we pass MM data through beam search
     image_url = TEST_IMAGE_URLS[image_idx]
     expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
 
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    },
+                },
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
         n=2,
         max_completion_tokens=10,
         temperature=0.0,
-        extra_body=dict(use_beam_search=True))
+        extra_body=dict(use_beam_search=True),
+    )
     assert len(chat_completion.choices) == 2
     for actual, expected_str in zip(chat_completion.choices, expected_res):
         assert actual.message.content == expected_str
@@ -329,24 +321,18 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_chat_streaming_image(client: openai.AsyncOpenAI,
-                                    model_name: str, image_url: str):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+async def test_chat_streaming_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -386,27 +372,23 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
-    "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
-async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_urls: list[str]):
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            } for image_url in image_urls),
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    "image_urls", [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]
+)
+async def test_multi_image_input(
+    client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
 
     if len(image_urls) > MAXIMUM_IMAGES:
         with pytest.raises(openai.BadRequestError):  # test multi-image input
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index fe982e286ae4..86c9ae6c6b93 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -58,9 +58,9 @@ def base64_encoded_image() -> dict[str, str]:
 
 
 def get_hf_prompt_tokens(model_name, content, image_url):
-    processor = AutoProcessor.from_pretrained(model_name,
-                                              trust_remote_code=True,
-                                              num_crops=4)
+    processor = AutoProcessor.from_pretrained(
+        model_name, trust_remote_code=True, num_crops=4
+    )
 
     placeholder = "<|image_1|> "
     prompt = f"{placeholder}{content}"
@@ -72,39 +72,28 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
-                               image_url: str):
+async def test_image_embedding(
+    server: RemoteOpenAIServer, model_name: str, image_url: str
+):
     content_text = "Represent the given image."
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
 
     response = requests.post(
         server.url_for("v1/embeddings"),
-        json={
-            "model": model_name,
-            "messages": messages,
-            "encoding_format": "float"
-        },
+        json={"model": model_name, "messages": messages, "encoding_format": "float"},
     )
     response.raise_for_status()
     embeddings = EmbeddingResponse.model_validate(response.json())
 
-    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
-                                            image_url)
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
diff --git a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
index bd8e06513e13..bdd5344652c4 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -8,15 +8,18 @@
 import pytest
 
 from tests.entrypoints.openai.tool_parsers.utils import (
-    run_tool_extraction, run_tool_extraction_streaming)
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
 from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 
 
 def make_tool_call(name, arguments):
-    return ToolCall(type="function",
-                    function=FunctionCall(name=name,
-                                          arguments=json.dumps(arguments)))
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=name, arguments=json.dumps(arguments)),
+    )
 
 
 # TODO: add reason prefix and suffix.
@@ -29,70 +32,68 @@ def make_tool_call(name, arguments):
         ("How can I help you today?", [], "How can I help you today?"),
         # Single tool call, no content
         (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}]</tool_calls>",  #noqa: E501
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}]</tool_calls>',  # noqa: E501
             [
-                make_tool_call("get_weather", {
-                    "city": "San Francisco",
-                    "metric": "celsius"
-                })
+                make_tool_call(
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                )
             ],
-            None),
+            None,
+        ),
         # Multiple tool calls
         (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"San Francisco\", \"metric\": \"celsius\"}}, {\"name\": \"register_user\", \"arguments\": {\"name\": \"John Doe\", \"age\": 37, \"address\": {\"city\": \"San Francisco\", \"state\": \"CA\"}, \"role\": null, \"passed_test\": true, \"aliases\": [\"John\", \"Johnny\"]}}]</tool_calls>",  #noqa: E501
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}, {"name": "register_user", "arguments": {"name": "John Doe", "age": 37, "address": {"city": "San Francisco", "state": "CA"}, "role": null, "passed_test": true, "aliases": ["John", "Johnny"]}}]</tool_calls>',  # noqa: E501
             [
-                make_tool_call("get_weather", {
-                    "city": "San Francisco",
-                    "metric": "celsius"
-                }),
                 make_tool_call(
-                    "register_user", {
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                ),
+                make_tool_call(
+                    "register_user",
+                    {
                         "name": "John Doe",
                         "age": 37,
-                        "address": {
-                            "city": "San Francisco",
-                            "state": "CA"
-                        },
+                        "address": {"city": "San Francisco", "state": "CA"},
                         "role": None,
                         "passed_test": True,
-                        "aliases": ["John", "Johnny"]
-                    })
+                        "aliases": ["John", "Johnny"],
+                    },
+                ),
             ],
-            None),
+            None,
+        ),
         # Content before tool call
         (
-            "I will call the tool now. <tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Boston\"}}]</tool_calls>",  #noqa: E501
+            'I will call the tool now. <tool_calls>[{"name": "get_weather", "arguments": {"city": "Boston"}}]</tool_calls>',  # noqa: E501
             [make_tool_call("get_weather", {"city": "Boston"})],
-            "I will call the tool now. "),
+            "I will call the tool now. ",
+        ),
         # Content after tool call (should be stripped)
         (
-            "<tool_calls>[{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Seattle\"}}]</tool_calls>\nThank you!",  #noqa: E501
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "Seattle"}}]</tool_calls>\nThank you!',  # noqa: E501
             [make_tool_call("get_weather", {"city": "Seattle"})],
-            None),
+            None,
+        ),
         (
-            "<tool_calls>[{\"name\": \"complex_tool\", \"arguments\": {\"level1\": {\"level2\": {\"level3\": {\"value\": 123}}}}}]</tool_calls>",
+            '<tool_calls>[{"name": "complex_tool", "arguments": {"level1": {"level2": {"level3": {"value": 123}}}}}]</tool_calls>',
             [
                 make_tool_call(
-                    "complex_tool",
-                    {"level1": {
-                        "level2": {
-                            "level3": {
-                                "value": 123
-                            }
-                        }
-                    }})
+                    "complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
+                )
             ],
             None,
         ),
-    ])
-def test_hunyuan_a13b_tool_parser_extract(model_output, expected_tool_calls,
-                                          expected_content):
+    ],
+)
+def test_hunyuan_a13b_tool_parser_extract(
+    model_output, expected_tool_calls, expected_content
+):
     mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "hunyuan_a13b")(mock_tokenizer)
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=False)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
+        mock_tokenizer
+    )
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=False
+    )
 
     # align the random id.
     for idx in range(len(tool_calls)):
@@ -102,49 +103,74 @@ def test_hunyuan_a13b_tool_parser_extract(model_output, expected_tool_calls,
 
 
 # Streaming test: simulate incremental output
-@pytest.mark.parametrize("model_deltas,expected_tool_calls", [
-    ([
-        "<tool_calls>[{\"name\": \"get_weather\", ",
-        "\"arguments\": {\"city\": \"San Francisco\", ",
-        "\"metric\": \"celsius\"}}]", "</tool_calls>"
-    ], [
-        make_tool_call("get_weather", {
-            "city": "San Francisco",
-            "metric": "celsius"
-        })
-    ]),
-    ([
-        "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
-        " {\"city\": \"Boston\"}", "}]", "</tool_calls>"
-    ], [make_tool_call("get_weather", {"city": "Boston"})]),
-    ([
-        "", "<tool_calls>[{\"name\":", " \"get_weather\",", " \"arguments\":",
-        " {\"city\": \"Boston\"}", "}]", "</tool_calls>", "\n</answer>"
-    ], [make_tool_call("get_weather", {"city": "Boston"})]),
-    pytest.param([
-        "<tool_calls>[{\"name\": \"complex_tool\",", " \"arguments\": ",
-        " {\"level1\": {\"level2\": ", "{\"level3\": {\"value\": 123}}}}}",
-        "]</tool_calls>"
-    ], [
-        make_tool_call("complex_tool",
-                       {"level1": {
-                           "level2": {
-                               "level3": {
-                                   "value": 123
-                               }
-                           }
-                       }})
+@pytest.mark.parametrize(
+    "model_deltas,expected_tool_calls",
+    [
+        (
+            [
+                '<tool_calls>[{"name": "get_weather", ',
+                '"arguments": {"city": "San Francisco", ',
+                '"metric": "celsius"}}]',
+                "</tool_calls>",
+            ],
+            [
+                make_tool_call(
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                )
+            ],
+        ),
+        (
+            [
+                '<tool_calls>[{"name":',
+                ' "get_weather",',
+                ' "arguments":',
+                ' {"city": "Boston"}',
+                "}]",
+                "</tool_calls>",
+            ],
+            [make_tool_call("get_weather", {"city": "Boston"})],
+        ),
+        (
+            [
+                "",
+                '<tool_calls>[{"name":',
+                ' "get_weather",',
+                ' "arguments":',
+                ' {"city": "Boston"}',
+                "}]",
+                "</tool_calls>",
+                "\n</answer>",
+            ],
+            [make_tool_call("get_weather", {"city": "Boston"})],
+        ),
+        pytest.param(
+            [
+                '<tool_calls>[{"name": "complex_tool",',
+                ' "arguments": ',
+                ' {"level1": {"level2": ',
+                '{"level3": {"value": 123}}}}}',
+                "]</tool_calls>",
+            ],
+            [
+                make_tool_call(
+                    "complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
+                )
+            ],
+            marks=pytest.mark.xfail(
+                reason="stream parsing not support nested json yet."
+            ),
+        ),
     ],
-                 marks=pytest.mark.xfail(
-                     reason="stream parsing not support nested json yet.")),
-])
+)
 def test_hunyuan_a13b_tool_parser_streaming(model_deltas, expected_tool_calls):
     mock_tokenizer = MagicMock()
 
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "hunyuan_a13b")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
+        mock_tokenizer
+    )
     reconstructor = run_tool_extraction_streaming(
-        tool_parser, model_deltas, assert_one_tool_per_delta=False)
+        tool_parser, model_deltas, assert_one_tool_per_delta=False
+    )
 
     # align the random id.
     for idx in range(len(reconstructor.tool_calls)):
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index 8c86b4889e15..94277980f229 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -6,7 +6,9 @@
 import pytest
 
 from tests.entrypoints.openai.tool_parsers.utils import (
-    run_tool_extraction, run_tool_extraction_streaming)
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 
@@ -16,12 +18,14 @@
     name="get_weather",
     arguments='{"city": "LA", "metric": "C"}',
 )
-MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', "
-                              "age=9, "
-                              "address={'city': 'LA', 'state': 'CA'}, "
-                              "role=None, "
-                              "passed_test=True, "
-                              "aliases=['John', 'Johnny'])]")
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "[register_user(name='Doe', "
+    "age=9, "
+    "address={'city': 'LA', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])]"
+)
 MORE_TYPES_FUNCTION_CALL = FunctionCall(
     name="register_user",
     arguments='{"name": "Doe", '
@@ -34,7 +38,7 @@
 PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
 PARAMETERLESS_FUNCTION_CALL = FunctionCall(
     name="get_weather",
-    arguments='{}',
+    arguments="{}",
 )
 EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
 EMPTY_DICT_FUNCTION_CALL = FunctionCall(
@@ -47,25 +51,28 @@
     arguments='{"steps": []}',
 )
 ESCAPED_STRING_FUNCTION_OUTPUT = (
-    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]")
+    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]"
+)
 ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
     name="get_weather",
     arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
 )
 PYTHON_TAG_FUNCTION_OUTPUT = (
-    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>")
+    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>"
+)
 
 
 @pytest.mark.parametrize("streaming", [True, False])
 def test_no_tool_call(streaming: bool):
     mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        mock_tokenizer
+    )
     model_output = "How can I help you today?"
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert content == model_output
     assert len(tool_calls) == 0
@@ -75,98 +82,139 @@ def test_no_tool_call(streaming: bool):
 test_str += "[get_weather(city='LA', metric='C'),"
 test_str += "register_user(name='Doe', age=9)]"
 TEST_CASES = [
-    pytest.param(True,
-                 ESCAPED_STRING_FUNCTION_OUTPUT,
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="simple_streaming"),
-    pytest.param(False,
-                 SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
-                 id="simple_nonstreaming"),
-    pytest.param(True,
-                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_streaming"),
-    pytest.param(False,
-                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_nonstreaming"),
-    pytest.param(True,
-                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_streaming"),
-    pytest.param(False,
-                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_nonstreaming"),
-    pytest.param(True,
-                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_streaming"),
-    pytest.param(False,
-                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_nonstreaming"),
-    pytest.param(True,
-                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_streaming"),
-    pytest.param(False,
-                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_nonstreaming"),
-    pytest.param(True,
-                 ESCAPED_STRING_FUNCTION_OUTPUT,
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_streaming"),
-    pytest.param(False,
-                 ESCAPED_STRING_FUNCTION_OUTPUT,
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_nonstreaming"),
+    pytest.param(
+        True,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False, SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], id="simple_nonstreaming"
+    ),
+    pytest.param(
+        True,
+        MORE_TYPES_FUNCTION_OUTPUT,
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        MORE_TYPES_FUNCTION_OUTPUT,
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        EMPTY_DICT_FUNCTION_OUTPUT,
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY_DICT_FUNCTION_OUTPUT,
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        EMPTY_LIST_FUNCTION_OUTPUT,
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY_LIST_FUNCTION_OUTPUT,
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
     pytest.param(
         True,
         "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
         [
             SIMPLE_FUNCTION_CALL,
-            FunctionCall(name="register_user",
-                         arguments='{"name": "Doe", "age": 9}')
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
         ],
-        id="parallel_calls_streaming"),
+        id="parallel_calls_streaming",
+    ),
     pytest.param(
         False,
         "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
         [
             SIMPLE_FUNCTION_CALL,
-            FunctionCall(name="register_user",
-                         arguments='{"name": "Doe", "age": 9}')
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PYTHON_TAG_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        id="python_tag_streaming",
+    ),
+    pytest.param(
+        False,
+        PYTHON_TAG_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        id="python_tag_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        test_str,
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        "<|python_start|>[get_weather(city='LA', metric='C'), "
+        + "register_user(name='Doe', age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
         ],
-        id="parallel_calls_nonstreaming"),
-    pytest.param(True,
-                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
-                 id="python_tag_streaming"),
-    pytest.param(False,
-                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
-                 id="python_tag_nonstreaming"),
-    pytest.param(True,
-                 test_str, [
-                     SIMPLE_FUNCTION_CALL,
-                     FunctionCall(name="register_user",
-                                  arguments='{"name": "Doe", "age": 9}')
-                 ],
-                 id="parallel_calls_streaming"),
-    pytest.param(False,
-                 "<|python_start|>[get_weather(city='LA', metric='C'), " +
-                 "register_user(name='Doe', age=9)]", [
-                     SIMPLE_FUNCTION_CALL,
-                     FunctionCall(name="register_user",
-                                  arguments='{"name": "Doe", "age": 9}')
-                 ],
-                 id="parallel_calls_nonstreaming"),
+        id="parallel_calls_nonstreaming",
+    ),
 ]
 
 
-@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
-                         TEST_CASES)
-def test_tool_call(streaming: bool, model_output: str,
-                   expected_tool_calls: list[FunctionCall]):
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall]
+):
     mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        mock_tokenizer
+    )
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert len(tool_calls) == len(expected_tool_calls)
     for actual, expected in zip(tool_calls, expected_tool_calls):
@@ -176,8 +224,9 @@ def test_tool_call(streaming: bool, model_output: str,
 
 def test_streaming_tool_call_with_large_steps():
     mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        mock_tokenizer
+    )
     model_output_deltas = [
         "<|python_start|>[get_weather(city='LA', metric='C'), "
         "get_weather(), "
@@ -185,7 +234,8 @@ def test_streaming_tool_call_with_large_steps():
     ]
 
     reconstructor = run_tool_extraction_streaming(
-        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
 
     assert reconstructor.other_content == ""
     assert len(reconstructor.tool_calls) == 3
@@ -198,8 +248,9 @@ def test_streaming_tool_call_with_large_steps():
 def test_regex_timeout_handling(streaming: bool):
     """test regex timeout is handled gracefully"""
     mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        mock_tokenizer
+    )
 
     fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
 
@@ -207,10 +258,10 @@ def test_regex_timeout_handling(streaming: bool):
     mock_regex = MagicMock()
     mock_regex.match.side_effect = TimeoutError("Regex timeout")
 
-    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
-        content, tool_calls = run_tool_extraction(tool_parser,
-                                                  fake_problematic_input,
-                                                  streaming=streaming)
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
 
         # should treat as regular text when regex times out
         assert content == fake_problematic_input
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index d83137472598..ccd6abbac4c9 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -6,7 +6,9 @@
 import pytest
 
 from tests.entrypoints.openai.tool_parsers.utils import (
-    run_tool_extraction, run_tool_extraction_streaming)
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 
@@ -22,7 +24,8 @@
     "address={'city': 'San Francisco', 'state': 'CA'}, "
     "role=None, "
     "passed_test=True, "
-    "aliases=['John', 'Johnny'])")
+    "aliases=['John', 'Johnny'])"
+)
 MORE_TYPES_FUNCTION_CALL = FunctionCall(
     name="register_user",
     arguments='{"name": "John Doe", '
@@ -35,7 +38,7 @@
 PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
 PARAMETERLESS_FUNCTION_CALL = FunctionCall(
     name="get_weather",
-    arguments='{}',
+    arguments="{}",
 )
 EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
 EMPTY_DICT_FUNCTION_CALL = FunctionCall(
@@ -48,7 +51,8 @@
     arguments='{"steps": []}',
 )
 ESCAPED_STRING_FUNCTION_OUTPUT = (
-    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')")
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
+)
 ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
     name="get_weather",
     arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
@@ -59,80 +63,118 @@
 def test_no_tool_call(streaming: bool):
     mock_tokenizer = MagicMock()
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
-        mock_tokenizer)
+        mock_tokenizer
+    )
     model_output = "How can I help you today?"
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert content == model_output
     assert len(tool_calls) == 0
 
 
 TEST_CASES = [
-    pytest.param(True,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
-                 id="simple_streaming"),
-    pytest.param(False,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
-                 id="simple_nonstreaming"),
-    pytest.param(True,
-                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_streaming"),
-    pytest.param(False,
-                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
-                 id="more_types_nonstreaming"),
-    pytest.param(True,
-                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
-                 [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_streaming"),
-    pytest.param(False,
-                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
-                 [PARAMETERLESS_FUNCTION_CALL],
-                 id="parameterless_nonstreaming"),
-    pytest.param(True,
-                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_streaming"),
-    pytest.param(False,
-                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
-                 id="empty_dict_nonstreaming"),
-    pytest.param(True,
-                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_streaming"),
-    pytest.param(False,
-                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
-                 id="empty_list_nonstreaming"),
-    pytest.param(True,
-                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_streaming"),
-    pytest.param(False,
-                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
-                 [ESCAPED_STRING_FUNCTION_CALL],
-                 id="escaped_string_nonstreaming"),
-    pytest.param(True,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
-                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
-                 id="parallel_calls_streaming"),
-    pytest.param(False,
-                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
-                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
-                 id="parallel_calls_nonstreaming"),
+    pytest.param(
+        True,
+        f"[{SIMPLE_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{SIMPLE_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_nonstreaming",
+    ),
 ]
 
 
-@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
-                         TEST_CASES)
-def test_tool_call(streaming: bool, model_output: str,
-                   expected_tool_calls: list[FunctionCall]):
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall]
+):
     mock_tokenizer = MagicMock()
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
-        mock_tokenizer)
+        mock_tokenizer
+    )
 
-    content, tool_calls = run_tool_extraction(tool_parser,
-                                              model_output,
-                                              streaming=streaming)
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
 
     assert content is None
     assert len(tool_calls) == len(expected_tool_calls)
@@ -144,7 +186,8 @@ def test_tool_call(streaming: bool, model_output: str,
 def test_streaming_tool_call_with_large_steps():
     mock_tokenizer = MagicMock()
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
-        mock_tokenizer)
+        mock_tokenizer
+    )
     model_output_deltas = [
         "[get_weather(city='San",
         " Francisco', metric='celsius'), "
@@ -153,7 +196,8 @@ def test_streaming_tool_call_with_large_steps():
     ]
 
     reconstructor = run_tool_extraction_streaming(
-        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
 
     assert reconstructor.other_content == ""
     assert len(reconstructor.tool_calls) == 3
@@ -166,8 +210,9 @@ def test_streaming_tool_call_with_large_steps():
 def test_regex_timeout_handling(streaming: bool):
     """test regex timeout is handled gracefully"""
     mock_tokenizer = MagicMock()
-    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
-        "llama4_pythonic")(mock_tokenizer)
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        mock_tokenizer
+    )
 
     fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
 
@@ -175,10 +220,10 @@ def test_regex_timeout_handling(streaming: bool):
     mock_regex = MagicMock()
     mock_regex.match.side_effect = TimeoutError("Regex timeout")
 
-    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
-        content, tool_calls = run_tool_extraction(tool_parser,
-                                                  fake_problematic_input,
-                                                  streaming=streaming)
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
 
         # should treat as regular text when regex times out
         assert content == fake_problematic_input
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index e1b41f45f554..cfa4d3584e70 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -4,15 +4,17 @@
 from collections.abc import Iterable
 from typing import Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage,
-                                              ExtractedToolCallInformation,
-                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 
 
 class StreamingToolReconstructor:
-
     def __init__(self, assert_one_tool_per_delta: bool = True):
         self.tool_calls: list[ToolCall] = []
         self.other_content: str = ""
@@ -23,49 +25,60 @@ def append_delta(self, delta: DeltaMessage):
             self.other_content += delta.content
         else:
             assert delta.tool_calls, (
-                "Streaming results should have either content or tool calls "
-                "(or both)")
+                "Streaming results should have either content or tool calls (or both)"
+            )
         if self._assert_one_tool_per_delta:
             # Note: This isn't strictly required by the API and may not be
             # possible to adhere to depending on the token space and number of
             # tokens per streamed response from the model, but it is required
             # by tool_use tests, so we enforce it here by default also.
             assert len(delta.tool_calls) < 2, (
-                "Streaming should include only one tool call per update.")
+                "Streaming should include only one tool call per update."
+            )
         for call_delta in delta.tool_calls:
             assert call_delta.type is None or call_delta.type == "function", (
                 "Streaming tool calls should only emit function calls. Got "
-                f"{call_delta.type}")
-            current_tool_call = self.tool_calls[
-                call_delta.index] if call_delta.index < len(
-                    self.tool_calls) else None
+                f"{call_delta.type}"
+            )
+            current_tool_call = (
+                self.tool_calls[call_delta.index]
+                if call_delta.index < len(self.tool_calls)
+                else None
+            )
             if current_tool_call:
-                assert (not call_delta.function.name), (
+                assert not call_delta.function.name, (
                     "Streaming tool calls should emit the full function name "
-                    f"exactly once. Got {call_delta.function.name}")
-                assert (not call_delta.id), (
+                    f"exactly once. Got {call_delta.function.name}"
+                )
+                assert not call_delta.id, (
                     "Streaming tool calls must emit function id only once. Got "
-                    f"{call_delta.id}")
-                assert (call_delta.index == len(self.tool_calls) - 1), (
+                    f"{call_delta.id}"
+                )
+                assert call_delta.index == len(self.tool_calls) - 1, (
                     f"Incorrect index for tool delta. Got {call_delta.index}, "
-                    f"expected {len(self.tool_calls) - 1}")
-                current_tool_call.function.arguments += (
-                    call_delta.function.arguments)
+                    f"expected {len(self.tool_calls) - 1}"
+                )
+                current_tool_call.function.arguments += call_delta.function.arguments
             else:
                 assert call_delta.id is not None, (
-                    "Streaming tool calls must have an id on first appearance")
+                    "Streaming tool calls must have an id on first appearance"
+                )
                 assert call_delta.function.name is not None, (
-                    "Streaming tool calls must have a function name on first "
-                    "appearance")
+                    "Streaming tool calls must have a function name on first appearance"
+                )
                 assert call_delta.index == len(self.tool_calls), (
                     f"Incorrect index for tool delta. Got {call_delta.index}, "
-                    f"expected {len(self.tool_calls)}")
+                    f"expected {len(self.tool_calls)}"
+                )
                 self.tool_calls.append(
-                    ToolCall(id=call_delta.id,
-                             function=FunctionCall(
-                                 name=call_delta.function.name,
-                                 arguments=call_delta.function.arguments
-                                 or "")))
+                    ToolCall(
+                        id=call_delta.id,
+                        function=FunctionCall(
+                            name=call_delta.function.name,
+                            arguments=call_delta.function.arguments or "",
+                        ),
+                    )
+                )
 
 
 def run_tool_extraction(
@@ -80,11 +93,11 @@ def run_tool_extraction(
             tool_parser,
             model_output,
             request,
-            assert_one_tool_per_delta=assert_one_tool_per_delta)
+            assert_one_tool_per_delta=assert_one_tool_per_delta,
+        )
         return reconstructor.other_content or None, reconstructor.tool_calls
     else:
-        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output,
-                                                     request)
+        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output, request)
         assert extracted.tools_called == bool(extracted.tool_calls)
         return extracted.content, extracted.tool_calls
 
@@ -92,7 +105,7 @@ def run_tool_extraction(
 def run_tool_extraction_nonstreaming(
     tool_parser: ToolParser,
     model_output: str,
-    request: Union[ChatCompletionRequest, None] = None
+    request: Union[ChatCompletionRequest, None] = None,
 ) -> ExtractedToolCallInformation:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     return tool_parser.extract_tool_calls(model_output, request)
@@ -106,7 +119,8 @@ def run_tool_extraction_streaming(
 ) -> StreamingToolReconstructor:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     reconstructor = StreamingToolReconstructor(
-        assert_one_tool_per_delta=assert_one_tool_per_delta)
+        assert_one_tool_per_delta=assert_one_tool_per_delta
+    )
     previous_text = ""
     previous_tokens: list[int] = []
     for delta in model_deltas:
@@ -118,8 +132,14 @@ def run_tool_extraction_streaming(
         current_text = previous_text + delta
         current_tokens = previous_tokens + token_delta
         delta_message = tool_parser.extract_tool_calls_streaming(
-            previous_text, current_text, delta, previous_tokens,
-            current_tokens, token_delta, request)
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+            request,
+        )
         if delta_message is not None:
             reconstructor.append_delta(delta_message)
         previous_text = current_text
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
index e4af60a78265..921bfb563f02 100644
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -10,8 +10,7 @@
 
 import pytest
 
-from vllm.v1.utils import (APIServerProcessManager,
-                           wait_for_completion_or_failure)
+from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
 
 # Global variables to control worker behavior
 WORKER_RUNTIME_SECONDS = 0.5
@@ -30,26 +29,22 @@ def api_server_args():
     """Fixture to provide arguments for APIServerProcessManager."""
     sock = socket.socket()
     return {
-        "target_server_fn":
-        mock_run_api_server_worker,
-        "listen_address":
-        "localhost:8000",
-        "sock":
-        sock,
-        "args":
-        "test_args",  # Simple string to avoid pickling issues
-        "num_servers":
-        3,
+        "target_server_fn": mock_run_api_server_worker,
+        "listen_address": "localhost:8000",
+        "sock": sock,
+        "args": "test_args",  # Simple string to avoid pickling issues
+        "num_servers": 3,
         "input_addresses": [
-            "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002",
-            "tcp://127.0.0.1:5003"
+            "tcp://127.0.0.1:5001",
+            "tcp://127.0.0.1:5002",
+            "tcp://127.0.0.1:5003",
         ],
         "output_addresses": [
-            "tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002",
-            "tcp://127.0.0.1:6003"
+            "tcp://127.0.0.1:6001",
+            "tcp://127.0.0.1:6002",
+            "tcp://127.0.0.1:6003",
         ],
-        "stats_update_address":
-        "tcp://127.0.0.1:7000",
+        "stats_update_address": "tcp://127.0.0.1:7000",
     }
 
 
@@ -95,8 +90,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
             assert not proc.is_alive()
 
 
-@patch("vllm.entrypoints.cli.serve.run_api_server_worker",
-       mock_run_api_server_worker)
+@patch("vllm.entrypoints.cli.serve.run_api_server_worker", mock_run_api_server_worker)
 def test_wait_for_completion_or_failure(api_server_args):
     """Test that wait_for_completion_or_failure works with failures."""
     global WORKER_RUNTIME_SECONDS
@@ -118,8 +112,7 @@ def run_with_exception_capture():
                 result["exception"] = e
 
         # Start a thread to run wait_for_completion_or_failure
-        wait_thread = threading.Thread(target=run_with_exception_capture,
-                                       daemon=True)
+        wait_thread = threading.Thread(target=run_with_exception_capture, daemon=True)
         wait_thread.start()
 
         # Let all processes run for a short time
@@ -174,8 +167,7 @@ def test_normal_completion(api_server_args):
 
         # Verify all processes have terminated
         for i, proc in enumerate(manager.processes):
-            assert not proc.is_alive(
-            ), f"Process {i} still alive after terminate()"
+            assert not proc.is_alive(), f"Process {i} still alive after terminate()"
 
         # Now call wait_for_completion_or_failure
         # since all processes have already
@@ -198,13 +190,13 @@ def test_external_process_monitoring(api_server_args):
     # Create and start the external process
     # (simulates local_engine_manager or coordinator)
     spawn_context = multiprocessing.get_context("spawn")
-    external_proc = spawn_context.Process(target=mock_run_api_server_worker,
-                                          name="MockExternalProcess")
+    external_proc = spawn_context.Process(
+        target=mock_run_api_server_worker, name="MockExternalProcess"
+    )
     external_proc.start()
 
     # Create the class to simulate a coordinator
     class MockCoordinator:
-
         def __init__(self, proc):
             self.proc = proc
 
@@ -228,14 +220,14 @@ def close(self):
 
         def run_with_exception_capture():
             try:
-                wait_for_completion_or_failure(api_server_manager=manager,
-                                               coordinator=mock_coordinator)
+                wait_for_completion_or_failure(
+                    api_server_manager=manager, coordinator=mock_coordinator
+                )
             except Exception as e:
                 result["exception"] = e
 
         # Start a thread to run wait_for_completion_or_failure
-        wait_thread = threading.Thread(target=run_with_exception_capture,
-                                       daemon=True)
+        wait_thread = threading.Thread(target=run_with_exception_capture, daemon=True)
         wait_thread.start()
 
         # Terminate the external process to trigger a failure
@@ -246,21 +238,23 @@ def run_with_exception_capture():
         wait_thread.join(timeout=1.0)
 
         # The wait thread should have completed
-        assert not wait_thread.is_alive(
-        ), "wait_for_completion_or_failure thread still running"
+        assert not wait_thread.is_alive(), (
+            "wait_for_completion_or_failure thread still running"
+        )
 
         # Verify that an exception was raised with appropriate error message
         assert result["exception"] is not None, "No exception was raised"
         error_message = str(result["exception"])
-        assert "died with exit code" in error_message, \
+        assert "died with exit code" in error_message, (
             f"Unexpected error message: {error_message}"
-        assert "MockExternalProcess" in error_message, \
+        )
+        assert "MockExternalProcess" in error_message, (
             f"Error doesn't mention external process: {error_message}"
+        )
 
         # Verify that all API server processes were terminated as a result
         for i, proc in enumerate(manager.processes):
-            assert not proc.is_alive(
-            ), f"API server process {i} was not terminated"
+            assert not proc.is_alive(), f"API server process {i} was not terminated"
 
     finally:
         # Clean up
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index e321ca70001d..e189d9585f6c 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -11,15 +11,21 @@
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
-                                         parse_chat_messages,
-                                         parse_chat_messages_futures,
-                                         resolve_chat_template_content_format,
-                                         resolve_hf_chat_template)
+from vllm.entrypoints.chat_utils import (
+    _try_extract_ast,
+    load_chat_template,
+    parse_chat_messages,
+    parse_chat_messages_futures,
+    resolve_chat_template_content_format,
+    resolve_hf_chat_template,
+)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
-                                   encode_video_base64)
+from vllm.multimodal.utils import (
+    encode_audio_base64,
+    encode_image_base64,
+    encode_video_base64,
+)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -41,31 +47,35 @@
 
 @pytest.fixture(scope="function")
 def phi3v_model_config():
-    return ModelConfig(PHI3V_MODEL_ID,
-                       task="generate",
-                       tokenizer=PHI3V_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        task="generate",
+        tokenizer=PHI3V_MODEL_ID,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="auto",
+        seed=0,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="function")
 def phi3v_model_config_mm_interleaved():
-    return ModelConfig(PHI3V_MODEL_ID,
-                       task="generate",
-                       tokenizer=PHI3V_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
-                       interleave_mm_strings=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        task="generate",
+        tokenizer=PHI3V_MODEL_ID,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="auto",
+        seed=0,
+        interleave_mm_strings=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -80,18 +90,20 @@ def phi3v_tokenizer():
 
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
-    return ModelConfig(QWEN25OMNI_MODEL_ID,
-                       task="generate",
-                       tokenizer=QWEN25OMNI_MODEL_ID,
-                       tokenizer_mode="auto",
-                       dtype="auto",
-                       seed=0,
-                       interleave_mm_strings=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                           "audio": 1,
-                           "video": 1,
-                       })
+    return ModelConfig(
+        QWEN25OMNI_MODEL_ID,
+        task="generate",
+        tokenizer=QWEN25OMNI_MODEL_ID,
+        tokenizer_mode="auto",
+        dtype="auto",
+        seed=0,
+        interleave_mm_strings=True,
+        limit_mm_per_prompt={
+            "image": 2,
+            "audio": 1,
+            "video": 1,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -106,16 +118,18 @@ def qwen25omni_tokenizer():
 
 @pytest.fixture(scope="module")
 def mllama_model_config():
-    return ModelConfig(MLLAMA_MODEL_ID,
-                       task="generate",
-                       tokenizer=MLLAMA_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        MLLAMA_MODEL_ID,
+        task="generate",
+        tokenizer=MLLAMA_MODEL_ID,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="auto",
+        seed=0,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -130,16 +144,18 @@ def mllama_tokenizer():
 
 @pytest.fixture(scope="function")
 def mistral_model_config():
-    return ModelConfig(MISTRAL_MODEL_ID,
-                       task="generate",
-                       tokenizer=MISTRAL_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        MISTRAL_MODEL_ID,
+        task="generate",
+        tokenizer=MISTRAL_MODEL_ID,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="auto",
+        seed=0,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -154,21 +170,21 @@ def mistral_tokenizer():
 
 @pytest.fixture(scope="module")
 def image_url():
-    image = ImageAsset('cherry_blossom')
+    image = ImageAsset("cherry_blossom")
     base64 = encode_image_base64(image.pil_image)
     return f"data:image/jpeg;base64,{base64}"
 
 
 @pytest.fixture(scope="module")
 def video_url():
-    video = VideoAsset('baby_reading', 1)
+    video = VideoAsset("baby_reading", 1)
     base64 = encode_video_base64(video.np_ndarrays)
     return f"data:video/jpeg;base64,{base64}"
 
 
 @pytest.fixture(scope="module")
 def audio_url():
-    audio = AudioAsset('mary_had_lamb')
+    audio = AudioAsset("mary_had_lamb")
     base64 = encode_audio_base64(*audio.audio_and_sample_rate)
     return f"data:audio/ogg;base64,{base64}"
 
@@ -209,28 +225,23 @@ def test_parse_chat_messages_single_image(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in the image?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(mm_data, 1)
 
 
@@ -240,58 +251,33 @@ def test_parse_chat_messages_empty_system(
 ):
     # Test string format
     conversation, _ = parse_chat_messages(
-        [{
-            "role": "system",
-            "content": ""
-        }, {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "Who are you?"
-            }]
-        }],
+        [
+            {"role": "system", "content": ""},
+            {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
+        ],
         mistral_model_config,
         mistral_tokenizer,
         content_format="string",
     )
-    assert conversation == [{
-        "role": "system",
-        "content": ""
-    }, {
-        "role": "user",
-        "content": "Who are you?"
-    }]
+    assert conversation == [
+        {"role": "system", "content": ""},
+        {"role": "user", "content": "Who are you?"},
+    ]
 
     # Test openai format
     conversation, _ = parse_chat_messages(
-        [{
-            "role": "system",
-            "content": ""
-        }, {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "Who are you?"
-            }]
-        }],
+        [
+            {"role": "system", "content": ""},
+            {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
+        ],
         mistral_model_config,
         mistral_tokenizer,
         content_format="openai",
     )
-    assert conversation == [{
-        "role": "system",
-        "content": [{
-            "type": "text",
-            "text": ""
-        }]
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": "Who are you?"
-        }]
-    }]
+    assert conversation == [
+        {"role": "system", "content": [{"type": "text", "text": ""}]},
+        {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
+    ]
 
 
 @pytest.mark.asyncio
@@ -301,28 +287,23 @@ async def test_parse_chat_messages_single_image_async(
     image_url,
 ):
     conversation, mm_future = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in the image?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What's in the image?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role": "user",
-        "content": "<|image_1|>\nWhat's in the image?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
+    ]
     _assert_mm_data_is_image_input(await mm_future, 1)
 
 
@@ -332,33 +313,27 @@ def test_parse_chat_messages_multiple_images(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
-            }, {
-                "type": "text",
-                "text": "What's in these images?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "image_pil",
+                        "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    },
+                    {"type": "text", "text": "What's in these images?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?"}
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
@@ -369,33 +344,27 @@ async def test_parse_chat_messages_multiple_images_async(
     image_url,
 ):
     conversation, mm_future = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
-            }, {
-                "type": "text",
-                "text": "What's in these images?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "image_pil",
+                        "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    },
+                    {"type": "text", "text": "What's in these images?"},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?"}
+    ]
     _assert_mm_data_is_image_input(await mm_future, 2)
 
 
@@ -405,36 +374,29 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type":
-                "text",
-                "text":
-                "What's in <|image_1|> and how does it compare to <|image_2|>?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "What's in <|image_1|> and how does it compare to <|image_2|>?",
+                    },
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's in <|image_1|> and how does it compare to <|image_2|>?"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's in <|image_1|> and how does it compare to <|image_2|>?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
@@ -444,42 +406,31 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type":
-                    "text",
-                    "text":
-                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
-                }
-            ]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "What's in <|image_1|> and how does it compare to the other one?",  # noqa: E501
+                    },
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
-        "other one?"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
+            "other one?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
@@ -489,52 +440,32 @@ def test_parse_chat_messages_multiple_images_across_messages(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in this image?"
-            }]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What about this one?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            },
+            {"role": "assistant", "content": "Some stuff."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What about this one?"},
+                ],
+            },
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
     assert conversation == [
-        {
-            "role": "user",
-            "content": "<|image_1|>\nWhat's in this image?"
-        },
-        {
-            "role": "assistant",
-            "content": "Some stuff."
-        },
-        {
-            "role": "user",
-            "content": "<|image_2|>\nWhat about this one?"
-        },
+        {"role": "user", "content": "<|image_1|>\nWhat's in this image?"},
+        {"role": "assistant", "content": "Some stuff."},
+        {"role": "user", "content": "<|image_2|>\nWhat about this one?"},
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
@@ -544,46 +475,23 @@ def test_parse_chat_messages_context_text_format(
     phi3v_tokenizer,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What's in this text?"
-            }]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role": "user",
-            "content": "What about this one?"
-        }],
+        [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": "What's in this text?"}],
+            },
+            {"role": "assistant", "content": "Some stuff."},
+            {"role": "user", "content": "What about this one?"},
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="openai",
     )
 
     assert conversation == [
-        {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What's in this text?"
-            }]
-        },
-        {
-            "role": "assistant",
-            "content": [{
-                "type": "text",
-                "text": "Some stuff."
-            }]
-        },
-        {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What about this one?"
-            }]
-        },
+        {"role": "user", "content": [{"type": "text", "text": "What's in this text?"}]},
+        {"role": "assistant", "content": [{"type": "text", "text": "Some stuff."}]},
+        {"role": "user", "content": [{"type": "text", "text": "What about this one?"}]},
     ]
 
 
@@ -594,36 +502,23 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
 ):
     with warnings.catch_warnings():
         warnings.filterwarnings(
-            "ignore",
-            message="coroutine 'async_get_and_parse_image' was never awaited")
+            "ignore", message="coroutine 'async_get_and_parse_image' was never awaited"
+        )
         with pytest.raises(
-                ValueError,
-                match="At most 2 image\\(s\\) may be provided in one request\\."
+            ValueError, match="At most 2 image\\(s\\) may be provided in one request\\."
         ):
             parse_chat_messages(
-                [{
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What's in these images?"
-                    }]
-                }],
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "text", "text": "What's in these images?"},
+                        ],
+                    }
+                ],
                 phi3v_model_config,
                 phi3v_tokenizer,
                 content_format="string",
@@ -637,46 +532,30 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
 ):
     with warnings.catch_warnings():
         warnings.filterwarnings(
-            "ignore",
-            message="coroutine 'async_get_and_parse_image' was never awaited")
+            "ignore", message="coroutine 'async_get_and_parse_image' was never awaited"
+        )
         with pytest.raises(
-                ValueError,
-                match="At most 2 image\\(s\\) may be provided in one request\\."
+            ValueError, match="At most 2 image\\(s\\) may be provided in one request\\."
         ):
             parse_chat_messages(
-                [{
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What's in this image?"
-                    }]
-                }, {
-                    "role": "assistant",
-                    "content": "Some stuff."
-                }, {
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What about these two?"
-                    }]
-                }],
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "text", "text": "What's in this image?"},
+                        ],
+                    },
+                    {"role": "assistant", "content": "Some stuff."},
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "text", "text": "What about these two?"},
+                        ],
+                    },
+                ],
                 phi3v_model_config,
                 phi3v_tokenizer,
                 content_format="string",
@@ -689,28 +568,24 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                "What's in these images?", {
-                    "image_url": image_url
-                }, {
-                    "image_url": image_url
-                }
-            ]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    "What's in these images?",
+                    {"image_url": image_url},
+                    {"image_url": image_url},
+                ],
+            }
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?"}
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
@@ -720,42 +595,30 @@ def test_parse_chat_messages_multiple_images_interleave(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": "I need you to compare this image"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "and this one"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "Do they have differences?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "I need you to compare this image"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "and this one"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Do they have differences?"},
+                ],
+            }
+        ],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+            "Do they have differences?",
+        }
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
@@ -766,42 +629,30 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": "I need you to compare this image"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "and this one"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "Do they have differences?"
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "I need you to compare this image"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "and this one"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Do they have differences?"},
+                ],
+            }
+        ],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+            "Do they have differences?",
+        }
+    ]
     _assert_mm_data_is_image_input(await mm_data, 2)
 
 
@@ -811,135 +662,84 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's on this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Be accurate."
-                },
-            ]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": "What's on this image?"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Be accurate."},
+                ],
+            },
+            {"role": "assistant", "content": "Some stuff."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                ],
+            },
+        ],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|image_1|>\nBe accurate."
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role": "user",
-        "content": "What's on this image?\n<|image_2|>"
-    }]
+    assert conversation == [
+        {"role": "user", "content": "What's on this image?\n<|image_1|>\nBe accurate."},
+        {"role": "assistant", "content": "Some stuff."},
+        {"role": "user", "content": "What's on this image?\n<|image_2|>"},
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
 def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
-        qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer,
-        image_url, video_url, audio_url):
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's on this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Now listen to this audio"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
-                    }
-                },
-            ]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": "What's on this image?"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "And what's in the video?"
-            }, {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Now listen to this audio"},
+                    {"type": "audio_url", "audio_url": {"url": audio_url}},
+                ],
+            },
+            {"role": "assistant", "content": "Some stuff."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's on this image?"},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "And what's in the video?"},
+                    {"type": "video_url", "video_url": {"url": video_url}},
+                ],
+            },
+        ],
         qwen25omni_model_config_mm_interleaved,
         qwen25omni_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-        "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>"
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-        "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
+        },
+        {"role": "assistant", "content": "Some stuff."},
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
 
@@ -950,35 +750,25 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
     image_url,
 ):
     with pytest.raises(
-            ValueError,
-            match=r"Found more '<|image_1|>' placeholders in input prompt "
-            "than actual multimodal data items."):
+        ValueError,
+        match=r"Found more '<|image_1|>' placeholders in input prompt "
+        "than actual multimodal data items.",
+    ):
         parse_chat_messages(
-            [{
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type":
-                        "text",
-                        "text":
-                        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-                        "Do they have differences?"
-                    },
-                ]
-            }],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {
+                            "type": "text",
+                            "text": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+                            "Do they have differences?",
+                        },
+                    ],
+                }
+            ],
             phi3v_model_config_mm_interleaved,
             phi3v_tokenizer,
             content_format="string",
@@ -993,31 +783,29 @@ def test_mllama_single_image(
 ):
     """Ensures that a single image is parsed correctly mllama."""
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                'type': 'text',
-                'text': 'The content of this image is:'
-            }, {
-                "image_url": image_url
-            }]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "The content of this image is:"},
+                    {"image_url": image_url},
+                ],
+            }
+        ],
         mllama_model_config,
         mllama_tokenizer,
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 1)
-    assert conversation == [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'The content of this image is:'
-        }, {
-            'type': 'image'
-        }]
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "The content of this image is:"},
+                {"type": "image"},
+            ],
+        }
+    ]
 
 
 def test_mllama_interleaved_images(
@@ -1027,46 +815,33 @@ def test_mllama_interleaved_images(
 ):
     """Ensures that multiple image are parsed as interleaved dicts."""
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    'type': 'text',
-                    'text': 'The content of the first image is:'
-                },
-                {
-                    "image_url": image_url
-                },
-                {
-                    'type': 'text',
-                    'text': 'The content of the second image is:'
-                },
-                {
-                    "image_url": image_url
-                },
-            ]
-        }],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "The content of the first image is:"},
+                    {"image_url": image_url},
+                    {"type": "text", "text": "The content of the second image is:"},
+                    {"image_url": image_url},
+                ],
+            }
+        ],
         mllama_model_config,
         mllama_tokenizer,
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 2)
-    assert conversation == [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'The content of the first image is:'
-        }, {
-            'type': 'image'
-        }, {
-            'type': 'text',
-            'text': 'The content of the second image is:'
-        }, {
-            'type': 'image'
-        }]
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "The content of the first image is:"},
+                {"type": "image"},
+                {"type": "text", "text": "The content of the second image is:"},
+                {"type": "image"},
+            ],
+        }
+    ]
 
 
 @pytest.mark.parametrize("model", [MLLAMA_MODEL_ID])
@@ -1076,39 +851,33 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
     def get_conversation(is_hf: bool):
         img_part = {"type": "image_url", "image_url": {"url": image_url}}
         if is_hf:
-            img_part = {'type': 'image'}
-        return [{
-            'role':
-            'user',
-            'content': [
-                {
-                    'type': 'text',
-                    'text': 'The content of the first image is:'
-                },
-                img_part,
-                {
-                    'type': 'text',
-                    'text': 'The content of the second image is:'
-                },
-                img_part,
-                {
-                    'type': 'text',
-                    'text': 'What animal is in the first image?'
-                },
-            ]
-        }]
+            img_part = {"type": "image"}
+        return [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "The content of the first image is:"},
+                    img_part,
+                    {"type": "text", "text": "The content of the second image is:"},
+                    img_part,
+                    {"type": "text", "text": "What animal is in the first image?"},
+                ],
+            }
+        ]
 
     # Build a config for the model
-    model_config = ModelConfig(model,
-                               task="generate",
-                               tokenizer=model,
-                               tokenizer_mode="auto",
-                               trust_remote_code=True,
-                               dtype="auto",
-                               seed=0,
-                               limit_mm_per_prompt={
-                                   "image": 2,
-                               })
+    model_config = ModelConfig(
+        model,
+        task="generate",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="auto",
+        seed=0,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
@@ -1154,7 +923,8 @@ def get_conversation(is_hf: bool):
     [
         QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
         HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
-    ])
+    ],
+)
 @pytest.mark.parametrize("use_tools", [True, False])
 def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     """checks that chat_template is a dict type for HF models."""
@@ -1179,14 +949,20 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     )
     tokenizer = tokenizer_group.tokenizer
 
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "dummy_function_name",
-            "description": "This is a dummy function",
-            "parameters": sample_json_schema
-        }
-    }] if use_tools else None
+    tools = (
+        [
+            {
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": sample_json_schema,
+                },
+            }
+        ]
+        if use_tools
+        else None
+    )
 
     # Test detecting the tokenizer's chat_template
     chat_template = resolve_hf_chat_template(
diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py
index 33ad2cfd3a33..b56fbd9fee7e 100644
--- a/tests/entrypoints/test_ssl_cert_refresher.py
+++ b/tests/entrypoints/test_ssl_cert_refresher.py
@@ -11,7 +11,6 @@
 
 
 class MockSSLContext(SSLContext):
-
     def __init__(self):
         self.load_cert_chain_count = 0
         self.load_ca_count = 0
@@ -34,7 +33,7 @@ def load_verify_locations(
 
 
 def create_file() -> str:
-    with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as f:
+    with tempfile.NamedTemporaryFile(dir="/tmp", delete=False) as f:
         return f.name
 
 
diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
index 1b95bf59f67c..4a9b8aa39d70 100644
--- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -17,7 +17,6 @@
 
 
 def test_model_loader_download_files(vllm_runner):
-    with vllm_runner(test_model,
-                     load_format=LoadFormat.FASTSAFETENSORS) as llm:
+    with vllm_runner(test_model, load_format=LoadFormat.FASTSAFETENSORS) as llm:
         deserialized_outputs = llm.generate(prompts, sampling_params)
         assert deserialized_outputs
diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/fastsafetensors_loader/test_weight_utils.py
index 78d23acfec7c..cc899b77b5e9 100644
--- a/tests/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/fastsafetensors_loader/test_weight_utils.py
@@ -8,24 +8,25 @@
 import torch
 
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, fastsafetensors_weights_iterator,
-    safetensors_weights_iterator)
+    download_weights_from_hf,
+    fastsafetensors_weights_iterator,
+    safetensors_weights_iterator,
+)
 
 
 def test_fastsafetensors_model_loader():
     with tempfile.TemporaryDirectory() as tmpdir:
         huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("openai-community/gpt2",
-                                 allow_patterns=["*.safetensors"],
-                                 cache_dir=tmpdir)
+        download_weights_from_hf(
+            "openai-community/gpt2", allow_patterns=["*.safetensors"], cache_dir=tmpdir
+        )
         safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
         assert len(safetensors) > 0
 
         fastsafetensors_tensors = {}
         hf_safetensors_tensors = {}
 
-        for name, tensor in fastsafetensors_weights_iterator(
-                safetensors, True):
+        for name, tensor in fastsafetensors_weights_iterator(safetensors, True):
             fastsafetensors_tensors[name] = tensor
 
         for name, tensor in safetensors_weights_iterator(safetensors, True):
@@ -34,13 +35,10 @@ def test_fastsafetensors_model_loader():
         assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
 
         for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
-            fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
-            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
-                name].dtype
-            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
-                name].shape
-            assert torch.all(
-                fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
+            fastsafetensors_tensor = fastsafetensors_tensor.to("cpu")
+            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[name].dtype
+            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[name].shape
+            assert torch.all(fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py
index 9d65159bf64f..6561e9556fa7 100644
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
@@ -6,11 +6,7 @@
 # Reference default values of atol and rtol are from
 # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
-default_rtol = {
-    torch.float16: 1e-3,
-    torch.bfloat16: 1.6e-2,
-    torch.float: 1.3e-6
-}
+default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float: 1.3e-6}
 
 
 def get_default_atol(output) -> float:
diff --git a/tests/kernels/attention/conftest.py b/tests/kernels/attention/conftest.py
index 88a2fb62b254..b080a71bd54e 100644
--- a/tests/kernels/attention/conftest.py
+++ b/tests/kernels/attention/conftest.py
@@ -3,8 +3,7 @@
 
 import pytest
 
-from vllm.utils import (create_kv_caches_with_random,
-                        create_kv_caches_with_random_flash)
+from vllm.utils import create_kv_caches_with_random, create_kv_caches_with_random_flash
 
 
 @pytest.fixture()
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 2e0b4efebfdb..e3823df74f16 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -6,9 +6,9 @@
 
 import pytest
 import torch
-
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
+
 from vllm import _custom_ops as ops
 from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.platforms import current_platform
@@ -30,9 +30,11 @@
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [
-    torch.half, torch.bfloat16, torch.float
-] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+DTYPES = (
+    [torch.half, torch.bfloat16, torch.float]
+    if not current_platform.is_rocm()
+    else [torch.half, torch.bfloat16]
+)
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
@@ -45,9 +47,7 @@
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 def ref_masked_attention(
@@ -113,8 +113,7 @@ def ref_single_query_cached_kv_attention(
             # Create the ALiBi bias used in the paged attention kernel.
             position_ids = torch.arange(seq_len).int()
             alibi_bias = (position_ids - seq_len + 1).float()
-            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
-                1, 1, -1)
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(1, 1, -1)
 
         out = ref_masked_attention(q, keys, values, scale, alibi_bias)
         out = out.view(num_query_heads, head_size)
@@ -122,8 +121,8 @@ def ref_single_query_cached_kv_attention(
 
 
 @pytest.mark.parametrize(
-    "version",
-    ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"])
+    "version", ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]
+)
 @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -146,13 +145,18 @@ def test_paged_attention(
     seed: int,
     device: str,
 ) -> None:
-    if ((kv_cache_dtype == "fp8" and head_size % 16)
-            or (version == "rocm" and head_size not in (64, 128))):
+    if (kv_cache_dtype == "fp8" and head_size % 16) or (
+        version == "rocm" and head_size not in (64, 128)
+    ):
         pytest.skip()
 
-    if (version == "rocm" and current_platform.is_navi()
-            and (kv_cache_dtype == "fp8" or head_size != 128
-                 or block_size != 16 or use_alibi)):
+    if (
+        version == "rocm"
+        and current_platform.is_navi()
+        and (
+            kv_cache_dtype == "fp8" or head_size != 128 or block_size != 16 or use_alibi
+        )
+    ):
         pytest.skip()
 
     global PARTITION_SIZE
@@ -180,18 +184,24 @@ def test_paged_attention(
     block_tables_lst: list[list[int]] = []
     for _ in range(num_seqs):
         block_table = [
-            random.randint(0, NUM_BLOCKS - 1)
-            for _ in range(max_num_blocks_per_seq)
+            random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq)
         ]
         block_tables_lst.append(block_table)
 
     block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
 
     # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
-                                                num_kv_heads, head_size,
-                                                kv_cache_dtype, dtype, seed,
-                                                device)
+    key_caches, value_caches = kv_cache_factory(
+        NUM_BLOCKS,
+        block_size,
+        1,
+        num_kv_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
@@ -217,18 +227,37 @@ def test_paged_attention(
             v_scale,
         )
 
-        opcheck(torch.ops._C.paged_attention_v1,
-                (output, query, key_cache, value_cache, num_kv_heads, scale,
-                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
-                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
-                cond=(head_size == HEAD_SIZES[0]
-                      and block_size == BLOCK_SIZES[0]))
+        opcheck(
+            torch.ops._C.paged_attention_v1,
+            (
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                0,
+                0,
+                0,
+                64,
+                0,
+            ),
+            cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+        )
 
     elif version in ("v2", "rocm"):
         if current_platform.is_rocm() and version == "rocm":
             PARTITION_SIZE = PARTITION_SIZE_ROCM
 
-        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
         assert PARTITION_SIZE % block_size == 0
         num_seqs, num_heads, head_size = output.shape
         tmp_output = torch.empty(
@@ -261,13 +290,34 @@ def test_paged_attention(
                 v_scale,
             )
 
-            opcheck(torch.ops._C.paged_attention_v2,
-                    (output, exp_sums, max_logits, tmp_output, query,
-                     key_cache, value_cache, num_kv_heads, scale, block_tables,
-                     seq_lens, block_size, max_seq_len, alibi_slopes,
-                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
-                    cond=(head_size == HEAD_SIZES[0]
-                          and block_size == BLOCK_SIZES[0]))
+            opcheck(
+                torch.ops._C.paged_attention_v2,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                    0,
+                    0,
+                    0,
+                    64,
+                    0,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
 
         else:
             ops.paged_attention_rocm(
@@ -291,13 +341,30 @@ def test_paged_attention(
                 v_scale,
             )
 
-            opcheck(torch.ops._rocm_C.paged_attention,
-                    (output, exp_sums, max_logits, tmp_output, query,
-                     key_cache, value_cache, num_kv_heads, scale, block_tables,
-                     seq_lens, None, block_size, max_seq_len, alibi_slopes,
-                     kv_cache_dtype, k_scale, v_scale),
-                    cond=(head_size == HEAD_SIZES[0]
-                          and block_size == BLOCK_SIZES[0]))
+            opcheck(
+                torch.ops._rocm_C.paged_attention,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    None,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
 
     else:
         raise AssertionError(f"Unknown version: {version}")
@@ -306,18 +373,17 @@ def test_paged_attention(
     if kv_cache_dtype == "fp8":
         # Convert cache data back to dtype.
         x = 16 // torch.tensor([], dtype=dtype).element_size()
-        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
-                           block_size, x)
-        dequantized_key_cache = torch.empty(size=key_cache_shape,
-                                            dtype=dtype,
-                                            device=device)
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
+        dequantized_key_cache = torch.empty(
+            size=key_cache_shape, dtype=dtype, device=device
+        )
         ops.convert_fp8(dequantized_key_cache, key_cache)
         key_cache = dequantized_key_cache
 
         value_cache_shape = value_cache.shape
-        dequantized_value_cache = torch.empty(size=value_cache_shape,
-                                              dtype=dtype,
-                                              device=device)
+        dequantized_value_cache = torch.empty(
+            size=value_cache_shape, dtype=dtype, device=device
+        )
         ops.convert_fp8(dequantized_value_cache, value_cache)
         value_cache = dequantized_value_cache
 
@@ -370,8 +436,9 @@ def ref_multi_query_kv_attention(
         if alibi_bias:
             attn_mask = alibi_bias[i]
         else:
-            attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
-                                   diagonal=1)
+            attn_mask = torch.triu(
+                torch.ones(seq_len, seq_len, dtype=dtype), diagonal=1
+            )
             attn_mask = attn_mask * torch.finfo(dtype).min
             attn_mask = attn_mask.to(dtype=dtype)
 
@@ -393,8 +460,9 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 @torch.inference_mode()
 def test_multi_query_kv_attention(
     num_seqs: int,
@@ -416,13 +484,11 @@ def test_multi_query_kv_attention(
 
     scale = float(1.0 / (head_size**0.5))
     num_query_heads, num_kv_heads = num_heads
-    qkv = torch.empty(num_tokens,
-                      num_query_heads + 2 * num_kv_heads,
-                      head_size,
-                      dtype=dtype)
+    qkv = torch.empty(
+        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
+    )
     qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split(
-        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
+    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
 
     num_queries_per_kv = num_query_heads // num_kv_heads
     if num_queries_per_kv > 1:
@@ -432,8 +498,7 @@ def test_multi_query_kv_attention(
     alibi_bias = None
     if use_alibi:
         alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
-                                     seq_lens)
+        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
         output = torch.empty_like(query)
         start = 0
         # Dynamic sequence length not supported with custom attn_bias.
@@ -445,7 +510,8 @@ def test_multi_query_kv_attention(
                 value[None, start:end],
                 attn_bias=attn_bias[i],
                 p=0.0,
-                scale=scale)
+                scale=scale,
+            )
             output[start:end].copy_(out.view_as(query[start:end]))
             start += seq_len
         # xformers.AttentionBias to Tensor for use in reference impl.
@@ -488,8 +554,9 @@ def test_multi_query_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 @torch.inference_mode()
 def test_multi_query_kv_attention_with_alibi(
     num_seqs: int,
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 93bf20da4adb..bff77c7868a1 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -15,8 +15,7 @@
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
 
 
@@ -37,7 +36,7 @@ def clear_cache():
     "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
     "hip": [16, 1],  # HIP requires special handling for block_size=1
     # "cpu": [16]  # CPU uses fixed block size from test cases
-    "cpu": []  # FIXME(woosuk): Temporarily disable CPU tests
+    "cpu": [],  # FIXME(woosuk): Temporarily disable CPU tests
 }
 
 
@@ -45,12 +44,13 @@ def generate_params():
     params = []
     for use_mla in [True, False]:
         for device in ["cuda", "hip", "cpu"]:
-            backends = DEVICE_MLA_BACKENDS[
-                device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            backends = (
+                DEVICE_MLA_BACKENDS[device]
+                if use_mla
+                else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            )
             for name in backends:
-                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
-                    16
-                ]
+                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [16]
                 for block_size in block_sizes:
                     params.append(
                         pytest.param(
@@ -58,14 +58,13 @@ def generate_params():
                             name,
                             use_mla,
                             block_size,
-                            id=
-                            f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
-                        ))
+                            id=f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}",
+                        )
+                    )
     return params
 
 
-@pytest.mark.parametrize("device, name, use_mla, block_size",
-                         generate_params())
+@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
 @pytest.mark.parametrize("use_v1", [True, False])
 def test_env(
     device: str,
@@ -85,57 +84,61 @@ def test_env(
             if not use_v1:
                 pytest.skip("CPU backend only supports V1")
 
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           block_size, False)
+            with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+                backend = get_attn_backend(
+                    16, torch.float16, torch.float16, block_size, False
+                )
             assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
 
         elif device == "hip":
-            with patch("vllm.attention.selector.current_platform",
-                       RocmPlatform()):
+            with patch("vllm.attention.selector.current_platform", RocmPlatform()):
                 if use_mla:
                     # Validate HIP MLA backend-block_size combinations
-                    valid_combination = (
-                        (name == "TRITON_MLA" and block_size != 1)
-                        or (name == "ROCM_AITER_MLA" and block_size == 1))
+                    valid_combination = (name == "TRITON_MLA" and block_size != 1) or (
+                        name == "ROCM_AITER_MLA" and block_size == 1
+                    )
 
                     if valid_combination:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   torch.float16,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
+                        backend = get_attn_backend(
+                            16,
+                            torch.float16,
+                            torch.float16,
+                            block_size,
+                            False,
+                            use_mla=use_mla,
+                        )
                         expected = f"{name}_VLLM_V1" if use_v1 else name
                         assert backend.get_name() == expected
                     else:
                         with pytest.raises(ValueError) as exc_info:
-                            get_attn_backend(16,
-                                             torch.float16,
-                                             torch.float16,
-                                             block_size,
-                                             False,
-                                             use_mla=use_mla)
-                        assert f"The selected backend, {name}" in str(
-                            exc_info.value)
+                            get_attn_backend(
+                                16,
+                                torch.float16,
+                                torch.float16,
+                                block_size,
+                                False,
+                                use_mla=use_mla,
+                            )
+                        assert f"The selected backend, {name}" in str(exc_info.value)
                 else:
-                    backend = get_attn_backend(16,
-                                               torch.float16,
-                                               torch.float16,
-                                               block_size,
-                                               False,
-                                               use_mla=use_mla)
+                    backend = get_attn_backend(
+                        16,
+                        torch.float16,
+                        torch.float16,
+                        block_size,
+                        False,
+                        use_mla=use_mla,
+                    )
                     expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
                     assert backend.get_name() == expected
 
         elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
+            with patch("vllm.attention.selector.current_platform", CudaPlatform()):
                 if use_mla:
                     if name == "FLASHMLA" and block_size == 64:
                         from vllm.attention.backends.flashmla import (
-                            is_flashmla_supported)
+                            is_flashmla_supported,
+                        )
 
                         # only on cuda platforms with specific capability.
                         is_supported, _ = is_flashmla_supported()
@@ -144,53 +147,63 @@ def test_env(
                             # if platform is not supported then skip this case.
                             pytest.skip()
                         else:
-                            backend = get_attn_backend(16,
-                                                       torch.float16,
-                                                       torch.float16,
-                                                       block_size,
-                                                       False,
-                                                       use_mla=use_mla)
+                            backend = get_attn_backend(
+                                16,
+                                torch.float16,
+                                torch.float16,
+                                block_size,
+                                False,
+                                use_mla=use_mla,
+                            )
                             expected = f"{name}_VLLM_V1" if use_v1 else name
                             assert backend.get_name() == expected
                     else:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   torch.float16,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        expected = ("TRITON_MLA_VLLM_V1"
-                                    if use_v1 else "TRITON_MLA")
+                        backend = get_attn_backend(
+                            16,
+                            torch.float16,
+                            torch.float16,
+                            block_size,
+                            False,
+                            use_mla=use_mla,
+                        )
+                        expected = "TRITON_MLA_VLLM_V1" if use_v1 else "TRITON_MLA"
                         assert backend.get_name() == expected
                 elif name == "FLASHINFER":
-                    backend = get_attn_backend(16,
-                                               torch.float16,
-                                               torch.float16,
-                                               block_size,
-                                               False,
-                                               use_mla=use_mla)
+                    backend = get_attn_backend(
+                        16,
+                        torch.float16,
+                        torch.float16,
+                        block_size,
+                        False,
+                        use_mla=use_mla,
+                    )
                     expected = "FLASHINFER_VLLM_V1" if use_v1 else name
                     assert backend.get_name() == expected
                 else:
-                    backend = get_attn_backend(32,
-                                               torch.float16,
-                                               torch.float16,
-                                               block_size,
-                                               False,
-                                               use_mla=use_mla)
+                    backend = get_attn_backend(
+                        32,
+                        torch.float16,
+                        torch.float16,
+                        block_size,
+                        False,
+                        use_mla=use_mla,
+                    )
                     expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                     assert backend.get_name() == expected
 
                     if use_v1:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   torch.float16,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
+                        backend = get_attn_backend(
+                            16,
+                            torch.float16,
+                            torch.float16,
+                            block_size,
+                            False,
+                            use_mla=use_mla,
+                        )
                         assert backend.get_name() == "FLEX_ATTENTION", (
                             "Should fallback to FlexAttention if head size is "
-                            "not supported by FlashAttention")
+                            "not supported by FlashAttention"
+                        )
 
 
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
@@ -208,19 +221,14 @@ def test_fp32_fallback(
             if not use_v1:
                 pytest.skip("CPU backend only supports V1")
 
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
+            with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32, 16, False)
             assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
 
         elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
+            with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, torch.float32, 16, False)
+            assert backend.get_name() == "FLEX_ATTENTION" if use_v1 else "XFORMERS"
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
@@ -232,9 +240,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
 
         # Unsupported CUDA arch
-        monkeypatch.setattr(torch.cuda,
-                            "get_device_capability",
-                            lambda _=None: (7, 5))
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16, False)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
@@ -255,17 +261,17 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
         # flash-attn is not installed
         import sys
-        original_module = sys.modules.get('vllm_flash_attn')
-        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
+
+        original_module = sys.modules.get("vllm_flash_attn")
+        monkeypatch.setitem(sys.modules, "vllm_flash_attn", None)
         backend = get_attn_backend(16, torch.float16, None, 16, False)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
         # Restore the original module if it existed
         if original_module is not None:
-            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
-                                original_module)
+            monkeypatch.setitem(sys.modules, "vllm_flash_attn", original_module)
         else:
-            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
+            monkeypatch.delitem(sys.modules, "vllm_flash_attn", raising=False)
 
         # Unsupported head size
         backend = get_attn_backend(17, torch.float16, None, 16, False)
@@ -278,9 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
 @pytest.mark.parametrize("use_v1", [True, False])
 def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
-
-    with monkeypatch.context() as m, patch(
-            "vllm.attention.selector.current_platform", CudaPlatform()):
+    with (
+        monkeypatch.context() as m,
+        patch("vllm.attention.selector.current_platform", CudaPlatform()),
+    ):
         m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
         m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
 
diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
index 9aee818c9956..e0b291f59e82 100644
--- a/tests/kernels/attention/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -6,11 +6,12 @@
 
 import pytest
 import torch
-
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
-    LocalStridedBlockSparseAttn)
+    LocalStridedBlockSparseAttn,
+)
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
@@ -34,7 +35,7 @@
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
-CUDA_DEVICES = ['cuda:0']
+CUDA_DEVICES = ["cuda:0"]
 BLOCKSPARSE_LOCAL_BLOCKS = [16]
 BLOCKSPARSE_VERT_STRIDES = [8]
 
@@ -111,8 +112,7 @@ def ref_single_query_cached_kv_attention(
             # Create the ALiBi bias used in the paged attention kernel.
             position_ids = torch.arange(seq_len).int()
             alibi_bias = (position_ids - seq_len + 1).float()
-            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
-                1, 1, -1)
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(1, 1, -1)
 
         if blocksparse_vert_stride >= 1:
             bsize = blocksparse_block_size
@@ -120,19 +120,18 @@ def ref_single_query_cached_kv_attention(
             vert = blocksparse_vert_stride
             locals = blocksparse_local_blocks
             qb = (seq_len - 1) // bsize
-            attn_mask = q.new_zeros(
-                (num_query_heads, 1, seq_len)).float() - torch.inf
+            attn_mask = q.new_zeros((num_query_heads, 1, seq_len)).float() - torch.inf
             for h in range(num_query_heads):
                 if hsliding >= 0:  # slide with q heads
                     bs_offset = (tp_rank * num_query_heads + h) * hsliding + 1
                 else:  # slide with kv heads
-                    bs_offset = (tp_rank * num_kv_heads +
-                                 h // num_queries_per_kv) * (-hsliding) + 1
+                    bs_offset = (tp_rank * num_kv_heads + h // num_queries_per_kv) * (
+                        -hsliding
+                    ) + 1
                 for kb in range(qb + 1):
                     kj = kb * bsize
-                    if (qb - kb) < locals or \
-                        (kb + bs_offset) % vert == 0:
-                        attn_mask[h, 0, kj:min(kj + bsize, seq_len)] = 0
+                    if (qb - kb) < locals or (kb + bs_offset) % vert == 0:
+                        attn_mask[h, 0, kj : min(kj + bsize, seq_len)] = 0
             if alibi_bias is not None:
                 attn_mask += alibi_bias
         else:
@@ -156,8 +155,7 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
 @pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
 @pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
-@pytest.mark.parametrize("blocksparse_head_sliding_step",
-                         BLOCKSPARSE_HEADS_SLIDINGS)
+@pytest.mark.parametrize("blocksparse_head_sliding_step", BLOCKSPARSE_HEADS_SLIDINGS)
 def test_paged_attention(
     kv_cache_factory,
     version: str,
@@ -198,17 +196,23 @@ def test_paged_attention(
     block_tables = []
     for _ in range(num_seqs):
         block_table = [
-            random.randint(0, NUM_BLOCKS - 1)
-            for _ in range(max_num_blocks_per_seq)
+            random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq)
         ]
         block_tables.append(block_table)
     block_tables = torch.tensor(block_tables, dtype=torch.int)
 
     # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
-                                                num_kv_heads, head_size,
-                                                kv_cache_dtype, dtype, seed,
-                                                device)
+    key_caches, value_caches = kv_cache_factory(
+        NUM_BLOCKS,
+        block_size,
+        1,
+        num_kv_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
@@ -240,7 +244,7 @@ def test_paged_attention(
             blocksparse_head_sliding_step=blocksparse_head_sliding_step,
         )
     elif version == "v2":
-        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
+        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
         assert PARTITION_SIZE % block_size == 0
         num_seqs, num_heads, head_size = output.shape
         tmp_output = torch.empty(
@@ -283,18 +287,17 @@ def test_paged_attention(
     if kv_cache_dtype == "fp8":
         # Convert cache data back to dtype.
         x = 16 // torch.tensor([], dtype=dtype).element_size()
-        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
-                           block_size, x)
-        dequantized_key_cache = torch.empty(size=key_cache_shape,
-                                            dtype=dtype,
-                                            device=device)
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
+        dequantized_key_cache = torch.empty(
+            size=key_cache_shape, dtype=dtype, device=device
+        )
         ops.convert_fp8(dequantized_key_cache, key_cache)
         key_cache = dequantized_key_cache
 
         value_cache_shape = value_cache.shape
-        dequantized_value_cache = torch.empty(size=value_cache_shape,
-                                              dtype=dtype,
-                                              device=device)
+        dequantized_value_cache = torch.empty(
+            size=value_cache_shape, dtype=dtype, device=device
+        )
         ops.convert_fp8(dequantized_value_cache, value_cache)
         value_cache = dequantized_value_cache
 
@@ -346,8 +349,7 @@ def ref_multi_query_kv_attention(
         seq_len = end_idx - start_idx
 
         # Create attention mask.
-        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
-                               diagonal=1)
+        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype), diagonal=1)
         attn_mask = attn_mask * torch.finfo(dtype).min
         attn_mask = attn_mask.to(dtype=dtype)
 
@@ -401,13 +403,11 @@ def test_varlen_blocksparse_attention_prefill(
     assert num_query_heads % num_kv_heads == 0
     num_queries_per_kv = num_query_heads // num_kv_heads
 
-    qkv = torch.empty(num_tokens,
-                      num_query_heads + 2 * num_kv_heads,
-                      head_size,
-                      dtype=dtype)
+    qkv = torch.empty(
+        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
+    )
     qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split(
-        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
+    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
 
     bs_attn_op = LocalStridedBlockSparseAttn(
         num_query_heads,
@@ -417,13 +417,10 @@ def test_varlen_blocksparse_attention_prefill(
         block_size=blocksparse_block_size,
         device=device,
         dtype=dtype,
-        homo_head=blocksparse_homo_heads)
+        homo_head=blocksparse_homo_heads,
+    )
 
-    output = bs_attn_op(query,
-                        key,
-                        value,
-                        cu_seq_lens.to(device),
-                        sm_scale=scale)
+    output = bs_attn_op(query, key, value, cu_seq_lens.to(device), sm_scale=scale)
 
     if num_queries_per_kv > 1:
         # Handle MQA and GQA
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 789507615580..fad998b60120 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -5,12 +5,12 @@
 
 import pytest
 import torch
-
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
-COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
+COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
@@ -32,9 +32,7 @@
 
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # We assume fp8 is always enabled for testing.
 KV_CACHE_DTYPE = ["auto", "fp8"]
@@ -83,24 +81,33 @@ def test_copy_blocks(
         block_mapping.append((src, dst2))
 
     # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
-                                                num_layers, num_heads,
-                                                head_size, kv_cache_dtype,
-                                                dtype, seed, device)
+    key_caches, value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        num_layers,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
 
     # Clone the KV caches.
     cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
     cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
 
     # Call the copy blocks kernel.
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device=device).view(-1, 2)
-
-    opcheck(torch.ops._C_cache_ops.copy_blocks,
-            (key_caches, value_caches, block_mapping_tensor),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-            cond=(head_size == HEAD_SIZES[0]))
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device=device
+    ).view(-1, 2)
+
+    opcheck(
+        torch.ops._C_cache_ops.copy_blocks,
+        (key_caches, value_caches, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+        cond=(head_size == HEAD_SIZES[0]),
+    )
     ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
 
     # Run the reference implementation.
@@ -113,8 +120,7 @@ def test_copy_blocks(
     # Compare the results.
     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
         torch.testing.assert_close(key_cache, cloned_key_cache)
-    for value_cache, cloned_value_cache in zip(value_caches,
-                                               cloned_value_caches):
+    for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
         torch.testing.assert_close(value_cache, cloned_value_cache)
 
 
@@ -153,10 +159,17 @@ def test_reshape_and_cache(
     _, key, value = qkv.unbind(dim=1)
 
     # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
-                                                num_heads, head_size,
-                                                kv_cache_dtype, dtype, seed,
-                                                device)
+    key_caches, value_caches = kv_cache_factory(
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
     key_cache, value_cache = key_caches[0], value_caches[0]
 
     # Using default kv_scale
@@ -174,12 +187,30 @@ def test_reshape_and_cache(
         cloned_value_cache = value_cache.clone()
 
     # Call the reshape_and_cache kernel.
-    opcheck(torch.ops._C_cache_ops.reshape_and_cache,
-            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
-             k_scale, v_scale),
-            cond=(head_size == HEAD_SIZES[0]))
-    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
-                          kv_cache_dtype, k_scale, v_scale)
+    opcheck(
+        torch.ops._C_cache_ops.reshape_and_cache,
+        (
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        ),
+        cond=(head_size == HEAD_SIZES[0]),
+    )
+    ops.reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
@@ -200,14 +231,12 @@ def test_reshape_and_cache(
         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
 
     if kv_cache_dtype == "fp8":
-        torch.testing.assert_close(result_key_cache,
-                                   cloned_key_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
-        torch.testing.assert_close(result_value_cache,
-                                   cloned_value_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
+        )
+        torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
+        )
     else:
         torch.testing.assert_close(key_cache, cloned_key_cache)
         torch.testing.assert_close(value_cache, cloned_value_cache)
@@ -247,15 +276,8 @@ def test_reshape_and_cache_flash(
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst,
-                                dtype=torch.long,
-                                device=device)
-    qkv = torch.randn(num_tokens,
-                      3,
-                      num_heads,
-                      head_size,
-                      dtype=dtype,
-                      device=device)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype, device=device)
     _, key, value = qkv.unbind(dim=1)
 
     # Create the KV caches.
@@ -286,40 +308,57 @@ def permute_and_compact(x):
 
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
-        cloned_key_cache = torch.empty_like(key_cache_compact,
-                                            dtype=torch.float16)
-        ops.convert_fp8(cloned_key_cache, key_cache_compact, k_scale.item(),
-                        kv_cache_dtype)
-        cloned_value_cache = torch.empty_like(value_cache_compact,
-                                              dtype=torch.float16)
-        ops.convert_fp8(cloned_value_cache, value_cache_compact,
-                        v_scale.item(), kv_cache_dtype)
+        cloned_key_cache = torch.empty_like(key_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            cloned_key_cache, key_cache_compact, k_scale.item(), kv_cache_dtype
+        )
+        cloned_value_cache = torch.empty_like(value_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            cloned_value_cache, value_cache_compact, v_scale.item(), kv_cache_dtype
+        )
     else:
         cloned_key_cache = key_cache_compact.clone()
         cloned_value_cache = value_cache_compact.clone()
     # Call the reshape_and_cache kernel.
-    opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
-            (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
-             k_scale, v_scale),
-            cond=(head_size == HEAD_SIZES[0]))
-    ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                slot_mapping, kv_cache_dtype, k_scale, v_scale)
+    opcheck(
+        torch.ops._C_cache_ops.reshape_and_cache_flash,
+        (
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        ),
+        cond=(head_size == HEAD_SIZES[0]),
+    )
+    ops.reshape_and_cache_flash(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        kv_cache_dtype,
+        k_scale,
+        v_scale,
+    )
     key_cache_compact = permute_and_compact(key_cache)
     value_cache_compact = permute_and_compact(value_cache)
 
     if kv_cache_dtype == "fp8":
-        result_key_cache = torch.empty_like(key_cache_compact,
-                                            dtype=torch.float16)
-        ops.convert_fp8(result_key_cache,
-                        key_cache_compact,
-                        k_scale.item(),
-                        kv_dtype=kv_cache_dtype)
-        result_value_cache = torch.empty_like(value_cache_compact,
-                                              dtype=torch.float16)
-        ops.convert_fp8(result_value_cache,
-                        value_cache_compact,
-                        v_scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        result_key_cache = torch.empty_like(key_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            result_key_cache, key_cache_compact, k_scale.item(), kv_dtype=kv_cache_dtype
+        )
+        result_value_cache = torch.empty_like(value_cache_compact, dtype=torch.float16)
+        ops.convert_fp8(
+            result_value_cache,
+            value_cache_compact,
+            v_scale.item(),
+            kv_dtype=kv_cache_dtype,
+        )
 
     # Run the reference implementation.
     block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
@@ -337,14 +376,12 @@ def permute_and_compact(x):
             cloned_value_cache[block_idx, :, block_offset, :] = value[i]
 
     if kv_cache_dtype == "fp8":
-        torch.testing.assert_close(result_key_cache,
-                                   cloned_key_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
-        torch.testing.assert_close(result_value_cache,
-                                   cloned_value_cache,
-                                   atol=0.001,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1
+        )
+        torch.testing.assert_close(
+            result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1
+        )
     else:
         torch.testing.assert_close(key_cache_compact, cloned_key_cache)
         torch.testing.assert_close(value_cache_compact, cloned_value_cache)
@@ -381,8 +418,8 @@ def test_swap_blocks(
 
     current_platform.seed_everything(seed)
 
-    src_device = device if direction[0] == "cuda" else 'cpu'
-    dst_device = device if direction[1] == "cuda" else 'cpu'
+    src_device = device if direction[0] == "cuda" else "cpu"
+    dst_device = device if direction[1] == "cuda" else "cpu"
 
     src_blocks = random.sample(range(num_blocks), num_mappings)
     # For the same device, mapping must not overlap
@@ -393,42 +430,62 @@ def test_swap_blocks(
         dst_blocks = random.sample(range(num_blocks), num_mappings)
 
     block_mapping = list(zip(src_blocks, dst_blocks))
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device="cpu").view(-1, 2)
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device="cpu"
+    ).view(-1, 2)
 
     # Create the KV caches on the first device.
     src_key_caches, src_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
-        seed, src_device)
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        src_device,
+    )
 
     # Create the KV caches on the second device.
     dist_key_caches, dist_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
-        seed, dst_device)
+        num_blocks,
+        block_size,
+        1,
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        dst_device,
+    )
 
     src_key_caches_clone = src_key_caches[0].clone()
     src_value_caches_clone = src_value_caches[0].clone()
 
     # Call the swap_blocks kernel.
-    do_opcheck = (head_size == HEAD_SIZES[0])
-    opcheck(torch.ops._C_cache_ops.swap_blocks,
-            (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
-            cond=do_opcheck)
-    opcheck(torch.ops._C_cache_ops.swap_blocks,
-            (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
-            cond=do_opcheck)
-
-    ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
-                    block_mapping_tensor)
-    ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
-                    block_mapping_tensor)
+    do_opcheck = head_size == HEAD_SIZES[0]
+    opcheck(
+        torch.ops._C_cache_ops.swap_blocks,
+        (src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
+        cond=do_opcheck,
+    )
+    opcheck(
+        torch.ops._C_cache_ops.swap_blocks,
+        (src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
+        cond=do_opcheck,
+    )
+
+    ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping_tensor)
+    ops.swap_blocks(src_value_caches[0], dist_value_caches[0], block_mapping_tensor)
 
     for src, dst in block_mapping:
-        torch.testing.assert_close(src_key_caches_clone[src].cpu(),
-                                   dist_key_caches[0][dst].cpu())
-        torch.testing.assert_close(src_value_caches_clone[src].cpu(),
-                                   dist_value_caches[0][dst].cpu())
+        torch.testing.assert_close(
+            src_key_caches_clone[src].cpu(), dist_key_caches[0][dst].cpu()
+        )
+        torch.testing.assert_close(
+            src_value_caches_clone[src].cpu(), dist_value_caches[0][dst].cpu()
+        )
 
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -474,11 +531,9 @@ def _create_mla_cache(
     device: str,
 ) -> torch.Tensor:
     cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype
-    return torch.zeros(num_blocks,
-                       block_size,
-                       entry_size,
-                       dtype=cache_dtype,
-                       device=device)
+    return torch.zeros(
+        num_blocks, block_size, entry_size, dtype=cache_dtype, device=device
+    )
 
 
 def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str):
@@ -518,20 +573,16 @@ def test_concat_and_cache_mla(
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst,
-                                dtype=torch.long,
-                                device=device)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
 
     kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
-    k_pe = torch.randn(num_tokens,
-                       qk_rope_head_dim,
-                       dtype=dtype,
-                       device=device)
+    k_pe = torch.randn(num_tokens, qk_rope_head_dim, dtype=dtype, device=device)
     entry_size = kv_lora_rank + qk_rope_head_dim
 
     scale = torch.tensor(0.1, dtype=torch.float32, device=device)
-    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                 kv_cache_dtype, device)
+    kv_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
     ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
 
     for i in range(num_tokens):
@@ -543,10 +594,7 @@ def test_concat_and_cache_mla(
 
     if kv_cache_dtype == "fp8":
         ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
-        ops.convert_fp8(ref_kv_cache,
-                        ref_temp,
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        ops.convert_fp8(ref_kv_cache, ref_temp, scale.item(), kv_dtype=kv_cache_dtype)
     else:
         ref_kv_cache = ref_temp
 
@@ -556,24 +604,18 @@ def test_concat_and_cache_mla(
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
     )
 
-    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
-                             kv_cache_dtype, scale)
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale)
 
     if kv_cache_dtype == "fp8":
         result_temp = torch.empty_like(kv_cache, dtype=torch.float16)
-        ops.convert_fp8(result_temp,
-                        kv_cache.contiguous(),
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        ops.convert_fp8(
+            result_temp, kv_cache.contiguous(), scale.item(), kv_dtype=kv_cache_dtype
+        )
         expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16)
-        ops.convert_fp8(expected_temp,
-                        ref_kv_cache,
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
-        torch.testing.assert_close(result_temp,
-                                   expected_temp,
-                                   atol=0.001,
-                                   rtol=0.1)
+        ops.convert_fp8(
+            expected_temp, ref_kv_cache, scale.item(), kv_dtype=kv_cache_dtype
+        )
+        torch.testing.assert_close(result_temp, expected_temp, atol=0.001, rtol=0.1)
     else:
         torch.testing.assert_close(kv_cache, ref_kv_cache)
 
@@ -606,8 +648,9 @@ def test_copy_blocks_mla(
 
     kv_caches = []
     for _ in range(num_layers):
-        kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                     kv_cache_dtype, device)
+        kv_cache = _create_mla_cache(
+            num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+        )
         _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
         kv_caches.append(kv_cache)
 
@@ -624,9 +667,9 @@ def test_copy_blocks_mla(
         dst2 = dst_blocks[2 * i + 1]
         block_mapping.append((src, dst1))
         block_mapping.append((src, dst2))
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device=device).view(-1, 2)
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device=device
+    ).view(-1, 2)
 
     for src, dst in block_mapping:
         for ref_cache in ref_caches:
@@ -667,10 +710,12 @@ def test_swap_blocks_mla(
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 
-    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device)
-    dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device)
+    src_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
+    dst_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
 
     _fill_mla_cache(src_cache, kv_cache_dtype)
     _fill_mla_cache(dst_cache, kv_cache_dtype)
@@ -682,9 +727,9 @@ def test_swap_blocks_mla(
     remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
     dst_blocks = random.sample(remaining_blocks, num_mappings)
     block_mapping = list(zip(src_blocks, dst_blocks))
-    block_mapping_tensor = torch.tensor(block_mapping,
-                                        dtype=torch.int64,
-                                        device="cpu").view(-1, 2)
+    block_mapping_tensor = torch.tensor(
+        block_mapping, dtype=torch.int64, device="cpu"
+    ).view(-1, 2)
 
     opcheck(
         torch.ops._C_cache_ops.swap_blocks,
@@ -699,7 +744,8 @@ def test_swap_blocks_mla(
             src_cache_clone[src].cpu(),
             dst_cache[dst].cpu(),
             msg=f"Block {src} from src should have been swapped to block "
-            f"{dst} in dst_cache.")
+            f"{dst} in dst_cache.",
+        )
 
 
 @pytest.mark.parametrize("kv_lora_rank", [512])
@@ -709,42 +755,46 @@ def test_swap_blocks_mla(
 @pytest.mark.parametrize("max_seq_len", [512])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("dtype", [torch.float32])
-@pytest.mark.parametrize("kv_cache_dtype",
-                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize(
+    "kv_cache_dtype", ["auto"]
+)  # You can also test "fp8" if needed.
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
-                          num_blocks, max_seq_len, batch_size, dtype,
-                          kv_cache_dtype, device):
+def test_gather_cache_mla(
+    kv_lora_rank,
+    qk_rope_head_dim,
+    block_size,
+    num_blocks,
+    max_seq_len,
+    batch_size,
+    dtype,
+    kv_cache_dtype,
+    device,
+):
     entry_size = kv_lora_rank + qk_rope_head_dim
-    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device)
+    src_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
-    seq_len_tensor = torch.randint(0,
-                                   max_seq_len + 1, (batch_size, ),
-                                   device=device)
+    seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
 
     total_tokens = seq_len_tensor.sum()
-    cu_seq_lens = torch.empty((batch_size + 1),
-                              dtype=torch.int32,
-                              device=device)
+    cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
     cu_seq_lens[0] = 0
     cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
     print("seq_len_tensor", seq_len_tensor)
 
     tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
-    block_table = torch.empty((batch_size, num_blocks),
-                              dtype=torch.int32,
-                              device=device)
+    block_table = torch.empty(
+        (batch_size, num_blocks), dtype=torch.int32, device=device
+    )
 
     for b in range(batch_size):
         perm = torch.randperm(num_blocks, device=device)
         block_table[b, :] = perm
 
-    dst = torch.zeros((total_tokens, entry_size),
-                      dtype=src_cache.dtype,
-                      device=device)
+    dst = torch.zeros((total_tokens, entry_size), dtype=src_cache.dtype, device=device)
 
     expected_batches = []
     for b in range(batch_size):
@@ -800,20 +850,16 @@ def test_concat_and_cache_mla_cpu(
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping_lst,
-                                dtype=torch.long,
-                                device=device)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
 
     kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
-    k_pe = torch.randn(num_tokens,
-                       qk_rope_head_dim,
-                       dtype=dtype,
-                       device=device)
+    k_pe = torch.randn(num_tokens, qk_rope_head_dim, dtype=dtype, device=device)
     entry_size = kv_lora_rank + qk_rope_head_dim
 
     scale = torch.tensor(0.1, dtype=torch.float32, device=device)
-    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                 kv_cache_dtype, device)
+    kv_cache = _create_mla_cache(
+        num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
+    )
     ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
 
     for i in range(num_tokens):
@@ -825,10 +871,7 @@ def test_concat_and_cache_mla_cpu(
 
     if kv_cache_dtype == "fp8":
         ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
-        ops.convert_fp8(ref_kv_cache,
-                        ref_temp,
-                        scale.item(),
-                        kv_dtype=kv_cache_dtype)
+        ops.convert_fp8(ref_kv_cache, ref_temp, scale.item(), kv_dtype=kv_cache_dtype)
     else:
         ref_kv_cache = ref_temp
 
@@ -838,6 +881,5 @@ def test_concat_and_cache_mla_cpu(
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
     )
 
-    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
-                             kv_cache_dtype, scale)
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale)
     torch.testing.assert_close(kv_cache, ref_kv_cache)
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 1e7e7e0a7f84..58e8bd592ba4 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -7,11 +7,12 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.flash_attn import (cascade_attention,
-                                                   merge_attn_states)
-from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
-                                  flash_attn_varlen_func,
-                                  is_fa_version_supported)
+from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
+from vllm.vllm_flash_attn import (
+    fa_version_unsupported_reason,
+    flash_attn_varlen_func,
+    is_fa_version_supported,
+)
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
@@ -37,21 +38,14 @@ def test_merge_kernel(
     assert num_query_heads % num_kv_heads == 0
 
     # Prepare inputs.
-    prefix_output = torch.randn(num_tokens,
-                                num_query_heads,
-                                head_size,
-                                dtype=dtype)
-    suffix_output = torch.randn(num_tokens,
-                                num_query_heads,
-                                head_size,
-                                dtype=dtype)
+    prefix_output = torch.randn(num_tokens, num_query_heads, head_size, dtype=dtype)
+    suffix_output = torch.randn(num_tokens, num_query_heads, head_size, dtype=dtype)
     prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
     suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
 
     # Run the kernel.
     output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype)
-    merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
-                      suffix_lse)
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse)
 
     # Reference implementation.
     max_lse = torch.maximum(prefix_lse, suffix_lse)
@@ -97,8 +91,10 @@ def test_cascade(
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
-        pytest.skip(f"Flash attention version {fa_version} not supported due "
-                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+        pytest.skip(
+            f"Flash attention version {fa_version} not supported due "
+            f'to: "{fa_version_unsupported_reason(fa_version)}"'
+        )
 
     current_platform.seed_everything(0)
 
@@ -107,11 +103,9 @@ def test_cascade(
     num_query_heads = num_heads[0]
     num_kv_heads = num_heads[1]
     assert num_query_heads % num_kv_heads == 0
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
 
     seq_lens, common_prefix_len = seq_lens_and_common_prefix
@@ -122,26 +116,21 @@ def test_cascade(
     max_kv_len = max(kv_lens)
 
     total_num_query_tokens = sum(query_lens)
-    query = torch.randn(total_num_query_tokens,
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    cu_query_lens = torch.tensor([0] + query_lens,
-                                 dtype=torch.int32).cumsum(dim=0,
-                                                           dtype=torch.int32)
+    query = torch.randn(total_num_query_tokens, num_query_heads, head_size, dtype=dtype)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
     kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     assert common_prefix_len > 0
     assert common_prefix_len % block_size == 0
     num_common_kv_blocks = common_prefix_len // block_size
     # Make sure the first `num_common_kv_blocks` blocks are the same.
-    block_tables[:, :num_common_kv_blocks] = \
-        block_tables[0, :num_common_kv_blocks]
+    block_tables[:, :num_common_kv_blocks] = block_tables[0, :num_common_kv_blocks]
 
     # Run the regular attention.
     ref_output = flash_attn_varlen_func(
@@ -161,8 +150,7 @@ def test_cascade(
 
     # Run cascade attention.
     assert all(common_prefix_len < kv_len for kv_len in kv_lens)
-    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens],
-                                        dtype=torch.int32)
+    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], dtype=torch.int32)
     prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32)
     suffix_kv_lens = kv_lens_tensor - common_prefix_len
     output = torch.empty_like(query)
diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
index a2e698646090..d82f28155bb5 100644
--- a/tests/kernels/attention/test_encoder_decoder_attn.py
+++ b/tests/kernels/attention/test_encoder_decoder_attn.py
@@ -13,12 +13,15 @@
 
 import pytest
 import torch
-
 from tests.kernels.utils import *
+
 from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
-from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
-                                     global_force_attn_backend_context_manager)
+from vllm.attention.selector import (
+    _Backend,
+    _cached_get_attn_backend,
+    global_force_attn_backend_context_manager,
+)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.platforms import current_platform
@@ -27,10 +30,10 @@
 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
     """
-    Encoder-decoder is only supported on V0, so set 
+    Encoder-decoder is only supported on V0, so set
     VLLM_USE_V1=0 for all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 # List of support backends for encoder/decoder models
@@ -79,7 +82,7 @@ class TestPoint(NamedTuple):
 
 
 class TestResources(NamedTuple):
-    '''
+    """
     Encapsulates key components for performing an
     encoder/decoder attention test
 
@@ -105,15 +108,17 @@ class TestResources(NamedTuple):
                     i.e. XFormers
     * attn: Attention layer instance
     * kv_cache: shared key/value cache for all attention
-    '''
+    """
 
     scale: float
     attn: Attention
     kv_cache: torch.Tensor
 
 
-def _make_test_resources(test_pt: TestPoint, ) -> TestResources:
-    '''
+def _make_test_resources(
+    test_pt: TestPoint,
+) -> TestResources:
+    """
     Build key components for performing encoder/decoder attention test.
 
     Note that
@@ -137,7 +142,7 @@ class that Attention will automatically select when it is constructed.
     Returns:
 
     * TestResources data structure.
-    '''
+    """
 
     scale = float(1.0 / (test_pt.head_size**0.5))
     attn = Attention(
@@ -150,18 +155,19 @@ class that Attention will automatically select when it is constructed.
     if test_pt.num_blocks is None or test_pt.num_heads is None:
         # Caller does not require a KV cache
         return TestResources(
-            scale, attn,
-            torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE))
+            scale, attn, torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)
+        )
 
     # Construct KV cache
-    if test_pt.attn_type in (AttentionType.DECODER,
-                             AttentionType.ENCODER_DECODER):
-        kv_cache = make_kv_cache(test_pt.num_blocks,
-                                 test_pt.num_heads,
-                                 test_pt.head_size,
-                                 test_pt.block_size,
-                                 device=CUDA_DEVICE,
-                                 backend=test_pt.backend_name)
+    if test_pt.attn_type in (AttentionType.DECODER, AttentionType.ENCODER_DECODER):
+        kv_cache = make_kv_cache(
+            test_pt.num_blocks,
+            test_pt.num_heads,
+            test_pt.head_size,
+            test_pt.block_size,
+            device=CUDA_DEVICE,
+            backend=test_pt.backend_name,
+        )
     else:
         kv_cache = torch.tensor([])
 
@@ -173,7 +179,7 @@ def _encoder_attn_setup(
     test_pt: TestPoint,
     test_rsrcs: TestResources,
 ) -> PhaseTestParameters:
-    '''
+    """
     Set up test vectors & data structures for encoder attention test.
 
     A triplet of synthetic query/key/value tensors are constructed.
@@ -200,7 +206,7 @@ def _encoder_attn_setup(
     * PhaseTestParameters data structure comprising (1) packed query/key/value
       tensors, (2) the ideal output of attention computed using a naive
       implementation, and (3) KVCache field set to None
-    '''
+    """
 
     (
         num_heads,
@@ -220,33 +226,37 @@ def _encoder_attn_setup(
 
     # Make test tensors
 
-    qkv_in, _, _ = make_qkv(batch_size,
-                            max_q_seq_len,
-                            max_kv_seq_len,
-                            num_heads,
-                            head_size,
-                            attn_type=AttentionType.ENCODER,
-                            device=CUDA_DEVICE)
+    qkv_in, _, _ = make_qkv(
+        batch_size,
+        max_q_seq_len,
+        max_kv_seq_len,
+        num_heads,
+        head_size,
+        attn_type=AttentionType.ENCODER,
+        device=CUDA_DEVICE,
+    )
 
     # Compute correct answer using naive non-causal attention
     # implementation
 
-    ideal_output = ref_masked_attention(qkv_in.query,
-                                        qkv_in.key,
-                                        qkv_in.value,
-                                        scale=scale,
-                                        q_seq_lens=qkv_in.q_seq_lens,
-                                        kv_seq_lens=qkv_in.kv_seq_lens)
+    ideal_output = ref_masked_attention(
+        qkv_in.query,
+        qkv_in.key,
+        qkv_in.value,
+        scale=scale,
+        q_seq_lens=qkv_in.q_seq_lens,
+        kv_seq_lens=qkv_in.kv_seq_lens,
+    )
 
-    packed_ideal_output, _ = pack_tensor(ideal_output,
-                                         qkv_in.q_seq_lens,
-                                         device=CUDA_DEVICE)
+    packed_ideal_output, _ = pack_tensor(
+        ideal_output, qkv_in.q_seq_lens, device=CUDA_DEVICE
+    )
 
     packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE)
 
     return PhaseTestParameters(
         PackedQKVO(packed_qkv, packed_ideal_output),
-        None  # No KV cache
+        None,  # No KV cache
     )
 
 
@@ -255,7 +265,7 @@ def _decoder_attn_setup(
     test_rsrcs: TestResources,
     block_base_addr: int = 0,
 ) -> tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
-    '''
+    """
     Set up test vectors & data structures for self-attention test.
 
     A triplet of synthetic query/key/value tensors are constructed ("baseline"
@@ -309,7 +319,7 @@ def _decoder_attn_setup(
                      (intended to be used as the base address for the encoder/
                       decoder cross-attention block-table, which is not
                       constructed in this function)
-    '''
+    """
 
     (
         num_heads,
@@ -333,27 +343,30 @@ def _decoder_attn_setup(
         qkv,
         prefill_qkv,
         decode_qkv,
-    ) = make_qkv(batch_size,
-                 max_q_seq_len,
-                 max_kv_seq_len,
-                 num_heads,
-                 head_size,
-                 attn_type=AttentionType.DECODER,
-                 device=CUDA_DEVICE)
+    ) = make_qkv(
+        batch_size,
+        max_q_seq_len,
+        max_kv_seq_len,
+        num_heads,
+        head_size,
+        attn_type=AttentionType.DECODER,
+        device=CUDA_DEVICE,
+    )
 
     # Compute correct answer using naive attention implementation
     # with causal attention mask
 
-    causal_mask = make_causal_mask(max_q_seq_len,
-                                   max_kv_seq_len).to(CUDA_DEVICE)
+    causal_mask = make_causal_mask(max_q_seq_len, max_kv_seq_len).to(CUDA_DEVICE)
 
-    ideal_output = ref_masked_attention(qkv.query,
-                                        qkv.key,
-                                        qkv.value,
-                                        scale=scale,
-                                        custom_mask=causal_mask,
-                                        q_seq_lens=qkv.q_seq_lens,
-                                        kv_seq_lens=qkv.kv_seq_lens)
+    ideal_output = ref_masked_attention(
+        qkv.query,
+        qkv.key,
+        qkv.value,
+        scale=scale,
+        custom_mask=causal_mask,
+        q_seq_lens=qkv.q_seq_lens,
+        kv_seq_lens=qkv.kv_seq_lens,
+    )
 
     # Split out the prefill- & decode-phase ideal answers & pack them
 
@@ -361,16 +374,18 @@ def _decoder_attn_setup(
     decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
     for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens):
         prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
-            bdx, :prefill_q_seq_len]
-        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
-            prefill_q_seq_len + 1)]
-
-    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
-                                                 prefill_qkv.q_seq_lens,
-                                                 device=CUDA_DEVICE)
-    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
-                                                [1 for _ in range(batch_size)],
-                                                device=CUDA_DEVICE)
+            bdx, :prefill_q_seq_len
+        ]
+        decode_ideal_output[bdx, :] = ideal_output[
+            bdx, prefill_q_seq_len : (prefill_q_seq_len + 1)
+        ]
+
+    prefill_packed_ideal_output, _ = pack_tensor(
+        prefill_ideal_output, prefill_qkv.q_seq_lens, device=CUDA_DEVICE
+    )
+    decode_packed_ideal_output, _ = pack_tensor(
+        decode_ideal_output, [1 for _ in range(batch_size)], device=CUDA_DEVICE
+    )
 
     # Build prefill- & decode-phase data structures
     # for decoder self-attention. Block tables and
@@ -398,17 +413,14 @@ def _decoder_attn_setup(
         decode_block_tables,
         slot_mapping_list,
         max_block_idx,
-    ) = make_block_tables_slot_mapping(block_size,
-                                       qkv.q_seq_lens,
-                                       device=CUDA_DEVICE,
-                                       block_base_addr=block_base_addr)
+    ) = make_block_tables_slot_mapping(
+        block_size, qkv.q_seq_lens, device=CUDA_DEVICE, block_base_addr=block_base_addr
+    )
 
     (
         prefill_slot_mapping,
         decode_slot_mapping,
-    ) = split_slot_mapping(slot_mapping_list,
-                           qkv.q_seq_lens,
-                           device=CUDA_DEVICE)
+    ) = split_slot_mapping(slot_mapping_list, qkv.q_seq_lens, device=CUDA_DEVICE)
 
     prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE)
 
@@ -418,11 +430,14 @@ def _decoder_attn_setup(
         qkv,
         PhaseTestParameters(  # Prefill test params
             PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output),
-            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping),
+        ),
         PhaseTestParameters(  # Decode test params
             PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output),
-            KVMemoryMap(decode_block_tables, decode_slot_mapping)),
-        max_block_idx)
+            KVMemoryMap(decode_block_tables, decode_slot_mapping),
+        ),
+        max_block_idx,
+    )
 
 
 def _enc_dec_cross_attn_setup_reuses_query(
@@ -433,7 +448,7 @@ def _enc_dec_cross_attn_setup_reuses_query(
     test_rsrcs: TestResources,
     block_base_addr: int = 0,
 ) -> tuple[PhaseTestParameters, PhaseTestParameters]:
-    '''
+    """
     Set up test vectors & data structures for cross-attention test.
 
     A triplet of synthetic cross-attention key/value tensors are constructed
@@ -494,7 +509,7 @@ def _enc_dec_cross_attn_setup_reuses_query(
       along with (2) ideal attention output computed using a
       naive implementation, and (3) memory-mapping data structures appropriate
       for decode phase.
-    '''
+    """
 
     assert encoder_test_params.packed_qkvo.packed_qkv is not None
     assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None
@@ -517,7 +532,8 @@ def _enc_dec_cross_attn_setup_reuses_query(
     decoder_seq_lens = decoder_qkv.q_seq_lens
     encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
     prefill_q_seq_lens = (
-        prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens)
+        prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens
+    )
 
     assert prefill_q_seq_lens is not None
 
@@ -525,36 +541,42 @@ def _enc_dec_cross_attn_setup_reuses_query(
         cross_kv,
         _,
         _,
-    ) = make_qkv(batch_size,
-                 max_decoder_seq_len,
-                 max_encoder_seq_len,
-                 num_heads,
-                 head_size,
-                 force_kv_seq_lens=encoder_seq_lens,
-                 attn_type=AttentionType.ENCODER_DECODER,
-                 device=CUDA_DEVICE)
-
-    ideal_output = ref_masked_attention(decoder_query,
-                                        cross_kv.key,
-                                        cross_kv.value,
-                                        scale=scale,
-                                        q_seq_lens=decoder_seq_lens,
-                                        kv_seq_lens=cross_kv.kv_seq_lens)
+    ) = make_qkv(
+        batch_size,
+        max_decoder_seq_len,
+        max_encoder_seq_len,
+        num_heads,
+        head_size,
+        force_kv_seq_lens=encoder_seq_lens,
+        attn_type=AttentionType.ENCODER_DECODER,
+        device=CUDA_DEVICE,
+    )
+
+    ideal_output = ref_masked_attention(
+        decoder_query,
+        cross_kv.key,
+        cross_kv.value,
+        scale=scale,
+        q_seq_lens=decoder_seq_lens,
+        kv_seq_lens=cross_kv.kv_seq_lens,
+    )
 
     prefill_ideal_output = torch.zeros_like(ideal_output)
     decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
     for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens):
         prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
-            bdx, :prefill_q_seq_len]
-        decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
-            prefill_q_seq_len + 1)]
-
-    prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
-                                                 prefill_q_seq_lens,
-                                                 device=CUDA_DEVICE)
-    decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
-                                                [1 for _ in range(batch_size)],
-                                                device=CUDA_DEVICE)
+            bdx, :prefill_q_seq_len
+        ]
+        decode_ideal_output[bdx, :] = ideal_output[
+            bdx, prefill_q_seq_len : (prefill_q_seq_len + 1)
+        ]
+
+    prefill_packed_ideal_output, _ = pack_tensor(
+        prefill_ideal_output, prefill_q_seq_lens, device=CUDA_DEVICE
+    )
+    decode_packed_ideal_output, _ = pack_tensor(
+        decode_ideal_output, [1 for _ in range(batch_size)], device=CUDA_DEVICE
+    )
 
     # Build prefill- & decode-phase data structures
     # for encoder/decoder cross-attention. Block tables and
@@ -591,13 +613,16 @@ def _enc_dec_cross_attn_setup_reuses_query(
         decode_block_tables,
         prefill_slot_mapping_list,
         _,
-    ) = make_block_tables_slot_mapping(block_size,
-                                       cross_kv.kv_seq_lens,
-                                       block_base_addr=block_base_addr,
-                                       device=CUDA_DEVICE)
+    ) = make_block_tables_slot_mapping(
+        block_size,
+        cross_kv.kv_seq_lens,
+        block_base_addr=block_base_addr,
+        device=CUDA_DEVICE,
+    )
 
-    prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list,
-                                                  device=CUDA_DEVICE)
+    prefill_slot_mapping = maybe_make_long_tensor(
+        prefill_slot_mapping_list, device=CUDA_DEVICE
+    )
 
     # Packed key/value (query is already provided)
     packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE)
@@ -605,10 +630,13 @@ def _enc_dec_cross_attn_setup_reuses_query(
     return (
         PhaseTestParameters(  # Prefill-phase test params
             PackedQKVO(packed_cross_kv, prefill_packed_ideal_output),
-            KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
+            KVMemoryMap(prefill_block_tables, prefill_slot_mapping),
+        ),
         PhaseTestParameters(  # Decode-phase test params
             PackedQKVO(None, decode_packed_ideal_output),
-            KVMemoryMap(decode_block_tables, decode_slot_mapping)))
+            KVMemoryMap(decode_block_tables, decode_slot_mapping),
+        ),
+    )
 
 
 def _run_encoder_attention_test(
@@ -618,7 +646,7 @@ def _run_encoder_attention_test(
     test_pt: TestPoint,
     vllm_config: VllmConfig,
 ) -> torch.Tensor:
-    '''
+    """
     Run encoder attention.
 
     attn.forward() is passed attn_type=AttentionType.ENCODER in order
@@ -641,7 +669,7 @@ def _run_encoder_attention_test(
     Returns:
     * Attention.forward() applied to packed {query,key,value} and
       & attn_metadata
-    '''
+    """
     assert attn_metadata.num_decode_tokens == 0
     packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
@@ -654,7 +682,8 @@ def _run_encoder_attention_test(
         # TODO - Update the way we construct the query so that it
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
-            -1, test_pt.num_heads * test_pt.head_size)
+            -1, test_pt.num_heads * test_pt.head_size
+        )
         return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
 
 
@@ -665,7 +694,7 @@ def _run_decoder_self_attention_test(
     test_pt: TestPoint,
     vllm_config: VllmConfig,
 ) -> torch.Tensor:
-    '''
+    """
     Run decoder self-attention test.
 
     attn.forward() is passed attn_type=AttentionType.DECODER
@@ -687,7 +716,7 @@ def _run_decoder_self_attention_test(
     Returns:
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
       & attn_metadata
-    '''
+    """
     attn = test_rsrcs.attn
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
@@ -700,7 +729,8 @@ def _run_decoder_self_attention_test(
         # TODO - Update the way we construct the query so that it
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
-            -1, test_pt.num_heads * test_pt.head_size)
+            -1, test_pt.num_heads * test_pt.head_size
+        )
         return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
 
 
@@ -712,7 +742,7 @@ def _run_encoder_decoder_cross_attention_test(
     test_pt: TestPoint,
     vllm_config: VllmConfig,
 ) -> torch.Tensor:
-    '''
+    """
     Run encoder/decoder cross-attention test.
 
     Via PhaseTestParameters data structures, consumes the same query utilized
@@ -745,7 +775,7 @@ def _run_encoder_decoder_cross_attention_test(
     Returns:
     * Attention.forward() applied to packed_{query,key,value}, kv_cache
       & attn_metadata
-    '''
+    """
     assert decoder_test_params.packed_qkvo.packed_qkv is not None
 
     attn = test_rsrcs.attn
@@ -754,8 +784,8 @@ def _run_encoder_decoder_cross_attention_test(
         value = None
     else:
         cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
-        key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
-        value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
+        key = None if cross_pckd_qkv is None else cross_pckd_qkv.key
+        value = None if cross_pckd_qkv is None else cross_pckd_qkv.value
     with set_forward_context(attn_metadata, vllm_config):
         # In the test setup the shape of the query is
         # [batch_size, seq_len, num_heads, head_size]. However
@@ -765,7 +795,8 @@ def _run_encoder_decoder_cross_attention_test(
         # TODO - Update the way we construct the query so that it
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
-            -1, test_pt.num_heads * test_pt.head_size)
+            -1, test_pt.num_heads * test_pt.head_size
+        )
         return attn.forward(reshaped_query, key, value)
 
 
@@ -775,7 +806,7 @@ def set_reset_environment(attn_backend):
     # testing of the Flash Attention backend. Also clear the
     # cached value of the backend.
     default_dtype = torch.get_default_dtype()
-    if attn_backend.name == 'FLASH_ATTN':
+    if attn_backend.name == "FLASH_ATTN":
         torch.set_default_dtype(torch.bfloat16)
     _cached_get_attn_backend.cache_clear()
     yield
@@ -784,8 +815,7 @@ def set_reset_environment(attn_backend):
     torch.set_default_dtype(default_dtype)
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.skipif(current_platform.is_rocm(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
@@ -802,7 +832,7 @@ def test_encoder_only(
     max_dec_seq_len: int,
     max_enc_seq_len: int,
 ):
-    '''
+    """
     End-to-end encoder-only attention test:
 
     * Construct fake test vectors for (1) encoder attention
@@ -830,15 +860,23 @@ def test_encoder_only(
     * block_size: KV cache block size
     * max_dec_seq_len: max length of decoder input sequences
     * max_enc_seq_len: max length of encoder input sequences
-    '''
+    """
     # Force Attention wrapper backend
     with global_force_attn_backend_context_manager(attn_backend):
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
-        test_pt = TestPoint(num_heads, head_size, attn_backend.name,
-                            batch_size, block_size, max_dec_seq_len,
-                            max_enc_seq_len, 4096, AttentionType.ENCODER)
+        test_pt = TestPoint(
+            num_heads,
+            head_size,
+            attn_backend.name,
+            batch_size,
+            block_size,
+            max_dec_seq_len,
+            max_enc_seq_len,
+            4096,
+            AttentionType.ENCODER,
+        )
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
@@ -860,24 +898,26 @@ def test_encoder_only(
             decoder_test_params=None,
             encoder_test_params=enc_test_params,
             cross_test_params=None,
-            device=CUDA_DEVICE)
+            device=CUDA_DEVICE,
+        )
 
         # PREFILL: encoder attention
 
-        enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
+        enc_pckd_act_out: torch.Tensor = _run_encoder_attention_test(
             test_rsrcs.attn,
             enc_test_params,
             prephase_attn_metadata,
             test_pt=test_pt,
-            vllm_config=vllm_config))
+            vllm_config=vllm_config,
+        )
 
         # - Is encoder attention result correct?
-        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
-                                    attn_backend.name)
+        assert_actual_matches_ideal(
+            enc_test_params, enc_pckd_act_out, attn_backend.name
+        )
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
+@pytest.mark.skipif(current_platform.is_rocm(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
@@ -894,7 +934,7 @@ def test_e2e_enc_dec_attn(
     max_dec_seq_len: int,
     max_enc_seq_len: int,
 ) -> None:
-    '''
+    """
     End-to-end encoder/decoder test:
 
     * Construct fake test vectors for (1) encoder attention,
@@ -954,22 +994,45 @@ def test_e2e_enc_dec_attn(
     * block_size: KV cache block size
     * max_dec_seq_len: max length of decoder input sequences
     * max_enc_seq_len: max length of encoder input sequences
-    '''
+    """
     # Force Attention wrapper backend
     with global_force_attn_backend_context_manager(attn_backend):
         # Note: KV cache size of 4096 is arbitrary & chosen intentionally
         # to be more than necessary, since exceeding the kv cache size
         # is not part of this test
-        enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
-                                batch_size, block_size, max_dec_seq_len,
-                                max_enc_seq_len, 4096, AttentionType.ENCODER)
-        enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
-                                    batch_size, block_size, max_dec_seq_len,
-                                    max_enc_seq_len, 4096,
-                                    AttentionType.ENCODER_DECODER)
-        dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name,
-                                batch_size, block_size, max_dec_seq_len,
-                                max_enc_seq_len, 4096, AttentionType.DECODER)
+        enc_test_pt = TestPoint(
+            num_heads,
+            head_size,
+            attn_backend.name,
+            batch_size,
+            block_size,
+            max_dec_seq_len,
+            max_enc_seq_len,
+            4096,
+            AttentionType.ENCODER,
+        )
+        enc_dec_test_pt = TestPoint(
+            num_heads,
+            head_size,
+            attn_backend.name,
+            batch_size,
+            block_size,
+            max_dec_seq_len,
+            max_enc_seq_len,
+            4096,
+            AttentionType.ENCODER_DECODER,
+        )
+        dec_test_pt = TestPoint(
+            num_heads,
+            head_size,
+            attn_backend.name,
+            batch_size,
+            block_size,
+            max_dec_seq_len,
+            max_enc_seq_len,
+            4096,
+            AttentionType.DECODER,
+        )
 
         # Attention scale factor, attention backend instance, attention wrapper
         # instance, KV cache init
@@ -1010,7 +1073,8 @@ def test_e2e_enc_dec_attn(
             prephase_dec_test_params,
             enc_dec_test_pt,
             enc_dec_test_rsrcs,
-            block_base_addr=cross_block_base_addr)
+            block_base_addr=cross_block_base_addr,
+        )
 
         # Shared prefill metadata structure
         assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
@@ -1021,19 +1085,23 @@ def test_e2e_enc_dec_attn(
             decoder_test_params=prephase_dec_test_params,
             encoder_test_params=enc_test_params,
             cross_test_params=prephase_cross_test_params,
-            device=CUDA_DEVICE)
+            device=CUDA_DEVICE,
+        )
 
         # PREFILL: encoder attention
 
-        enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn,
-                                                       enc_test_params,
-                                                       prephase_attn_metadata,
-                                                       test_pt=enc_test_pt,
-                                                       vllm_config=vllm_config)
+        enc_pckd_act_out = _run_encoder_attention_test(
+            enc_test_rsrcs.attn,
+            enc_test_params,
+            prephase_attn_metadata,
+            test_pt=enc_test_pt,
+            vllm_config=vllm_config,
+        )
 
         # - Is encoder attention result correct?
-        assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out,
-                                    attn_backend.name)
+        assert_actual_matches_ideal(
+            enc_test_params, enc_pckd_act_out, attn_backend.name
+        )
 
         # PREFILL: decoder self-attention test
 
@@ -1042,12 +1110,13 @@ def test_e2e_enc_dec_attn(
             prephase_dec_test_params,
             prephase_attn_metadata,
             test_pt=dec_test_pt,
-            vllm_config=vllm_config)
+            vllm_config=vllm_config,
+        )
 
         # - Is prefill decoder self-attention correct?
-        assert_actual_matches_ideal(prephase_dec_test_params,
-                                    prephase_dec_pckd_act_out,
-                                    attn_backend.name)
+        assert_actual_matches_ideal(
+            prephase_dec_test_params, prephase_dec_pckd_act_out, attn_backend.name
+        )
 
         # PREFILL: encoder/decoder cross-attention test
 
@@ -1057,12 +1126,13 @@ def test_e2e_enc_dec_attn(
             prephase_cross_test_params,
             prephase_attn_metadata,
             test_pt=enc_dec_test_pt,
-            vllm_config=vllm_config)
+            vllm_config=vllm_config,
+        )
 
         # - Is prefill encoder/decoder cross-attention correct?
-        assert_actual_matches_ideal(prephase_cross_test_params,
-                                    prephase_cross_pckd_act_out,
-                                    attn_backend.name)
+        assert_actual_matches_ideal(
+            prephase_cross_test_params, prephase_cross_pckd_act_out, attn_backend.name
+        )
 
         # DECODE: build decode-phase attention metadata
 
@@ -1073,7 +1143,8 @@ def test_e2e_enc_dec_attn(
             decoder_test_params=decphase_dec_test_params,
             encoder_test_params=enc_test_params,
             cross_test_params=decphase_cross_test_params,
-            device=CUDA_DEVICE)
+            device=CUDA_DEVICE,
+        )
 
         # DECODE: decoder self-attention test
 
@@ -1082,12 +1153,13 @@ def test_e2e_enc_dec_attn(
             decphase_dec_test_params,
             decphase_attn_metadata,
             test_pt=dec_test_pt,
-            vllm_config=vllm_config)
+            vllm_config=vllm_config,
+        )
 
         # - Is decode-phase decoder self-attention correct?
-        assert_actual_matches_ideal(decphase_dec_test_params,
-                                    decphase_dec_pckd_act_out,
-                                    attn_backend.name)
+        assert_actual_matches_ideal(
+            decphase_dec_test_params, decphase_dec_pckd_act_out, attn_backend.name
+        )
 
         # DECODE: encoder/decoder cross-attention test
 
@@ -1097,9 +1169,10 @@ def test_e2e_enc_dec_attn(
             None,
             decphase_attn_metadata,
             test_pt=enc_dec_test_pt,
-            vllm_config=vllm_config)
+            vllm_config=vllm_config,
+        )
 
         # - Is decode-phase encoder/decoder cross-attention correct?
-        assert_actual_matches_ideal(decphase_cross_test_params,
-                                    decphase_cross_pckd_act_out,
-                                    attn_backend.name)
+        assert_actual_matches_ideal(
+            decphase_cross_test_params, decphase_cross_pckd_act_out, attn_backend.name
+        )
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index bd3190d09b0f..ba1a540a1a0e 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -7,10 +7,12 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
-                                  flash_attn_varlen_func,
-                                  flash_attn_with_kvcache,
-                                  is_fa_version_supported)
+from vllm.vllm_flash_attn import (
+    fa_version_unsupported_reason,
+    flash_attn_varlen_func,
+    flash_attn_with_kvcache,
+    is_fa_version_supported,
+)
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 256]
@@ -42,7 +44,7 @@ def ref_paged_attn(
     for i in range(num_seqs):
         query_len = query_lens[i]
         kv_len = kv_lens[i]
-        q = query[start_idx:start_idx + query_len]
+        q = query[start_idx : start_idx + query_len]
         q *= scale
 
         num_kv_blocks = (kv_len + block_size - 1) // block_size
@@ -60,10 +62,13 @@ def ref_paged_attn(
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
         if sliding_window is not None:
-            sliding_window_mask = torch.triu(empty_mask,
-                                             diagonal=kv_len -
-                                             (query_len + sliding_window) +
-                                             1).bool().logical_not()
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
             mask |= sliding_window_mask
         if soft_cap is not None:
             attn = soft_cap * torch.tanh(attn / soft_cap)
@@ -104,11 +109,15 @@ def test_flash_attn_with_paged_kv(
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
-        pytest.skip(f"Flash attention version {fa_version} not supported due "
-                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+        pytest.skip(
+            f"Flash attention version {fa_version} not supported due "
+            f'to: "{fa_version_unsupported_reason(fa_version)}"'
+        )
     if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
-        pytest.skip("Flash attention with quantized inputs is only "
-                    "supported on version 3 with bfloat16 base type")
+        pytest.skip(
+            "Flash attention with quantized inputs is only "
+            "supported on version 3 with bfloat16 base type"
+        )
 
     current_platform.seed_everything(0)
     num_seqs = len(kv_lens)
@@ -117,23 +126,19 @@ def test_flash_attn_with_paged_kv(
     assert num_query_heads % num_kv_heads == 0
     max_kv_len = max(kv_lens)
     scale = head_size**-0.5
-    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
-                   (-1, -1))
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
 
     query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
     kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     q = query.unsqueeze(1)
     out = torch.empty_like(q) if use_out else None
@@ -178,23 +183,27 @@ def test_flash_attn_with_paged_kv(
     if q_dtype is not None:
         atol, rtol = 1.5e-1, 1.5e-1
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=[1] * num_seqs,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap,
-                                sliding_window=sliding_window)
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=[1] * num_seqs,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+        sliding_window=sliding_window,
+    )
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
 
 
 @pytest.mark.parametrize("use_out", [True, False])
-@pytest.mark.parametrize("seq_lens",
-                         [[(1, 1328), (5, 18),
-                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
+@pytest.mark.parametrize(
+    "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
+)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@@ -220,11 +229,15 @@ def test_varlen_with_paged_kv(
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
-        pytest.skip(f"Flash attention version {fa_version} not supported due "
-                    f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+        pytest.skip(
+            f"Flash attention version {fa_version} not supported due "
+            f'to: "{fa_version_unsupported_reason(fa_version)}"'
+        )
     if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
-        pytest.skip("Flash attention with quantized inputs is only "
-                    "supported on version 3 with bfloat16 base type")
+        pytest.skip(
+            "Flash attention with quantized inputs is only "
+            "supported on version 3 with bfloat16 base type"
+        )
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
@@ -234,30 +247,23 @@ def test_varlen_with_paged_kv(
     assert num_query_heads % num_kv_heads == 0
     max_query_len = max(query_lens)
     max_kv_len = max(kv_lens)
-    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
-                   (-1, -1))
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
     scale = head_size**-0.5
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
-    cu_query_lens = torch.tensor([0] + query_lens,
-                                 dtype=torch.int32).cumsum(dim=0,
-                                                           dtype=torch.int32)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
     kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     out = torch.empty_like(query) if use_out else None
 
@@ -313,5 +319,7 @@ def test_varlen_with_paged_kv(
     atol, rtol = 1.5e-2, 1e-2
     if q_dtype is not None:
         atol, rtol = 1.5e-1, 1.5e-1
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 3ad6e1d32911..19e7ad67dfdb 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -36,7 +36,7 @@ def ref_paged_attn(
     for i in range(num_seqs):
         query_len = query_lens[i]
         kv_len = kv_lens[i]
-        q = query[start_idx:start_idx + query_len]
+        q = query[start_idx : start_idx + query_len]
         q *= scale
 
         num_kv_blocks = (kv_len + block_size - 1) // block_size
@@ -54,10 +54,13 @@ def ref_paged_attn(
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
         if sliding_window is not None:
-            sliding_window_mask = torch.triu(empty_mask,
-                                             diagonal=kv_len -
-                                             (query_len + sliding_window) +
-                                             1).bool().logical_not()
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
             mask |= sliding_window_mask
         if soft_cap is not None:
             attn = soft_cap * torch.tanh(attn / soft_cap)
@@ -97,20 +100,16 @@ def test_flashinfer_decode_with_paged_kv(
 
     query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
 
-    key_value_cache = torch.randn(NUM_BLOCKS,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     kv_indptr = [0]
     kv_indices = []
@@ -131,35 +130,41 @@ def test_flashinfer_decode_with_paged_kv(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
-                use_tensor_cores=(
-                    (num_query_heads//num_kv_heads) > 4)
-                )
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_query_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
-                 q_data_type=dtype,
-                 kv_data_type=dtype,
-                 logits_soft_cap=soft_cap)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        "NHD",
+        use_tensor_cores=((num_query_heads // num_kv_heads) > 4),
+    )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
+    )
 
     output = wrapper.run(query, key_value_cache)
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=[1] * num_seqs,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap)
-    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=[1] * num_seqs,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
+    (
+        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
 
 
 @pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
@@ -169,11 +174,14 @@ def test_flashinfer_decode_with_paged_kv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
 @torch.inference_mode
-def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
-                                          num_heads: tuple[int, int],
-                                          head_size: int, dtype: torch.dtype,
-                                          block_size: int,
-                                          soft_cap: Optional[float]) -> None:
+def test_flashinfer_prefill_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+) -> None:
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
@@ -185,16 +193,10 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
     max_kv_len = max(kv_lens)
     scale = head_size**-0.5
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    key_value_cache = torch.randn(NUM_BLOCKS,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
@@ -204,10 +206,9 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
     value_cache /= head_size**0.5
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     qo_indptr = [0]
     kv_indptr = [0]
@@ -231,8 +232,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, "NHD")
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD")
     wrapper.plan(
         qo_indptr,
         kv_indptr,
@@ -252,16 +252,20 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
         key_value_cache,
     )
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=query_lens,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap)
-    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
+    (
+        torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
 
 
 @pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
@@ -271,9 +275,13 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
 def test_flashinfer_prefill_with_paged_fp8_kv(
-        seq_lens: list[tuple[int, int]], num_heads: tuple[int, int],
-        head_size: int, dtype: torch.dtype, block_size: int,
-        soft_cap: Optional[float]) -> None:
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: Optional[float],
+) -> None:
     pytest.skip("TODO: fix the accuracy issue")
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
@@ -288,17 +296,11 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 
     kv_cache_dtype = torch.float8_e4m3fn
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
     NUM_BLOCKS_FP8 = 2048
-    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS_FP8, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
     key_cache /= head_size**0.5
     value_cache /= head_size**0.5
@@ -306,15 +308,15 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     k_scale = key_cache.amax().item() / 448.0
     v_scale = value_cache.amax().item() / 448.0
 
-    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale],
-                             dim=1).to(kv_cache_dtype)
+    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale], dim=1).to(
+        kv_cache_dtype
+    )
 
-    assert (kv_cache_fp8.shape == key_value_cache.shape)
+    assert kv_cache_fp8.shape == key_value_cache.shape
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS_FP8,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS_FP8, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     qo_indptr = [0]
     kv_indptr = [0]
@@ -338,8 +340,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, "NHD")
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD")
     wrapper.plan(
         qo_indptr,
         kv_indptr,
@@ -356,19 +357,23 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 
     output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache.squeeze(1),
-                                value_cache=value_cache.squeeze(1),
-                                query_lens=query_lens,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap)
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache.squeeze(1),
+        value_cache=value_cache.squeeze(1),
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
     del query
     del block_tables
     # verify prefill fp8
-    torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
 
 
 @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
@@ -401,12 +406,9 @@ def test_flashinfer_decode_with_paged_fp8_kv(
 
     query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
     NUM_BLOCKS_FP8 = 2048
-    key_value_cache = torch.randn(NUM_BLOCKS_FP8,
-                                  2,
-                                  block_size,
-                                  num_kv_heads,
-                                  head_size,
-                                  dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS_FP8, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
     key_cache /= head_size**0.5
     value_cache /= head_size**0.5
@@ -416,14 +418,13 @@ def test_flashinfer_decode_with_paged_fp8_kv(
 
     key_cache_fp8 = (key_cache / k_scale).to(kv_cache_dtype)
     value_cache_fp8 = (value_cache / v_scale).to(kv_cache_dtype)
-    assert (key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1)
+    assert key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1
     kv_cache_fp8 = torch.cat([key_cache_fp8, value_cache_fp8], dim=1)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS_FP8,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS_FP8, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     kv_indptr = [0]
     kv_indices = []
@@ -444,32 +445,38 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
-                    use_tensor_cores=use_tensor_cores)
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_query_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
-                 q_data_type=dtype,
-                 kv_data_type=kv_cache_dtype,
-                 logits_soft_cap=soft_cap)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores
+    )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
+    )
     output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
     key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
     value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
 
-    ref_output = ref_paged_attn(query=query,
-                                key_cache=key_cache,
-                                value_cache=value_cache,
-                                query_lens=[1] * num_seqs,
-                                kv_lens=kv_lens,
-                                block_tables=block_tables,
-                                scale=scale,
-                                soft_cap=soft_cap)
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=[1] * num_seqs,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
     # Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
-    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
index 96eee13695a9..e22e893acf8c 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
@@ -9,8 +9,9 @@
 from vllm.platforms import current_platform
 
 if not current_platform.is_device_capability(100):
-    pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.",
-                allow_module_level=True)
+    pytest.skip(
+        "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
+    )
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 
@@ -72,10 +73,9 @@ def test_flashinfer_trtllm_decode_with_baseline(
     key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
     k_scale = v_scale = 1.0
     kv_indptr = [0]
     kv_indices = []
@@ -96,30 +96,30 @@ def test_flashinfer_trtllm_decode_with_baseline(
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.\
-        BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, kv_layout,
-                use_tensor_cores=(
-                    (num_query_heads//num_kv_heads) > 4)
-                )
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_query_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
-                 q_data_type=dtype,
-                 kv_data_type=dtype,
-                 logits_soft_cap=soft_cap)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout,
+        use_tensor_cores=((num_query_heads // num_kv_heads) > 4),
+    )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
+    )
 
     output = wrapper.run(query, key_value_cache, scale)
 
     # TRTLLM Decode
     max_kv_len = max(kv_lens)
-    kv_lens_tensor = torch.tensor(kv_lens,
-                                  dtype=torch.int,
-                                  device=query.device)
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=query.device)
     output_trtllm = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
         query.contiguous(),
         key_value_cache,
@@ -136,5 +136,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
         v_scale,
     )
 
-    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - output_trtllm))}"
+    (
+        torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - output_trtllm))}",
+    )
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 21b08e45fd6f..43b027ba8226 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -7,24 +7,28 @@
 import pytest
 import torch
 
-from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
-                                         get_mla_metadata,
-                                         is_flashmla_supported)
+from vllm.attention.ops.flashmla import (
+    flash_mla_with_kvcache,
+    get_mla_metadata,
+    is_flashmla_supported,
+)
 from vllm.triton_utils import triton
 
 
 def cal_diff(x: torch.Tensor, y: torch.Tensor, name: str) -> None:
     x, y = x.double(), y.double()
-    cos_diff = 1 - 2 * (x * y).sum().item() / max(
-        (x * x + y * y).sum().item(), 1e-12)
+    cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
     assert cos_diff < 1e-5
 
-FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
-    if not is_flashmla_supported()[0] else "FlashMLA is supported"
 
+FLASH_MLA_UNSUPPORTED_REASON = (
+    is_flashmla_supported()[1]
+    if not is_flashmla_supported()[0]
+    else "FlashMLA is supported"
+)
 
-@pytest.mark.skipif(not is_flashmla_supported()[0],
-                    reason=FLASH_MLA_UNSUPPORTED_REASON)
+
+@pytest.mark.skipif(not is_flashmla_supported()[0], reason=FLASH_MLA_UNSUPPORTED_REASON)
 @pytest.mark.parametrize("b", [128])
 @pytest.mark.parametrize("s_q", [1, 2])
 @pytest.mark.parametrize("mean_sk", [4096, 8192])
@@ -36,8 +40,7 @@ def cal_diff(x: torch.Tensor, y: torch.Tensor, name: str) -> None:
 @pytest.mark.parametrize("causal", [True])
 @pytest.mark.parametrize("varlen", [False, True])
 @torch.inference_mode()
-def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
-                   varlen):
+def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, varlen):
     # TODO: parametrize using pytest
     dtype = torch.bfloat16
     device = torch.device("cuda:0")
@@ -47,30 +50,32 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
     torch.manual_seed(0)
     random.seed(0)
 
-    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
-          f"{d=}, {dv=}, {causal=}, {varlen=}")
+    print(
+        f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {varlen=}"
+    )
 
-    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
+    cache_seqlens = torch.full((b,), mean_sk, dtype=torch.int32)
     if varlen:
         for i in range(b):
-            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2),
-                                   s_q)
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2), s_q)
     total_seqlens = cache_seqlens.sum().item()
     max_seqlen = cache_seqlens.max().item()
     max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
 
     q = torch.randn(b, s_q, h_q, d)
-    block_table = torch.arange(b * max_seqlen_pad // block_size,
-                               dtype=torch.int32).view(
-                                   b, max_seqlen_pad // block_size)
+    block_table = torch.arange(
+        b * max_seqlen_pad // block_size, dtype=torch.int32
+    ).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
     for i in range(b):
-        blocked_k.view(b, max_seqlen_pad, h_kv,
-                       d)[i, cache_seqlens[i].item():] = float("nan")
+        blocked_k.view(b, max_seqlen_pad, h_kv, d)[i, cache_seqlens[i].item() :] = (
+            float("nan")
+        )
     blocked_v = blocked_k[..., :dv]
 
     tile_scheduler_metadata, num_splits = get_mla_metadata(
-        cache_seqlens, s_q * h_q // h_kv, h_kv)
+        cache_seqlens, s_q * h_q // h_kv, h_kv
+    )
 
     def flash_mla():
         return flash_mla_with_kvcache(
@@ -95,8 +100,7 @@ def scaled_dot_product_attention(query, key, value, is_causal=False):
             s_q = query.shape[-2]
             s_k = key.shape[-2]
             attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
-            temp_mask = torch.ones(s_q, s_k,
-                                   dtype=torch.bool).tril(diagonal=s_k - s_q)
+            temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
             attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
             attn_bias.to(query.dtype)
             attn_weight += attn_bias
@@ -127,7 +131,7 @@ def ref_mla():
 
     t = triton.testing.do_bench(flash_mla)
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d +
-             b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
-    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} "
-          f"TFLOPS, {bytes / 10 ** 6 / t:.0f} GB/s")
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
+        torch.finfo(dtype).bits // 8
+    )
+    print(f"{t:.3f} ms, {FLOPS / 10**9 / t:.0f} TFLOPS, {bytes / 10**6 / t:.0f} GB/s")
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
index de45ee1ed5cc..0e3da986299e 100644
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -4,8 +4,7 @@
 import pytest
 import torch
 
-from vllm.model_executor.layers.lightning_attn import (
-    linear_decode_forward_triton)
+from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton
 from vllm.platforms import current_platform
 
 NUM_HEADS = [4, 8]
@@ -17,8 +16,8 @@
 
 def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
     """Reference implementation of lightning attention core algorithm
-    
-    The difference from the main implementation is that this processes 
+
+    The difference from the main implementation is that this processes
     each step sequentially, instead of using parallelized triton kernels
     """
     B, H, S, D = q.shape
@@ -62,8 +61,7 @@ def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
     # The actual implementation returns a tensor of shape [B, H, 2, D, E]
     # where dimension 2 contains both KV and KV history
     kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
-    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
-                               dim=2)  # [B, H, 2, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped], dim=2)  # [B, H, 2, D, E]
 
     return output, final_kv_cache
 
@@ -109,7 +107,7 @@ def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
             out_h = torch.matmul(q_bh, kv_new)
 
             # Update output and cache
-            output[b, h * D:(h + 1) * D] = out_h
+            output[b, h * D : (h + 1) * D] = out_h
             kv_caches[b, h] = kv_new
 
     return output
@@ -135,12 +133,9 @@ def test_linear_decode_forward_triton(
     k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
     v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
 
-    kv_caches = base * torch.randn(batch_size,
-                                   num_heads,
-                                   head_size,
-                                   head_size,
-                                   dtype=dtype,
-                                   device="cuda")
+    kv_caches = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
 
     kv_caches_copy = kv_caches.clone()
 
@@ -150,15 +145,14 @@ def test_linear_decode_forward_triton(
 
     slot_idx = torch.arange(batch_size, device="cuda")
 
-    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
-                                                 slope_rate, slot_idx)
+    triton_output = linear_decode_forward_triton(
+        q, k, v, kv_caches, slope_rate, slot_idx
+    )
 
-    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
-                                               slope_rate, slot_idx)
-    torch.testing.assert_close(triton_output,
-                               reference_output,
-                               rtol=1e-1,
-                               atol=1e-1)
+    reference_output = reference_linear_decode(
+        q, k, v, kv_caches_copy, slope_rate, slot_idx
+    )
+    torch.testing.assert_close(triton_output, reference_output, rtol=1e-1, atol=1e-1)
     torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
 
     assert triton_output.shape == (batch_size, num_heads * head_size)
@@ -184,12 +178,9 @@ def test_linear_decode_forward_triton_with_padding(
     k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
     v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
 
-    kv_caches = base * torch.randn(batch_size,
-                                   num_heads,
-                                   head_size,
-                                   head_size,
-                                   dtype=dtype,
-                                   device="cuda")
+    kv_caches = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
 
     kv_caches_copy = kv_caches.clone()
 
@@ -199,14 +190,15 @@ def test_linear_decode_forward_triton_with_padding(
 
     slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
 
-    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
-                                                 slope_rate, slot_idx)
+    triton_output = linear_decode_forward_triton(
+        q, k, v, kv_caches, slope_rate, slot_idx
+    )
 
-    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
-                                               slope_rate, slot_idx)
+    reference_output = reference_linear_decode(
+        q, k, v, kv_caches_copy, slope_rate, slot_idx
+    )
 
-    padding_mask = (slot_idx
-                    != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+    padding_mask = (slot_idx != -1).unsqueeze(1).expand(-1, num_heads * head_size)
 
     triton_masked = triton_output[padding_mask]
     reference_masked = reference_output[padding_mask]
@@ -217,15 +209,11 @@ def test_linear_decode_forward_triton_with_padding(
 
     for i in range(batch_size):
         if valid_indices[i] > 0:
-            torch.testing.assert_close(kv_caches[i],
-                                       kv_caches_copy[i],
-                                       rtol=rtol,
-                                       atol=atol)
+            torch.testing.assert_close(
+                kv_caches[i], kv_caches_copy[i], rtol=rtol, atol=atol
+            )
 
-    torch.testing.assert_close(triton_masked,
-                               reference_masked,
-                               rtol=rtol,
-                               atol=atol)
+    torch.testing.assert_close(triton_masked, reference_masked, rtol=rtol, atol=atol)
 
     assert triton_output.shape == (batch_size, num_heads * head_size)
 
@@ -249,39 +237,33 @@ def test_lightning_attention_reference(
     current_platform.seed_everything(42)
 
     base = 0.01
-    q = base * torch.randn(
-        batch_size, num_heads, seq_len, head_size, dtype=dtype)
-    k = base * torch.randn(
-        batch_size, num_heads, seq_len, head_size, dtype=dtype)
-    v = base * torch.randn(
-        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
 
     ed = torch.zeros(num_heads, device="cuda")
     for h in range(num_heads):
         ed[h] = 0.1 * (h + 1)
 
-    kv_history = base * torch.randn(batch_size,
-                                    num_heads,
-                                    head_size,
-                                    head_size,
-                                    dtype=dtype,
-                                    device="cuda")
+    kv_history = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
 
     kv_history_clone = kv_history.clone()
 
     ref_output, ref_kv_cache = reference_lightning_attention(
-        q, k, v, ed, 256, kv_history)
+        q, k, v, ed, 256, kv_history
+    )
 
     from vllm.model_executor.layers.lightning_attn import lightning_attention
+
     actual_output, actual_kv_cache = lightning_attention(
-        q, k, v, ed, 256, kv_history_clone)
+        q, k, v, ed, 256, kv_history_clone
+    )
 
     atol, rtol = 1.5e-1, 1.5e-1
     torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
-    torch.testing.assert_close(ref_kv_cache,
-                               actual_kv_cache,
-                               rtol=rtol,
-                               atol=atol)
+    torch.testing.assert_close(ref_kv_cache, actual_kv_cache, rtol=rtol, atol=atol)
 
     assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
     assert ref_kv_cache.shape == actual_kv_cache.shape
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index 9d1a301ebe30..eb9204dfaf15 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -7,19 +7,20 @@
 
 from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
 from vllm.attention.ops.triton_merge_attn_states import (
-    merge_attn_states as merge_attn_states_triton)
+    merge_attn_states as merge_attn_states_triton,
+)
 from vllm.platforms import current_platform
 
 
 # Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
 # can be used to combine partial attention results (in the split-KV case)
 def merge_attn_states_torch(
-        output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-        prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-        prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-        suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-        suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-        output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+    output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+    suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+    output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
 ):
     p_lse = prefix_lse
     s_lse = suffix_lse
@@ -32,15 +33,13 @@ def merge_attn_states_torch(
     s_lse = s_lse - max_lse
     p_lse_exp = torch.exp(p_lse)
     s_lse_exp = torch.exp(s_lse)
-    out_se = (p_lse_exp + s_lse_exp)
+    out_se = p_lse_exp + s_lse_exp
     if output_lse is not None:
         output_lse = torch.log(out_se) + max_lse
     p_scale = p_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
     s_scale = s_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
-    p_scale = torch.transpose(p_scale, 0,
-                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
-    s_scale = torch.transpose(s_scale, 0,
-                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    p_scale = torch.transpose(p_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = torch.transpose(s_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
     output = prefix_output * p_scale + suffix_output * s_scale
     return output, output_lse
 
@@ -55,8 +54,10 @@ def merge_attn_states_torch(
 
 def generate_markdown_table():
     global all_case_info
-    table_header = ("| tokens | heads | headsize | dtype "
-                    "| device | torch | triton | cuda | speedup |")
+    table_header = (
+        "| tokens | heads | headsize | dtype "
+        "| device | torch | triton | cuda | speedup |"
+    )
     table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
 
     def shortly_dtype(dtype: torch.dtype) -> str:
@@ -68,16 +69,26 @@ def shortly_device(device: str) -> str:
     print(table_header)
     print(table_separator)
     for info in all_case_info:
-        (num_tokens, num_heads, head_size, dtype, device,
-         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
-         performance_improved) = info
+        (
+            num_tokens,
+            num_heads,
+            head_size,
+            dtype,
+            device,
+            avg_time_torch_kernel,
+            avg_time_triton_kernel,
+            avg_time_cuda_kernel,
+            performance_improved,
+        ) = info
         dtype = shortly_dtype(dtype)
         device = shortly_device(device)
-        print(f"| {num_tokens} | {num_heads} | {head_size} "
-              f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
-              f"| {avg_time_triton_kernel:.5f}ms "
-              f"| {avg_time_cuda_kernel:.5f}ms "
-              f"| {performance_improved:.4f}x |")
+        print(
+            f"| {num_tokens} | {num_heads} | {head_size} "
+            f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
+            f"| {avg_time_triton_kernel:.5f}ms "
+            f"| {avg_time_cuda_kernel:.5f}ms "
+            f"| {performance_improved:.4f}x |"
+        )
 
 
 @pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
@@ -85,29 +96,28 @@ def shortly_device(device: str) -> str:
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("output_dtype", DTYPES)
 @torch.inference_mode()
-def test_merge_attn_states(num_tokens: int, num_query_heads: int,
-                           head_size: int, output_dtype: torch.dtype):
+def test_merge_attn_states(
+    num_tokens: int, num_query_heads: int, head_size: int, output_dtype: torch.dtype
+):
     if not current_platform.is_cuda():
-        pytest.skip('Currently only support compare triton merge_attn_states '
-                    'with custom cuda merge_attn_states kernel')
+        pytest.skip(
+            "Currently only support compare triton merge_attn_states "
+            "with custom cuda merge_attn_states kernel"
+        )
 
     NUM_TOKENS = num_tokens
     NUM_HEADS = num_query_heads
     HEAD_SIZE = head_size
 
-    print(f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
-          f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
-          f"Device: {current_platform.get_device_name()}")
+    print(
+        f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
+        f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
+        f"Device: {current_platform.get_device_name()}"
+    )
 
     # prefix_lse and suffix_lse contain inf and normal values
-    prefix_lse = torch.randn(NUM_HEADS,
-                             NUM_TOKENS,
-                             dtype=torch.float32,
-                             device="cuda")
-    suffix_lse = torch.randn(NUM_HEADS,
-                             NUM_TOKENS,
-                             dtype=torch.float32,
-                             device="cuda")
+    prefix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="cuda")
+    suffix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="cuda")
 
     # Generate boolean masks
     mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
@@ -117,23 +127,23 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
     mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
 
-    prefix_lse[mask_prefix] = float('inf')
-    suffix_lse[mask_suffix] = float('inf')
+    prefix_lse[mask_prefix] = float("inf")
+    suffix_lse[mask_suffix] = float("inf")
 
     # Other input tensors (need to be initialized but
     # no actual calculation needed)
-    output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
-                         dtype=output_dtype,
-                         device="cuda")
-    output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS),
-                             dtype=torch.float32,
-                             device="cuda")
-    prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
-                                dtype=output_dtype,
-                                device="cuda")
-    suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
-                                dtype=output_dtype,
-                                device="cuda")
+    output = torch.zeros(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    output_lse = torch.zeros(
+        (NUM_HEADS, NUM_TOKENS), dtype=torch.float32, device="cuda"
+    )
+    prefix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    suffix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
 
     warmup_times = 2
     repeat_times = 20
@@ -149,15 +159,25 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     suffix_lse_torch = suffix_lse.clone()
     for _ in range(warmup_times):
         output_torch, output_lse_torch = merge_attn_states_torch(
-            output_torch, prefix_output, prefix_lse_torch, suffix_output,
-            suffix_lse_torch, output_lse_torch)
+            output_torch,
+            prefix_output,
+            prefix_lse_torch,
+            suffix_output,
+            suffix_lse_torch,
+            output_lse_torch,
+        )
     torch.cuda.synchronize()
 
     for _ in range(repeat_times):
         start.record()
         output_torch, output_lse_torch = merge_attn_states_torch(
-            output_torch, prefix_output, prefix_lse_torch, suffix_output,
-            suffix_lse_torch, output_lse_torch)
+            output_torch,
+            prefix_output,
+            prefix_lse_torch,
+            suffix_output,
+            suffix_lse_torch,
+            output_lse_torch,
+        )
         end.record()
         torch.cuda.synchronize()
         total_time_torch_kernel += start.elapsed_time(end)
@@ -173,16 +193,26 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     end = torch.cuda.Event(enable_timing=True)
 
     for _ in range(warmup_times):
-        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
-                                 suffix_output, suffix_lse,
-                                 output_lse_ref_triton)
+        merge_attn_states_triton(
+            output_ref_triton,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_ref_triton,
+        )
     torch.cuda.synchronize()
 
     for _ in range(repeat_times):
         start.record()
-        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
-                                 suffix_output, suffix_lse,
-                                 output_lse_ref_triton)
+        merge_attn_states_triton(
+            output_ref_triton,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_ref_triton,
+        )
         end.record()
         torch.cuda.synchronize()
         total_time_triton_kernel += start.elapsed_time(end)
@@ -195,14 +225,26 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     output_lse_cuda = output_lse.clone()
 
     for _ in range(warmup_times):
-        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
-                               suffix_output, suffix_lse, output_lse_cuda)
+        merge_attn_states_cuda(
+            output_cuda,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_cuda,
+        )
     torch.cuda.synchronize()
 
     for _ in range(repeat_times):
         start.record()
-        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
-                               suffix_output, suffix_lse, output_lse_cuda)
+        merge_attn_states_cuda(
+            output_cuda,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_cuda,
+        )
         end.record()
         torch.cuda.synchronize()
         total_time_cuda_kernel += start.elapsed_time(end)
@@ -213,8 +255,10 @@ def test_merge_attn_states(num_tokens: int, num_query_heads: int,
     performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel
     print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
     print(f"Triton time: {avg_time_triton_kernel:.6f}ms")
-    print(f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
-          f"Performance: {performance_improved:.5f}x")
+    print(
+        f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
+        f"Performance: {performance_improved:.5f}x"
+    )
     print("-" * 100)
 
     # 4. Correctness compare
@@ -232,35 +276,45 @@ def diff(a: torch.Tensor, b: torch.Tensor):
     # states operation.
     output_ref = output_ref_triton
     output_lse_ref = output_lse_ref_triton
-    torch.testing.assert_close(output_cuda.float(),
-                               output_ref.float(),
-                               atol=1e-3,
-                               rtol=rtol)
+    torch.testing.assert_close(
+        output_cuda.float(), output_ref.float(), atol=1e-3, rtol=rtol
+    )
     print("Output all match, max abs diff:")
     print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}")
     print(f"  (CUDA vs Torch) : {diff(output_torch, output_cuda)}")
     print(f"  (CUDA vs Triton): {diff(output_ref, output_cuda)}")
     print("-" * 100)
 
-    torch.testing.assert_close(output_lse_cuda.float(),
-                               output_lse_ref.float(),
-                               atol=1e-3,
-                               rtol=rtol)
+    torch.testing.assert_close(
+        output_lse_cuda.float(), output_lse_ref.float(), atol=1e-3, rtol=rtol
+    )
     print("Output LSE all match, max abs diff:")
     print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
     print(f"  (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}")
     print(f"  (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}")
     print("-" * 100)
 
-    print("All output values test passed! All inf values "
-          "are correctly replaced with -inf.")
+    print(
+        "All output values test passed! All inf values "
+        "are correctly replaced with -inf."
+    )
     print("-" * 100)
 
     device = current_platform.get_device_name()
     all_case_info.append(
-        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE, output_dtype, device,
-         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
-         performance_improved))
-    if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) *
-                              len(NUM_QUERY_HEADS) * len(DTYPES)):
+        (
+            NUM_TOKENS,
+            NUM_HEADS,
+            HEAD_SIZE,
+            output_dtype,
+            device,
+            avg_time_torch_kernel,
+            avg_time_triton_kernel,
+            avg_time_cuda_kernel,
+            performance_improved,
+        )
+    )
+    if len(all_case_info) == (
+        len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) * len(NUM_QUERY_HEADS) * len(DTYPES)
+    ):
         generate_markdown_table()
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 53c37554b15a..f97dd50cb5ca 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -5,6 +5,7 @@
 
 * Tests for MultiHeadAttention layer
 """
+
 from unittest.mock import patch
 
 import pytest
@@ -20,8 +21,7 @@
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
 
 
@@ -74,9 +74,11 @@ def ref_attention(
 NUM_KV_HEADS = [1]
 HEAD_SIZES = [64, 80]
 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [
-    torch.half, torch.bfloat16, torch.float
-] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+DTYPES = (
+    [torch.half, torch.bfloat16, torch.float]
+    if not current_platform.is_rocm()
+    else [torch.half, torch.bfloat16]
+)
 CUDA_DEVICES = ["cuda"]
 
 
@@ -104,10 +106,9 @@ def test_mha_attn_forward(
     k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(num_heads,
-                              head_size,
-                              scale=scale,
-                              num_kv_heads=num_kv_heads)
+    attn = MultiHeadAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
     output = attn(q, k, v)
 
     assert num_heads % num_kv_heads == 0
diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
index f8b307c595de..44f3e42e8714 100644
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@@ -11,30 +11,24 @@
 
 
 def ref_mla(
-        out: Tensor,  # (bs, num_heads, v_head_dim)
-        query: Tensor,  # (bs, num_heads, head_dim)
-        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
-        scale: float,
-        block_tables: Tensor,  # (bs, max_num_blocks)
-        seq_lens: Tensor,  # (bs,)
+    out: Tensor,  # (bs, num_heads, v_head_dim)
+    query: Tensor,  # (bs, num_heads, head_dim)
+    kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+    scale: float,
+    block_tables: Tensor,  # (bs, max_num_blocks)
+    seq_lens: Tensor,  # (bs,)
 ):
     bs, num_heads, v_head_dim = out.shape
     head_dim = query.shape[2]
 
     for i in range(bs):
         # gather and flatten KV-cache
-        kv = kv_cache[
-            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
-        kv = kv.view(1, -1,
-                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        kv = kv_cache[block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1, head_dim)[:, : seq_lens[i]]  # (1, seq_len, head_dim)
         v = kv[:, :, :v_head_dim]
 
         q = query[i].view(num_heads, 1, head_dim)
-        o = F.scaled_dot_product_attention(q,
-                                           kv,
-                                           v,
-                                           scale=scale,
-                                           enable_gqa=True)
+        o = F.scaled_dot_product_attention(q, kv, v, scale=scale, enable_gqa=True)
         out[i] = o.view(num_heads, v_head_dim)
 
     return out
@@ -63,18 +57,17 @@ def test_mla_decode_cpu(
     torch.set_default_dtype(dtype)
     torch.manual_seed(0)
 
-    scale = d**(-0.5)
+    scale = d ** (-0.5)
     if varlen:
         seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
         seq_lens = seq_lens.clip(2).to(torch.int32)
     else:
-        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+        seq_lens = torch.full((bs,), mean_seq_len, dtype=torch.int32)
     max_seq_len = seq_lens.max().item()
     seqlen_pad = cdiv(max_seq_len, 256) * 256  # is this necessary?
 
     q = torch.randn(bs, h_q, d)
-    block_table = torch.arange(bs * seqlen_pad // block_size,
-                               dtype=torch.int32)
+    block_table = torch.arange(bs * seqlen_pad // block_size, dtype=torch.int32)
     block_table = block_table.view(bs, seqlen_pad // block_size)
 
     kv_cache = torch.randn(block_table.numel(), block_size, d)
@@ -82,8 +75,7 @@ def test_mla_decode_cpu(
         kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
 
     out_mla = q.new_zeros(bs, h_q, dv)
-    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
-                               seq_lens)
+    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table, seq_lens)
 
     out_ref = q.new_zeros(bs, h_q, dv)
     ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index b09e1bbc4279..5acab484ecdc 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -12,8 +12,7 @@
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
 from vllm.attention.backends.xformers import _make_alibi_bias
-from vllm.attention.ops.chunked_prefill_paged_decode import (
-    chunked_prefill_paged_decode)
+from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
@@ -22,9 +21,7 @@
 NUM_QUERIES_PER_KV = [1, 8, 64]
 HEAD_SIZES = [128, 96, 24]
 DTYPES = [torch.float16]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
@@ -50,12 +47,10 @@ def test_contexted_kv_attention(
     device: str,
     op: Callable,
 ) -> None:
-
-    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
-            89):
+    if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
         pytest.skip(
-            'Triton limitation: fp8e4nv data type is not supported on CUDA'
-            ' arch < 89')
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
 
     current_platform.seed_everything(0)
     torch.set_default_device(device)
@@ -93,38 +88,29 @@ def test_contexted_kv_attention(
         cache_dtype = dtype
     else:
         cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
+    k_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    v_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     values = torch.arange(0, cache_size, dtype=torch.long)
     values = values[torch.randperm(cache_size)]
-    block_table = values[:BS * max_block_per_request].view(
-        BS, max_block_per_request)
+    block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
-                                            dtype=torch.long),
-                               dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0
+    )
     for i in range(BS):
         for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
         cur_ctx = 0
         block_id = 0
         while cur_ctx < b_ctx_len[i]:
@@ -135,61 +121,71 @@ def test_contexted_kv_attention(
                 end_loc = start_loc + block_size
             start_slot = block_table[i, block_id] * block_size
             end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
+            k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc]
+            )
+            v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc]
+            )
             cur_ctx += block_size
             block_id += 1
     # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
-    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
-                           8).permute(0, 2, 3, 1, 4).contiguous()
+    k_cache = (
+        k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8)
+        .permute(0, 2, 3, 1, 4)
+        .contiguous()
+    )
     # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
-    v_cache = v_cache.view(-1, block_size, num_kv_heads,
-                           head_size).permute(0, 2, 3, 1).contiguous()
+    v_cache = (
+        v_cache.view(-1, block_size, num_kv_heads, head_size)
+        .permute(0, 2, 3, 1)
+        .contiguous()
+    )
     k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       sliding_window=sliding_window)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        sliding_window=sliding_window,
+    )
     torch.cuda.synchronize()
     start_time = time.time()
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       sliding_window=sliding_window)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        sliding_window=sliding_window,
+    )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+    print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
 
     scale = float(1.0 / (head_size**0.5))
 
@@ -201,22 +197,24 @@ def test_contexted_kv_attention(
         # heads.
         #
         # see also: vllm/model_executor/layers/attention.py
-        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
-                           query.shape[-1])
-        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
-                                        num_queries_per_kv, key.shape[-1])
-        value = value[:, :,
-                      None, :].expand(value.shape[0], num_kv_heads,
-                                      num_queries_per_kv, value.shape[-1])
+        query = query.view(
+            query.shape[0], num_kv_heads, num_queries_per_kv, query.shape[-1]
+        )
+        key = key[:, :, None, :].expand(
+            key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+        )
+        value = value[:, :, None, :].expand(
+            value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+        )
     query = query.unsqueeze(0)
     key = key.unsqueeze(0)
     value = value.unsqueeze(0)
 
     attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens)
+        query_lens, seq_lens
+    )
     if sliding_window > 0:
-        attn_bias = attn_bias.make_local_attention_from_bottomright(
-            sliding_window)
+        attn_bias = attn_bias.make_local_attention_from_bottomright(sliding_window)
     output_ref = xops.memory_efficient_attention_forward(
         query,
         key,
@@ -239,7 +237,7 @@ def test_contexted_kv_attention(
     )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms")
     output_ref = output_ref.reshape(output.shape)
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
@@ -262,12 +260,10 @@ def test_contexted_kv_attention_alibi(
     device: str,
     op: Callable,
 ) -> None:
-
-    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
-            89):
+    if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
         pytest.skip(
-            'Triton limitation: fp8e4nv data type is not supported on CUDA'
-            ' arch < 89')
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
 
     current_platform.seed_everything(0)
     torch.set_default_device(device)
@@ -280,9 +276,9 @@ def test_contexted_kv_attention_alibi(
 
     def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         # Fork from: vllm/vllm/model_executor/models/bloom.py#L44
-        closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+        closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
         base = torch.tensor(
-            2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+            2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
             dtype=torch.float32,
         )
         powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
@@ -290,17 +286,16 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 
         if closest_power_of_2 != total_num_heads:
             extra_base = torch.tensor(
-                2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+                2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
                 dtype=torch.float32,
             )
-            num_remaining_heads = min(closest_power_of_2,
-                                      total_num_heads - closest_power_of_2)
-            extra_powers = torch.arange(start=1,
-                                        end=1 + 2 * num_remaining_heads,
-                                        step=2,
-                                        dtype=torch.int32)
-            slopes = torch.cat(
-                [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+            num_remaining_heads = min(
+                closest_power_of_2, total_num_heads - closest_power_of_2
+            )
+            extra_powers = torch.arange(
+                start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+            )
+            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
         return slopes
 
     alibi_slopes = _get_alibi_slopes(num_heads).to(device)
@@ -328,38 +323,29 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         cache_dtype = dtype
     else:
         cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=cache_dtype)
+    k_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    v_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     values = torch.arange(0, cache_size, dtype=torch.long)
     values = values[torch.randperm(cache_size)]
-    block_table = values[:BS * max_block_per_request].view(
-        BS, max_block_per_request)
+    block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
-                                            dtype=torch.long),
-                               dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0
+    )
     for i in range(BS):
         for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
         cur_ctx = 0
         block_id = 0
         while cur_ctx < b_ctx_len[i]:
@@ -370,82 +356,90 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
                 end_loc = start_loc + block_size
             start_slot = block_table[i, block_id] * block_size
             end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
+            k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc]
+            )
+            v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc]
+            )
             cur_ctx += block_size
             block_id += 1
     # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
-    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
-                           8).permute(0, 2, 3, 1, 4).contiguous()
+    k_cache = (
+        k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8)
+        .permute(0, 2, 3, 1, 4)
+        .contiguous()
+    )
     # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
-    v_cache = v_cache.view(-1, block_size, num_kv_heads,
-                           head_size).permute(0, 2, 3, 1).contiguous()
+    v_cache = (
+        v_cache.view(-1, block_size, num_kv_heads, head_size)
+        .permute(0, 2, 3, 1)
+        .contiguous()
+    )
     k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       alibi_slopes=alibi_slopes)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        alibi_slopes=alibi_slopes,
+    )
     torch.cuda.synchronize()
     start_time = time.time()
-    op(query,
-       k,
-       v,
-       output,
-       kv_cache_dtype,
-       k_cache,
-       v_cache,
-       block_table,
-       b_start_loc,
-       b_seq_len,
-       MAX_CTX_LEN,
-       max_input_len,
-       k_scale,
-       v_scale,
-       alibi_slopes=alibi_slopes)
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        alibi_slopes=alibi_slopes,
+    )
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+    print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
     scale = float(1.0 / (head_size**0.5))
 
     # NOTE(DefTruth): In order to reuse _make_alibi_bias function,
     # we have to pad query tensor before MQA/GQA expanding.
     if query.shape[0] != key.shape[0]:
-        query_pad = torch.empty(sum(seq_lens),
-                                num_heads,
-                                head_size,
-                                dtype=dtype)
+        query_pad = torch.empty(sum(seq_lens), num_heads, head_size, dtype=dtype)
         query_pad.uniform_(-1e-3, 1e-3)
         seq_start = 0
         query_start = 0
         for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
             seq_end = seq_start + seq_len
             query_end = query_start + query_len
-            query_pad[seq_start:seq_end, ...] = torch.cat([
-                torch.zeros(
-                    seq_len - query_len, num_heads, head_size, dtype=dtype),
-                query[query_start:query_end, ...]
-            ],
-                                                          dim=0)
+            query_pad[seq_start:seq_end, ...] = torch.cat(
+                [
+                    torch.zeros(seq_len - query_len, num_heads, head_size, dtype=dtype),
+                    query[query_start:query_end, ...],
+                ],
+                dim=0,
+            )
             seq_start += seq_len
             query_start += query_len
         query = query_pad
@@ -456,11 +450,12 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         # heads.
         #
         # see also: vllm/model_executor/layers/attention.py
-        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
-                                        num_queries_per_kv, key.shape[-1])
-        value = value[:, :,
-                      None, :].expand(value.shape[0], num_kv_heads,
-                                      num_queries_per_kv, value.shape[-1])
+        key = key[:, :, None, :].expand(
+            key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+        )
+        value = value[:, :, None, :].expand(
+            value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+        )
         # [seq, num_kv_heads, num_queries_per_kv, dk]=>
         # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
         # codebase. We save some time reshaping alibi matrix at runtime.
@@ -483,24 +478,23 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
         seq_end = seq_start + seq_len
         query_end = query_start + query_len
-        out = xops.memory_efficient_attention_forward(query[:,
-                                                            seq_start:seq_end],
-                                                      key[:,
-                                                          seq_start:seq_end],
-                                                      value[:,
-                                                            seq_start:seq_end],
-                                                      attn_bias=attn_bias[i],
-                                                      p=0.0,
-                                                      scale=scale)
+        out = xops.memory_efficient_attention_forward(
+            query[:, seq_start:seq_end],
+            key[:, seq_start:seq_end],
+            value[:, seq_start:seq_end],
+            attn_bias=attn_bias[i],
+            p=0.0,
+            scale=scale,
+        )
         out = out.view_as(query[:, seq_start:seq_end]).view(
-            seq_len, num_heads, head_size)
-        output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len:,
-                                                         ...])
+            seq_len, num_heads, head_size
+        )
+        output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len :, ...])
         seq_start += seq_len
         query_start += query_len
     torch.cuda.synchronize()
     end_time = time.time()
-    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms")
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
@@ -532,9 +526,16 @@ def test_contexted_kv_attention_f32(
     device: str,
     op: Callable,
 ) -> None:
-    test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
-                                sliding_window, dtype, kv_cache_dtype, device,
-                                op)
+    test_contexted_kv_attention(
+        num_heads,
+        num_queries_per_kv,
+        head_size,
+        sliding_window,
+        dtype,
+        kv_cache_dtype,
+        device,
+        op,
+    )
 
 
 @pytest.mark.optional
@@ -555,5 +556,6 @@ def test_contexted_kv_attention_alibi_f32(
     device: str,
     op: Callable,
 ) -> None:
-    test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
-                                      dtype, kv_cache_dtype, device, op)
+    test_contexted_kv_attention_alibi(
+        num_heads, num_queries_per_kv, head_size, dtype, kv_cache_dtype, device, op
+    )
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 34311b9ccd76..68b0fa5e6838 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -11,8 +11,7 @@
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
 
 
@@ -21,38 +20,42 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
 
         # Set the current platform to ROCm using monkeypatch
-        monkeypatch.setattr("vllm.attention.selector.current_platform",
-                            RocmPlatform())
+        monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
 
         # Test standard ROCm attention
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-        assert (backend.get_name() == "ROCM_FLASH"
-                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
+        assert (
+            backend.get_name() == "ROCM_FLASH"
+            or backend.get_name() == "TRITON_ATTN_VLLM_V1"
+        )
 
         # MLA test for deepseek related
 
         # change the attention backend to triton MLA
         m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
-                                   False, True)
-        assert (backend.get_name() == "TRITON_MLA"
-                or backend.get_name() == "TRITON_MLA_VLLM_V1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, False, True)
+        assert (
+            backend.get_name() == "TRITON_MLA"
+            or backend.get_name() == "TRITON_MLA_VLLM_V1"
+        )
 
         # If attention backend is None
         # If use_mla is true
         # The selected backend is triton MLA
         m.setenv(STR_BACKEND_ENV_VAR, None)
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
-                                   False, True)
-        assert (backend.get_name() == "TRITON_MLA"
-                or backend.get_name() == "TRITON_MLA_VLLM_V1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, False, True)
+        assert (
+            backend.get_name() == "TRITON_MLA"
+            or backend.get_name() == "TRITON_MLA_VLLM_V1"
+        )
 
         # change the attention backend to AITER MLA
         m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
-                                   False, True)
-        assert (backend.get_name() == "ROCM_AITER_MLA"
-                or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, False, True)
+        assert (
+            backend.get_name() == "ROCM_AITER_MLA"
+            or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1"
+        )
 
         # If attention backend is None
         # If use_mla is true
@@ -60,7 +63,8 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # The selected backend is ROCM_AITER_MLA
         m.setenv(STR_BACKEND_ENV_VAR, None)
         m.setenv("VLLM_ROCM_USE_AITER", "1")
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
-                                   False, True)
-        assert (backend.get_name() == "ROCM_AITER_MLA"
-                or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, False, True)
+        assert (
+            backend.get_name() == "ROCM_AITER_MLA"
+            or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1"
+        )
diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
index 2dca720fe330..b893a4b820d9 100644
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -24,14 +24,12 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
     num_kv_splits = 8
 
     num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
-    req_to_page = torch.randint(0,
-                                CACHE_SIZE // PAGE_SIZE,
-                                (B, num_pages_per_batch, 1),
-                                device="cuda")
+    req_to_page = torch.randint(
+        0, CACHE_SIZE // PAGE_SIZE, (B, num_pages_per_batch, 1), device="cuda"
+    )
     req_to_token = req_to_page * PAGE_SIZE
     req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
-    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(
-        1, 1, -1)
+    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(1, 1, -1)
     req_to_token = req_to_token.view(B, -1)
     req_to_token = req_to_token[:, :seq_len].contiguous()
 
@@ -46,7 +44,7 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
     # o will have the same shape as q
     o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
 
-    b_seq_len = torch.full((B, ), seq_len, device="cuda")
+    b_seq_len = torch.full((B,), seq_len, device="cuda")
 
     attn_logits = torch.empty(
         (B, H_Q, num_kv_splits, D_V + 1),
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 0cb7f5963c79..290f858959f7 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -14,9 +14,11 @@
 BLOCK_SIZES = [16, 32]
 
 DTYPES = [torch.float16, torch.bfloat16]
-QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
-    None, torch.float8_e4m3fnuz
-]
+QDTYPES = (
+    [None, torch.float8_e4m3fn]
+    if not current_platform.is_rocm()
+    else [None, torch.float8_e4m3fnuz]
+)
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
@@ -42,7 +44,7 @@ def ref_paged_attn(
     for i in range(num_seqs):
         query_len = query_lens[i]
         kv_len = kv_lens[i]
-        q = query[start_idx:start_idx + query_len]
+        q = query[start_idx : start_idx + query_len]
         q *= scale
 
         num_kv_blocks = (kv_len + block_size - 1) // block_size
@@ -60,10 +62,13 @@ def ref_paged_attn(
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
         if sliding_window is not None:
-            sliding_window_mask = torch.triu(empty_mask,
-                                             diagonal=kv_len -
-                                             (query_len + sliding_window) +
-                                             1).bool().logical_not()
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
             mask |= sliding_window_mask
         if soft_cap is not None and soft_cap > 0:
             attn = soft_cap * torch.tanh(attn / soft_cap)
@@ -77,9 +82,9 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
-@pytest.mark.parametrize("seq_lens",
-                         [[(1, 1328), (5, 18),
-                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
+@pytest.mark.parametrize(
+    "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
+)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@@ -114,30 +119,23 @@ def test_triton_unified_attn(
     assert num_query_heads % num_kv_heads == 0
     max_query_len = max(query_lens)
     max_kv_len = max(kv_lens)
-    window_size = ((sliding_window - 1, 0) if sliding_window is not None else
-                   (-1, -1))
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
     scale = head_size**-0.5
 
-    query = torch.randn(sum(query_lens),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-    key_cache = torch.randn(num_blocks,
-                            block_size,
-                            num_kv_heads,
-                            head_size,
-                            dtype=dtype)
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
     value_cache = torch.randn_like(key_cache)
-    cu_query_lens = torch.tensor([0] + query_lens,
-                                 dtype=torch.int32).cumsum(dim=0,
-                                                           dtype=torch.int32)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
     kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
 
     max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 num_blocks,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
 
     output = torch.empty_like(query)
 
@@ -191,5 +189,7 @@ def test_triton_unified_attn(
     atol, rtol = 1.5e-2, 1e-2
     if q_dtype is not None:
         atol, rtol = 1.5e-1, 1.5e-1
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
-        f"{torch.max(torch.abs(output - ref_output))}"
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 29c5e70a8ba8..fdb1c8adfd6e 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -5,27 +5,30 @@
 
 import pytest
 import torch
-
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
-from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
-                                                   GeluAndMul, MulAndSilu,
-                                                   NewGELU, QuickGELU,
-                                                   SiluAndMul)
+
+from vllm.model_executor.layers.activation import (
+    FastGELU,
+    FatreluAndMul,
+    GeluAndMul,
+    MulAndSilu,
+    NewGELU,
+    QuickGELU,
+    SiluAndMul,
+)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 @pytest.mark.parametrize(
-    "activation",
-    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
+    "activation", ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -67,7 +70,7 @@ def test_act_and_mul(
     torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
-    output_shape = (x.shape[:-1] + (d, ))
+    output_shape = x.shape[:-1] + (d,)
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
@@ -75,9 +78,14 @@ def test_act_and_mul(
         opcheck(fn, (out, x))
 
 
-@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
-                                        (NewGELU, torch.ops._C.gelu_new),
-                                        (QuickGELU, torch.ops._C.gelu_quick)])
+@pytest.mark.parametrize(
+    "activation",
+    [
+        (FastGELU, torch.ops._C.gelu_fast),
+        (NewGELU, torch.ops._C.gelu_new),
+        (QuickGELU, torch.ops._C.gelu_quick),
+    ],
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -99,10 +107,9 @@ def test_activation(
     fn = activation[1]
     out = layer(x)
     ref_out = layer.forward_native(x)
-    torch.testing.assert_close(out,
-                               ref_out,
-                               atol=get_default_atol(out),
-                               rtol=get_default_rtol(out))
+    torch.testing.assert_close(
+        out, ref_out, atol=get_default_atol(out), rtol=get_default_rtol(out)
+    )
 
     out = torch.empty_like(x)
     opcheck(fn, (out, x))
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 19703b8a2f97..60467f696693 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -5,9 +5,9 @@
 
 import pytest
 import torch
+from tests.kernels.utils import opcheck
 
 import vllm._custom_ops as ops
-from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
 
 DTYPES = [torch.bfloat16, torch.float]
@@ -24,9 +24,7 @@
 ADD_RESIDUAL = [False, True]
 SCALE_UBS = [True, False]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 EPS = 1e-6
 
@@ -34,13 +32,12 @@
 
 
 def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
-    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+    return torch.as_tensor(x, dtype=torch.float32, device="cuda")
 
 
-def ref_rms_norm(rms_norm_layer: RMSNorm,
-                 x: torch.Tensor,
-                 residual: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+def ref_rms_norm(
+    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     if residual is not None:
         residual = residual.clone()
         out, residual = rms_norm_layer.forward_native(x, residual)
@@ -50,12 +47,13 @@ def ref_rms_norm(rms_norm_layer: RMSNorm,
     return out, residual
 
 
-def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
-                                x: torch.Tensor,
-                                quant_dtype: torch.dtype,
-                                residual: Optional[torch.Tensor],
-                                scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+def ref_dynamic_per_token_quant(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     if scale_ub is not None:
         assert quant_dtype == torch.float8_e4m3fn
 
@@ -64,9 +62,9 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
 
     # Quant
     if quant_dtype == torch.float8_e4m3fn:
-        torch_out, scales = ops.scaled_fp8_quant(torch_out,
-                                                 scale_ub=scale_ub,
-                                                 use_per_token_if_dynamic=True)
+        torch_out, scales = ops.scaled_fp8_quant(
+            torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True
+        )
     else:
         assert quant_dtype == torch.int8
         torch_out, scales = ops.scaled_int8_quant(torch_out)
@@ -74,38 +72,41 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
     return torch_out, scales, residual
 
 
-def ref_impl(rms_norm_layer: RMSNorm,
-             x: torch.Tensor,
-             quant_dtype: torch.dtype,
-             residual: Optional[torch.Tensor],
-             scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
-                                       residual, scale_ub)
+def ref_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ref_dynamic_per_token_quant(
+        rms_norm_layer, x, quant_dtype, residual, scale_ub
+    )
 
 
-def ops_dynamic_per_token_quant(weight: torch.Tensor,
-                                x: torch.Tensor,
-                                quant_dtype: torch.dtype,
-                                residual: Optional[torch.Tensor],
-                                scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+def ops_dynamic_per_token_quant(
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     if residual is not None:
         residual = residual.clone()
-    out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
-                                                       quant_dtype, scale_ub,
-                                                       residual)
+    out, scales = ops.rms_norm_dynamic_per_token_quant(
+        x, weight, EPS, quant_dtype, scale_ub, residual
+    )
     return out, scales, residual
 
 
-def ops_impl(weight: torch.Tensor,
-             x: torch.Tensor,
-             quant_dtype: torch.dtype,
-             residual: Optional[torch.Tensor],
-             scale_ub: Optional[torch.Tensor]) \
-        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
-                                       scale_ub)
+def ops_impl(
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    quant_dtype: torch.dtype,
+    residual: Optional[torch.Tensor],
+    scale_ub: Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
 
 
 @pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES)
@@ -146,12 +147,14 @@ def test_rms_norm(
     residual = torch.randn_like(x) * scale if add_residual else None
     if scale_ub is not None:
         rms_x, _ = ref_rms_norm(layer, x, residual)
-        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda')
+        scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device="cuda")
 
-    ref_out, ref_scales, ref_residual = \
-        ref_impl(layer, x, quant_dtype, residual, scale_ub)
-    ops_out, ops_scales, ops_residual = \
-        ops_impl(layer.weight, x, quant_dtype, residual, scale_ub)
+    ref_out, ref_scales, ref_residual = ref_impl(
+        layer, x, quant_dtype, residual, scale_ub
+    )
+    ops_out, ops_scales, ops_residual = ops_impl(
+        layer.weight, x, quant_dtype, residual, scale_ub
+    )
 
     assert ref_out.dtype == quant_dtype
     assert ops_out.dtype == quant_dtype
@@ -160,15 +163,18 @@ def test_rms_norm(
         # big atol to account for round-off errors.
         assert torch.allclose(ref_out, ops_out, atol=1)
     else:
-        assert torch.allclose(ref_out.to(dtype=torch.float32),
-                              ops_out.to(dtype=torch.float32))
+        assert torch.allclose(
+            ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+        )
     if add_residual:
         assert torch.allclose(ref_residual, ops_residual)
 
     output = torch.empty_like(x, dtype=quant_dtype)
-    scales = torch.empty((x.numel() // x.shape[-1], 1),
-                         device=x.device,
-                         dtype=torch.float32)
-
-    opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant,
-            (output, x, layer.weight, scales, 1e-5, scale_ub, residual))
+    scales = torch.empty(
+        (x.numel() // x.shape[-1], 1), device=x.device, dtype=torch.float32
+    )
+
+    opcheck(
+        torch.ops._C.rms_norm_dynamic_per_token_quant,
+        (output, x, layer.weight, scales, 1e-5, scale_ub, residual),
+    )
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index 3eac062738f8..37116060bd1a 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -3,21 +3,30 @@
 
 import pytest
 import torch
-
 from tests.kernels.quant_utils import FP8_DTYPE
 from tests.kernels.utils import opcheck
+
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
-HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
-                8199]  # Arbitrary values for testing
+HIDDEN_SIZES = [
+    8,
+    768,
+    769,
+    770,
+    771,
+    5120,
+    5124,
+    5125,
+    5126,
+    8192,
+    8199,
+]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -58,11 +67,14 @@ def test_rms_norm(
         torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
 
     if residual is not None:
-        opcheck(torch.ops._C.fused_add_rms_norm,
-                (x, residual, layer.weight.data, layer.variance_epsilon))
+        opcheck(
+            torch.ops._C.fused_add_rms_norm,
+            (x, residual, layer.weight.data, layer.variance_epsilon),
+        )
     else:
-        opcheck(torch.ops._C.rms_norm,
-                (out, x, layer.weight.data, layer.variance_epsilon))
+        opcheck(
+            torch.ops._C.rms_norm, (out, x, layer.weight.data, layer.variance_epsilon)
+        )
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -102,36 +114,38 @@ def test_fused_rms_norm_quant(
 
     if add_residual:
         torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
+            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
+        )
 
         # Unfused kernel is in-place so it goes second
         # Also use a separate clone of x to avoid modifying the input
         x_unfused = x.clone()
         torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
-        torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused,
-                                             quant_scale_t)
+        torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused, quant_scale_t)
 
         torch.cuda.synchronize()
-        torch.testing.assert_close(residual_fused,
-                                   residual,
-                                   atol=1e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
 
         opcheck(
             torch.ops._C.fused_add_rms_norm_static_fp8_quant,
-            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
+            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
+        )
     else:
-        torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
-                                               quant_scale_t, 1e-6)
+        torch.ops._C.rms_norm_static_fp8_quant(
+            out_quant_fused, x, weight, quant_scale_t, 1e-6
+        )
 
         torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
-        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
-                                             quant_scale_t)
-
-        opcheck(torch.ops._C.rms_norm_static_fp8_quant,
-                (out_quant_fused, x, weight, quant_scale_t, 1e-6))
+        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
 
-    torch.testing.assert_close(out_quant_fused.to(dtype=torch.float32),
-                               out_quant.to(dtype=torch.float32),
-                               atol=1e-3,
-                               rtol=1e-3)
+        opcheck(
+            torch.ops._C.rms_norm_static_fp8_quant,
+            (out_quant_fused, x, weight, quant_scale_t, 1e-6),
+        )
+
+    torch.testing.assert_close(
+        out_quant_fused.to(dtype=torch.float32),
+        out_quant.to(dtype=torch.float32),
+        atol=1e-3,
+        rtol=1e-3,
+    )
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
index 40ced08b933a..6d52669a7a25 100644
--- a/tests/kernels/core/test_opcheck.py
+++ b/tests/kernels/core/test_opcheck.py
@@ -5,7 +5,6 @@
 """
 
 import torch
-
 from tests.kernels.utils import opcheck
 
 
diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
index e18f6230dbce..1470301ee1d4 100644
--- a/tests/kernels/core/test_permute_cols.py
+++ b/tests/kernels/core/test_permute_cols.py
@@ -3,16 +3,16 @@
 
 import pytest
 import torch
-
 from tests.kernels.utils import opcheck
+
 from vllm._custom_ops import permute_cols
 
 
-@pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
-@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 def test_permute_cols(shape, dtype):
     x = torch.randn(shape, dtype=dtype).cuda()
     perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
     opcheck(torch.ops._C.permute_cols, (x, perm))
     y = permute_cols(x, perm)
-    torch.testing.assert_close(y, x[:, perm])
\ No newline at end of file
+    torch.testing.assert_close(y, x[:, perm])
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index ab6f1ccf881f..d2643c23cc17 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -6,8 +6,8 @@
 
 import pytest
 import torch
-
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 
@@ -19,30 +19,33 @@
 BATCH_SIZES = [5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 USE_KEY = [True, False]
 
 
-def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                           head_size: int) -> tuple[int, ...]:
+def _get_flat_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
     return (batch_size, seq_len, num_heads * head_size)
 
 
 # For testing sliced tensors
-def _get_padded_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                             head_size: int) -> tuple[int, ...]:
+def _get_padded_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
     return (batch_size, seq_len, num_heads, head_size + 64)
 
 
-def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
-                            head_size: int) -> tuple[int, ...]:
+def _get_batch_tensor_shape(
+    batch_size: int, seq_len: int, num_heads: int, head_size: int
+) -> tuple[int, ...]:
     return (batch_size, seq_len, num_heads, head_size)
 
 
 TENSORS_SHAPES_FN = [
-    _get_batch_tensor_shape, _get_flat_tensor_shape, _get_padded_tensor_shape
+    _get_batch_tensor_shape,
+    _get_flat_tensor_shape,
+    _get_padded_tensor_shape,
 ]
 
 
@@ -97,18 +100,21 @@ def test_rotary_embedding(
     ref_query, ref_key = rope.forward_native(positions, query, key)
     out_query, out_key = rope.forward(positions, query, key)
     # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(
+        out_query,
+        ref_query,
+        atol=get_default_atol(out_query),
+        rtol=get_default_rtol(out_query),
+    )
     if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
+        torch.testing.assert_close(
+            out_key,
+            ref_key,
+            atol=get_default_atol(out_key),
+            rtol=get_default_rtol(out_key),
+        )
     else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
+        assert ref_key is None and out_key is None, "expected returned key to be None"
 
 
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -142,10 +148,14 @@ def test_batched_rotary_embedding(
     torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
-        "rope_type": "linear",
-        "factor": (1, )
-    })
+    rope = get_rope(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        {"rope_type": "linear", "factor": (1,)},
+    )
     rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
@@ -160,25 +170,28 @@ def test_batched_rotary_embedding(
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
     ref_query, ref_key = rope.forward_native(positions, query, key)
-    out_query, out_key = rope.forward(positions,
-                                      query,
-                                      key,
-                                      offsets=torch.zeros(batch_size * seq_len,
-                                                          dtype=torch.long,
-                                                          device=device))
+    out_query, out_key = rope.forward(
+        positions,
+        query,
+        key,
+        offsets=torch.zeros(batch_size * seq_len, dtype=torch.long, device=device),
+    )
     # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(
+        out_query,
+        ref_query,
+        atol=get_default_atol(out_query),
+        rtol=get_default_rtol(out_query),
+    )
     if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
+        torch.testing.assert_close(
+            out_key,
+            ref_key,
+            atol=get_default_atol(out_key),
+            rtol=get_default_rtol(out_key),
+        )
     else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
+        assert ref_key is None and out_key is None, "expected returned key to be None"
 
 
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -211,72 +224,98 @@ def test_batched_rotary_embedding_multi_lora(
     if rotary_dim is None:
         rotary_dim = head_size
     scaling_factors: list[int] = [1, 2, 4]
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
-        "rope_type": "linear",
-        "factor": tuple(scaling_factors)
-    })
+    rope = get_rope(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        {"rope_type": "linear", "factor": tuple(scaling_factors)},
+    )
     rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
+    query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
     key = torch.randn_like(query) if use_key else None
 
     offset_map = torch.tensor(
         list(
-            accumulate([0] + [
-                max_position * scaling_factor * 2
-                for scaling_factor in scaling_factors[:-1]
-            ])))
-    query_types = torch.randint(0,
-                                len(scaling_factors), (batch_size, seq_len),
-                                device=device)
+            accumulate(
+                [0]
+                + [
+                    max_position * scaling_factor * 2
+                    for scaling_factor in scaling_factors[:-1]
+                ]
+            )
+        )
+    )
+    query_types = torch.randint(
+        0, len(scaling_factors), (batch_size, seq_len), device=device
+    )
     query_offsets = offset_map[query_types]
 
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
-    ref_query, ref_key = rope.forward_native(positions, query, key,
-                                             query_offsets)
-    out_query, out_key = rope.forward(positions, query, key,
-                                      query_offsets.flatten())
+    ref_query, ref_key = rope.forward_native(positions, query, key, query_offsets)
+    out_query, out_key = rope.forward(positions, query, key, query_offsets.flatten())
     # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
+    torch.testing.assert_close(
+        out_query,
+        ref_query,
+        atol=get_default_atol(out_query),
+        rtol=get_default_rtol(out_query),
+    )
     if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
+        torch.testing.assert_close(
+            out_key,
+            ref_key,
+            atol=get_default_atol(out_key),
+            rtol=get_default_rtol(out_key),
+        )
     else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
+        assert ref_key is None and out_key is None, "expected returned key to be None"
 
 
 @torch.inference_mode()
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
     BASES = [10000, 1000000]
-    ROPE_SCALINGS = (None, {
-        "rope_type": "linear",
-        "factor": (1, )
-    }, {
-        "rope_type": "dynamic",
-        "factor": 1
-    })
-    settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
-                ROPE_SCALINGS, DTYPES)
+    ROPE_SCALINGS = (
+        None,
+        {"rope_type": "linear", "factor": (1,)},
+        {"rope_type": "dynamic", "factor": 1},
+    )
+    settings = (
+        HEAD_SIZES,
+        ROTARY_DIMS,
+        MAX_POSITIONS,
+        BASES,
+        IS_NEOX_STYLE,
+        ROPE_SCALINGS,
+        DTYPES,
+    )
     rope_setting_id_map: dict[str, int] = {}
     for setting in product(*settings):
-        head_size, rotary_dim, max_position, base, \
-            is_neox_stype, rope_scaling, dtype = setting
+        (
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
-        rope = get_rope(head_size, rotary_dim, max_position, base,
-                        is_neox_stype, rope_scaling, dtype)
+        rope = get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        )
         # different settings cannot share the same rope module
         assert id(rope) not in rope_setting_id_map.values()
         assert all(x.dtype == dtype for x in rope.buffers())
@@ -284,11 +323,25 @@ def test_rope_module_cache():
         rope_setting_id_map[str(setting)] = id(rope)
 
     for setting in product(*settings):
-        head_size, rotary_dim, max_position, base, \
-            is_neox_stype, rope_scaling, dtype = setting
+        (
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
-        rope = get_rope(head_size, rotary_dim, max_position, base,
-                        is_neox_stype, rope_scaling, dtype)
+        rope = get_rope(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_stype,
+            rope_scaling,
+            dtype,
+        )
         # check if cache take effect
         assert id(rope) == rope_setting_id_map[str(setting)]
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index d1fd960bf115..4a46cc6dc6b9 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -8,28 +8,41 @@
 
 import pytest
 import torch
-
 from tests.kernels.utils import opcheck
+
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 
 
-def rotary_embedding_opcheck(rot,
-                             positions: torch.Tensor,
-                             query: torch.Tensor,
-                             key: Optional[torch.Tensor] = None,
-                             offsets: Optional[torch.Tensor] = None):
+def rotary_embedding_opcheck(
+    rot,
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: Optional[torch.Tensor] = None,
+    offsets: Optional[torch.Tensor] = None,
+):
     cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
 
     # ops.rotary_embedding()/batched_rotary_embedding()
     # are in-place operations that update the query and key tensors.
     if offsets is not None:
-        opcheck(torch.ops._C.batched_rotary_embedding,
-                (positions, query, key, rot.head_size, cos_sin_cache,
-                 rot.is_neox_style, rot.rotary_dim, offsets))
+        opcheck(
+            torch.ops._C.batched_rotary_embedding,
+            (
+                positions,
+                query,
+                key,
+                rot.head_size,
+                cos_sin_cache,
+                rot.is_neox_style,
+                rot.rotary_dim,
+                offsets,
+            ),
+        )
     else:
-        opcheck(torch.ops._C.rotary_embedding,
-                (positions, query, key, rot.head_size, cos_sin_cache,
-                 rot.is_neox_style))
+        opcheck(
+            torch.ops._C.rotary_embedding,
+            (positions, query, key, rot.head_size, cos_sin_cache, rot.is_neox_style),
+        )
 
 
 @pytest.mark.parametrize("device", ["cuda"])
@@ -40,39 +53,44 @@ def rotary_embedding_opcheck(rot,
 @pytest.mark.parametrize("seq_len", [11, 1024])
 @pytest.mark.parametrize("use_key", [True, False])
 @pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
-def test_rotary_embedding_opcheck(dist_init, device, max_position,
-                                  is_neox_style, rotary_dim, head_size,
-                                  seq_len, use_key, head_stride_is_contiguous):
+def test_rotary_embedding_opcheck(
+    dist_init,
+    device,
+    max_position,
+    is_neox_style,
+    rotary_dim,
+    head_size,
+    seq_len,
+    use_key,
+    head_stride_is_contiguous,
+):
     batch_size = 1
     base = 10000
     num_heads = 7
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
+    rot = RotaryEmbedding(
+        head_size, rotary_dim, max_position, base, is_neox_style, torch.float32
+    )
 
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device=device)
+    positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
     head_stride = head_size + (64 if head_stride_is_contiguous else 0)
 
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads,
-                        head_stride,
-                        dtype=torch.float32,
-                        device=device)
+    query = torch.randn(
+        batch_size, seq_len, num_heads, head_stride, dtype=torch.float32, device=device
+    )
     key = torch.randn_like(query) if use_key else None
     query = query[..., :head_size]
     key = key[..., :head_size] if use_key else None
 
     rotary_embedding_opcheck(rot, positions, query, key)
-    offsets = torch.zeros(batch_size * seq_len,
-                          device=device,
-                          dtype=torch.long)
+    offsets = torch.zeros(batch_size * seq_len, device=device, dtype=torch.long)
     rotary_embedding_opcheck(rot, positions, query, key, offsets)
 
     # if we have a contiguous head stride, test the alternate
     # [..., num_heads * head_dim] shape/layout
     if head_stride_is_contiguous:
         rotary_embedding_opcheck(
-            rot, positions, query.flatten(start_dim=-2),
-            key.flatten(start_dim=-2) if use_key else None)
+            rot,
+            positions,
+            query.flatten(start_dim=-2),
+            key.flatten(start_dim=-2) if use_key else None,
+        )
diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py
index c71215e4c646..73738175e5c7 100644
--- a/tests/kernels/core/test_uva.py
+++ b/tests/kernels/core/test_uva.py
@@ -5,20 +5,14 @@
 
 from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 @pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_cpu_write(device):
     torch.set_default_device(device)
-    cpu_tensor = torch.zeros(10,
-                             10,
-                             device="cpu",
-                             pin_memory=True,
-                             dtype=torch.int32)
+    cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
     cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
     assert cuda_view.device.type == "cuda"
 
@@ -40,11 +34,7 @@ def test_cpu_write(device):
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_gpu_write(device):
     torch.set_default_device(device)
-    cpu_tensor = torch.zeros(10,
-                             10,
-                             device="cpu",
-                             pin_memory=True,
-                             dtype=torch.int32)
+    cpu_tensor = torch.zeros(10, 10, device="cpu", pin_memory=True, dtype=torch.int32)
     cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
     assert cuda_view.device.type == "cuda"
 
@@ -59,4 +49,4 @@ def test_gpu_write(device):
 
     assert cpu_tensor[0, 0] == 2
     assert cpu_tensor[2, 3] == 4
-    assert cpu_tensor[4, 5] == -2
\ No newline at end of file
+    assert cpu_tensor[4, 5] == -2
diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
index 411bd9e904b0..f5bac4f1ac12 100644
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -10,7 +10,9 @@
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
-    causal_conv1d_fn, causal_conv1d_update)
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
 from vllm.platforms import current_platform
 
 
@@ -39,18 +41,15 @@ def causal_conv1d_ref(
     seqlen = x.shape[-1]
     dim, width = weight.shape
     if initial_states is None:
-        out = F.conv1d(x,
-                       weight.unsqueeze(1),
-                       bias,
-                       padding=width - 1,
-                       groups=dim)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
     else:
         x = torch.cat([initial_states, x], dim=-1)
         out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
     out = out[..., :seqlen]
     if return_final_states:
         final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
-            dtype_in)  # (batch, dim, width - 1)
+            dtype_in
+        )  # (batch, dim, width - 1)
         if final_states_out is not None:
             final_states_out.copy_(final_states)
         else:
@@ -59,12 +58,9 @@ def causal_conv1d_ref(
     return (out, None) if not return_final_states else (out, final_states_out)
 
 
-def causal_conv1d_update_ref(x,
-                             conv_state,
-                             weight,
-                             bias=None,
-                             activation=None,
-                             cache_seqlens=None):
+def causal_conv1d_update_ref(
+    x, conv_state, weight, bias=None, activation=None, cache_seqlens=None
+):
     """
     x: (batch, dim) or (batch, dim, seqlen)
     conv_state: (batch, dim, state_len), where state_len >= width - 1
@@ -91,24 +87,25 @@ def causal_conv1d_update_ref(x,
     assert weight.shape == (dim, width)
     if cache_seqlens is None:
         x_new = torch.cat([conv_state, x], dim=-1).to(
-            weight.dtype)  # (batch, dim, state_len + seqlen)
+            weight.dtype
+        )  # (batch, dim, state_len + seqlen)
         conv_state.copy_(x_new[:, :, -state_len:])
     else:
         width_idx = torch.arange(
-            -(width - 1), 0, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
-            -1, dim, -1)
-        x_new = torch.cat([conv_state.gather(2, width_idx), x],
-                          dim=-1).to(weight.dtype)
-        copy_idx = torch.arange(
-            seqlen, dtype=torch.long,
-            device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
-        copy_idx = torch.remainder(copy_idx,
-                                   state_len).unsqueeze(1).expand(-1, dim, -1)
+            -(width - 1), 0, dtype=torch.long, device=x.device
+        ).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        width_idx = (
+            torch.remainder(width_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        )
+        x_new = torch.cat([conv_state.gather(2, width_idx), x], dim=-1).to(weight.dtype)
+        copy_idx = torch.arange(seqlen, dtype=torch.long, device=x.device).unsqueeze(
+            0
+        ) + cache_seqlens.unsqueeze(1)
+        copy_idx = torch.remainder(copy_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
         conv_state.scatter_(2, copy_idx, x)
-    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
-                   groups=dim)[:, :, -seqlen:]
+    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0, groups=dim)[
+        :, :, -seqlen:
+    ]
     if unsqueeze:
         out = out.squeeze(-1)
     return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
@@ -117,15 +114,17 @@ def causal_conv1d_update_ref(x,
 @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
 @pytest.mark.parametrize("silu_activation", [True])
 @pytest.mark.parametrize("has_bias", [True])
-def causal_conv1d_opcheck_fn(x: torch.Tensor,
-                             weight: torch.Tensor,
-                             bias: Optional[torch.Tensor] = None,
-                             cu_seq_len: Optional[torch.Tensor] = None,
-                             cache_indices: Optional[torch.Tensor] = None,
-                             has_initial_state: Optional[torch.Tensor] = None,
-                             conv_states: Optional[torch.Tensor] = None,
-                             activation: Optional[str] = "silu",
-                             pad_slot_id: int = PAD_SLOT_ID):
+def causal_conv1d_opcheck_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    cu_seq_len: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+):
     """
     x: (batch, dim, seqlen)
     weight: (dim, width)
@@ -150,8 +149,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
 @pytest.mark.parametrize("seqlen", [1])
 @pytest.mark.parametrize("width", [4])
 @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
-def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
-                              itype):
+def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, itype):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
@@ -167,23 +165,16 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
     bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
     conv_state_ref = conv_state.detach().clone()
     activation = None if not silu_activation else "silu"
-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation)
-    out_ref = causal_conv1d_update_ref(x_ref,
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation)
+    out_ref = causal_conv1d_update_ref(
+        x_ref, conv_state_ref, weight, bias, activation=activation
+    )
 
     assert torch.equal(conv_state, conv_state_ref)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("silu_activation", [False, True])
 @pytest.mark.parametrize("has_bias", [False, True])
 @pytest.mark.parametrize("seqlen", [1, 3])
@@ -192,9 +183,9 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
 # tests correctness in case subset of the sequences are padded
 @pytest.mark.parametrize("with_padding", [True, False])
 @pytest.mark.parametrize("batch_size", [3])
-def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
-                                                width, seqlen, has_bias,
-                                                silu_activation, itype):
+def test_causal_conv1d_update_with_batch_gather(
+    batch_size, with_padding, dim, width, seqlen, has_bias, silu_activation, itype
+):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
@@ -209,31 +200,30 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
     total_entries = 10 * batch_size
 
     # x will be (batch, dim, seqlen) with contiguous along dim-axis
-    x = torch.randn(padded_batch_size, seqlen, dim, device=device,
-                    dtype=itype).transpose(1, 2)
+    x = torch.randn(
+        padded_batch_size, seqlen, dim, device=device, dtype=itype
+    ).transpose(1, 2)
 
     x_ref = x.clone()
 
     conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
     unused_states_bool[conv_state_indices] = False
-    padded_state_indices = torch.concat([
-        conv_state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
-    ],
-                                        dim=0)
+    padded_state_indices = torch.concat(
+        [
+            conv_state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )
 
     # conv_state will be (cache_lines, dim, state_len)
     # with contiguous along dim-axis
-    conv_state = torch.randn(total_entries,
-                             width - 1,
-                             dim,
-                             device=device,
-                             dtype=itype).transpose(1, 2)
+    conv_state = torch.randn(
+        total_entries, width - 1, dim, device=device, dtype=itype
+    ).transpose(1, 2)
 
     conv_state_for_padding_test = conv_state.clone()
 
@@ -242,22 +232,23 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
     conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
     activation = None if not silu_activation else "silu"
 
-    out = causal_conv1d_update(x,
-                               conv_state,
-                               weight,
-                               bias,
-                               activation=activation,
-                               conv_state_indices=padded_state_indices,
-                               pad_slot_id=PAD_SLOT_ID)
-    out_ref = causal_conv1d_update_ref(x_ref[:batch_size],
-                                       conv_state_ref,
-                                       weight,
-                                       bias,
-                                       activation=activation)
+    out = causal_conv1d_update(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation=activation,
+        conv_state_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+    out_ref = causal_conv1d_update_ref(
+        x_ref[:batch_size], conv_state_ref, weight, bias, activation=activation
+    )
 
     assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
-    assert torch.equal(conv_state[unused_states_bool],
-                       conv_state_for_padding_test[unused_states_bool])
+    assert torch.equal(
+        conv_state[unused_states_bool], conv_state_for_padding_test[unused_states_bool]
+    )
     assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
 
 
@@ -265,12 +256,13 @@ def test_causal_conv1d_update_with_batch_gather(batch_size, with_padding, dim,
 @pytest.mark.parametrize("silu_activation", [True])
 @pytest.mark.parametrize("has_bias", [True])
 @pytest.mark.parametrize("width", [4])
-@pytest.mark.parametrize('seqlen', [8, 30, 249, 2049, 4096])
-@pytest.mark.parametrize('dim', [64, 4096])
-@pytest.mark.parametrize('with_padding', [True, False])
-@pytest.mark.parametrize('batch', [4, 10])
-def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
-                              has_bias, silu_activation, itype):
+@pytest.mark.parametrize("seqlen", [8, 30, 249, 2049, 4096])
+@pytest.mark.parametrize("dim", [64, 4096])
+@pytest.mark.parametrize("with_padding", [True, False])
+@pytest.mark.parametrize("batch", [4, 10])
+def test_causal_conv1d_varlen(
+    batch, with_padding, dim, seqlen, width, has_bias, silu_activation, itype
+):
     device = "cuda"
     torch.cuda.empty_cache()
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
@@ -288,19 +280,19 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
 
     seqlens.append(
         torch.diff(
-            torch.cat(
-                [torch.tensor([-1]), eos_pos,
-                 torch.tensor([seqlen - 1])])).tolist())
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )
     assert sum(seqlens[-1]) == seqlen
     assert all(s > 0 for s in seqlens[-1])
 
     total_entries = batch_size * 10
     cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
-    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
-                          dim=0)
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0)
     x = rearrange(
         torch.randn(1, seqlen, 4096 + dim + 64, device=device, dtype=itype),
-        "b s d -> b d s")[:, 4096:4096 + dim, :]
+        "b s d -> b d s",
+    )[:, 4096 : 4096 + dim, :]
 
     weight = torch.randn(dim, width, device=device, dtype=itype)
 
@@ -309,34 +301,34 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
     weight_ref = weight.clone()
     bias_ref = bias.clone() if bias is not None else None
     activation = None if not silu_activation else "silu"
-    final_states = torch.randn(total_entries,
-                               width - 1,
-                               dim,
-                               device=x.device,
-                               dtype=x.dtype).transpose(1, 2)
+    final_states = torch.randn(
+        total_entries, width - 1, dim, device=x.device, dtype=x.dtype
+    ).transpose(1, 2)
     final_states_ref = final_states.clone()
-    has_initial_states = torch.randint(0,
-                                       2, (cumsum.shape[0] - 1, ),
-                                       dtype=torch.bool,
-                                       device=x.device)
-    state_indices = torch.randperm(total_entries,
-                                   dtype=torch.int32,
-                                   device=x.device)[:batch_size]
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
-    ],
-                                        dim=-1)
-    out = causal_conv1d_fn(x.squeeze(0),
-                           weight,
-                           bias=bias,
-                           conv_states=final_states,
-                           query_start_loc=cumsum.cuda(),
-                           cache_indices=padded_state_indices,
-                           has_initial_state=has_initial_states,
-                           activation=activation,
-                           pad_slot_id=PAD_SLOT_ID)
+    has_initial_states = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=x.device
+    )
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=x.device)[
+        :batch_size
+    ]
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )
+    out = causal_conv1d_fn(
+        x.squeeze(0),
+        weight,
+        bias=bias,
+        conv_states=final_states,
+        query_start_loc=cumsum.cuda(),
+        cache_indices=padded_state_indices,
+        has_initial_state=has_initial_states,
+        activation=activation,
+        pad_slot_id=PAD_SLOT_ID,
+    )
 
     out_ref = []
     out_ref_b = []
@@ -353,16 +345,20 @@ def test_causal_conv1d_varlen(batch, with_padding, dim, seqlen, width,
                 bias_ref,
                 activation=activation,
                 return_final_states=True,
-                final_states_out=final_states_ref[
-                    padded_state_indices[i]].unsqueeze(0),
-                initial_states=final_states_ref[padded_state_indices[i]].
-                unsqueeze(0) if has_initial_states[i] else None))
+                final_states_out=final_states_ref[padded_state_indices[i]].unsqueeze(0),
+                initial_states=final_states_ref[padded_state_indices[i]].unsqueeze(0)
+                if has_initial_states[i]
+                else None,
+            )
+        )
     out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
     out_ref_tensor = torch.cat(out_ref, dim=0)
 
-    assert torch.allclose(final_states[state_indices],
-                          final_states_ref[state_indices],
-                          rtol=rtol,
-                          atol=atol)
-    unpadded_out = out[:, :out_ref_tensor.shape[-1]]
+    assert torch.allclose(
+        final_states[state_indices],
+        final_states_ref[state_indices],
+        rtol=rtol,
+        atol=atol,
+    )
+    unpadded_out = out[:, : out_ref_tensor.shape[-1]]
     assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index f5c6a18614ff..e05592f5678e 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -5,10 +5,12 @@
 
 import pytest
 import torch
-
 from tests.utils import multi_gpu_test
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables
@@ -24,14 +26,15 @@
         (64, 2),
         (64, 4),  # hidden_size be divisible by num_gpus
         (100, 5),  # and n_groups must divide hidden_size
-    ])
+    ],
+)
 @pytest.mark.parametrize("dtype", [torch.float16])
 def test_mixer2_gated_norm_multi_gpu(
     batch_size: int,
     seq_len: int,
     hidden_size_n_groups: tuple[int, int],
     dtype: torch.dtype,
-    device: str = 'cuda',
+    device: str = "cuda",
 ):
     hidden_size, n_groups = hidden_size_n_groups
     num_processes = 2
@@ -39,17 +42,19 @@ def test_mixer2_gated_norm_multi_gpu(
     def run_torch_spawn(fn, nprocs):
         # need to use torch.mp.spawn otherwise will have problems with
         # torch.distributed and cuda
-        torch.multiprocessing.spawn(fn,
-                                    args=(
-                                        num_processes,
-                                        batch_size,
-                                        seq_len,
-                                        hidden_size,
-                                        n_groups,
-                                        dtype,
-                                        device,
-                                    ),
-                                    nprocs=nprocs)
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                batch_size,
+                seq_len,
+                hidden_size,
+                n_groups,
+                dtype,
+                device,
+            ),
+            nprocs=nprocs,
+        )
 
     run_torch_spawn(mixer2_gated_norm_tensor_parallel, 2)
 
@@ -71,20 +76,22 @@ def mixer2_gated_norm_tensor_parallel(
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': '12345',
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # create random weights an inputs
-    weight = torch.rand((hidden_size, ), dtype=dtype, device=device)
+    weight = torch.rand((hidden_size,), dtype=dtype, device=device)
     hidden_states = torch.randn(batch_size, seq_len, hidden_size)
     gate_states = torch.randn(batch_size, seq_len, hidden_size)
 
@@ -97,14 +104,18 @@ def mixer2_gated_norm_tensor_parallel(
 
     # create gated-norm without TP to compute reference
     # - utilize mock patching to disable TP when
-    with (unittest.mock.patch(
+    with (
+        unittest.mock.patch(
             "vllm.model_executor.layers.mamba.mamba_mixer2."
             "get_tensor_model_parallel_world_size",
-            return_value=1),
-          unittest.mock.patch(
-              "vllm.model_executor.layers.mamba.mamba_mixer2."
-              "get_tensor_model_parallel_rank",
-              return_value=0)):
+            return_value=1,
+        ),
+        unittest.mock.patch(
+            "vllm.model_executor.layers.mamba.mamba_mixer2."
+            "get_tensor_model_parallel_rank",
+            return_value=0,
+        ),
+    ):
         mixer_single_gpu = Mixer2RMSNormGated(
             full_hidden_size=hidden_size,
             full_n_groups=n_groups,
@@ -115,11 +126,13 @@ def mixer2_gated_norm_tensor_parallel(
     # generate and compare
     N = hidden_size // world_size
     output = mixer(
-        hidden_states[..., local_rank * N:(local_rank + 1) * N],
-        gate_states[..., local_rank * N:(local_rank + 1) * N],
+        hidden_states[..., local_rank * N : (local_rank + 1) * N],
+        gate_states[..., local_rank * N : (local_rank + 1) * N],
     )
     ref_output = mixer_single_gpu(hidden_states, gate_states)
-    torch.allclose(output,
-                   ref_output[..., local_rank * N:(local_rank + 1) * N],
-                   atol=1e-3,
-                   rtol=1e-3)
+    torch.allclose(
+        output,
+        ref_output[..., local_rank * N : (local_rank + 1) * N],
+        atol=1e-3,
+        rtol=1e-3,
+    )
diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
index 8dece26ddb29..a338c95c7c88 100644
--- a/tests/kernels/mamba/test_mamba_ssm.py
+++ b/tests/kernels/mamba/test_mamba_ssm.py
@@ -5,25 +5,20 @@
 import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
-
 from tests.kernels.utils import opcheck
+
 from vllm import _custom_ops as ops  # noqa: F401
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
-    selective_scan_fn, selective_state_update)
+    selective_scan_fn,
+    selective_state_update,
+)
 from vllm.platforms import current_platform
 
 
-def selective_state_update_ref(state,
-                               x,
-                               dt,
-                               A,
-                               B,
-                               C,
-                               D=None,
-                               z=None,
-                               dt_bias=None,
-                               dt_softplus=False):
+def selective_state_update_ref(
+    state, x, dt, A, B, C, D=None, z=None, dt_bias=None, dt_softplus=False
+):
     """
     Argument:
         state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
@@ -73,16 +68,17 @@ def selective_state_update_ref(state,
         assert dt_bias.shape == (nheads, dim)
         dt = dt + dt_bias
     dt = F.softplus(dt) if dt_softplus else dt
-    dA = torch.exp(rearrange(dt, "b h d -> b h d 1") *
-                   A)  # (batch, nheads, dim, dstate)
-    B = repeat(B, "b g n -> b (g h) n",
-               h=nheads // ngroups)  # (batch, nheads, dstate)
-    C = repeat(C, "b g n -> b (g h) n",
-               h=nheads // ngroups)  # (batch, nheads, dstate)
+    dA = torch.exp(
+        rearrange(dt, "b h d -> b h d 1") * A
+    )  # (batch, nheads, dim, dstate)
+    B = repeat(B, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
+    C = repeat(C, "b g n -> b (g h) n", h=nheads // ngroups)  # (batch, nheads, dstate)
     dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
-        B, "b h n -> b h 1 n")  # (batch, nheads, dim, dstate)
-    state.copy_(state * dA +
-                dB * rearrange(x, "b h d -> b h d 1"))  # (batch, dim, dstate
+        B, "b h n -> b h 1 n"
+    )  # (batch, nheads, dim, dstate)
+    state.copy_(
+        state * dA + dB * rearrange(x, "b h d -> b h d 1")
+    )  # (batch, dim, dstate
     out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
     if D is not None:
         out += (x * D).to(out.dtype)
@@ -92,18 +88,20 @@ def selective_state_update_ref(state,
     return out
 
 
-def selective_scan_ref(u,
-                       delta,
-                       A,
-                       B,
-                       C,
-                       D=None,
-                       z=None,
-                       delta_bias=None,
-                       delta_softplus=False,
-                       return_last_state=False,
-                       prev_state=None,
-                       final_state_out=None):
+def selective_scan_ref(
+    u,
+    delta,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    delta_bias=None,
+    delta_softplus=False,
+    return_last_state=False,
+    prev_state=None,
+    final_state_out=None,
+):
     """
     u: r(B D L)
     delta: r(B D L)
@@ -132,26 +130,26 @@ def selective_scan_ref(u,
     C = C.float()
     x = A.new_zeros((batch, dim, dstate)) if prev_state is None else prev_state
     ys = []
-    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
     if not is_variable_B:
-        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
     else:
         if B.dim() == 3:
-            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
         else:
             B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
-            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
     if is_variable_C and C.dim() == 4:
         C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
     for i in range(u.shape[2]):
         x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
         if not is_variable_C:
-            y = torch.einsum('bdn,dn->bd', x, C)
+            y = torch.einsum("bdn,dn->bd", x, C)
         else:
             if C.dim() == 3:
-                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
             else:
-                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
         if i == u.shape[2] - 1:
             if final_state_out is None:
                 final_state_out = x
@@ -166,20 +164,22 @@ def selective_scan_ref(u,
     return out if not return_last_state else (out, final_state_out)
 
 
-def selective_scan_opcheck_fn(u,
-                              delta,
-                              A,
-                              B,
-                              C,
-                              D=None,
-                              z=None,
-                              delta_bias=None,
-                              delta_softplus=False,
-                              cu_seq_len=None,
-                              cache_indices=None,
-                              has_initial_state=None,
-                              ssm_states=None,
-                              pad_slot_id=PAD_SLOT_ID):
+def selective_scan_opcheck_fn(
+    u,
+    delta,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    delta_bias=None,
+    delta_softplus=False,
+    cu_seq_len=None,
+    cache_indices=None,
+    has_initial_state=None,
+    ssm_states=None,
+    pad_slot_id=PAD_SLOT_ID,
+):
     """if return_last_state is True, returns (out, last_state)
     last_state has shape (batch, dim, dstate).
     """
@@ -206,30 +206,55 @@ def selective_scan_opcheck_fn(u,
 
     # Disable test_autograd_registration for now as it seems to trigger
     # a bogus error.
-    opcheck(torch.ops._C.selective_scan_fwd,
-            (u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len,
-             cache_indices, has_initial_state, ssm_states, pad_slot_id),
-            test_utils=["test_schema", "test_faketensor"])
-
-
-@pytest.mark.parametrize('wtype', [torch.float32])
-@pytest.mark.parametrize('itype',
-                         [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
-@pytest.mark.parametrize('has_delta_bias', [True])
-@pytest.mark.parametrize('delta_softplus', [True])
-@pytest.mark.parametrize('has_z', [True])
-@pytest.mark.parametrize('has_D', [True])
+    opcheck(
+        torch.ops._C.selective_scan_fwd,
+        (
+            u,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            z,
+            delta_bias,
+            delta_softplus,
+            cu_seq_len,
+            cache_indices,
+            has_initial_state,
+            ssm_states,
+            pad_slot_id,
+        ),
+        test_utils=["test_schema", "test_faketensor"],
+    )
+
+
+@pytest.mark.parametrize("wtype", [torch.float32])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("seqlen", [128, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("has_delta_bias", [True])
+@pytest.mark.parametrize("delta_softplus", [True])
+@pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("has_D", [True])
 @pytest.mark.parametrize("varBC_groups", [1, 2])
 @pytest.mark.parametrize("is_variable_C", [True])
 @pytest.mark.parametrize("is_variable_B", [True])
 @pytest.mark.parametrize("scan_chunks", [1, 2, 3])
-def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
-                        has_z, has_delta_bias, delta_softplus, seqlen, itype,
-                        wtype, scan_chunks):
+def test_selective_scan(
+    is_variable_B,
+    is_variable_C,
+    varBC_groups,
+    has_D,
+    has_z,
+    has_delta_bias,
+    delta_softplus,
+    seqlen,
+    itype,
+    wtype,
+    scan_chunks,
+):
     if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
         pytest.skip()  # This config is not applicable
-    device = 'cuda'
+    device = "cuda"
     rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
         rtol, atol = 3e-2, 5e-2
@@ -242,7 +267,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
     batch_size = 1
     dim = 4
     dstate = 8
-    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A = -0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)
     A_ref = A.clone()
     if not is_variable_B:
         B_shape = [dim, dstate]
@@ -250,9 +275,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
         B_shape = [batch_size, dstate, seqlen]
     else:
         B_shape = [batch_size, varBC_groups, dstate, seqlen]
-    B = torch.randn(B_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_B else itype)
+    B = torch.randn(B_shape, device=device, dtype=wtype if not is_variable_B else itype)
     B_ref = B.clone()
     if not is_variable_C:
         C_shape = [dim, dstate]
@@ -260,27 +283,27 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
         C_shape = [batch_size, dstate, seqlen]
     else:
         C_shape = [batch_size, varBC_groups, dstate, seqlen]
-    C = torch.randn(C_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_C else itype)
+    C = torch.randn(C_shape, device=device, dtype=wtype if not is_variable_C else itype)
     C_ref = C.clone()
     D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
     D_ref = D.clone()
-    z = torch.randn(batch_size, dim, seqlen, device=device,
-                    dtype=itype) if has_z else None
+    z = (
+        torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
+        if has_z
+        else None
+    )
     z_ref = z.clone() if has_z else None
-    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
-                  ) if has_delta_bias else None
+    delta_bias = (
+        (0.5 * torch.rand(dim, device=device, dtype=torch.float32))
+        if has_delta_bias
+        else None
+    )
     u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
     u_ref = u.clone()
-    delta = (0.5 *
-             torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
+    delta = 0.5 * torch.rand(batch_size, dim, seqlen, device=device, dtype=itype)
     delta_ref = delta.clone()
     state_shape = (batch_size, u.shape[1], int(A.shape[1]))
-    state = torch.randn(state_shape,
-                        device=u.device,
-                        dtype=itype,
-                        requires_grad=False)
+    state = torch.randn(state_shape, device=u.device, dtype=itype, requires_grad=False)
     state_ref = state.clone()
     out = None
     out_ref = None
@@ -312,9 +335,10 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
             z=_z,
             delta_bias=delta_bias,
             delta_softplus=delta_softplus,
-            has_initial_state=torch.ones(batch_size,
-                                         device=u.device,
-                                         dtype=torch.bool) if c > 0 else None)
+            has_initial_state=torch.ones(batch_size, device=u.device, dtype=torch.bool)
+            if c > 0
+            else None,
+        )
         outs.append(out)
     if len(outs) > 1:
         out = torch.cat(outs, dim=-1)
@@ -329,27 +353,29 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
         z=z_ref,
         delta_bias=delta_bias,
         delta_softplus=delta_softplus,
-        return_last_state=True)
+        return_last_state=True,
+    )
 
     assert out is not None and out_ref is not None
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
     assert state is not None and state_ref is not None
     assert torch.allclose(state, state_ref.to(itype), rtol=rtol, atol=atol)
 
-    selective_scan_opcheck_fn(u,
-                              delta,
-                              A,
-                              B,
-                              C,
-                              D,
-                              z,
-                              delta_bias=delta_bias,
-                              delta_softplus=delta_softplus,
-                              ssm_states=state)
+    selective_scan_opcheck_fn(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias=delta_bias,
+        delta_softplus=delta_softplus,
+        ssm_states=state,
+    )
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("has_z", [False, True])
 @pytest.mark.parametrize("dstate", [16, 32, 64])
 @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
@@ -373,51 +399,47 @@ def test_selective_state_update(dim, dstate, has_z, itype):
     D = torch.randn(dim, device=device)
     z = torch.randn_like(x) if has_z else None
     state_ref = state.detach().clone()
-    out = selective_state_update(state,
-                                 x,
-                                 dt,
-                                 A,
-                                 B,
-                                 C,
-                                 D=D,
-                                 z=z,
-                                 dt_bias=dt_bias,
-                                 dt_softplus=True)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x,
-                                         dt,
-                                         A,
-                                         B,
-                                         C,
-                                         D=D,
-                                         z=z,
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    out = selective_state_update(
+        state, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True
+    )
+    out_ref = selective_state_update_ref(
+        state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True
+    )
 
     assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
 
-@pytest.mark.parametrize('wtype', [torch.float32])
-@pytest.mark.parametrize('itype', [torch.float32])
-@pytest.mark.parametrize('seqlen', [1, 128, 129, 256, 512, 1024, 2048, 4096])
+@pytest.mark.parametrize("wtype", [torch.float32])
+@pytest.mark.parametrize("itype", [torch.float32])
+@pytest.mark.parametrize("seqlen", [1, 128, 129, 256, 512, 1024, 2048, 4096])
 @pytest.mark.parametrize("return_last_state", [True])
-@pytest.mark.parametrize('has_delta_bias', [True])
-@pytest.mark.parametrize('delta_softplus', [True])
-@pytest.mark.parametrize('has_z', [True])
-@pytest.mark.parametrize('has_D', [True])
+@pytest.mark.parametrize("has_delta_bias", [True])
+@pytest.mark.parametrize("delta_softplus", [True])
+@pytest.mark.parametrize("has_z", [True])
+@pytest.mark.parametrize("has_D", [True])
 @pytest.mark.parametrize("varBC_groups", [1, 2])
 @pytest.mark.parametrize("is_variable_C", [True])
 @pytest.mark.parametrize("is_variable_B", [True])
 # tests correctness in case subset of the sequences are padded
 @pytest.mark.parametrize("with_padding", [False, True])
-def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
-                               varBC_groups, has_D, has_z, has_delta_bias,
-                               delta_softplus, return_last_state, seqlen,
-                               itype, wtype):
+def test_selective_scan_varlen(
+    with_padding,
+    is_variable_B,
+    is_variable_C,
+    varBC_groups,
+    has_D,
+    has_z,
+    has_delta_bias,
+    delta_softplus,
+    return_last_state,
+    seqlen,
+    itype,
+    wtype,
+):
     if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
         pytest.skip()  # This config is not applicable
-    device = 'cuda'
+    device = "cuda"
     rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
         rtol, atol = 3e-2, 5e-2
@@ -441,72 +463,79 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
     eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
     seqlens.append(
         torch.diff(
-            torch.cat(
-                [torch.tensor([-1]), eos_pos,
-                 torch.tensor([seqlen - 1])])).tolist())
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )
 
     assert sum(seqlens[-1]) == seqlen
     assert all(s > 0 for s in seqlens[-1])
 
     total_entries = batch_size * 10
     cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
-    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
-                          dim=0).cuda()
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0).cuda()
 
     dim = 4
     dstate = 8
-    A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
+    A = -0.5 * torch.rand(dim, dstate, device=device, dtype=wtype)
     A_ref = A.clone()
     B_shape = [varBC_groups, dstate, seqlen]
-    B = torch.randn(B_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_B else itype)
+    B = torch.randn(B_shape, device=device, dtype=wtype if not is_variable_B else itype)
     B_ref = B.clone()
     C_shape = [varBC_groups, dstate, seqlen]
-    C = torch.randn(C_shape,
-                    device=device,
-                    dtype=wtype if not is_variable_C else itype)
+    C = torch.randn(C_shape, device=device, dtype=wtype if not is_variable_C else itype)
     C_ref = C.clone()
     D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
     D_ref = D.clone()
     z = torch.randn(dim, seqlen, device=device, dtype=itype)
     z_ref = z.clone()
-    delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
-                  ) if has_delta_bias else None
+    delta_bias = (
+        (0.5 * torch.rand(dim, device=device, dtype=torch.float32))
+        if has_delta_bias
+        else None
+    )
     u = torch.randn(dim, seqlen, device=device, dtype=itype)
     u_ref = u.clone()
-    delta = (0.5 * torch.rand(dim, seqlen, device=device, dtype=itype))
+    delta = 0.5 * torch.rand(dim, seqlen, device=device, dtype=itype)
     delta_ref = delta.clone()
     out = None
     out_ref = None
 
     prev_state_shape = (total_entries, u.shape[0], int(A.shape[1]))
-    prev_state = torch.randn(prev_state_shape,
-                             device=u.device,
-                             dtype=itype,
-                             requires_grad=False)
+    prev_state = torch.randn(
+        prev_state_shape, device=u.device, dtype=itype, requires_grad=False
+    )
     prev_state_ref = prev_state.clone()
-    state_indices = torch.randperm(total_entries,
-                                   dtype=torch.int32,
-                                   device=u.device)[:batch_size]
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=u.device)[
+        :batch_size
+    ]
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
     unused_states_bool[state_indices] = False
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
-    ],
-                                        dim=-1)
-
-    has_initial_state = torch.randint(0,
-                                      2, (cumsum.shape[0] - 1, ),
-                                      dtype=torch.bool,
-                                      device=u.device)
-    out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias,
-                            delta_softplus, cumsum, padded_state_indices,
-                            has_initial_state)
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )
+
+    has_initial_state = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=u.device
+    )
+    out = selective_scan_fn(
+        u,
+        prev_state,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias,
+        delta_softplus,
+        cumsum,
+        padded_state_indices,
+        has_initial_state,
+    )
     outs_ref = []
     splits = [
         torch.split(var, seqlens[0], dim=-1)
@@ -528,33 +557,46 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
             delta_softplus=delta_softplus,
             return_last_state=return_last_state,
             prev_state=prev_state_ref[padded_state_indices[i]].unsqueeze(0)
-            if has_initial_state[i] else None,
-            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(
-                0))
+            if has_initial_state[i]
+            else None,
+            final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(0),
+        )
         outs_ref.append(out_ref_s)
     out_ref = torch.cat(outs_ref, dim=-1)[0]
 
-    unpadded_out = out[:, :out_ref[0].shape[-1]]
+    unpadded_out = out[:, : out_ref[0].shape[-1]]
     print("Output diff max", (unpadded_out - out_ref).max())
     print("Output diff mean", (unpadded_out - out_ref).mean())
     print("Output state diff max", (prev_state - prev_state_ref).max())
     print("Output state diff mean", (prev_state - prev_state_ref).mean())
     assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(unpadded_out, out_ref, rtol=rtol, atol=atol)
-    selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias,
-                              delta_softplus, cumsum, padded_state_indices,
-                              has_initial_state, prev_state)
-
-
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+    selective_scan_opcheck_fn(
+        u,
+        delta,
+        A,
+        B,
+        C,
+        D,
+        z,
+        delta_bias,
+        delta_softplus,
+        cumsum,
+        padded_state_indices,
+        has_initial_state,
+        prev_state,
+    )
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("has_z", [True])
 @pytest.mark.parametrize("dstate", [16, 32, 64])
 @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
 # tests correctness in case subset of the sequences are padded
 @pytest.mark.parametrize("with_padding", [True, False])
-def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
-                                                   has_z, itype):
+def test_selective_state_update_with_batch_indices(
+    with_padding, dim, dstate, has_z, itype
+):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
     if itype == torch.bfloat16:
@@ -569,17 +611,17 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
     total_entries = 10 * batch_size
     state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device)
     state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
-    unused_states_bool = torch.ones(total_entries,
-                                    dtype=torch.bool,
-                                    device=device)
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
     unused_states_bool[state_indices] = False
-    padded_state_indices = torch.concat([
-        state_indices,
-        torch.as_tensor(
-            [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
-    ],
-                                        dim=0)
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )
     x = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
     dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
     dt_bias = torch.rand(dim, device=device) - 4.0
@@ -590,60 +632,59 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
     z = torch.randn_like(x) if has_z else None
     state_ref = state[state_indices, :].clone()
     state_before = state.clone()
-    out = selective_state_update(state,
-                                 x,
-                                 dt,
-                                 A,
-                                 B,
-                                 C,
-                                 D=D,
-                                 z=z,
-                                 dt_bias=dt_bias,
-                                 dt_softplus=True,
-                                 state_batch_indices=padded_state_indices,
-                                 pad_slot_id=PAD_SLOT_ID)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x[:batch_size],
-                                         dt[:batch_size],
-                                         A,
-                                         B[:batch_size],
-                                         C[:batch_size],
-                                         D=D,
-                                         z=z[:batch_size],
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    out = selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        state_batch_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+    out_ref = selective_state_update_ref(
+        state_ref,
+        x[:batch_size],
+        dt[:batch_size],
+        A,
+        B[:batch_size],
+        C[:batch_size],
+        D=D,
+        z=z[:batch_size],
+        dt_bias=dt_bias,
+        dt_softplus=True,
+    )
 
     print("Output diff max", (out[:batch_size] - out_ref).max())
     print("Output diff mean", (out[:batch_size] - out_ref).mean())
     print("Output state diff max", (state[state_indices, :] - state_ref).max())
-    print("Output state diff mean",
-          (state[state_indices, :] - state_ref).mean())
+    print("Output state diff mean", (state[state_indices, :] - state_ref).mean())
     # test padded entries stay the same
     if with_padding:
-        assert torch.equal(state_before[unused_states_bool],
-                           state[unused_states_bool])
-        assert torch.equal(x[batch_size + 1:], x[batch_size + 1:])
-        assert torch.equal(dt[batch_size + 1:], dt[batch_size + 1:])
-        assert torch.equal(B[batch_size + 1:], B[batch_size + 1:])
-        assert torch.equal(C[batch_size + 1:], C[batch_size + 1:])
+        assert torch.equal(state_before[unused_states_bool], state[unused_states_bool])
+        assert torch.equal(x[batch_size + 1 :], x[batch_size + 1 :])
+        assert torch.equal(dt[batch_size + 1 :], dt[batch_size + 1 :])
+        assert torch.equal(B[batch_size + 1 :], B[batch_size + 1 :])
+        assert torch.equal(C[batch_size + 1 :], C[batch_size + 1 :])
 
     # test "real" entries
-    assert torch.allclose(state[state_indices, :],
-                          state_ref,
-                          rtol=rtol,
-                          atol=atol)
+    assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("has_z", [False, True])
 @pytest.mark.parametrize("tie_hdim", [False, True])
 @pytest.mark.parametrize("ngroups", [1, 2, 4])
 @pytest.mark.parametrize("dstate", [16, 32, 64])
 @pytest.mark.parametrize("dim", [2048, 4096])
 def test_selective_state_update_with_heads_with_batch_indices(
-        dim, dstate, ngroups, has_z, tie_hdim, itype):
+    dim, dstate, ngroups, has_z, tie_hdim, itype
+):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 3e-2)
     if itype == torch.bfloat16:
@@ -655,69 +696,53 @@ def test_selective_state_update_with_heads_with_batch_indices(
     nheads = dim // headdim
 
     total_entries = 10 * batch_size
-    state = torch.randn(total_entries,
-                        nheads,
-                        headdim,
-                        dstate,
-                        dtype=itype,
-                        device=device)
+    state = torch.randn(
+        total_entries, nheads, headdim, dstate, dtype=itype, device=device
+    )
     state_indices = torch.randperm(total_entries)[:batch_size].to(
-        dtype=torch.int32, device=device)
+        dtype=torch.int32, device=device
+    )
 
     x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
     if not tie_hdim:
-        dt = torch.randn(batch_size,
-                         nheads,
-                         headdim,
-                         device=device,
-                         dtype=itype)
+        dt = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
         dt_bias = torch.rand(nheads, headdim, device=device) - 4.0
         A = -torch.rand(nheads, headdim, dstate, device=device) - 1.0
         D = torch.randn(nheads, headdim, device=device)
     else:
-        dt = repeat(torch.randn(batch_size, nheads, device=device,
-                                dtype=itype),
-                    "b h -> b h p",
-                    p=headdim)
-        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0,
-                         "h -> h p",
-                         p=headdim)
-        A = repeat(-torch.rand(nheads, device=device) - 1.0,
-                   "h -> h p n",
-                   p=headdim,
-                   n=dstate)
+        dt = repeat(
+            torch.randn(batch_size, nheads, device=device, dtype=itype),
+            "b h -> b h p",
+            p=headdim,
+        )
+        dt_bias = repeat(torch.rand(nheads, device=device) - 4.0, "h -> h p", p=headdim)
+        A = repeat(
+            -torch.rand(nheads, device=device) - 1.0, "h -> h p n", p=headdim, n=dstate
+        )
         D = repeat(torch.randn(nheads, device=device), "h -> h p", p=headdim)
     B = torch.randn(batch_size, ngroups, dstate, device=device)
     C = torch.randn(batch_size, ngroups, dstate, device=device)
     z = torch.randn_like(x) if has_z else None
     state_ref = state[state_indices, :].detach().clone()
-    out = selective_state_update(state,
-                                 x,
-                                 dt,
-                                 A,
-                                 B,
-                                 C,
-                                 D=D,
-                                 z=z,
-                                 dt_bias=dt_bias,
-                                 dt_softplus=True,
-                                 state_batch_indices=state_indices,
-                                 pad_slot_id=PAD_SLOT_ID)
-    out_ref = selective_state_update_ref(state_ref,
-                                         x,
-                                         dt,
-                                         A,
-                                         B,
-                                         C,
-                                         D=D,
-                                         z=z,
-                                         dt_bias=dt_bias,
-                                         dt_softplus=True)
+    out = selective_state_update(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=True,
+        state_batch_indices=state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+    out_ref = selective_state_update_ref(
+        state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True
+    )
 
     print(f"Output max diff: {(out - out_ref).abs().max().item()}")
     print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    assert torch.allclose(state[state_indices, :],
-                          state_ref,
-                          rtol=rtol,
-                          atol=atol)
+    assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index 6a3f21ba543f..a71057977087 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -6,11 +6,11 @@
 import torch.nn.functional as F
 from einops import rearrange, repeat
 
-from vllm.model_executor.layers.mamba.ops.ssd_combined import (
-    mamba_chunk_scan_combined)
+from vllm.model_executor.layers.mamba.ops.ssd_combined import mamba_chunk_scan_combined
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.mamba_attn import (
-    _query_start_loc_to_chunk_indices_offsets)
+    _query_start_loc_to_chunk_indices_offsets,
+)
 
 # Added by the IBM Team, 2024
 
@@ -22,12 +22,10 @@ def segsum(x):
     """Calculates segment sum."""
     T = x.size(-1)
     x = repeat(x, "... d -> ... d e", e=T)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
-                      diagonal=-1)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=-1)
     x = x.masked_fill(~mask, 0)
     x_segsum = torch.cumsum(x, dim=-2)
-    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
-                      diagonal=0)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=0)
     x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
     return x_segsum
 
@@ -46,8 +44,9 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
     assert X.shape[1] % block_len == 0
 
     # Rearrange into blocks/chunks
-    X, A, B, C = (rearrange(x, "b (c l) ... -> b c l ...", l=block_len)
-                  for x in (X, A, B, C))
+    X, A, B, C = (
+        rearrange(x, "b (c l) ... -> b c l ...", l=block_len) for x in (X, A, B, C)
+    )
 
     A = rearrange(A, "b c l h -> b h c l")
     A_cumsum = torch.cumsum(A, dim=-1)
@@ -74,7 +73,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
     # 4. Compute state -> output conversion per chunk
     # (left term of low-rank factorization of off-diagonal blocks; C terms)
     state_decay_out = torch.exp(A_cumsum)
-    Y_off = torch.einsum('bclhn,bchpn,bhcl->bclhp', C, states, state_decay_out)
+    Y_off = torch.einsum("bclhn,bchpn,bhcl->bclhp", C, states, state_decay_out)
 
     # Add output of intra-chunk and inter-chunk terms
     # (diagonal and off-diagonal blocks)
@@ -82,61 +81,48 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
     return Y, final_state
 
 
-def generate_random_inputs(batch_size,
-                           seqlen,
-                           n_heads,
-                           d_head,
-                           itype,
-                           device='cuda'):
-
+def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"):
     current_platform.seed_everything(0)
-    A = (-torch.exp(torch.rand(n_heads, dtype=itype, device=device)))
+    A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device))
     dt = F.softplus(
-        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) -
-        4)
-    X = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
-    B = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
-    C = torch.randn((batch_size, seqlen, n_heads, d_head),
-                    dtype=itype,
-                    device=device)
+        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4
+    )
+    X = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
+    B = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
+    C = torch.randn((batch_size, seqlen, n_heads, d_head), dtype=itype, device=device)
 
     return A, dt, X, B, C
 
 
-def generate_continuous_batched_examples(example_lens_by_batch,
-                                         num_examples,
-                                         full_length,
-                                         last_taken,
-                                         exhausted,
-                                         n_heads,
-                                         d_head,
-                                         itype,
-                                         device='cuda'):
-
+def generate_continuous_batched_examples(
+    example_lens_by_batch,
+    num_examples,
+    full_length,
+    last_taken,
+    exhausted,
+    n_heads,
+    d_head,
+    itype,
+    device="cuda",
+):
     # this function generates a random examples of certain length
     # and then cut according to "example_lens_by_batch" and feed
     # them in continuous batches to the kernels
 
     # generate the full-length example
-    A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
-                                            d_head, itype)
+    A, dt, X, B, C = generate_random_inputs(
+        num_examples, full_length, n_heads, d_head, itype
+    )
 
-    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
-                                                  A * dt,
-                                                  B,
-                                                  C,
-                                                  block_len=full_length // 4)
+    Y_min, final_state_min = ssd_minimal_discrete(
+        X * dt.unsqueeze(-1), A * dt, B, C, block_len=full_length // 4
+    )
 
     # internal function that outputs a cont batch of examples
     # given a tuple of lengths for each example in the batch
     # e.g., example_lens=(8, 4) means take 8 samples from first eg,
     #       4 examples from second eg, etc
     def get_continuous_batch(example_lens: tuple[int, ...]):
-
         indices = []
         for i, x in enumerate(example_lens):
             c = last_taken.get(i, 0)
@@ -144,8 +130,10 @@ def get_continuous_batch(example_lens: tuple[int, ...]):
             last_taken[i] = (c + x) % full_length
             exhausted[i] = last_taken[i] == 0
 
-        return (torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)
-                              ]).unsqueeze(0) for x in (dt, X, B, C))
+        return (
+            torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)]).unsqueeze(0)
+            for x in (dt, X, B, C)
+        )
 
     # internal function that maps "n" to the appropriate right boundary
     # value when forming continuous batches from examples of length given
@@ -157,19 +145,20 @@ def end_boundary(n: int):
 
     IND_E = None
     for spec in example_lens_by_batch:
-
         # get the (maybe partial) example seen in this cont batch
         dt2, X2, B2, C2 = get_continuous_batch(spec)
 
         # get the metadata
-        cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
-        seq_idx = torch.zeros(cu_seqlens[-1],
-                              dtype=torch.int32,
-                              device=cu_seqlens.device)
-        for i, (srt, end) in enumerate(zip(
+        cu_seqlens = torch.tensor((0,) + spec, device=device).cumsum(dim=0)
+        seq_idx = torch.zeros(
+            cu_seqlens[-1], dtype=torch.int32, device=cu_seqlens.device
+        )
+        for i, (srt, end) in enumerate(
+            zip(
                 cu_seqlens,
                 cu_seqlens[1:],
-        )):
+            )
+        ):
             seq_idx[srt:end] = i
 
         # for cont batch
@@ -179,18 +168,19 @@ def end_boundary(n: int):
             IND_S = [x % full_length for x in IND_E]
         IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
 
-        yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
-               cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
+        yield (
+            [Y_min[s, IND_S[s] : IND_E[s]] for s in range(num_examples)],
+            cu_seqlens,
+            seq_idx.unsqueeze(0),
+            (A, dt2, X2, B2, C2),
+        )
 
 
-@pytest.mark.parametrize("itype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
 @pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
 @pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
-def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
-                                         itype):
-
+def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype):
     # this tests the kernels on a single example (no batching)
 
     # set seed
@@ -200,30 +190,27 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
     #   it is not an operational limitation.
     seqlen, chunk_size = seq_len_chunk_size
 
-    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads,
-                                            d_head, itype)
+    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype)
 
-    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
-                                                  B, C, chunk_size)
+    Y_min, final_state_min = ssd_minimal_discrete(
+        X * dt.unsqueeze(-1), A * dt, B, C, chunk_size
+    )
 
-    Y, final_state = mamba_chunk_scan_combined(X,
-                                               dt,
-                                               A,
-                                               B,
-                                               C,
-                                               chunk_size,
-                                               D=None,
-                                               return_final_states=True)
+    Y, final_state = mamba_chunk_scan_combined(
+        X, dt, A, B, C, chunk_size, D=None, return_final_states=True
+    )
 
     # just test the last in sequence
     torch.allclose(Y[:, -1], Y_min[:, -1], atol=1e-3, rtol=1e-3)
 
     # just test the last head
     # NOTE, in the kernel we always cast states to fp32
-    torch.allclose(final_state[:, -1],
-                   final_state_min[:, -1].to(torch.float32),
-                   atol=1e-3,
-                   rtol=1e-3)
+    torch.allclose(
+        final_state[:, -1],
+        final_state_min[:, -1].to(torch.float32),
+        atol=1e-3,
+        rtol=1e-3,
+    )
 
 
 @pytest.mark.parametrize("itype", [torch.float32, torch.float16])
@@ -232,32 +219,39 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
 @pytest.mark.parametrize(
     "seq_len_chunk_size_cases",
     [
-
         # small-ish chunk_size (8)
         (64, 8, 2, [(64, 32), (64, 32)]),
         (64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
         (64, 8, 2, [(8, 8), (8, 8), (8, 8)]),  # chunk size boundary
-        (64, 8, 2, [(4, 4), (4, 4), (4, 4),
-                    (4, 4)]),  # chunk_size larger than cont batches
-        (64, 8, 5, [
-            (64, 32, 16, 8, 8),
-            (8, 16, 32, 16, 8),
-            (8, 8, 16, 32, 16),
-        ]),  # mode examples with varied lengths
-
+        (
+            64,
+            8,
+            2,
+            [(4, 4), (4, 4), (4, 4), (4, 4)],
+        ),  # chunk_size larger than cont batches
+        (
+            64,
+            8,
+            5,
+            [
+                (64, 32, 16, 8, 8),
+                (8, 16, 32, 16, 8),
+                (8, 8, 16, 32, 16),
+            ],
+        ),  # mode examples with varied lengths
         # odd chunk_size
-        (64, 29, 2, [(11, 4), (13, 23), (19, 22),
-                     (21, 15)]),  # irregular sizes
-
+        (64, 29, 2, [(11, 4), (13, 23), (19, 22), (21, 15)]),  # irregular sizes
         # large-ish chunk_size (256)
-        (64, 256, 1, [(5, ), (1, ), (1, ),
-                      (1, )]),  # irregular sizes with small sequences
-        (64, 256, 2, [(5, 30), (1, 2), (1, 2),
-                      (1, 2)]),  # irregular sizes with small sequences
-    ])
-def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
-                                     itype):
-
+        (64, 256, 1, [(5,), (1,), (1,), (1,)]),  # irregular sizes with small sequences
+        (
+            64,
+            256,
+            2,
+            [(5, 30), (1, 2), (1, 2), (1, 2)],
+        ),  # irregular sizes with small sequences
+    ],
+)
+def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, itype):
     # this test with multiple examples in a continuous batch
     # (i.e. chunked prefill)
 
@@ -270,13 +264,17 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
 
     states = None
     for Y_min, cu_seqlens, seq_idx, (
-            A, dt, X, B, C) in generate_continuous_batched_examples(
-                cases, num_examples, seqlen, last_taken, exhausted, n_heads,
-                d_head, itype):
-
-        chunk_indices, chunk_offsets = \
-            _query_start_loc_to_chunk_indices_offsets(
-                cu_seqlens, chunk_size, cu_seqlens[-1])
+        A,
+        dt,
+        X,
+        B,
+        C,
+    ) in generate_continuous_batched_examples(
+        cases, num_examples, seqlen, last_taken, exhausted, n_heads, d_head, itype
+    ):
+        chunk_indices, chunk_offsets = _query_start_loc_to_chunk_indices_offsets(
+            cu_seqlens, chunk_size, cu_seqlens[-1]
+        )
 
         Y, new_states = mamba_chunk_scan_combined(
             X,
@@ -296,9 +294,8 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
 
         # just test the last in sequence
         for i in range(num_examples):
-
             # just test one dim and dstate
-            Y_eg = Y[0, cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
+            Y_eg = Y[0, cu_seqlens[i] : cu_seqlens[i + 1], 0, 0]
             Y_min_eg = Y_min[i][:, 0, 0]
             torch.allclose(Y_eg, Y_min_eg, atol=1e-3, rtol=1e-3)
 
@@ -306,5 +303,5 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
         states = new_states
         for i, clear in exhausted.items():
             if clear:
-                states[i].fill_(0.)
+                states[i].fill_(0.0)
                 exhausted[i] = False
diff --git a/tests/kernels/moe/modular_kernel_tools/cli_args.py b/tests/kernels/moe/modular_kernel_tools/cli_args.py
index b95d87cd04f5..d46847fbf6a3 100644
--- a/tests/kernels/moe/modular_kernel_tools/cli_args.py
+++ b/tests/kernels/moe/modular_kernel_tools/cli_args.py
@@ -9,18 +9,19 @@
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 
 from .common import Config
-from .mk_objects import (MK_ALL_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES,
-                         MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+from .mk_objects import (
+    MK_ALL_PREPARE_FINALIZE_TYPES,
+    MK_FUSED_EXPERT_TYPES,
+    MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES,
+)
 
 
 def make_config_arg_parser(description: str):
-
     def to_pf_class_type(s: str) -> mk.FusedMoEPrepareAndFinalize:
         for pf in MK_ALL_PREPARE_FINALIZE_TYPES:
             if pf.__name__ == s:
                 return pf
-        raise ValueError(
-            f"Cannot find a PrepareFinalize type that matches {s}")
+        raise ValueError(f"Cannot find a PrepareFinalize type that matches {s}")
 
     def to_experts_class_type(s: str) -> mk.FusedMoEPermuteExpertsUnpermute:
         for fe in MK_FUSED_EXPERT_TYPES:
@@ -45,15 +46,18 @@ def to_quant_torch_dtype(s: str) -> torch.dtype:
         "--pf-type",
         type=to_pf_class_type,
         required=True,
-        help=("Choose a PrepareFinalize Type : "
-              f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"),
+        help=(
+            "Choose a PrepareFinalize Type : "
+            f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"
+        ),
     )
     parser.add_argument(
         "--experts-type",
         type=to_experts_class_type,
         required=True,
-        help=(f"Choose a FusedExpert type : "
-              f"{[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"),
+        help=(
+            f"Choose a FusedExpert type : {[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"
+        ),
     )
     parser.add_argument(
         "-m",
@@ -74,66 +78,65 @@ def to_quant_torch_dtype(s: str) -> torch.dtype:
         default=1024,
         help="N dimension of the first fused-moe matmul",
     )
-    parser.add_argument("--num-experts",
-                        type=int,
-                        default=32,
-                        help="Global num experts")
-    parser.add_argument("--topk",
-                        nargs="+",
-                        type=int,
-                        default=[4, 1],
-                        help="num topk")
+    parser.add_argument(
+        "--num-experts", type=int, default=32, help="Global num experts"
+    )
+    parser.add_argument("--topk", nargs="+", type=int, default=[4, 1], help="num topk")
     parser.add_argument(
         "--fused-moe-chunk-size",
         type=int,
-        help="Fused moe chunk size used for the non-batched fused experts impl."
+        help="Fused moe chunk size used for the non-batched fused experts impl.",
     )
 
     # Quant args
-    parser.add_argument("--quant-dtype",
-                        type=to_quant_torch_dtype,
-                        help="Quant datatype")
-    parser.add_argument("--per-token-quantized-activations",
-                        action='store_true',
-                        help=("The input activations must be per-token "
-                              "quantized"))
-    parser.add_argument("--per-channel-quantized-weights",
-                        action="store_true",
-                        help="The weights must be per-channel quantized.")
-    parser.add_argument("--block-shape",
-                        nargs="+",
-                        type=int,
-                        help="Quantization block shape")
+    parser.add_argument(
+        "--quant-dtype", type=to_quant_torch_dtype, help="Quant datatype"
+    )
+    parser.add_argument(
+        "--per-token-quantized-activations",
+        action="store_true",
+        help=("The input activations must be per-token quantized"),
+    )
+    parser.add_argument(
+        "--per-channel-quantized-weights",
+        action="store_true",
+        help="The weights must be per-channel quantized.",
+    )
+    parser.add_argument(
+        "--block-shape", nargs="+", type=int, help="Quantization block shape"
+    )
 
     # Torch trace profile generation args
-    parser.add_argument("--torch-trace-dir-path",
-                        type=str,
-                        default=None,
-                        help="Get torch trace for single execution")
+    parser.add_argument(
+        "--torch-trace-dir-path",
+        type=str,
+        default=None,
+        help="Get torch trace for single execution",
+    )
 
     return parser
 
 
 def _validate_args(args: argparse.Namespace):
-
     if args.quant_dtype is not None:
         assert args.quant_dtype == torch.float8_e4m3fn
         if args.block_shape is not None:
             assert len(args.block_shape) == 2, (
-                f"block shape must have 2 elements. got {args.block_shape}")
+                f"block shape must have 2 elements. got {args.block_shape}"
+            )
 
     if args.experts_type in MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES:
-        assert args.world_size == 1, (
-            "Single GPU objects need world size set to 1")
+        assert args.world_size == 1, "Single GPU objects need world size set to 1"
 
     if args.torch_trace_dir_path is not None:
         from pathlib import Path
+
         assert Path(args.torch_trace_dir_path).is_dir(), (
-            f"Please create {args.torch_trace_dir_path}")
+            f"Please create {args.torch_trace_dir_path}"
+        )
 
 
 def make_config(args: argparse.Namespace) -> Config:
-
     _validate_args(args)
 
     quant_config = None
@@ -142,7 +145,8 @@ def make_config(args: argparse.Namespace) -> Config:
             quant_dtype=args.quant_dtype,
             per_act_token_quant=args.per_token_quantized_activations,
             per_out_ch_quant=args.per_channel_quantized_weights,
-            block_shape=args.block_shape)
+            block_shape=args.block_shape,
+        )
 
     return Config(
         Ms=args.m,
@@ -156,4 +160,5 @@ def make_config(args: argparse.Namespace) -> Config:
         fused_experts_type=args.experts_type,
         fused_moe_chunk_size=args.fused_moe_chunk_size,
         world_size=args.world_size,
-        torch_trace_dir_path=args.torch_trace_dir_path)
+        torch_trace_dir_path=args.torch_trace_dir_path,
+    )
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index fd99e8dc5c98..48d4beb8c294 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -10,38 +10,54 @@
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
 from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
+
 # Fused experts and PrepareFinalize imports
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
+    BatchedDeepGemmExperts,
+)
 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
+    BatchedTritonOrDeepGemmExperts,
+)
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig)
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts, NaiveBatchedExperts)
+    BatchedTritonExperts,
+    NaiveBatchedExperts,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
-                                                        TritonExperts)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, TritonExperts
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
+    TritonOrDeepGemmExperts,
+)
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 
 from .parallel_utils import ProcessGroupInfo
-from .utils import (make_block_quant_fp8_weights, make_non_quant_weights,
-                    make_quant_fp8_weights, per_token_cast_to_fp8)
+from .utils import (
+    make_block_quant_fp8_weights,
+    make_non_quant_weights,
+    make_quant_fp8_weights,
+    per_token_cast_to_fp8,
+)
 
 if has_pplx():
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize,
+    )
 if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
+        DeepEPHTPrepareAndFinalize,
+    )
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+        DeepEPLLPrepareAndFinalize,
+    )
 
 
 def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
@@ -110,8 +126,7 @@ def is_per_act_token_quant(self) -> bool:
     def is_per_tensor_act_quant(self) -> bool:
         if self.quant_config is None:
             return False
-        return (not self.is_per_act_token_quant
-                and self.quant_block_shape is None)
+        return not self.is_per_act_token_quant and self.quant_block_shape is None
 
     @property
     def is_per_out_ch_quant(self) -> bool:
@@ -136,7 +151,8 @@ def topk_ids_dtype(self) -> Optional[torch.dtype]:
         if self.prepare_finalize_type == PplxPrepareAndFinalize:
             topk_ids_dtype = torch.uint32
         elif self.prepare_finalize_type in [
-                DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+            DeepEPHTPrepareAndFinalize,
+            DeepEPLLPrepareAndFinalize,
         ]:
             topk_ids_dtype = torch.int64
         return topk_ids_dtype
@@ -147,7 +163,7 @@ def num_local_experts(self) -> int:
 
     def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]:
         """
-        make env data for vllm launch. 
+        make env data for vllm launch.
         """
         vllm_config = VllmConfig()
         vllm_config.parallel_config.data_parallel_size = self.world_size
@@ -159,34 +175,45 @@ def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]:
         }
         if self.fused_moe_chunk_size is not None:
             env_dict.update(
-                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)})
+                {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)}
+            )
         return vllm_config, env_dict
 
     def is_fp8_block_quantized(self):
-        return (self.quant_dtype == torch.float8_e4m3fn
-                and self.quant_block_shape is not None)
+        return (
+            self.quant_dtype == torch.float8_e4m3fn
+            and self.quant_block_shape is not None
+        )
 
     def is_batched_prepare_finalize(self):
         return self.prepare_finalize_type in [
-            PplxPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+            PplxPrepareAndFinalize,
+            DeepEPLLPrepareAndFinalize,
         ]
 
     def is_batched_fused_experts(self):
         return self.fused_experts_type in [
-            CutlassExpertsFp8, BatchedDeepGemmExperts, BatchedTritonExperts,
-            NaiveBatchedExperts, BatchedTritonOrDeepGemmExperts
+            CutlassExpertsFp8,
+            BatchedDeepGemmExperts,
+            BatchedTritonExperts,
+            NaiveBatchedExperts,
+            BatchedTritonOrDeepGemmExperts,
         ]
 
     def is_standard_fused_experts(self):
         return self.fused_experts_type in [
-            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
-            TritonExperts
+            CutlassExpertsFp8,
+            DeepGemmExperts,
+            TritonOrDeepGemmExperts,
+            TritonExperts,
         ]
 
     def is_fe_16bit_supported(self):
         return self.fused_experts_type in [
-            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
-            NaiveBatchedExperts, TritonExperts
+            BatchedTritonExperts,
+            BatchedTritonOrDeepGemmExperts,
+            NaiveBatchedExperts,
+            TritonExperts,
         ]
 
     def is_fe_fp8_supported(self):
@@ -214,8 +241,10 @@ def is_fe_block_fp8_supported(self):
 
     def is_fe_supports_chunking(self):
         return self.fused_experts_type in [
-            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
-            TritonExperts
+            CutlassExpertsFp8,
+            DeepGemmExperts,
+            TritonOrDeepGemmExperts,
+            TritonExperts,
         ]
 
     def needs_deep_gemm(self):
@@ -229,7 +258,8 @@ def needs_pplx(self):
 
     def needs_deep_ep(self):
         return self.prepare_finalize_type in [
-            DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+            DeepEPHTPrepareAndFinalize,
+            DeepEPLLPrepareAndFinalize,
         ]
 
     def all2all_backend(self):
@@ -243,8 +273,9 @@ def all2all_backend(self):
 
     def needs_all2all(self):
         return self.prepare_finalize_type in [
-            PplxPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
-            DeepEPLLPrepareAndFinalize
+            PplxPrepareAndFinalize,
+            DeepEPHTPrepareAndFinalize,
+            DeepEPLLPrepareAndFinalize,
         ]
 
     def is_valid(self):
@@ -261,14 +292,16 @@ def is_valid(self):
             return False
 
         # Check quantization sanity
-        if (int(self.is_per_act_token_quant) +
-                int(self.is_per_tensor_act_quant) +
-                int(self.quant_block_shape is not None)) > 1:
+        if (
+            int(self.is_per_act_token_quant)
+            + int(self.is_per_tensor_act_quant)
+            + int(self.quant_block_shape is not None)
+        ) > 1:
             # invalid quant config
             return False
 
         # check bf16 / fp16 support
-        is_16bit = (self.dtype.itemsize == 2 and self.quant_dtype is None)
+        is_16bit = self.dtype.itemsize == 2 and self.quant_dtype is None
         if is_16bit and not self.is_fe_16bit_supported():
             return False
 
@@ -309,10 +342,10 @@ class WeightTensors:
     def describe(self):
         s = ""
         s += "== Weight Tensors: \n"
-        s += f' - {_describe_tensor(self.w1, "w1")} \n'
-        s += f' - {_describe_tensor(self.w2, "w2")} \n'
-        s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n'
-        s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n'
+        s += f" - {_describe_tensor(self.w1, 'w1')} \n"
+        s += f" - {_describe_tensor(self.w2, 'w2')} \n"
+        s += f" - {_describe_tensor(self.w1_scale, 'w1_scale')} \n"
+        s += f" - {_describe_tensor(self.w2_scale, 'w2_scale')} \n"
         return s
 
     def to_current_device(self):
@@ -322,13 +355,10 @@ def to_current_device(self):
         if is_quantized:
             assert self.w1_scale is not None
             assert self.w2_scale is not None
-            self.w1_scale = self.w1_scale.to(
-                device=torch.cuda.current_device())
-            self.w2_scale = self.w2_scale.to(
-                device=torch.cuda.current_device())
+            self.w1_scale = self.w1_scale.to(device=torch.cuda.current_device())
+            self.w2_scale = self.w2_scale.to(device=torch.cuda.current_device())
 
-    def slice_weights(self, rank: int,
-                      num_local_experts: int) -> "WeightTensors":
+    def slice_weights(self, rank: int, num_local_experts: int) -> "WeightTensors":
         s = rank * num_local_experts
         e = s + num_local_experts
         w1 = self.w1[s:e, :, :]
@@ -344,13 +374,11 @@ def slice_weights(self, rank: int,
 
     @staticmethod
     def make(config: Config) -> "WeightTensors":
-
         if config.quant_dtype is None:
             # just make normal dtype weights
-            w1, w2 = make_non_quant_weights(e=config.E,
-                                            n=config.N,
-                                            k=config.K,
-                                            dtype=config.dtype)
+            w1, w2 = make_non_quant_weights(
+                e=config.E, n=config.N, k=config.K, dtype=config.dtype
+            )
             return WeightTensors(w1=w1, w2=w2, w1_scale=None, w2_scale=None)
 
         assert config.quant_dtype == torch.float8_e4m3fn
@@ -361,10 +389,7 @@ def make(config: Config) -> "WeightTensors":
                 k=config.K,
                 per_out_channel_quant=config.is_per_out_ch_quant,
             )
-            return WeightTensors(w1=w1,
-                                 w2=w2,
-                                 w1_scale=w1_scale,
-                                 w2_scale=w2_scale)
+            return WeightTensors(w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale)
 
         assert config.quant_block_shape is not None
         w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
@@ -373,10 +398,7 @@ def make(config: Config) -> "WeightTensors":
             k=config.K,
             block_size=config.quant_block_shape,
         )
-        return WeightTensors(w1=w1,
-                             w2=w2,
-                             w1_scale=w1_scale,
-                             w2_scale=w2_scale)
+        return WeightTensors(w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale)
 
 
 @dataclass
@@ -393,22 +415,22 @@ class RankTensors:
     def describe(self):
         s = ""
         s += "== Rank Tensors: \n"
-        s += f' - {_describe_tensor(self.hidden_states, "HS")} \n'
-        s += f' - {_describe_tensor(self.hidden_states_scale, "HS_scale")} \n'
-        s += f' - {_describe_tensor(self.topk_weights, "topk_weights")} \n'
-        s += f' - {_describe_tensor(self.topk_ids, "topk_ids")} \n'
-        s += f' - {_describe_tensor(self.expert_map, "expert_map")} \n'
+        s += f" - {_describe_tensor(self.hidden_states, 'HS')} \n"
+        s += f" - {_describe_tensor(self.hidden_states_scale, 'HS_scale')} \n"
+        s += f" - {_describe_tensor(self.topk_weights, 'topk_weights')} \n"
+        s += f" - {_describe_tensor(self.topk_ids, 'topk_ids')} \n"
+        s += f" - {_describe_tensor(self.expert_map, 'expert_map')} \n"
         return s
 
     @staticmethod
     def make_hidden_states(
-            config: Config) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        config: Config,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         Return hidden_states
         """
         m, k, dtype = (config.M, config.K, config.dtype)
-        a = (torch.randn(
-            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0)
+        a = torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0
 
         if config.quant_dtype is None:
             return a, None
@@ -419,36 +441,29 @@ def make_hidden_states(
         # first - so further quantize and dequantize will yield the same
         # values.
         if config.is_per_tensor_act_quant:
-            a_q, a_scales = ops.scaled_fp8_quant(
-                a, use_per_token_if_dynamic=False)
+            a_q, a_scales = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=False)
             return a_q.float().mul(a_scales).to(dtype), a_scales
 
         if config.is_per_act_token_quant:
-            a_q, a_scales = ops.scaled_fp8_quant(a,
-                                                 use_per_token_if_dynamic=True)
+            a_q, a_scales = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=True)
             return a_q.float().mul(a_scales).to(dtype), None
 
         assert config.quant_block_shape is not None
         block_k = config.quant_block_shape[1]
         a_q, a_scales = per_token_cast_to_fp8(a, block_size=block_k)
-        return a_q.float().view(
-            (-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(dtype), None
+        return a_q.float().view((-1, block_k)).mul(a_scales.view(-1, 1)).view(m, k).to(
+            dtype
+        ), None
 
     @staticmethod
     def make(config: Config, pgi: ProcessGroupInfo):
-
         dtype = config.dtype
         topk, m, _ = (config.topk, config.M, config.K)
-        hidden_states, hidden_states_scale = RankTensors.make_hidden_states(
-            config)
-
-        num_local_experts, global_num_experts = (config.num_local_experts,
-                                                 config.E)
-        score = torch.randn((m, global_num_experts),
-                            device="cuda",
-                            dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk,
-                                               False)
+        hidden_states, hidden_states_scale = RankTensors.make_hidden_states(config)
+
+        num_local_experts, global_num_experts = (config.num_local_experts, config.E)
+        score = torch.randn((m, global_num_experts), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk, False)
         topk_ids = topk_ids.to(config.topk_ids_dtype)
 
         # distribute topk_ids evenly
@@ -458,14 +473,15 @@ def make(config: Config, pgi: ProcessGroupInfo):
 
         expert_map = None
         if config.world_size > 1:
-            expert_map = torch.full((global_num_experts, ),
-                                    fill_value=-1,
-                                    dtype=torch.int32)
+            expert_map = torch.full(
+                (global_num_experts,), fill_value=-1, dtype=torch.int32
+            )
             s = pgi.rank * num_local_experts
             e = s + num_local_experts
             expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-            expert_map = expert_map.to(device=torch.cuda.current_device(),
-                                       dtype=torch.int32)
+            expert_map = expert_map.to(
+                device=torch.cuda.current_device(), dtype=torch.int32
+            )
 
         return RankTensors(
             hidden_states=hidden_states,
@@ -477,29 +493,30 @@ def make(config: Config, pgi: ProcessGroupInfo):
         )
 
 
-def reference_moe_impl(config: Config, weights: WeightTensors,
-                       rank_tensors: RankTensors) -> torch.Tensor:
-
-    return torch_experts(a=rank_tensors.hidden_states,
-                         w1=weights.w1,
-                         w2=weights.w2,
-                         topk_weight=rank_tensors.topk_weights,
-                         topk_ids=rank_tensors.topk_ids,
-                         global_num_experts=config.E,
-                         expert_map=None,
-                         w1_scale=weights.w1_scale,
-                         w2_scale=weights.w2_scale,
-                         a1_scale=rank_tensors.hidden_states_scale,
-                         quant_dtype=config.quant_dtype,
-                         per_act_token_quant=config.is_per_act_token_quant,
-                         block_shape=config.quant_block_shape,
-                         apply_router_weights_on_input=config.topk == 1)
+def reference_moe_impl(
+    config: Config, weights: WeightTensors, rank_tensors: RankTensors
+) -> torch.Tensor:
+    return torch_experts(
+        a=rank_tensors.hidden_states,
+        w1=weights.w1,
+        w2=weights.w2,
+        topk_weight=rank_tensors.topk_weights,
+        topk_ids=rank_tensors.topk_ids,
+        global_num_experts=config.E,
+        expert_map=None,
+        w1_scale=weights.w1_scale,
+        w2_scale=weights.w2_scale,
+        a1_scale=rank_tensors.hidden_states_scale,
+        quant_dtype=config.quant_dtype,
+        per_act_token_quant=config.is_per_act_token_quant,
+        block_shape=config.quant_block_shape,
+        apply_router_weights_on_input=config.topk == 1,
+    )
 
 
 def make_fused_experts(
-        config: Config, moe: FusedMoEConfig,
-        num_dispatchers: int) -> mk.FusedMoEPermuteExpertsUnpermute:
-
+    config: Config, moe: FusedMoEConfig, num_dispatchers: int
+) -> mk.FusedMoEPermuteExpertsUnpermute:
     use_fp8 = config.quant_dtype == torch.float8_e4m3fn
     batch_kwargs = {
         "max_num_tokens": moe.max_num_tokens,
@@ -547,8 +564,7 @@ def make_fused_experts(
         experts = NaiveBatchedExperts(**kwargs)
     elif config.fused_experts_type == CutlassExpertsFp8:
         use_batched_format = config.is_batched_prepare_finalize()
-        num_experts = (moe.num_local_experts
-                       if use_batched_format else moe.num_experts)
+        num_experts = moe.num_local_experts if use_batched_format else moe.num_experts
         kwargs = {
             "max_experts_per_worker": num_experts,
             "out_dtype": moe.in_dtype,
@@ -556,7 +572,7 @@ def make_fused_experts(
             "per_out_ch_quant": config.is_per_out_ch_quant,
             "block_shape": config.quant_block_shape,
             "num_dispatchers": num_dispatchers,
-            "use_batched_format": use_batched_format
+            "use_batched_format": use_batched_format,
         }
         print(f"Making CutlassExpertsFp8 {kwargs} ...")
         experts = CutlassExpertsFp8(**kwargs)
@@ -564,14 +580,15 @@ def make_fused_experts(
     return experts
 
 
-def make_modular_kernel(config: Config,
-                        vllm_config: VllmConfig) -> mk.FusedMoEModularKernel:
-
+def make_modular_kernel(
+    config: Config, vllm_config: VllmConfig
+) -> mk.FusedMoEModularKernel:
     def next_power_of_2(x):
         import math
+
         if x == 0:
             return 1
-        return 2**math.ceil(math.log2(x))
+        return 2 ** math.ceil(math.log2(x))
 
     # make moe config
     moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
@@ -598,11 +615,11 @@ def next_power_of_2(x):
     else:
         prepare_finalize = MoEPrepareAndFinalizeNoEP()
 
-    fused_experts = make_fused_experts(config, moe,
-                                       prepare_finalize.num_dispatchers())
+    fused_experts = make_fused_experts(config, moe, prepare_finalize.num_dispatchers())
 
     modular_kernel = mk.FusedMoEModularKernel(
-        prepare_finalize=prepare_finalize, fused_experts=fused_experts)
+        prepare_finalize=prepare_finalize, fused_experts=fused_experts
+    )
 
     return modular_kernel
 
@@ -623,8 +640,7 @@ def run_modular_kernel(
     mk = make_modular_kernel(config, vllm_config)
 
     mk_kwargs = {
-        "hidden_states": rank_tensors.hidden_states.clone(
-        ),  # impls might update the tensor in place
+        "hidden_states": rank_tensors.hidden_states.clone(),  # impls might update the tensor in place
         "w1": rank_weights.w1,
         "w2": rank_weights.w2,
         "topk_weights": rank_tensors.topk_weights,
diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
index 5dbfdfc153f9..6aa64d3f4929 100644
--- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@@ -13,10 +13,18 @@
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.platforms import current_platform
 
-from .common import (Config, RankTensors, WeightTensors, reference_moe_impl,
-                     run_modular_kernel)
-from .mk_objects import (MK_FUSED_EXPERT_TYPES,
-                         MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_QUANT_CONFIGS)
+from .common import (
+    Config,
+    RankTensors,
+    WeightTensors,
+    reference_moe_impl,
+    run_modular_kernel,
+)
+from .mk_objects import (
+    MK_FUSED_EXPERT_TYPES,
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
+    MK_QUANT_CONFIGS,
+)
 from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
 
 
@@ -37,8 +45,9 @@ def rank_worker(
 
     # sanity check
     from vllm import envs
+
     if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
     # get weights to this device
     weights.to_current_device()
@@ -59,8 +68,7 @@ def rank_worker(
         rank_tensors = RankTensors.make(cfgx, pgi)
 
         # modular kernel out
-        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
-                                    rank_tensors)
+        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, rank_tensors)
 
         with set_current_vllm_config(vllm_config):
             ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
@@ -69,28 +77,27 @@ def rank_worker(
 
 
 def make_feature_matrix(csv_file_path: str):
-
     from dataclasses import asdict
 
     import pandas as pd
 
-    def add_to_results(config: Config,
-                       success: Result,
-                       results_df: Optional[pd.DataFrame] = None):
+    def add_to_results(
+        config: Config, success: Result, results_df: Optional[pd.DataFrame] = None
+    ):
         config_dict = asdict(config)
-        config_dict['prepare_finalize_type'] = config_dict[
-            'prepare_finalize_type'].__name__
-        config_dict['fused_experts_type'] = config_dict[
-            'fused_experts_type'].__name__
-        config_dict['per_tensor_act_quant'] = config.is_per_tensor_act_quant
-        quant_config_dict = config_dict['quant_config']
-        del config_dict['quant_config']
+        config_dict["prepare_finalize_type"] = config_dict[
+            "prepare_finalize_type"
+        ].__name__
+        config_dict["fused_experts_type"] = config_dict["fused_experts_type"].__name__
+        config_dict["per_tensor_act_quant"] = config.is_per_tensor_act_quant
+        quant_config_dict = config_dict["quant_config"]
+        del config_dict["quant_config"]
         if quant_config_dict is None:
             quant_config = FusedMoEQuantConfig(None)
             quant_config_dict = asdict(quant_config)
 
         config_dict |= quant_config_dict
-        result_dict = config_dict | {'success': success.name}
+        result_dict = config_dict | {"success": success.name}
 
         result_df = pd.DataFrame([result_dict])
         if results_df is None:
@@ -111,22 +118,26 @@ def add_to_results(config: Config,
     Q_TYPES = MK_QUANT_CONFIGS
 
     combinations = list(
-        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES))
+        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES)
+    )
 
     results_df: Optional[pd.DataFrame] = None
     for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
-            combinations):  #noqa: E501
-        config = Config(Ms=[m],
-                        K=k,
-                        N=n,
-                        E=e,
-                        topks=topks,
-                        dtype=dtype,
-                        prepare_finalize_type=pf_type,
-                        fused_experts_type=experts_type,
-                        quant_config=quant_config,
-                        world_size=2,
-                        fused_moe_chunk_size=None)
+        combinations
+    ):  # noqa: E501
+        config = Config(
+            Ms=[m],
+            K=k,
+            N=n,
+            E=e,
+            topks=topks,
+            dtype=dtype,
+            prepare_finalize_type=pf_type,
+            fused_experts_type=experts_type,
+            quant_config=quant_config,
+            world_size=2,
+            fused_moe_chunk_size=None,
+        )
 
         success = None
         if config.is_valid():
@@ -134,9 +145,14 @@ def add_to_results(config: Config,
             try:
                 weights: WeightTensors = WeightTensors.make(config)
                 vllm_config, env_dict = config.make_env_data()
-                parallel_launch_with_config(config.world_size, rank_worker,
-                                            vllm_config, env_dict, config,
-                                            weights)
+                parallel_launch_with_config(
+                    config.world_size,
+                    rank_worker,
+                    vllm_config,
+                    env_dict,
+                    config,
+                    weights,
+                )
                 success = Result.PASS
             except Exception as _:
                 success = Result.FAIL
@@ -149,25 +165,33 @@ def add_to_results(config: Config,
         results_df.to_csv(f"{csv_file_path}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
     from pathlib import Path
-    parser = argparse.ArgumentParser(description=(
-        "Make ModularKernel feature matrix \n"
-        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.make_feature_matrix "  #noqa: E501
-        "-f ./feature_matrices/feature_matrix.csv"))
-
-    parser.add_argument("-f",
-                        "--feature-matrix-csv-file-path",
-                        type=str,
-                        required=True,
-                        help="File name to Generate a .csv file")
+
+    parser = argparse.ArgumentParser(
+        description=(
+            "Make ModularKernel feature matrix \n"
+            "Example : python3 -m tests.kernels.moe.modular_kernel_tools.make_feature_matrix "  # noqa: E501
+            "-f ./feature_matrices/feature_matrix.csv"
+        )
+    )
+
+    parser.add_argument(
+        "-f",
+        "--feature-matrix-csv-file-path",
+        type=str,
+        required=True,
+        help="File name to Generate a .csv file",
+    )
     args = parser.parse_args()
 
     csv_path = args.feature_matrix_csv_file_path
-    assert csv_path.endswith(
-        'csv'), f"Need a file path ending with .csv, got {csv_path}"
-    assert Path(csv_path).parent.is_dir(
-    ), f"Cannot find parent directory for {Path(csv_path).parent}"
+    assert csv_path.endswith("csv"), (
+        f"Need a file path ending with .csv, got {csv_path}"
+    )
+    assert Path(csv_path).parent.is_dir(), (
+        f"Cannot find parent directory for {Path(csv_path).parent}"
+    )
 
     make_feature_matrix(args.feature_matrix_csv_file_path)
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 73214066f7ea..2ae28baac6f8 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -5,43 +5,54 @@
 
 # Fused experts and PrepareFinalize imports
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
+    BatchedDeepGemmExperts,
+)
 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
+    BatchedTritonOrDeepGemmExperts,
+)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts, NaiveBatchedExperts)
+    BatchedTritonExperts,
+    NaiveBatchedExperts,
+)
 from vllm.model_executor.layers.fused_moe.layer import TritonExperts
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
+    TritonOrDeepGemmExperts,
+)
 from vllm.utils import has_deep_ep, has_pplx
 
 if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
+        DeepEPHTPrepareAndFinalize,
+    )
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+        DeepEPLLPrepareAndFinalize,
+    )
 
 if has_pplx():
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize,
+    )
 
 MK_MULTI_GPU_PREPARE_FINALIZE_TYPES = []
 if has_pplx():
     MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [PplxPrepareAndFinalize]
 if has_deep_ep():
     MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [
-        DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
+        DeepEPHTPrepareAndFinalize,
+        DeepEPLLPrepareAndFinalize,
     ]
 
 MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES = [MoEPrepareAndFinalizeNoEP]
 
-MK_ALL_PREPARE_FINALIZE_TYPES = (MK_MULTI_GPU_PREPARE_FINALIZE_TYPES +
-                                 MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+MK_ALL_PREPARE_FINALIZE_TYPES = (
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES + MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES
+)
 
 MK_FUSED_EXPERT_TYPES = [
     BatchedDeepGemmExperts,
@@ -57,30 +68,40 @@
 MK_QUANT_CONFIGS = [
     None,
     # per-channel / per-column weights and per-tensor activations
-    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                        per_out_ch_quant=True,
-                        per_act_token_quant=False,
-                        block_shape=None),
+    FusedMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=True,
+        per_act_token_quant=False,
+        block_shape=None,
+    ),
     # per-channel / per-column weights and per-token activations
-    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                        per_out_ch_quant=True,
-                        per_act_token_quant=True,
-                        block_shape=None),
+    FusedMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=True,
+        per_act_token_quant=True,
+        block_shape=None,
+    ),
     # per-tensor weights and per-tensor activations
-    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                        per_out_ch_quant=False,
-                        per_act_token_quant=False,
-                        block_shape=None),
+    FusedMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=False,
+        block_shape=None,
+    ),
     # per-tensor weights and per-token activations
-    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                        per_out_ch_quant=False,
-                        per_act_token_quant=True,
-                        block_shape=None),
+    FusedMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=True,
+        block_shape=None,
+    ),
     # block-quantized weights and 128 block per-token activations
-    FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
-                        per_out_ch_quant=False,
-                        per_act_token_quant=False,
-                        block_shape=[128, 128]),
+    FusedMoEQuantConfig(
+        quant_dtype=torch.float8_e4m3fn,
+        per_out_ch_quant=False,
+        per_act_token_quant=False,
+        block_shape=[128, 128],
+    ),
     # TODO (varun) : Should we test the following combinations ?
     # block-quantized weights and per-token activations
     # block-quantized weights and per-tensor activations
diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
index 1f8d21a7a702..e05de1aa0231 100644
--- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@@ -6,13 +6,11 @@
 from typing import Any, Callable, Optional
 
 import torch
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import Concatenate, ParamSpec
 
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.distributed import (init_distributed_environment,
-                              initialize_model_parallel)
+from vllm.distributed import init_distributed_environment, initialize_model_parallel
 from vllm.utils import get_open_port
 
 ## Parallel Processes Utils
@@ -30,10 +28,11 @@ class ProcessGroupInfo:
     device: torch.device
 
 
-def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
-                     local_rank: int):
-
+def _set_vllm_config(
+    vllm_config: VllmConfig, world_size: int, rank: int, local_rank: int
+):
     import tempfile
+
     temp_file = tempfile.mkstemp()[1]
 
     set_current_vllm_config(vllm_config)
@@ -47,13 +46,10 @@ def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
         )
 
         initialize_model_parallel(
-            tensor_model_parallel_size=vllm_config.parallel_config.
-            tensor_parallel_size,
-            pipeline_model_parallel_size=vllm_config.parallel_config.
-            pipeline_parallel_size,
+            tensor_model_parallel_size=vllm_config.parallel_config.tensor_parallel_size,
+            pipeline_model_parallel_size=vllm_config.parallel_config.pipeline_parallel_size,
         )
-        cpu_group = torch.distributed.new_group(list(range(world_size)),
-                                                backend="gloo")
+        cpu_group = torch.distributed.new_group(list(range(world_size)), backend="gloo")
     return cpu_group
 
 
@@ -63,8 +59,7 @@ def _worker_parallel_launch(
     world_local_size: int,
     node_rank: int,
     init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any,
-                                 P], None],
+    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None],
     vllm_config: Optional[VllmConfig],
     env_dict: Optional[dict],
     *args: P.args,
@@ -132,7 +127,8 @@ def parallel_launch_with_config(
             worker,
             vllm_config,
             env_dict,
-        ) + args,
+        )
+        + args,
         nprocs=world_size,
         join=True,
     )
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index dd16ffb2eabe..0de1762f6425 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -14,28 +14,31 @@
 from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config
 
 
-def do_profile(fn: Callable,
-               fn_kwargs: dict[Any, Any],
-               pgi: ProcessGroupInfo,
-               config: Config,
-               num_warmups: int = 5):
+def do_profile(
+    fn: Callable,
+    fn_kwargs: dict[Any, Any],
+    pgi: ProcessGroupInfo,
+    config: Config,
+    num_warmups: int = 5,
+):
     for _ in range(num_warmups):
         fn(**fn_kwargs)
 
     with torch.profiler.profile(
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
-            ],
-            with_stack=True,
-            record_shapes=True,
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        with_stack=True,
+        record_shapes=True,
     ) as tprof:
         fn(**fn_kwargs)
         torch.cuda.synchronize(torch.cuda.current_device())
 
     # TODO (varun): Add a descriptive trace file name
     tprof.export_chrome_trace(
-        f"{config.torch_trace_dir_path}/m{config.M}_{pgi.rank}_trace.json")
+        f"{config.torch_trace_dir_path}/m{config.M}_{pgi.rank}_trace.json"
+    )
 
 
 def profile_modular_kernel(
@@ -82,8 +85,9 @@ def rank_worker(
 
     # sanity check
     from vllm import envs
+
     if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
     # get weights to this device
     weights.to_current_device()
@@ -108,20 +112,25 @@ def rank_worker(
 def run(config: Config):
     weights: WeightTensors = WeightTensors.make(config)
     vllm_config, env_dict = config.make_env_data()
-    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
-                                env_dict, config, weights)
+    parallel_launch_with_config(
+        config.world_size, rank_worker, vllm_config, env_dict, config, weights
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     from .cli_args import make_config, make_config_arg_parser
-    parser = make_config_arg_parser(description=(
-        "Run single prepare-finalize & fused-experts combination test"
-        "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  #noqa: E501
-        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
-    ))
+
+    parser = make_config_arg_parser(
+        description=(
+            "Run single prepare-finalize & fused-experts combination test"
+            "Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel "  # noqa: E501
+            "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+        )
+    )
     args = parser.parse_args()
     assert args.torch_trace_dir_path is not None, (
-        "Please pass in a directory to store torch traces")
+        "Please pass in a directory to store torch traces"
+    )
     config = make_config(args)
 
     run(config)
diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py
index 09bb4a34f318..2a465fd9b4c3 100644
--- a/tests/kernels/moe/modular_kernel_tools/utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/utils.py
@@ -8,12 +8,12 @@
 
 
 def per_token_cast_to_fp8(
-        x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]:
+    x: torch.Tensor, block_size: int
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
     pad_size = (block_size - (n % block_size)) % block_size
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
+    x = torch.nn.functional.pad(x, (0, pad_size), value=0) if pad_size > 0 else x
     x_view = x.view(m, -1, block_size)
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
     fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
@@ -21,8 +21,8 @@ def per_token_cast_to_fp8(
 
 
 def per_block_cast_to_fp8(
-        x: torch.Tensor, block_size_k: int,
-        block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
+    x: torch.Tensor, block_size_k: int, block_size_n: int
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
     x_padded = torch.zeros(
@@ -34,8 +34,9 @@ def per_block_cast_to_fp8(
         device=x.device,
     )
     x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, block_size_k,
-                           x_padded.size(1) // block_size_k, block_size_n)
+    x_view = x_padded.view(
+        -1, block_size_k, x_padded.size(1) // block_size_k, block_size_n
+    )
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
     x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
     x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
@@ -86,24 +87,23 @@ def make_block_quant_fp8_weights(
     w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device)
     w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device)
 
-    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
-                       device=device,
-                       dtype=torch.float32)
-    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
-                       device=device,
-                       dtype=torch.float32)
+    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1), device=device, dtype=torch.float32)
+    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2), device=device, dtype=torch.float32)
 
-    assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n,
-                          (k + (block_k - 1)) // block_k)
+    assert w1_s.shape == (
+        e,
+        (2 * n + (block_n - 1)) // block_n,
+        (k + (block_k - 1)) // block_k,
+    )
     assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
 
     for i in range(e):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
-                                               block_size_k=block_k,
-                                               block_size_n=block_n)
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
-                                               block_size_k=block_k,
-                                               block_size_n=block_n)
+        w1[i], w1_s[i] = per_block_cast_to_fp8(
+            w1_bf16[i], block_size_k=block_k, block_size_n=block_n
+        )
+        w2[i], w2_s[i] = per_block_cast_to_fp8(
+            w2_bf16[i], block_size_k=block_k, block_size_n=block_n
+        )
 
     return w1, w2, w1_s, w2_s
 
@@ -127,16 +127,14 @@ def make_quant_fp8_weights(
 
     n_b_scales = 2 * n if per_out_channel_quant else 1
     k_b_scales = k if per_out_channel_quant else 1
-    w1_scale = torch.empty((e, n_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-    w2_scale = torch.empty((e, k_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
+    w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
 
     for expert in range(e):
         w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-            w1[expert], use_per_token_if_dynamic=per_out_channel_quant)
+            w1[expert], use_per_token_if_dynamic=per_out_channel_quant
+        )
         w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-            w2[expert], use_per_token_if_dynamic=per_out_channel_quant)
+            w2[expert], use_per_token_if_dynamic=per_out_channel_quant
+        )
     return w1_q, w2_q, w1_scale, w2_scale
diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
index 1ad361ae0733..9d087ad13b82 100644
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -3,6 +3,7 @@
 """
 DeepEP test utilities
 """
+
 import dataclasses
 import os
 import traceback
@@ -10,17 +11,18 @@
 
 import torch
 from torch.distributed import ProcessGroup
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
+from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import Concatenate, ParamSpec
 
 from vllm.utils import get_open_port, has_deep_ep
 
 if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
+        DeepEPHTPrepareAndFinalize,
+    )
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+        DeepEPLLPrepareAndFinalize,
+    )
 
 ## Parallel Processes Utils
 
@@ -96,7 +98,8 @@ def parallel_launch(
             0,
             f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
             worker,
-        ) + args,
+        )
+        + args,
         nprocs=world_size,
         join=True,
     )
@@ -118,48 +121,57 @@ class DeepEPLLArgs:
     use_fp8_dispatch: bool
 
 
-def make_deepep_ht_a2a(pg: ProcessGroup,
-                       pgi: ProcessGroupInfo,
-                       dp_size: int,
-                       ht_args: DeepEPHTArgs,
-                       q_dtype: Optional[torch.dtype] = None,
-                       block_shape: Optional[list[int]] = None):
-
+def make_deepep_ht_a2a(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    ht_args: DeepEPHTArgs,
+    q_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+):
     import deep_ep
 
     # high throughput a2a
     num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
     num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
-    buffer = deep_ep.Buffer(group=pg,
-                            num_nvl_bytes=num_nvl_bytes,
-                            num_rdma_bytes=num_rdma_bytes,
-                            low_latency_mode=low_latency_mode,
-                            num_qps_per_rank=num_qps_per_rank)
-    return DeepEPHTPrepareAndFinalize(buffer=buffer,
-                                      num_dispatchers=pgi.world_size,
-                                      dp_size=dp_size,
-                                      rank_expert_offset=pgi.rank *
-                                      ht_args.num_local_experts)
-
-
-def make_deepep_ll_a2a(pg: ProcessGroup,
-                       pgi: ProcessGroupInfo,
-                       deepep_ll_args: DeepEPLLArgs,
-                       q_dtype: Optional[torch.dtype] = None,
-                       block_shape: Optional[list[int]] = None):
+    buffer = deep_ep.Buffer(
+        group=pg,
+        num_nvl_bytes=num_nvl_bytes,
+        num_rdma_bytes=num_rdma_bytes,
+        low_latency_mode=low_latency_mode,
+        num_qps_per_rank=num_qps_per_rank,
+    )
+    return DeepEPHTPrepareAndFinalize(
+        buffer=buffer,
+        num_dispatchers=pgi.world_size,
+        dp_size=dp_size,
+        rank_expert_offset=pgi.rank * ht_args.num_local_experts,
+    )
+
 
+def make_deepep_ll_a2a(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    deepep_ll_args: DeepEPLLArgs,
+    q_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+):
     import deep_ep
 
     # low-latency a2a
     num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
-        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
-        pgi.world_size, deepep_ll_args.num_experts)
+        deepep_ll_args.max_tokens_per_rank,
+        deepep_ll_args.hidden_size,
+        pgi.world_size,
+        deepep_ll_args.num_experts,
+    )
 
-    buffer = deep_ep.Buffer(group=pg,
-                            num_rdma_bytes=num_rdma_bytes,
-                            low_latency_mode=True,
-                            num_qps_per_rank=deepep_ll_args.num_experts //
-                            pgi.world_size)
+    buffer = deep_ep.Buffer(
+        group=pg,
+        num_rdma_bytes=num_rdma_bytes,
+        low_latency_mode=True,
+        num_qps_per_rank=deepep_ll_args.num_experts // pgi.world_size,
+    )
 
     return DeepEPLLPrepareAndFinalize(
         buffer=buffer,
@@ -169,17 +181,20 @@ def make_deepep_ll_a2a(pg: ProcessGroup,
     )
 
 
-def make_deepep_a2a(pg: ProcessGroup,
-                    pgi: ProcessGroupInfo,
-                    dp_size: int,
-                    deepep_ht_args: Optional[DeepEPHTArgs],
-                    deepep_ll_args: Optional[DeepEPLLArgs],
-                    q_dtype: Optional[torch.dtype] = None,
-                    block_shape: Optional[list[int]] = None):
+def make_deepep_a2a(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    deepep_ht_args: Optional[DeepEPHTArgs],
+    deepep_ll_args: Optional[DeepEPLLArgs],
+    q_dtype: Optional[torch.dtype] = None,
+    block_shape: Optional[list[int]] = None,
+):
     if deepep_ht_args is not None:
         assert deepep_ll_args is None
-        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
-                                  block_shape)
+        return make_deepep_ht_a2a(
+            pg, pgi, dp_size, deepep_ht_args, q_dtype, block_shape
+        )
 
     assert deepep_ll_args is not None
     return make_deepep_ll_a2a(pg, pgi, deepep_ll_args, q_dtype, block_shape)
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 69317405d48b..c62de471a603 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -7,14 +7,18 @@
 import pytest
 import torch
 
-from tests.kernels.moe.utils import (batched_moe,
-                                     make_quantized_test_activations,
-                                     make_test_weights, naive_batched_moe)
+from tests.kernels.moe.utils import (
+    batched_moe,
+    make_quantized_test_activations,
+    make_test_weights,
+    naive_batched_moe,
+)
 from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    invoke_moe_batched_triton_kernel)
+    invoke_moe_batched_triton_kernel,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl
@@ -68,41 +72,54 @@ class BatchedMMTensors:
 
     @staticmethod
     def make_tensors(config: BatchedMMConfig):
-        A = torch.randn(
-            (config.num_experts, config.max_tokens_per_expert, config.K),
+        A = (
+            torch.randn(
+                (config.num_experts, config.max_tokens_per_expert, config.K),
+                device="cuda",
+                dtype=config.in_dtype,
+            )
+            / 10
+        )
+        B = torch.randn(
+            (config.num_experts, config.N, config.K),
             device="cuda",
-            dtype=config.in_dtype) / 10
-        B = torch.randn((config.num_experts, config.N, config.K),
-                        device="cuda",
-                        dtype=config.in_dtype)
+            dtype=config.in_dtype,
+        )
         C = torch.zeros(
             (config.num_experts, config.max_tokens_per_expert, config.N),
             device="cuda",
-            dtype=config.out_dtype)
+            dtype=config.out_dtype,
+        )
 
-        num_expert_tokens = torch.randint(low=0,
-                                          high=config.max_tokens_per_expert,
-                                          size=(config.num_experts, ),
-                                          device="cuda",
-                                          dtype=torch.int32)
+        num_expert_tokens = torch.randint(
+            low=0,
+            high=config.max_tokens_per_expert,
+            size=(config.num_experts,),
+            device="cuda",
+            dtype=torch.int32,
+        )
 
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
 
 @pytest.mark.parametrize("num_experts", [8, 16, 32])
-@pytest.mark.parametrize("max_tokens_per_expert",
-                         [32, 64, 128, 192, 224, 256, 512])
+@pytest.mark.parametrize("max_tokens_per_expert", [32, 64, 128, 192, 224, 256, 512])
 @pytest.mark.parametrize("K", [128, 256, 1024])
 @pytest.mark.parametrize("N", [128, 256, 1024])
 @pytest.mark.parametrize(
-    "dtype",
-    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+    "dtype", [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16]
+)
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
-def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
-                    N: int, dtype: torch.dtype,
-                    block_shape: Optional[list[int]],
-                    per_act_token_quant: bool):
+def test_batched_mm(
+    num_experts: int,
+    max_tokens_per_expert: int,
+    K: int,
+    N: int,
+    dtype: torch.dtype,
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+):
     current_platform.seed_everything(7)
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
@@ -120,11 +137,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         act_dtype = dtype
         quant_dtype = None
 
-    num_expert_tokens = torch.randint(low=0,
-                                      high=max_tokens_per_expert,
-                                      size=(num_experts, ),
-                                      device="cuda",
-                                      dtype=torch.int32)
+    num_expert_tokens = torch.randint(
+        low=0,
+        high=max_tokens_per_expert,
+        size=(num_experts,),
+        device="cuda",
+        dtype=torch.int32,
+    )
 
     A, A_q, A_scale = make_quantized_test_activations(
         num_experts,
@@ -154,7 +173,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     compute_tl_dtype = {
         torch.float16: tl.float16,
         torch.bfloat16: tl.bfloat16,
-        torch.float32: tl.float32
+        torch.float32: tl.float32,
     }[test_output.dtype]
 
     assert A_q.dtype == B_q.dtype
@@ -176,7 +195,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         config={
             "BLOCK_SIZE_M": 16,
             "BLOCK_SIZE_N": 16,
-            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
+            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32,
         },
         per_act_token_quant=per_act_token_quant,
         block_shape=block_shape,
@@ -189,11 +208,16 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         num_expert_tokens,
     )
 
-    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
-                                                      num_expert_tokens,
-                                                      A_scale, B_scale,
-                                                      block_shape,
-                                                      per_act_token_quant)
+    q_ref_output = native_batched_masked_quant_matmul(
+        A_q,
+        B_q,
+        q_ref_output,
+        num_expert_tokens,
+        A_scale,
+        B_scale,
+        block_shape,
+        per_act_token_quant,
+    )
 
     rtol, atol = {
         torch.float16: (6e-2, 6e-2),
@@ -311,12 +335,6 @@ def test_fused_moe_batched_experts(
             block_shape=block_shape,
         )
 
-    torch.testing.assert_close(batched_output,
-                               baseline_output,
-                               atol=3e-2,
-                               rtol=2e-2)
+    torch.testing.assert_close(batched_output, baseline_output, atol=3e-2, rtol=2e-2)
 
-    torch.testing.assert_close(triton_output,
-                               batched_output,
-                               atol=2e-2,
-                               rtol=2e-2)
+    torch.testing.assert_close(triton_output, batched_output, atol=2e-2, rtol=2e-2)
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 7dc6282326b6..9143103c1839 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -5,15 +5,21 @@
 import torch
 
 from tests.kernels.moe.utils import make_test_weights
-from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
-                                       native_w8a8_block_matmul)
+from tests.kernels.quant_utils import (
+    native_per_token_group_quant_fp8,
+    native_w8a8_block_matmul,
+)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-    _valid_deep_gemm_shape, deep_gemm_moe_fp8)
+    _valid_deep_gemm_shape,
+    deep_gemm_moe_fp8,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, modular_triton_fused_moe)
+    fused_topk,
+    modular_triton_fused_moe,
+)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
@@ -24,8 +30,7 @@
     from deep_gemm import get_m_alignment_for_contiguous_layout
 
 if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -97,8 +102,7 @@
 SEEDS = [0]
 
 
-def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids,
-                             block_shape):
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids, block_shape):
     """Fused moe with block-wise quantization using native torch."""
     B, D = a.shape
     topk = topk_ids.size(1)
@@ -114,23 +118,17 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids,
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            inter_out = native_w8a8_block_matmul(a_q[mask],
-                                                 w1[i],
-                                                 a_s[mask],
-                                                 w1_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
+            inter_out = native_w8a8_block_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
+            )
             act_out = SiluAndMul().forward_native(inter_out)
-            act_out_q, act_out_s = native_per_token_group_quant_fp8(
-                act_out, block_k)
-            out[mask] = native_w8a8_block_matmul(act_out_q,
-                                                 w2[i],
-                                                 act_out_s,
-                                                 w2_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(act_out, block_k)
+            out[mask] = native_w8a8_block_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 # Skip all tests if CUDA is not available
@@ -149,8 +147,9 @@ def setup_cuda():
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
-                                  monkeypatch):
+def test_w8a8_block_fp8_fused_moe(
+    M, N, K, E, topk, block_size, dtype, seed, monkeypatch
+):
     if topk > E:
         pytest.skip(f"Skipping test; topk={topk} > E={E}")
 
@@ -161,20 +160,24 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.float8_e4m3fn,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
-
-    m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True,
-                                           use_int8_w8a8=False,
-                                           use_int8_w8a16=False,
-                                           use_int4_w4a16=False,
-                                           per_act_token_quant=False,
-                                           block_shape=block_size)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        E,
+        N,
+        K,
+        dtype,
+        torch.float8_e4m3fn,
+        per_act_token_quant=False,
+        block_shape=block_size,
+    )
+
+    m_fused_moe = modular_triton_fused_moe(
+        use_fp8_w8a8=True,
+        use_int8_w8a8=False,
+        use_int8_w8a16=False,
+        use_int4_w4a16=False,
+        per_act_token_quant=False,
+        block_shape=block_size,
+    )
 
     topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)
 
@@ -226,8 +229,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
 @pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
 @torch.inference_mode()
-def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
-                                            monkeypatch):
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch):
     if topk > E:
         pytest.skip(f"Skipping test: topk={topk} > E={E}")
 
@@ -246,49 +248,53 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.float8_e4m3fn,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        E,
+        N,
+        K,
+        dtype,
+        torch.float8_e4m3fn,
+        per_act_token_quant=False,
+        block_shape=block_size,
+    )
 
     # Note: for now use_compile will error out if the problem size is
     # large enough to trigger chunking. I'm leaving the flag and
     # setup code in case we are able to revisit this later.
     use_compile = False
 
-    use_cudagraph = (chunk_size < M and N >= 1024 and K >= 1024
-                     and current_platform.is_cuda_alike())
+    use_cudagraph = (
+        chunk_size < M and N >= 1024 and K >= 1024 and current_platform.is_cuda_alike()
+    )
 
     topk_weights, topk_ids, _ = fused_topk(a, score.float(), topk, False)
 
     # Set the context to avoid lots of warning spam.
     with set_current_vllm_config(vllm_config):
-        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, topk_weights,
-                                           topk_ids, block_size)
+        ref_out = torch_w8a8_block_fp8_moe(
+            a, w1, w2, w1_s, w2_s, topk_weights, topk_ids, block_size
+        )
 
         if use_compile:
-            deep_gemm_moe_fp8_fn = torch.compile(deep_gemm_moe_fp8,
-                                                 backend="inductor",
-                                                 fullgraph=True)
+            deep_gemm_moe_fp8_fn = torch.compile(
+                deep_gemm_moe_fp8, backend="inductor", fullgraph=True
+            )
             torch._dynamo.mark_dynamic(a, 0)
             torch._dynamo.mark_dynamic(topk_weights, 0)
             torch._dynamo.mark_dynamic(topk_ids, 0)
         else:
             deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
 
-        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                   topk_ids)
+        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
 
         if use_cudagraph:
             out.fill_(0)
             stream = torch.cuda.Stream()
             graph = torch.cuda.CUDAGraph()
             with torch.cuda.graph(graph, stream=stream):
-                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                           topk_ids)
+                out = deep_gemm_moe_fp8_fn(
+                    a, w1, w2, w1_s, w2_s, topk_weights, topk_ids
+                )
             torch.cuda.synchronize()
             graph.replay()
             torch.cuda.synchronize()
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
index 8e680c722935..078d110b1925 100644
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -5,16 +5,17 @@
 import torch
 
 from tests.kernels.moe.utils import make_test_weights
-from tests.kernels.quant_utils import (native_per_token_group_quant_int8,
-                                       native_w8a8_block_matmul)
+from tests.kernels.quant_utils import (
+    native_per_token_group_quant_int8,
+    native_w8a8_block_matmul,
+)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (7, 0):
-    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
-                allow_module_level=True)
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -77,24 +78,18 @@ def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            inter_out = native_w8a8_block_matmul(a_q[mask],
-                                                 w1[i],
-                                                 a_s[mask],
-                                                 w1_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
+            inter_out = native_w8a8_block_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
+            )
             act_out = SiluAndMul().forward_native(inter_out)
-            act_out_q, act_out_s = native_per_token_group_quant_int8(
-                act_out, block_k)
+            act_out_q, act_out_s = native_per_token_group_quant_int8(act_out, block_k)
             act_out = act_out.to(torch.float32)
-            out[mask] = native_w8a8_block_matmul(act_out_q,
-                                                 w2[i],
-                                                 act_out_s,
-                                                 w2_s[i],
-                                                 block_shape,
-                                                 output_dtype=a.dtype)
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+            out[mask] = native_w8a8_block_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 @pytest.fixture(autouse=True, scope="module")
@@ -118,13 +113,9 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.int8,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        E, N, K, dtype, torch.int8, per_act_token_quant=False, block_shape=block_size
+    )
 
     # Set the context to avoid lots of warning spam.
     with set_current_vllm_config(vllm_config):
@@ -140,8 +131,9 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
             w2_scale=w2_s,
             block_shape=block_size,
         )
-        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk,
-                                            block_size)
+        ref_out = torch_w8a8_block_int8_moe(
+            a, w1, w2, w1_s, w2_s, score, topk, block_size
+        )
 
     # Check results
     torch.testing.assert_close(out, ref_out, atol=0.065, rtol=0.065)
diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py
index 0872836b6064..cf648dc36d61 100644
--- a/tests/kernels/moe/test_count_expert_num_tokens.py
+++ b/tests/kernels/moe/test_count_expert_num_tokens.py
@@ -15,7 +15,6 @@
 
 @dataclasses.dataclass
 class TestTensors:
-
     topk_ids: torch.Tensor
     expert_map: Optional[torch.Tensor] = None
 
@@ -25,32 +24,31 @@ def to_device(self, device: str):
             self.expert_map = self.expert_map.to(device=device)
 
     @staticmethod
-    def make(num_tokens: int, num_topk: int, num_experts: int, device: str,
-             topk_ids_dtype: torch.dtype) -> "TestTensors":
-
+    def make(
+        num_tokens: int,
+        num_topk: int,
+        num_experts: int,
+        device: str,
+        topk_ids_dtype: torch.dtype,
+    ) -> "TestTensors":
         # make topk ids
-        topk_ids = torch.empty((num_tokens, num_topk),
-                               device=device,
-                               dtype=torch.int64)
+        topk_ids = torch.empty((num_tokens, num_topk), device=device, dtype=torch.int64)
         for x in range(num_tokens):
             topk_ids[x] = torch.randperm(num_experts)[:num_topk]
         topk_ids = topk_ids.to(dtype=torch.int64)
         return TestTensors(topk_ids=topk_ids)
 
-    def with_ep_rank(self, ep_rank: int, num_global_experts: int,
-                     num_local_experts: int, device: str):
+    def with_ep_rank(
+        self, ep_rank: int, num_global_experts: int, num_local_experts: int, device: str
+    ):
         # make an expert map
-        expert_map = torch.empty((num_global_experts),
-                                 device=device,
-                                 dtype=torch.int32)
+        expert_map = torch.empty((num_global_experts), device=device, dtype=torch.int32)
         expert_map.fill_(-1)
         s = ep_rank * num_local_experts
         e = s + num_local_experts
-        expert_map[s:e] = torch.tensor(list(range(num_local_experts)),
-                                       device=device)
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)), device=device)
 
-        return TestTensors(topk_ids=self.topk_ids.clone(),
-                           expert_map=expert_map)
+        return TestTensors(topk_ids=self.topk_ids.clone(), expert_map=expert_map)
 
 
 def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
@@ -68,73 +66,81 @@ def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
         expert_num_tokens[eid] += count
 
 
-def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
-                                      num_experts: int, ep_size: int,
-                                      topk_ids_dtype: torch.dtype):
-
+def do_test_compute_expert_num_tokens(
+    num_tokens: int,
+    num_topk: int,
+    num_experts: int,
+    ep_size: int,
+    topk_ids_dtype: torch.dtype,
+):
     assert num_topk <= num_experts
 
-    tt = TestTensors.make(num_tokens,
-                          num_topk,
-                          num_experts,
-                          topk_ids_dtype=topk_ids_dtype,
-                          device="cpu")
+    tt = TestTensors.make(
+        num_tokens, num_topk, num_experts, topk_ids_dtype=topk_ids_dtype, device="cpu"
+    )
 
     num_global_experts = num_experts
     assert num_global_experts % ep_size == 0
     num_local_experts = num_global_experts // ep_size
     for ep_rank in range(ep_size):
-        tt_rank = tt.with_ep_rank(ep_rank, num_global_experts,
-                                  num_local_experts, "cpu")
+        tt_rank = tt.with_ep_rank(ep_rank, num_global_experts, num_local_experts, "cpu")
 
-        ref_expert_num_tokens = torch.zeros((num_local_experts),
-                                            device="cpu",
-                                            dtype=torch.int32)
+        ref_expert_num_tokens = torch.zeros(
+            (num_local_experts), device="cpu", dtype=torch.int32
+        )
         ref_impl(tt_rank, ref_expert_num_tokens)
         ref_expert_num_tokens = ref_expert_num_tokens.to("cuda")
 
         tt_rank.to_device("cuda")
         # Test with expert_map
         triton_expert_num_tokens_w_emap = count_expert_num_tokens(
-            tt_rank.topk_ids, num_local_experts, tt_rank.expert_map)
+            tt_rank.topk_ids, num_local_experts, tt_rank.expert_map
+        )
 
         # Test without expert map
         topk_ids = tt_rank.expert_map[tt_rank.topk_ids].to(topk_ids_dtype)
         triton_expert_num_tokens_wo_emap = count_expert_num_tokens(
-            topk_ids, num_local_experts, expert_map=None)
+            topk_ids, num_local_experts, expert_map=None
+        )
 
-        torch.testing.assert_close(ref_expert_num_tokens,
-                                   triton_expert_num_tokens_w_emap,
-                                   atol=0,
-                                   rtol=0)
-        torch.testing.assert_close(ref_expert_num_tokens,
-                                   triton_expert_num_tokens_wo_emap,
-                                   atol=0,
-                                   rtol=0)
+        torch.testing.assert_close(
+            ref_expert_num_tokens, triton_expert_num_tokens_w_emap, atol=0, rtol=0
+        )
+        torch.testing.assert_close(
+            ref_expert_num_tokens, triton_expert_num_tokens_wo_emap, atol=0, rtol=0
+        )
 
 
 @pytest.mark.parametrize(
-    "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
+    "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317]
+)
 @pytest.mark.parametrize("num_topk", [2, 6, 8])
 @pytest.mark.parametrize("num_experts", [64])
 @pytest.mark.parametrize("ep_size", [1, 2, 4])
 @pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
-def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
-                                   num_experts: int, ep_size: int,
-                                   topk_ids_dtype: torch.dtype):
-    do_test_compute_expert_num_tokens(num_tokens, num_topk, num_experts,
-                                      ep_size, topk_ids_dtype)
+def test_compute_expert_num_tokens(
+    num_tokens: int,
+    num_topk: int,
+    num_experts: int,
+    ep_size: int,
+    topk_ids_dtype: torch.dtype,
+):
+    do_test_compute_expert_num_tokens(
+        num_tokens, num_topk, num_experts, ep_size, topk_ids_dtype
+    )
 
 
 @pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
 @pytest.mark.parametrize("num_experts", [32])
 @pytest.mark.parametrize("ep_size", [2])
 @pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
-def test_compute_expert_num_tokens_from_numel(numel: int, num_experts: int,
-                                              ep_size: int,
-                                              topk_ids_dtype: torch.dtype):
-    do_test_compute_expert_num_tokens(num_tokens=numel,
-                                      num_topk=1,
-                                      num_experts=num_experts,
-                                      ep_size=ep_size,
-                                      topk_ids_dtype=topk_ids_dtype)
+def test_compute_expert_num_tokens_from_numel(
+    numel: int, num_experts: int, ep_size: int, topk_ids_dtype: torch.dtype
+):
+    do_test_compute_expert_num_tokens(
+        num_tokens=numel,
+        num_topk=1,
+        num_experts=num_experts,
+        ep_size=ep_size,
+        topk_ids_dtype=topk_ids_dtype,
+    )
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
index 67984fe7319a..ba9f7edc0e45 100644
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -18,48 +18,50 @@ def cdiv(a, b):
     return (a + b - 1) // b
 
 
-def per_token_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def per_token_cast_to_fp8(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
     pad_size = (128 - (n % 128)) % 128
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
+    x = torch.nn.functional.pad(x, (0, pad_size), value=0) if pad_size > 0 else x
     x_view = x.view(m, -1, 128)
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    fp8_data = (x_view *
-                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
+    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
     return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
 
 
-def per_block_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def per_block_cast_to_fp8(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
-                           device=x.device,
-                           dtype=x.dtype)
+    x_padded = torch.zeros(
+        (cdiv(m, 128) * 128, cdiv(n, 128) * 128), device=x.device, dtype=x.dtype
+    )
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
     x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-
-
-@pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
-    (4, 8192, 7168, 4096),
-    (4, 8192, 2048, 7168),
-    (8, 4096, 7168, 4096),
-    (8, 4096, 2048, 7168),
-    (32, 1024, 7168, 4096),
-    (32, 1024, 2048, 7168),
-])
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+@pytest.mark.parametrize(
+    "num_groups, expected_m_per_group, k, n",
+    [
+        (4, 8192, 7168, 4096),
+        (4, 8192, 2048, 7168),
+        (8, 4096, 7168, 4096),
+        (8, 4096, 2048, 7168),
+        (32, 1024, 7168, 4096),
+        (32, 1024, 2048, 7168),
+    ],
+)
 @pytest.mark.parametrize("out_dtype", [torch.float16])
 @pytest.mark.skipif(
     (lambda x: x is None or x.to_int() != 100)(
-        current_platform.get_device_capability()),
-    reason="Block Scaled Grouped GEMM is only supported on SM100.")
+        current_platform.get_device_capability()
+    ),
+    reason="Block Scaled Grouped GEMM is only supported on SM100.",
+)
 def test_cutlass_grouped_gemm(
     num_groups: int,
     expected_m_per_group: int,
@@ -70,8 +72,7 @@ def test_cutlass_grouped_gemm(
     device = "cuda"
     alignment = 128
     group_ms = [
-        int(expected_m_per_group * random.uniform(0.7, 1.3))
-        for _ in range(num_groups)
+        int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
     ]
     m = sum([cdiv(m, alignment) * alignment for m in group_ms])
 
@@ -88,20 +89,22 @@ def test_cutlass_grouped_gemm(
     expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
 
     x_fp8 = per_token_cast_to_fp8(x)
-    y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn),
-             torch.empty((num_groups, cdiv(n, 128), k // 128),
-                         device=device,
-                         dtype=torch.float))
+    y_fp8 = (
+        torch.empty_like(y, dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
+        ),
+    )
     for i in range(num_groups):
         y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
 
     for i in range(num_groups):
-        a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
-        a_scale = x_fp8[1][ep_offset[i]:ep_offset[i + 1]]
+        a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
+        a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
         b = y_fp8[0][i].t()
         b_scale = y_fp8[1][i].t()
         baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
-        ref_out[ep_offset[i]:ep_offset[i + 1]] = baseline
+        ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
 
     ops.cutlass_blockwise_scaled_grouped_mm(
         out,
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 5fb49c2da4fe..d61a66723c47 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -10,11 +10,11 @@
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    cutlass_moe_fp8, run_cutlass_moe_fp8)
-from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
-                                                            fused_topk)
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+    cutlass_moe_fp8,
+    run_cutlass_moe_fp8,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [40, 64]
@@ -35,12 +35,11 @@
     (224, 3072, 1536),
     (32768, 1024, 1024),
     # These sizes trigger wrong answers.
-    #(7232, 2048, 5120),
-    #(40000, 2048, 5120),
+    # (7232, 2048, 5120),
+    # (40000, 2048, 5120),
 ]
 
-vllm_config = VllmConfig(parallel_config=ParallelConfig(
-    pipeline_parallel_size=1))
+vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192
 
@@ -56,22 +55,25 @@ class MOETensors:
     c_strides2: torch.Tensor
 
     @staticmethod
-    def make_moe_tensors(m: int, k: int, n: int, e: int,
-                         dtype: torch.dtype) -> "MOETensors":
+    def make_moe_tensors(
+        m: int, k: int, n: int, e: int, dtype: torch.dtype
+    ) -> "MOETensors":
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
         w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
         w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        return MOETensors(a=a,
-                          w1=w1,
-                          w2=w2,
-                          ab_strides1=ab_strides1,
-                          c_strides1=c_strides1,
-                          ab_strides2=ab_strides2,
-                          c_strides2=c_strides2)
+        ab_strides1 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e,), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+        return MOETensors(
+            a=a,
+            w1=w1,
+            w2=w2,
+            ab_strides1=ab_strides1,
+            c_strides1=c_strides1,
+            ab_strides2=ab_strides2,
+            c_strides2=c_strides2,
+        )
 
 
 @dataclasses.dataclass
@@ -89,9 +91,9 @@ class MOETensors8Bit(MOETensors):
     w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
 
     @staticmethod
-    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
-                              per_act_token: bool,
-                              per_out_channel: bool) -> "MOETensors8Bit":
+    def make_moe_tensors_8bit(
+        m: int, k: int, n: int, e: int, per_act_token: bool, per_out_channel: bool
+    ) -> "MOETensors8Bit":
         dtype = torch.half
         q_dtype = torch.float8_e4m3fn
 
@@ -102,24 +104,21 @@ def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
         k_b_scales = k if per_out_channel else 1
         # Get the right scale for tests.
         a_q, a_scale = ops.scaled_fp8_quant(
-            moe_tensors_fp16.a, None, use_per_token_if_dynamic=per_act_token)
+            moe_tensors_fp16.a, None, use_per_token_if_dynamic=per_act_token
+        )
 
         w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
         w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
 
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
+        w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
         for expert in range(e):
             w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                moe_tensors_fp16.w1[expert],
-                use_per_token_if_dynamic=per_out_channel)
+                moe_tensors_fp16.w1[expert], use_per_token_if_dynamic=per_out_channel
+            )
             w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                moe_tensors_fp16.w2[expert],
-                use_per_token_if_dynamic=per_out_channel)
+                moe_tensors_fp16.w2[expert], use_per_token_if_dynamic=per_out_channel
+            )
 
         # a_q -> a_d, w1_q -> w1_d, w2_q -> w2_d
         a_d = a_q.float().mul(a_scale).to(dtype)
@@ -129,31 +128,39 @@ def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
             w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
             w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
 
-        return MOETensors8Bit(a=moe_tensors_fp16.a,
-                              w1=moe_tensors_fp16.w1,
-                              w2=moe_tensors_fp16.w2,
-                              ab_strides1=moe_tensors_fp16.ab_strides1,
-                              c_strides1=moe_tensors_fp16.c_strides1,
-                              ab_strides2=moe_tensors_fp16.ab_strides2,
-                              c_strides2=moe_tensors_fp16.c_strides2,
-                              a_q=a_q,
-                              w1_q=w1_q,
-                              w2_q=w2_q,
-                              a_scale=a_scale,
-                              w1_scale=w1_scale,
-                              w2_scale=w2_scale,
-                              a_d=a_d,
-                              w1_d=w1_d,
-                              w2_d=w2_d)
-
-
-def run_with_expert_maps(num_experts: int, num_local_experts: int,
-                         **cutlass_moe_kwargs):
-
+        return MOETensors8Bit(
+            a=moe_tensors_fp16.a,
+            w1=moe_tensors_fp16.w1,
+            w2=moe_tensors_fp16.w2,
+            ab_strides1=moe_tensors_fp16.ab_strides1,
+            c_strides1=moe_tensors_fp16.c_strides1,
+            ab_strides2=moe_tensors_fp16.ab_strides2,
+            c_strides2=moe_tensors_fp16.c_strides2,
+            a_q=a_q,
+            w1_q=w1_q,
+            w2_q=w2_q,
+            a_scale=a_scale,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a_d=a_d,
+            w1_d=w1_d,
+            w2_d=w2_d,
+        )
+
+
+def run_with_expert_maps(
+    num_experts: int, num_local_experts: int, **cutlass_moe_kwargs
+):
     def slice_experts():
         slice_params = [
-            "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1",
-            "c_strides2", "w1_scale", "w2_scale"
+            "w1_q",
+            "w2_q",
+            "ab_strides1",
+            "ab_strides2",
+            "c_strides1",
+            "c_strides2",
+            "w1_scale",
+            "w2_scale",
         ]
         full_tensors = {
             k: v
@@ -167,9 +174,7 @@ def slice_experts():
             # make expert map
             expert_map = [-1] * num_experts
             expert_map[s:e] = list(range(num_local_experts))
-            expert_map = torch.tensor(expert_map,
-                                      dtype=torch.int32,
-                                      device="cuda")
+            expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
 
             # update cutlass moe arg with expert_map
             cutlass_moe_kwargs["expert_map"] = expert_map
@@ -186,32 +191,40 @@ def slice_experts():
     return out_tensor
 
 
-def run_8_bit(moe_tensors: MOETensors8Bit,
-              topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor,
-              per_act_token: bool,
-              num_local_experts: Optional[int] = None) -> torch.Tensor:
-    assert not any([
-        t is None for t in [
-            moe_tensors.w1_q, moe_tensors.w2_q, moe_tensors.w1_scale,
-            moe_tensors.w2_scale, moe_tensors.a_scale
+def run_8_bit(
+    moe_tensors: MOETensors8Bit,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    per_act_token: bool,
+    num_local_experts: Optional[int] = None,
+) -> torch.Tensor:
+    assert not any(
+        [
+            t is None
+            for t in [
+                moe_tensors.w1_q,
+                moe_tensors.w2_q,
+                moe_tensors.w1_scale,
+                moe_tensors.w2_scale,
+                moe_tensors.a_scale,
+            ]
         ]
-    ])
+    )
 
     kwargs = {
-        'a': moe_tensors.a,
-        'w1_q': moe_tensors.w1_q,  # type: ignore[union-attr]
-        'w2_q': moe_tensors.w2_q,  # type: ignore[union-attr]
-        'topk_weights': topk_weights,
-        'topk_ids': topk_ids,
-        'w1_scale': moe_tensors.w1_scale,
-        'w2_scale': moe_tensors.w2_scale,
-        'ab_strides1': moe_tensors.ab_strides1,
-        'ab_strides2': moe_tensors.ab_strides2,
-        'c_strides1': moe_tensors.c_strides1,
-        'c_strides2': moe_tensors.c_strides2,
-        'per_act_token': per_act_token,
-        'a1_scale': None  #moe_tensors.a_scale
+        "a": moe_tensors.a,
+        "w1_q": moe_tensors.w1_q,  # type: ignore[union-attr]
+        "w2_q": moe_tensors.w2_q,  # type: ignore[union-attr]
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "w1_scale": moe_tensors.w1_scale,
+        "w2_scale": moe_tensors.w2_scale,
+        "ab_strides1": moe_tensors.ab_strides1,
+        "ab_strides2": moe_tensors.ab_strides2,
+        "c_strides1": moe_tensors.c_strides1,
+        "c_strides2": moe_tensors.c_strides2,
+        "per_act_token": per_act_token,
+        "a1_scale": None,  # moe_tensors.a_scale
     }
 
     num_experts = moe_tensors.w1.size(0)
@@ -223,7 +236,8 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
     return run_with_expert_maps(
         num_experts,
         num_local_experts,  # type: ignore[arg-type]
-        **kwargs)
+        **kwargs,
+    )
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
@@ -233,8 +247,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_no_graph(
     m: int,
     n: int,
@@ -249,34 +265,29 @@ def test_cutlass_moe_8_bit_no_graph(
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
-        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
-                                                  per_out_ch)
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.half)
-        topk_weights, topk_ids, _ = fused_topk(mt.a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a, score, topk, renormalize=False)
 
         # Note that we are using the dequantized versions of the tensors.
         # Using a, w1 and w2 directly results in minor output differences.
-        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
-                                      topk_ids)
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, topk_ids)
 
         if ep_size is not None:
             assert e % ep_size == 0, "Cannot distribute experts evenly"
             number_local_experts = e // ep_size
         else:
             number_local_experts = None
-        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token,
-                                   number_local_experts)
+        cutlass_output = run_8_bit(
+            mt, topk_weights, topk_ids, per_act_token, number_local_experts
+        )
 
         # Note 5.5 only needed for larger problem sizes, 5 works ok for
         # the rest.
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=5.5e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(
+            triton_output, cutlass_output, atol=5.5e-2, rtol=1e-2
+        )
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
@@ -286,8 +297,10 @@ def test_cutlass_moe_8_bit_no_graph(
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_cuda_graph(
     m: int,
     n: int,
@@ -303,34 +316,25 @@ def test_cutlass_moe_8_bit_cuda_graph(
     with set_current_vllm_config(vllm_config):
         dtype = torch.half
 
-        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
-                                                  per_out_ch)
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(mt.a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a, score, topk, renormalize=False)
 
         # Note that we are using the dequantized versions of the tensors.
         # Using a, w1 and w2 directly results in minor output differences.
-        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
-                                      topk_ids)
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, topk_ids)
 
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            cutlass_output = run_8_bit(mt, topk_weights, topk_ids,
-                                       per_act_token)
+            cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token)
 
         torch.cuda.synchronize()
         graph.replay()
         torch.cuda.synchronize()
 
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=9e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(triton_output, cutlass_output, atol=9e-2, rtol=1e-2)
 
 
 @pytest.mark.parametrize("m", [64])
@@ -343,8 +347,10 @@ def test_cutlass_moe_8_bit_cuda_graph(
 @pytest.mark.parametrize("ep_size", [1, 2, 4, 8, 16])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_EP(
     m: int,
     n: int,
@@ -356,8 +362,9 @@ def test_cutlass_moe_8_bit_EP(
     ep_size: int,
     monkeypatch,
 ):
-    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
-                                    per_out_channel, monkeypatch, ep_size)
+    test_cutlass_moe_8_bit_no_graph(
+        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+    )
 
 
 LARGE_MNK_FACTORS = [
@@ -374,8 +381,10 @@ def test_cutlass_moe_8_bit_EP(
 @pytest.mark.parametrize("ep_size", [8])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_cutlass_moe_8_bit_EP_large(
     m: int,
     n: int,
@@ -387,8 +396,9 @@ def test_cutlass_moe_8_bit_EP_large(
     ep_size: int,
     monkeypatch,
 ):
-    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
-                                    per_out_channel, monkeypatch, ep_size)
+    test_cutlass_moe_8_bit_no_graph(
+        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+    )
 
 
 @pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
@@ -398,8 +408,10 @@ def test_cutlass_moe_8_bit_EP_large(
 @pytest.mark.parametrize("ep_size", [8])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 def test_run_cutlass_moe_fp8(
     m: int,
     n: int,
@@ -412,14 +424,12 @@ def test_run_cutlass_moe_fp8(
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(vllm_config):
-        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
-                                                  per_out_channel)
+        mt = MOETensors8Bit.make_moe_tensors_8bit(
+            m, k, n, e, per_act_token, per_out_channel
+        )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.half)
-        topk_weights, topk_ids, _ = fused_topk(mt.a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(mt.a, score, topk, renormalize=False)
         # we want to make sure there is at least one token that's generated in
         # this expert shard and at least one token that's NOT generated in this
         # expert shard
@@ -430,12 +440,12 @@ def test_run_cutlass_moe_fp8(
         workspace2_shape = (m * topk, n)
         output_shape = (m * topk, k)
 
-        workspace13 = torch.empty(prod(workspace13_shape),
-                                  device="cuda",
-                                  dtype=mt.a.dtype)
-        workspace2 = torch.empty(prod(workspace2_shape),
-                                 device="cuda",
-                                 dtype=mt.a.dtype)
+        workspace13 = torch.empty(
+            prod(workspace13_shape), device="cuda", dtype=mt.a.dtype
+        )
+        workspace2 = torch.empty(
+            prod(workspace2_shape), device="cuda", dtype=mt.a.dtype
+        )
 
         num_local_experts = e // ep_size
         start, end = 0, num_local_experts
@@ -443,36 +453,54 @@ def test_run_cutlass_moe_fp8(
         expert_map[start:end] = list(range(num_local_experts))
         expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
 
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        ab_strides1 = torch.full((e,), k, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e,), n, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64)
 
         activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
-        a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
-                                                   torch.float8_e4m3fn,
-                                                   per_act_token)
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            mt.a, mt.a_scale, torch.float8_e4m3fn, per_act_token
+        )
         global_num_experts = -1 if mt.w1_q is None else mt.w1_q.size(0)
         func = lambda output: run_cutlass_moe_fp8(
-            output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
-            global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
-            a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2,
-            workspace13, workspace2, None, mt.a.dtype, per_act_token,
-            per_out_channel, False)
+            output,
+            a1q,
+            mt.w1_q,
+            mt.w2_q,
+            topk_ids,
+            activation,
+            global_num_experts,
+            expert_map,
+            mt.w1_scale,
+            mt.w2_scale,
+            a1q_scale,
+            None,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
+            workspace13,
+            workspace2,
+            None,
+            mt.a.dtype,
+            per_act_token,
+            per_out_channel,
+            False,
+        )
 
         workspace13.random_()
-        output_random_workspace = torch.empty(output_shape,
-                                              device="cuda",
-                                              dtype=mt.a.dtype)
+        output_random_workspace = torch.empty(
+            output_shape, device="cuda", dtype=mt.a.dtype
+        )
         func(output_random_workspace)
 
         workspace13.fill_(0)
-        output_zero_workspace = torch.zeros(output_shape,
-                                            device="cuda",
-                                            dtype=mt.a.dtype)
+        output_zero_workspace = torch.zeros(
+            output_shape, device="cuda", dtype=mt.a.dtype
+        )
         func(output_zero_workspace)
 
-        torch.testing.assert_close(output_random_workspace,
-                                   output_zero_workspace,
-                                   atol=5e-3,
-                                   rtol=1e-3)
+        torch.testing.assert_close(
+            output_random_workspace, output_zero_workspace, atol=5e-3, rtol=1e-3
+        )
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 074771e49a06..945a61d9e6d2 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -16,8 +16,7 @@
 
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
 from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
@@ -27,18 +26,19 @@
 
 if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
+        DeepEPHTPrepareAndFinalize,
+    )
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+        DeepEPLLPrepareAndFinalize,
+    )
 
     from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
 if has_deep_gemm():
-
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-        BatchedDeepGemmExperts)
-    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
-        DeepGemmExperts)
+        BatchedDeepGemmExperts,
+    )
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 
 requires_deep_ep = pytest.mark.skipif(
     not has_deep_ep(),
@@ -55,9 +55,10 @@
 
 def next_power_of_2(x):
     import math
+
     if x == 0:
         return 1
-    return 2**math.ceil(math.log2(x))
+    return 2 ** math.ceil(math.log2(x))
 
 
 def make_block_quant_fp8_weights(
@@ -70,7 +71,8 @@ def make_block_quant_fp8_weights(
     Return weights w1q, w2q, w1_scale, w2_scale
     """
     w1, w1q, w1_scale, w2, w2q, w2_scale = make_test_weights(
-        e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size)
+        e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size
+    )
     return w1q, w2q, w1_scale, w2_scale
 
 
@@ -98,15 +100,15 @@ class TestTensors:
 
     @staticmethod
     def make(config: TestConfig, rank) -> "TestTensors":
-
         dtype = torch.bfloat16
         topk, m, k = (config.topk, config.m, config.k)
 
         fp8_info = torch.finfo(torch.float8_e4m3fn)
         fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-        rank_tokens = torch.randn(
-            (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        rank_tokens = (
+            torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
+        )
         rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
         rank_token_scales = None
 
@@ -114,24 +116,31 @@ def make(config: TestConfig, rank) -> "TestTensors":
             low=0,
             high=config.num_experts,
             size=(m, topk),
-            device=torch.cuda.current_device()).to(dtype=torch.int64)
-
-        topk_weights = torch.randn(topk_ids.shape,
-                                   dtype=torch.float32,
-                                   device=torch.cuda.current_device())
+            device=torch.cuda.current_device(),
+        ).to(dtype=torch.int64)
 
-        return TestTensors(rank_tokens=rank_tokens,
-                           rank_token_scales=rank_token_scales,
-                           topk=topk_ids,
-                           topk_weights=topk_weights,
-                           config=config)
+        topk_weights = torch.randn(
+            topk_ids.shape, dtype=torch.float32, device=torch.cuda.current_device()
+        )
 
+        return TestTensors(
+            rank_tokens=rank_tokens,
+            rank_token_scales=rank_token_scales,
+            topk=topk_ids,
+            topk_weights=topk_weights,
+            config=config,
+        )
 
-def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
-                           max_tokens_per_rank: int, dp_size: int,
-                           hidden_size: int, q_dtype: Optional[torch.dtype],
-                           test_config: TestConfig) -> FusedMoEModularKernel:
 
+def make_ll_modular_kernel(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    max_tokens_per_rank: int,
+    dp_size: int,
+    hidden_size: int,
+    q_dtype: Optional[torch.dtype],
+    test_config: TestConfig,
+) -> FusedMoEModularKernel:
     assert test_config.low_latency
     assert test_config.use_fp8_dispatch is not None
 
@@ -144,25 +153,30 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
             max_tokens_per_rank=max_tokens_per_rank,
             hidden_size=hidden_size,
             num_experts=test_config.num_experts,
-            use_fp8_dispatch=test_config.use_fp8_dispatch),
+            use_fp8_dispatch=test_config.use_fp8_dispatch,
+        ),
         q_dtype=q_dtype,
-        block_shape=test_config.block_size)
+        block_shape=test_config.block_size,
+    )
 
     fused_experts = BatchedDeepGemmExperts(
         max_num_tokens=max_tokens_per_rank,
         num_dispatchers=pgi.world_size // dp_size,
         block_shape=test_config.block_size,
-        per_act_token_quant=test_config.per_act_token_quant)
-    mk = FusedMoEModularKernel(prepare_finalize=a2a,
-                               fused_experts=fused_experts)
+        per_act_token_quant=test_config.per_act_token_quant,
+    )
+    mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts)
     return mk
 
 
-def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
-                           dp_size: int, num_local_experts: int,
-                           q_dtype: Optional[torch.dtype],
-                           test_config: TestConfig) -> FusedMoEModularKernel:
-
+def make_ht_modular_kernel(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    num_local_experts: int,
+    q_dtype: Optional[torch.dtype],
+    test_config: TestConfig,
+) -> FusedMoEModularKernel:
     assert not test_config.low_latency
     assert test_config.use_fp8_dispatch is None
 
@@ -173,62 +187,68 @@ def make_ht_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
         deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts),
         deepep_ll_args=None,
         q_dtype=q_dtype,
-        block_shape=test_config.block_size)
+        block_shape=test_config.block_size,
+    )
 
     fused_experts = DeepGemmExperts()
-    mk = FusedMoEModularKernel(prepare_finalize=a2a,
-                               fused_experts=fused_experts)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts)
     return mk
 
 
-def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int,
-                        num_local_experts: int,
-                        test_tensors: TestTensors) -> FusedMoEModularKernel:
-
+def make_modular_kernel(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    num_local_experts: int,
+    test_tensors: TestTensors,
+) -> FusedMoEModularKernel:
     q_dtype = torch.float8_e4m3fn
     test_config = test_tensors.config
 
     mk: FusedMoEModularKernel
     # Make modular kernel
     if test_config.low_latency:
-        max_tokens_per_rank = max(
-            64, next_power_of_2(test_tensors.rank_tokens.size(0)))
+        max_tokens_per_rank = max(64, next_power_of_2(test_tensors.rank_tokens.size(0)))
         hidden_size = test_tensors.rank_tokens.size(-1)
 
-        mk = make_ll_modular_kernel(pg=pg,
-                                    pgi=pgi,
-                                    max_tokens_per_rank=max_tokens_per_rank,
-                                    dp_size=dp_size,
-                                    hidden_size=hidden_size,
-                                    q_dtype=q_dtype,
-                                    test_config=test_config)
+        mk = make_ll_modular_kernel(
+            pg=pg,
+            pgi=pgi,
+            max_tokens_per_rank=max_tokens_per_rank,
+            dp_size=dp_size,
+            hidden_size=hidden_size,
+            q_dtype=q_dtype,
+            test_config=test_config,
+        )
     else:
-        mk = make_ht_modular_kernel(pg, pgi, dp_size, num_local_experts,
-                                    q_dtype, test_config)
+        mk = make_ht_modular_kernel(
+            pg, pgi, dp_size, num_local_experts, q_dtype, test_config
+        )
 
     return mk
 
 
-def deepep_deepgemm_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo,
-                             dp_size: int, test_tensors: TestTensors,
-                             w1: torch.Tensor, w2: torch.Tensor,
-                             w1_scale: Optional[torch.Tensor],
-                             w2_scale: Optional[torch.Tensor]) -> torch.Tensor:
-
+def deepep_deepgemm_moe_impl(
+    pg: ProcessGroup,
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    test_tensors: TestTensors,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor],
+    w2_scale: Optional[torch.Tensor],
+) -> torch.Tensor:
     test_config = test_tensors.config
     num_experts = test_config.num_experts
     num_local_experts = w1.size(0)
 
     def build_expert_map():
         num_local_experts = w1.size(0)
-        expert_map = torch.full((num_experts, ),
-                                fill_value=-1,
-                                dtype=torch.int32)
+        expert_map = torch.full((num_experts,), fill_value=-1, dtype=torch.int32)
         s = pgi.rank * num_local_experts
         e = s + num_local_experts
         expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-        return expert_map.to(device=torch.cuda.current_device(),
-                             dtype=torch.int32)
+        return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32)
 
     # Make modular kernel
     mk: FusedMoEModularKernel = make_modular_kernel(
@@ -236,36 +256,44 @@ def build_expert_map():
         pgi=pgi,
         dp_size=dp_size,
         num_local_experts=num_local_experts,
-        test_tensors=test_tensors)
+        test_tensors=test_tensors,
+    )
 
     # Low-Latency kernels can't dispatch scales.
-    a1_scale = (None
-                if test_config.low_latency else test_tensors.rank_token_scales)
-
-    out = mk.forward(hidden_states=test_tensors.rank_tokens,
-                     w1=w1,
-                     w2=w2,
-                     topk_weights=test_tensors.topk_weights,
-                     topk_ids=test_tensors.topk,
-                     inplace=False,
-                     activation="silu",
-                     global_num_experts=num_experts,
-                     expert_map=build_expert_map(),
-                     w1_scale=w1_scale,
-                     w2_scale=w2_scale,
-                     w1_zp=None,
-                     w2_zp=None,
-                     a1_scale=a1_scale,
-                     a2_scale=None,
-                     apply_router_weight_on_input=False)
-    return out
+    a1_scale = None if test_config.low_latency else test_tensors.rank_token_scales
 
+    out = mk.forward(
+        hidden_states=test_tensors.rank_tokens,
+        w1=w1,
+        w2=w2,
+        topk_weights=test_tensors.topk_weights,
+        topk_ids=test_tensors.topk,
+        inplace=False,
+        activation="silu",
+        global_num_experts=num_experts,
+        expert_map=build_expert_map(),
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        w1_zp=None,
+        w2_zp=None,
+        a1_scale=a1_scale,
+        a2_scale=None,
+        apply_router_weight_on_input=False,
+    )
+    return out
 
-def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
-                topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
-                w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-                a1_scale: torch.Tensor, block_shape: list[int]):
 
+def triton_impl(
+    a: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    a1_scale: torch.Tensor,
+    block_shape: list[int],
+):
     return fused_experts(
         hidden_states=a,
         w1=w1,
@@ -280,7 +308,8 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
         block_shape=block_shape,
         # Make sure this is set to False so we
         # dont end up comparing the same implementation.
-        allow_deep_gemm=False)
+        allow_deep_gemm=False,
+    )
 
 
 def _test_deepep_deepgemm_moe(
@@ -301,22 +330,21 @@ def _test_deepep_deepgemm_moe(
 
     pg = torch.distributed.new_group(list(range(pgi.world_size)))
     test_tensors = TestTensors.make(config, pgi.rank)
-    block_shape = [
-        w1.size(1) // w1_scale.size(1),
-        w1.size(2) // w1_scale.size(2)
-    ]
+    block_shape = [w1.size(1) // w1_scale.size(1), w1.size(2) // w1_scale.size(2)]
 
     with set_current_vllm_config(VllmConfig()):
         # Reference
-        triton_moe = triton_impl(a=test_tensors.rank_tokens,
-                                 topk_ids=test_tensors.topk,
-                                 topk_weights=test_tensors.topk_weights,
-                                 w1=w1,
-                                 w2=w2,
-                                 w1_scale=w1_scale,
-                                 w2_scale=w2_scale,
-                                 a1_scale=test_tensors.rank_token_scales,
-                                 block_shape=block_shape)
+        triton_moe = triton_impl(
+            a=test_tensors.rank_tokens,
+            topk_ids=test_tensors.topk,
+            topk_weights=test_tensors.topk_weights,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=test_tensors.rank_token_scales,
+            block_shape=block_shape,
+        )
 
         # Slice experts for this rank.
         num_local_experts = config.num_experts // pgi.world_size
@@ -369,10 +397,15 @@ def _test_deepep_deepgemm_moe(
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
-                    reason="Skipping test for Blackwell DeepGEMM")
-def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
-                                topk: int, world_dp_size: tuple[int, int]):
+@pytest.mark.skipif(
+    is_blackwell_deep_gemm_used(), reason="Skipping test for Blackwell DeepGEMM"
+)
+def test_ht_deepep_deepgemm_moe(
+    mnk: tuple[int, int, int],
+    num_experts: int,
+    topk: int,
+    world_dp_size: tuple[int, int],
+):
     """
     Tests for High-Throughput DeepEP + DeepGemm integration.
     """
@@ -388,21 +421,32 @@ def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
     block_size = [block_m, block_m]
 
     world_size, dp_size = world_dp_size
-    config = TestConfig(topk=topk,
-                        m=m,
-                        k=k,
-                        n=n,
-                        num_experts=num_experts,
-                        per_act_token_quant=False,
-                        block_size=block_size,
-                        low_latency=False,
-                        use_fp8_dispatch=None)
+    config = TestConfig(
+        topk=topk,
+        m=m,
+        k=k,
+        n=n,
+        num_experts=num_experts,
+        per_act_token_quant=False,
+        block_size=block_size,
+        low_latency=False,
+        use_fp8_dispatch=None,
+    )
 
     w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
-        num_experts, n, k, block_size)
+        num_experts, n, k, block_size
+    )
 
-    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
-                    w2, w1_scale, w2_scale)
+    parallel_launch(
+        world_size,
+        _test_deepep_deepgemm_moe,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+    )
 
 
 MNKs = [
@@ -426,8 +470,9 @@ def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
-                    reason="Skipping test for Blackwell DeepGEMM")
+@pytest.mark.skipif(
+    is_blackwell_deep_gemm_used(), reason="Skipping test for Blackwell DeepGEMM"
+)
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
@@ -460,7 +505,16 @@ def test_ll_deepep_deepgemm_moe(
     )
 
     w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
-        num_experts, n, k, block_size)
+        num_experts, n, k, block_size
+    )
 
-    parallel_launch(world_size, _test_deepep_deepgemm_moe, dp_size, config, w1,
-                    w2, w1_scale, w2_scale)
+    parallel_launch(
+        world_size,
+        _test_deepep_deepgemm_moe,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+    )
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 43804c410b6c..5f5a17dc6714 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -15,12 +15,11 @@
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import TritonExperts
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+    per_token_group_quant_fp8,
+)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep
 
@@ -28,9 +27,11 @@
 
 if has_deep_ep():
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
+        DeepEPHTPrepareAndFinalize,
+    )
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
+        DeepEPLLPrepareAndFinalize,
+    )
 
     from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
 
@@ -43,7 +44,7 @@
 
 
 def make_weights(
-        e, n, k, dtype
+    e, n, k, dtype
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Return weights w1, w2, w1_scale, w2_scale
@@ -62,17 +63,15 @@ def make_weights(
     k_b_scales = k
     w1_q = torch.empty_like(w1, dtype=dtype)
     w2_q = torch.empty_like(w2, dtype=dtype)
-    w1_scale = torch.empty((e, n_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-    w2_scale = torch.empty((e, k_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
+    w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
     for expert in range(e):
         w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-            w1[expert], use_per_token_if_dynamic=True)
+            w1[expert], use_per_token_if_dynamic=True
+        )
         w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-            w2[expert], use_per_token_if_dynamic=True)
+            w2[expert], use_per_token_if_dynamic=True
+        )
     return w1_q, w2_q, w1_scale, w2_scale
 
 
@@ -98,24 +97,25 @@ class TestTensors:
     def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors":
         # TODO (varun) - check that float16 works ?
         assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn]
-        token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn
-                       else config.dtype)
-        rank_tokens = torch.randn(
-            (config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        token_dtype = (
+            torch.bfloat16 if config.dtype == torch.float8_e4m3fn else config.dtype
+        )
+        rank_tokens = (
+            torch.randn((config.m, config.k), device="cuda", dtype=token_dtype) / 10
+        )
         rank_token_scales = None
 
-        topk = torch.randint(low=0,
-                             high=config.num_experts,
-                             size=(config.m, config.topk),
-                             device="cuda").to(dtype=torch.int64)
-        topk_weights = torch.randn(topk.shape,
-                                   dtype=torch.float32,
-                                   device="cuda")
-        return TestTensors(rank_tokens=rank_tokens,
-                           rank_token_scales=rank_token_scales,
-                           topk=topk,
-                           topk_weights=topk_weights,
-                           config=config)
+        topk = torch.randint(
+            low=0, high=config.num_experts, size=(config.m, config.topk), device="cuda"
+        ).to(dtype=torch.int64)
+        topk_weights = torch.randn(topk.shape, dtype=torch.float32, device="cuda")
+        return TestTensors(
+            rank_tokens=rank_tokens,
+            rank_token_scales=rank_token_scales,
+            topk=topk,
+            topk_weights=topk_weights,
+            config=config,
+        )
 
 
 def make_modular_kernel(
@@ -130,30 +130,35 @@ def make_modular_kernel(
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ) -> FusedMoEModularKernel:
-
     is_quantized = q_dtype is not None
 
     ht_args: Optional[DeepEPHTArgs] = None
     ll_args: Optional[DeepEPLLArgs] = None
 
     if low_latency_mode:
-        ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK,
-                               hidden_size=hidden_size,
-                               num_experts=num_experts,
-                               use_fp8_dispatch=use_fp8_dispatch)
+        ll_args = DeepEPLLArgs(
+            max_tokens_per_rank=MAX_TOKENS_PER_RANK,
+            hidden_size=hidden_size,
+            num_experts=num_experts,
+            use_fp8_dispatch=use_fp8_dispatch,
+        )
     else:
         assert not use_fp8_dispatch, (
-            "FP8 Dispatch is valid only for low-latency kernels")
+            "FP8 Dispatch is valid only for low-latency kernels"
+        )
         ht_args = DeepEPHTArgs(num_local_experts=num_local_experts)
 
-    a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \
-        make_deepep_a2a(pg = pg,
-                        pgi = pgi,
-                        dp_size = dp_size,
-                        q_dtype = q_dtype,
-                        block_shape = None,
-                        deepep_ht_args = ht_args,
-                        deepep_ll_args = ll_args)
+    a2a: Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = (
+        make_deepep_a2a(
+            pg=pg,
+            pgi=pgi,
+            dp_size=dp_size,
+            q_dtype=q_dtype,
+            block_shape=None,
+            deepep_ht_args=ht_args,
+            deepep_ll_args=ll_args,
+        )
+    )
 
     num_dispatchers = pgi.world_size // dp_size
 
@@ -177,8 +182,7 @@ def make_modular_kernel(
             per_act_token_quant=per_act_token_quant,
         )
 
-    mk = FusedMoEModularKernel(prepare_finalize=a2a,
-                               fused_experts=fused_experts)
+    mk = FusedMoEModularKernel(prepare_finalize=a2a, fused_experts=fused_experts)
     return mk
 
 
@@ -196,19 +200,15 @@ def deep_ep_moe_impl(
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ) -> torch.Tensor:
-
     num_local_experts = w1.size(0)
 
     def build_expert_map():
         num_local_experts = w1.size(0)
-        expert_map = torch.full((num_experts, ),
-                                fill_value=-1,
-                                dtype=torch.int32)
+        expert_map = torch.full((num_experts,), fill_value=-1, dtype=torch.int32)
         s = pgi.rank * num_local_experts
         e = s + num_local_experts
         expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-        return expert_map.to(device=torch.cuda.current_device(),
-                             dtype=torch.int32)
+        return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32)
 
     hidden_size = test_tensors.rank_tokens.size(1)
     is_quantized = w1.dtype == torch.float8_e4m3fn
@@ -218,8 +218,17 @@ def build_expert_map():
 
     # Make modular kernel
     mk: FusedMoEModularKernel = make_modular_kernel(
-        pg, pgi, low_latency_mode, hidden_size, dp_size, num_experts,
-        num_local_experts, q_dtype, use_fp8_dispatch, per_act_token_quant)
+        pg,
+        pgi,
+        low_latency_mode,
+        hidden_size,
+        dp_size,
+        num_experts,
+        num_local_experts,
+        q_dtype,
+        use_fp8_dispatch,
+        per_act_token_quant,
+    )
 
     out_hidden_states = torch.empty_like(test_tensors.rank_tokens)
     total_num_tokens = test_tensors.rank_tokens.size(0)
@@ -229,35 +238,38 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
         topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end]
         topk_chunk = test_tensors.topk[chunk_start:chunk_end]
         rank_token_scales_chunk = test_tensors.rank_token_scales
-        if rank_token_scales_chunk is not None and rank_token_scales_chunk.size(
-                0) == total_num_tokens:
+        if (
+            rank_token_scales_chunk is not None
+            and rank_token_scales_chunk.size(0) == total_num_tokens
+        ):
             # per act token
-            rank_token_scales_chunk = rank_token_scales_chunk[
-                chunk_start:chunk_end]
-
-        out = mk.forward(hidden_states=rank_tokens_chunk,
-                         w1=w1,
-                         w2=w2,
-                         topk_weights=topk_weights_chunk,
-                         topk_ids=topk_chunk,
-                         inplace=False,
-                         activation="silu",
-                         global_num_experts=num_experts,
-                         expert_map=build_expert_map(),
-                         w1_scale=w1_scale,
-                         w2_scale=w2_scale,
-                         w1_zp=None,
-                         w2_zp=None,
-                         a1_scale=rank_token_scales_chunk,
-                         a2_scale=None,
-                         apply_router_weight_on_input=False)
+            rank_token_scales_chunk = rank_token_scales_chunk[chunk_start:chunk_end]
+
+        out = mk.forward(
+            hidden_states=rank_tokens_chunk,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights_chunk,
+            topk_ids=topk_chunk,
+            inplace=False,
+            activation="silu",
+            global_num_experts=num_experts,
+            expert_map=build_expert_map(),
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=None,
+            w2_zp=None,
+            a1_scale=rank_token_scales_chunk,
+            a2_scale=None,
+            apply_router_weight_on_input=False,
+        )
 
         if not skip_result_store:
-            out_hidden_states[chunk_start:chunk_end, :].copy_(
-                out, non_blocking=True)
+            out_hidden_states[chunk_start:chunk_end, :].copy_(out, non_blocking=True)
 
-    max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK
-                             if low_latency_mode else total_num_tokens)
+    max_num_tokens_per_dp = (
+        MAX_TOKENS_PER_RANK if low_latency_mode else total_num_tokens
+    )
 
     for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp):
         chunk_start = chunk_start_
@@ -266,9 +278,9 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
         chunk_start = min(chunk_start, total_num_tokens - 1)
         chunk_end = min(chunk_end, total_num_tokens)
 
-        process_chunk(chunk_start,
-                      chunk_end,
-                      skip_result_store=chunk_start_ >= total_num_tokens)
+        process_chunk(
+            chunk_start, chunk_end, skip_result_store=chunk_start_ >= total_num_tokens
+        )
 
     return out_hidden_states
 
@@ -282,9 +294,11 @@ def torch_moe_impl(
     using_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ):
-
-    a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk,
-                                 test_tensors.topk_weights)
+    a, topk_ids, topk_weights = (
+        test_tensors.rank_tokens,
+        test_tensors.topk,
+        test_tensors.topk_weights,
+    )
     if using_fp8_dispatch:
         # The DeepEP implementation is requested to dispatch using FP8.
         # For numerical stability for testing, emulate the fp8 dispatch by
@@ -292,8 +306,11 @@ def torch_moe_impl(
         assert not per_act_token_quant
         a = test_tensors.rank_tokens
         aq, aq_scale = per_token_group_quant_fp8(a, 128)
-        a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view(
-            a.shape).to(a.dtype)
+        a = (
+            (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
+            .view(a.shape)
+            .to(a.dtype)
+        )
 
     is_quantized = w1.dtype == torch.float8_e4m3fn
     a_dtype = a.dtype
@@ -314,8 +331,9 @@ def torch_moe_impl(
             e_w = topk_weights[i][j]
             w1_e = w1[e]
             w2_e = w2[e]
-            o_i += (SiluAndMul()
-                    (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w
+            o_i += (
+                SiluAndMul()(a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)
+            ) * e_w
 
     if is_quantized:
         out = out.to(dtype=a_dtype)
@@ -335,28 +353,36 @@ def _deep_ep_moe(
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ):
-
     if not low_latency_mode:
         assert not use_fp8_dispatch, (
-            "FP8 dispatch interface is available only in low-latency mode")
+            "FP8 dispatch interface is available only in low-latency mode"
+        )
 
     is_quantized = w1.dtype == torch.float8_e4m3fn
     w1 = w1.to(device=torch.cuda.current_device())
     w2 = w2.to(device=torch.cuda.current_device())
     if is_quantized:
         w1_scale = w1_scale.to(  # type: ignore
-            device=torch.cuda.current_device())
+            device=torch.cuda.current_device()
+        )
         w2_scale = w2_scale.to(  # type: ignore
-            device=torch.cuda.current_device())
+            device=torch.cuda.current_device()
+        )
 
     pg = torch.distributed.new_group(list(range(pgi.world_size)))
     test_tensors = TestTensors.make(config, low_latency_mode)
 
     with set_current_vllm_config(VllmConfig()):
         # Reference
-        torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale,
-                                        w2_scale, use_fp8_dispatch,
-                                        per_act_token_quant)
+        torch_combined = torch_moe_impl(
+            test_tensors,
+            w1,
+            w2,
+            w1_scale,
+            w2_scale,
+            use_fp8_dispatch,
+            per_act_token_quant,
+        )
 
         # Splice experts for this rank.
         num_local_experts = config.num_experts // pgi.world_size
@@ -426,18 +452,23 @@ def test_deep_ep_moe(
 
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    config = TestConfig(dtype=dtype,
-                        topk=topk,
-                        m=m,
-                        k=k,
-                        n=n,
-                        num_experts=num_experts)
+    config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
 
     w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
 
-    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
-                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
-                    per_act_token_quant)
+    parallel_launch(
+        world_size,
+        _deep_ep_moe,
+        low_latency_mode,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+        use_fp8_dispatch,
+        per_act_token_quant,
+    )
 
 
 MNKs = [
@@ -460,16 +491,18 @@ def test_deep_ep_moe(
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
 @requires_deep_ep
-def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
-                                 num_experts: int, topk: int,
-                                 world_dp_size: tuple[int, int],
-                                 use_fp8_dispatch: bool):
-
+def test_low_latency_deep_ep_moe(
+    dtype: torch.dtype,
+    mnk: tuple[int, int, int],
+    num_experts: int,
+    topk: int,
+    world_dp_size: tuple[int, int],
+    use_fp8_dispatch: bool,
+):
     low_latency_mode = True
     m, n, k = mnk
 
-    if (low_latency_mode
-            and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES):
+    if low_latency_mode and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES:
         pytest.skip(
             f"Skipping test as hidden size {k} is not in list of supported "
             f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}"
@@ -477,15 +510,20 @@ def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
 
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    config = TestConfig(dtype=dtype,
-                        topk=topk,
-                        m=m,
-                        k=k,
-                        n=n,
-                        num_experts=num_experts)
+    config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts)
 
     w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype)
 
-    parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size,
-                    config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch,
-                    False)
+    parallel_launch(
+        world_size,
+        _deep_ep_moe,
+        low_latency_mode,
+        dp_size,
+        config,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+        use_fp8_dispatch,
+        False,
+    )
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index f7578e226917..360d809aed97 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -14,7 +14,8 @@
 # vLLM fused-expert reference (Triton fallback + DeepGEMM option)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+    per_token_group_quant_fp8,
+)
 from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
 
@@ -40,8 +41,10 @@ def make_block_quant_fp8_weights(
       w2 shape: (E, K, N)
     """
     dtype = torch.bfloat16
-    fp8_max, fp8_min = torch.finfo(torch.float8_e4m3fn).max, torch.finfo(
-        torch.float8_e4m3fn).min
+    fp8_max, fp8_min = (
+        torch.finfo(torch.float8_e4m3fn).max,
+        torch.finfo(torch.float8_e4m3fn).min,
+    )
 
     # bf16 reference weights
     w1_bf16 = torch.randn(e, 2 * n, k, device="cuda", dtype=dtype) / 10
@@ -57,16 +60,8 @@ def make_block_quant_fp8_weights(
 
     w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
     w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
-    w1_s = torch.empty(e,
-                       n_tiles_w1,
-                       k_tiles_w1,
-                       device="cuda",
-                       dtype=torch.float32)
-    w2_s = torch.empty(e,
-                       n_tiles_w2,
-                       k_tiles_w2,
-                       device="cuda",
-                       dtype=torch.float32)
+    w1_s = torch.empty(e, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32)
+    w2_s = torch.empty(e, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32)
 
     for i in range(e):
         w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
@@ -80,18 +75,17 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
     Run one (M,N,K) configuration on a single GPU and assert DeepGEMM ==
     Triton baseline within tolerance.
     """
-    tokens_bf16 = torch.randn(
-        m, k, device="cuda", dtype=torch.bfloat16).clamp_min_(-1).clamp_max_(1)
+    tokens_bf16 = (
+        torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+        .clamp_min_(-1)
+        .clamp_max_(1)
+    )
     _, a1_scale = per_token_group_quant_fp8(tokens_bf16, block_size[1])
 
     # expert weight tensors
-    w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(num_experts, n, k,
-                                                      block_size)
+    w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(num_experts, n, k, block_size)
 
-    router_logits = torch.randn(m,
-                                num_experts,
-                                device="cuda",
-                                dtype=torch.float32)
+    router_logits = torch.randn(m, num_experts, device="cuda", dtype=torch.float32)
     topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
     topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
 
@@ -150,12 +144,12 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @requires_deep_gemm
 def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch):
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_DEEP_GEMM", "1")
 
         _fused_moe_mod = importlib.import_module(
-            "vllm.model_executor.layers.fused_moe.fused_moe")
+            "vllm.model_executor.layers.fused_moe.fused_moe"
+        )
 
         call_counter = {"cnt": 0}
 
@@ -165,8 +159,7 @@ def _spy_deep_gemm_moe_fp8(*args, **kwargs):
             call_counter["cnt"] += 1
             return orig_fn(*args, **kwargs)
 
-        monkeypatch.setattr(_fused_moe_mod, "deep_gemm_moe_fp8",
-                            _spy_deep_gemm_moe_fp8)
+        monkeypatch.setattr(_fused_moe_mod, "deep_gemm_moe_fp8", _spy_deep_gemm_moe_fp8)
 
         m, n, k = mnk
 
@@ -183,6 +176,7 @@ def _spy_deep_gemm_moe_fp8(*args, **kwargs):
         )
 
         # ensure that the DeepGEMM path was indeed taken.
-        assert call_counter["cnt"] == 1, \
-            f"DeepGEMM path was not executed during the test. " \
+        assert call_counter["cnt"] == 1, (
+            f"DeepGEMM path was not executed during the test. "
             f"Call counter: {call_counter['cnt']}"
+        )
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index 6f2869c3a61d..2eef8fdcf508 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -11,27 +11,37 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import VllmConfig, current_platform, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
+    BatchedTritonOrDeepGemmExperts,
+)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
 from vllm.model_executor.layers.fused_moe.layer import TritonExperts
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
+    TritonOrDeepGemmExperts,
+)
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 
-from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
-                                          reference_moe_impl,
-                                          run_modular_kernel)
+from .modular_kernel_tools.common import (
+    Config,
+    RankTensors,
+    WeightTensors,
+    reference_moe_impl,
+    run_modular_kernel,
+)
 from .modular_kernel_tools.mk_objects import (
-    MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
-    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
-from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
-                                                  parallel_launch_with_config)
+    MK_FUSED_EXPERT_TYPES,
+    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
+    MK_QUANT_CONFIGS,
+    MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES,
+)
+from .modular_kernel_tools.parallel_utils import (
+    ProcessGroupInfo,
+    parallel_launch_with_config,
+)
 
 # TODO (varun): These requirements are very strict and could be relaxed.
-has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx())
+has_all_packages = has_deep_ep() and has_deep_gemm() and has_pplx()
 
 meets_package_requirements = pytest.mark.skipif(
     not has_all_packages,
@@ -50,8 +60,9 @@ def rank_worker(
 
     # sanity check
     from vllm import envs
+
     if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
     # get weights to this device
     weights.to_current_device()
@@ -72,8 +83,7 @@ def rank_worker(
         rank_tensors = RankTensors.make(cfgx, pgi)
 
         # modular kernel out
-        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
-                                    rank_tensors)
+        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, rank_tensors)
 
         with set_current_vllm_config(vllm_config):
             ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
@@ -88,8 +98,9 @@ def run(config: Config):
     weights: WeightTensors = WeightTensors.make(config)
 
     vllm_config, env_dict = config.make_env_data()
-    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
-                                env_dict, config, weights)
+    parallel_launch_with_config(
+        config.world_size, rank_worker, vllm_config, env_dict, config, weights
+    )
 
 
 Ms = [32, 64]
@@ -104,14 +115,17 @@ def run(config: Config):
 def is_nyi_config(config: Config) -> bool:
     # We know these configs to be legitimate. but still fail.
 
-    if (config.fused_experts_type in [
-            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
-            TritonExperts, TritonOrDeepGemmExperts
-    ]):
+    if config.fused_experts_type in [
+        BatchedTritonExperts,
+        BatchedTritonOrDeepGemmExperts,
+        TritonExperts,
+        TritonOrDeepGemmExperts,
+    ]:
         # The triton kernels expect both per-act-token-quant and
         # per-out-ch-quant or neither.
-        unsupported_quant_config = ((config.is_per_act_token_quant +
-                                     config.is_per_out_ch_quant) == 1)
+        unsupported_quant_config = (
+            config.is_per_act_token_quant + config.is_per_out_ch_quant
+        ) == 1
         return unsupported_quant_config
 
     # cutlass kernels dont support expert_maps yet.
@@ -124,18 +138,23 @@ def is_nyi_config(config: Config) -> bool:
 @pytest.mark.parametrize("dtype", DTYPEs)
 @pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
 @pytest.mark.parametrize(
-    "combination",
-    product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
+    "combination", product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)
+)
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [2])
 @meets_package_requirements
 def test_modular_kernel_combinations_multigpu(
-        k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: FusedMoEQuantConfig,
-        combination: tuple[mk.FusedMoEPrepareAndFinalize,
-                           mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int):
-
+    k: int,
+    n: int,
+    e: int,
+    dtype: torch.dtype,
+    quant_config: FusedMoEQuantConfig,
+    combination: tuple[
+        mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute
+    ],
+    fused_moe_chunk_size: Optional[int],
+    world_size: int,
+):
     config = Config(
         Ms=Ms,
         K=k,
@@ -165,17 +184,23 @@ def test_modular_kernel_combinations_multigpu(
 @pytest.mark.parametrize("dtype", DTYPEs)
 @pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
 @pytest.mark.parametrize(
-    "combination",
-    product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
+    "combination", product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)
+)
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [1])
 @meets_package_requirements
 def test_modular_kernel_combinations_singlegpu(
-        k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: FusedMoEQuantConfig,
-        combination: tuple[mk.FusedMoEPrepareAndFinalize,
-                           mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int):
+    k: int,
+    n: int,
+    e: int,
+    dtype: torch.dtype,
+    quant_config: FusedMoEQuantConfig,
+    combination: tuple[
+        mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute
+    ],
+    fused_moe_chunk_size: Optional[int],
+    world_size: int,
+):
     config = Config(
         Ms=Ms,
         K=k,
@@ -199,15 +224,17 @@ def test_modular_kernel_combinations_singlegpu(
     run(config)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Ability to test individual PrepareAndFinalize and FusedExperts combination
-    from .modular_kernel_tools.cli_args import (make_config,
-                                                make_config_arg_parser)
-    parser = make_config_arg_parser(description=(
-        "Run single prepare-finalize & fused-experts combination test"
-        "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "  #noqa: E501
-        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
-    ))
+    from .modular_kernel_tools.cli_args import make_config, make_config_arg_parser
+
+    parser = make_config_arg_parser(
+        description=(
+            "Run single prepare-finalize & fused-experts combination test"
+            "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "  # noqa: E501
+            "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+        )
+    )
     args = parser.parse_args()
     config = make_config(args)
 
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0f1c78704642..54e5525ab0a9 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+
 import functools
 from typing import Callable, Optional, Union
 
@@ -21,17 +22,23 @@
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, modular_triton_fused_moe)
+    fused_topk,
+    modular_triton_fused_moe,
+)
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
-    fused_moe as iterative_moe)
+    fused_moe as iterative_moe,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    rand_marlin_weight_fp4_like)
+    rand_marlin_weight_fp4_like,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    marlin_quant_fp8_torch)
+    marlin_quant_fp8_torch,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    awq_marlin_quantize, marlin_quantize)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    quantize_weights)
+    awq_marlin_quantize,
+    marlin_quantize,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_weights
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
@@ -64,13 +71,15 @@ def run_moe_test(
     if isinstance(baseline, torch.Tensor):
         baseline_output = baseline
     else:
-        baseline_output = baseline(a,
-                                   w1,
-                                   w2,
-                                   score,
-                                   topk,
-                                   global_num_experts=global_num_experts,
-                                   expert_map=expert_map)
+        baseline_output = baseline(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+        )
 
     # Pad the weight if moe padding is enabled
     if padding:
@@ -82,34 +91,35 @@ def run_moe_test(
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(score, 0)
 
-    test_output = moe_fn(a,
-                         w1,
-                         w2,
-                         score,
-                         topk,
-                         global_num_experts=global_num_experts,
-                         expert_map=expert_map)
+    test_output = moe_fn(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+    )
 
     if use_cudagraph:
         test_output.fill_(0)
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            test_output = moe_fn(a,
-                                 w1,
-                                 w2,
-                                 score,
-                                 topk,
-                                 global_num_experts=global_num_experts,
-                                 expert_map=expert_map)
+            test_output = moe_fn(
+                a,
+                w1,
+                w2,
+                score,
+                topk,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
         torch.cuda.synchronize()
         graph.replay()
         torch.cuda.synchronize()
 
-    torch.testing.assert_close(test_output,
-                               baseline_output,
-                               atol=atol,
-                               rtol=rtol)
+    torch.testing.assert_close(test_output, baseline_output, atol=atol, rtol=rtol)
 
     return baseline_output
 
@@ -155,11 +165,8 @@ def test_fused_moe(
 
     if ep_size > 1:
         local_e = e // ep_size
-        e_ids = torch.randint(0,
-                              e, (local_e, ),
-                              device="cuda",
-                              dtype=torch.int32)
-        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_ids = torch.randint(0, e, (local_e,), device="cuda", dtype=torch.int32)
+        e_map = torch.full((e,), -1, device="cuda", dtype=torch.int32)
         e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
         w1 = w1[e_ids]
         w2 = w2[e_ids]
@@ -170,13 +177,15 @@ def test_fused_moe(
     # Setup test functions
     #
 
-    m_fused_moe_fn = modular_triton_fused_moe(use_fp8_w8a8=False,
-                                              use_int8_w8a8=False,
-                                              use_int8_w8a16=False,
-                                              use_int4_w4a16=False,
-                                              use_mxfp4_w4a4=False,
-                                              per_act_token_quant=False,
-                                              block_shape=None)
+    m_fused_moe_fn = modular_triton_fused_moe(
+        use_fp8_w8a8=False,
+        use_int8_w8a8=False,
+        use_int8_w8a16=False,
+        use_int4_w4a16=False,
+        use_mxfp4_w4a4=False,
+        per_act_token_quant=False,
+        block_shape=None,
+    )
 
     def m_fused_moe(
         a: torch.Tensor,
@@ -188,13 +197,15 @@ def m_fused_moe(
         expert_map: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
-        return m_fused_moe_fn(a,
-                              w1,
-                              w2,
-                              topk_weights,
-                              topk_ids,
-                              global_num_experts=global_num_experts,
-                              expert_map=expert_map)
+        return m_fused_moe_fn(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+        )
 
     fused_moe_fn = functools.partial(fused_moe, renormalize=False)
 
@@ -218,19 +229,22 @@ def m_fused_moe(
     # setup code in case we are able to revisit this later.
     use_compile = False
 
-    use_cudagraph = (n >= 1024 and k >= 1024
-                     and current_platform.is_cuda_alike())
+    use_cudagraph = n >= 1024 and k >= 1024 and current_platform.is_cuda_alike()
 
     with set_current_vllm_config(vllm_config):
         baseline_output = runner(torch_moe, iterative_moe)
-        runner(baseline_output,
-               fused_moe_fn,
-               use_compile=use_compile,
-               use_cudagraph=use_cudagraph)
-        runner(baseline_output,
-               m_fused_moe,
-               use_compile=use_compile,
-               use_cudagraph=use_cudagraph)
+        runner(
+            baseline_output,
+            fused_moe_fn,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
+        runner(
+            baseline_output,
+            m_fused_moe,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
 
 
 @pytest.mark.parametrize("m", [1, 32, 222])
@@ -243,9 +257,18 @@ def m_fused_moe(
 @pytest.mark.parametrize("group_size", [64, 128])
 @pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("weight_bits", [4, 8])
-def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
-                        ep_size: int, dtype: torch.dtype, group_size: int,
-                        has_zp: bool, weight_bits: int):
+def test_fused_moe_wn16(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    group_size: int,
+    has_zp: bool,
+    weight_bits: int,
+):
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
@@ -260,35 +283,40 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 
     w1_ref = w1.clone()
     w2_ref = w2.clone()
-    w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
-                             device="cuda",
-                             dtype=torch.uint8)
-    w2_qweight = torch.empty((e, k, n // pack_factor),
-                             device="cuda",
-                             dtype=torch.uint8)
-    w1_scales = torch.empty((e, 2 * n, k // group_size),
-                            device="cuda",
-                            dtype=dtype)
-    w2_scales = torch.empty((e, k, n // group_size),
-                            device="cuda",
-                            dtype=dtype)
-    w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
-                            device="cuda",
-                            dtype=torch.uint8)
-    w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
-                            device="cuda",
-                            dtype=torch.uint8)
+    w1_qweight = torch.empty(
+        (e, 2 * n, k // pack_factor), device="cuda", dtype=torch.uint8
+    )
+    w2_qweight = torch.empty((e, k, n // pack_factor), device="cuda", dtype=torch.uint8)
+    w1_scales = torch.empty((e, 2 * n, k // group_size), device="cuda", dtype=dtype)
+    w2_scales = torch.empty((e, k, n // group_size), device="cuda", dtype=dtype)
+    w1_qzeros = torch.empty(
+        (e, 2 * n // pack_factor, k // group_size), device="cuda", dtype=torch.uint8
+    )
+    w2_qzeros = torch.empty(
+        (e, k // pack_factor, n // group_size), device="cuda", dtype=torch.uint8
+    )
 
     for i in range(e * 2):
         expert_id = i % e
         if i // e == 0:
-            w, w_ref, w_qweight, w_scales, w_qzeros = \
-                w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
+            w, w_ref, w_qweight, w_scales, w_qzeros = (
+                w1,
+                w1_ref,
+                w1_qweight,
+                w1_scales,
+                w1_qzeros,
+            )
         else:
-            w, w_ref, w_qweight, w_scales, w_qzeros = \
-                w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
+            w, w_ref, w_qweight, w_scales, w_qzeros = (
+                w2,
+                w2_ref,
+                w2_qweight,
+                w2_scales,
+                w2_qzeros,
+            )
         weight, qweight, scales, qzeros = quantize_weights(
-            w[expert_id].T, quant_type, group_size, has_zp, False)
+            w[expert_id].T, quant_type, group_size, has_zp, False
+        )
         weight = weight.T
         qweight = qweight.T.contiguous().to(torch.uint8)
         scales = scales.T
@@ -307,11 +335,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 
     if ep_size > 1:
         local_e = e // ep_size
-        e_ids = torch.randint(0,
-                              e, (local_e, ),
-                              device="cuda",
-                              dtype=torch.int32)
-        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_ids = torch.randint(0, e, (local_e,), device="cuda", dtype=torch.int32)
+        e_map = torch.full((e,), -1, device="cuda", dtype=torch.int32)
         e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
         w1_ref = w1_ref[e_ids]
         w2_ref = w2_ref[e_ids]
@@ -325,45 +350,45 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
         e_map = None
 
     with set_current_vllm_config(vllm_config):
-        triton_output = fused_moe(a,
-                                  w1_qweight,
-                                  w2_qweight,
-                                  score,
-                                  topk,
-                                  renormalize=False,
-                                  use_int4_w4a16=weight_bits == 4,
-                                  use_int8_w8a16=weight_bits == 8,
-                                  global_num_experts=e,
-                                  expert_map=e_map,
-                                  w1_scale=w1_scales,
-                                  w2_scale=w2_scales,
-                                  w1_zp=w1_qzeros if has_zp else None,
-                                  w2_zp=w2_qzeros if has_zp else None,
-                                  block_shape=[0, group_size])
-        torch_output = torch_moe(a,
-                                 w1_ref,
-                                 w2_ref,
-                                 score,
-                                 topk,
-                                 expert_map=e_map)
+        triton_output = fused_moe(
+            a,
+            w1_qweight,
+            w2_qweight,
+            score,
+            topk,
+            renormalize=False,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            global_num_experts=e,
+            expert_map=e_map,
+            w1_scale=w1_scales,
+            w2_scale=w2_scales,
+            w1_zp=w1_qzeros if has_zp else None,
+            w2_zp=w2_qzeros if has_zp else None,
+            block_shape=[0, group_size],
+        )
+        torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, expert_map=e_map)
 
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
 
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
-                     monkeypatch):
+def test_mixtral_moe(
+    dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch
+):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
     # clear the cache before every test
     from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-        is_rocm_aiter_moe_enabled)
+        is_rocm_aiter_moe_enabled,
+    )
+
     is_rocm_aiter_moe_enabled.cache_clear()
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -371,17 +396,16 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
         if dtype == torch.float32:
             pytest.skip("AITER ROCm test skip for float32")
 
-    monkeypatch.setenv('RANK', "0")
-    monkeypatch.setenv('LOCAL_RANK', "0")
-    monkeypatch.setenv('WORLD_SIZE', "1")
-    monkeypatch.setenv('MASTER_ADDR', 'localhost')
-    monkeypatch.setenv('MASTER_PORT', '12345')
+    monkeypatch.setenv("RANK", "0")
+    monkeypatch.setenv("LOCAL_RANK", "0")
+    monkeypatch.setenv("WORLD_SIZE", "1")
+    monkeypatch.setenv("MASTER_ADDR", "localhost")
+    monkeypatch.setenv("MASTER_PORT", "12345")
     init_distributed_environment()
 
     # Instantiate our and huggingface's MoE blocks
     vllm_config.compilation_config.static_forward_context = dict()
-    with (set_current_vllm_config(vllm_config),
-          set_forward_context(None, vllm_config)):
+    with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
         config = MixtralConfig()
         hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
         vllm_moe = MixtralMoE(
@@ -397,28 +421,31 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
         # Load the weights
         vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
         for i in range(config.num_local_experts):
-            weights = (hf_moe.experts[i].w1.weight.data,
-                       hf_moe.experts[i].w3.weight.data)
+            weights = (
+                hf_moe.experts[i].w1.weight.data,
+                hf_moe.experts[i].w3.weight.data,
+            )
             vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
             vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
 
         # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
-        hf_inputs = torch.randn(
-            (1, 64, config.hidden_size)).to(dtype).to("cuda")
+        hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
         # vLLM uses 1D query [num_tokens, hidden_dim]
         vllm_inputs = hf_inputs.flatten(0, 1)
 
         # Pad the weight if moe padding is enabled
         if padding:
-            vllm_moe.experts.w13_weight = Parameter(F.pad(
-                vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[...,
-                                                                      0:-128],
-                                                    requires_grad=False)
+            vllm_moe.experts.w13_weight = Parameter(
+                F.pad(vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[
+                    ..., 0:-128
+                ],
+                requires_grad=False,
+            )
             torch.cuda.empty_cache()
-            vllm_moe.experts.w2_weight = Parameter(F.pad(
-                vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[...,
-                                                                     0:-128],
-                                                   requires_grad=False)
+            vllm_moe.experts.w2_weight = Parameter(
+                F.pad(vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
+                requires_grad=False,
+            )
             torch.cuda.empty_cache()
 
         # Run forward passes for both MoE blocks
@@ -434,19 +461,21 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
     if use_rocm_aiter:
         # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
         # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
-        torch.testing.assert_close(hf_states.flatten(0, 1),
-                                   vllm_states,
-                                   rtol=0.01,
-                                   atol=100)
+        torch.testing.assert_close(
+            hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
+        )
     else:
-        torch.testing.assert_close(hf_states.flatten(0, 1),
-                                   vllm_states,
-                                   rtol=mixtral_moe_tol[dtype],
-                                   atol=mixtral_moe_tol[dtype])
+        torch.testing.assert_close(
+            hf_states.flatten(0, 1),
+            vllm_states,
+            rtol=mixtral_moe_tol[dtype],
+            atol=mixtral_moe_tol[dtype],
+        )
 
 
 def marlin_moe_generate_valid_test_cases():
     import itertools
+
     m_list = [1, 123, 666]
     n_list = [128, 1024]
     k_list = [256, 2048]
@@ -465,16 +494,24 @@ def marlin_moe_generate_valid_test_cases():
     ]
     is_k_full_list = [True, False]
 
-    all_combinations = itertools.product(m_list, n_list, k_list, e_list,
-                                         topk_list, ep_size_list, dtype_list,
-                                         group_size_list, act_order_list,
-                                         quant_type_list, is_k_full_list)
-
-    def is_invalid(m, n, k, e, topk, ep_size, dtype, group_size, act_order,
-                   quant_type, is_k_full):
+    all_combinations = itertools.product(
+        m_list,
+        n_list,
+        k_list,
+        e_list,
+        topk_list,
+        ep_size_list,
+        dtype_list,
+        group_size_list,
+        act_order_list,
+        quant_type_list,
+        is_k_full_list,
+    )
 
-        if quant_type == scalar_types.float8_e4m3fn and \
-                group_size not in [-1, 128]:
+    def is_invalid(
+        m, n, k, e, topk, ep_size, dtype, group_size, act_order, quant_type, is_k_full
+    ):
+        if quant_type == scalar_types.float8_e4m3fn and group_size not in [-1, 128]:
             return False
         if quant_type == scalar_types.float4_e2m1f and group_size != 16:
             return False
@@ -500,9 +537,10 @@ def is_invalid(m, n, k, e, topk, ep_size, dtype, group_size, act_order,
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.parametrize(("m, n, k, e, topk, ep_size, dtype, group_size,"
-                          "act_order, quant_type, is_k_full"),
-                         marlin_moe_generate_valid_test_cases())
+@pytest.mark.parametrize(
+    ("m, n, k, e, topk, ep_size, dtype, group_size,act_order, quant_type, is_k_full"),
+    marlin_moe_generate_valid_test_cases(),
+)
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
     m: int,
@@ -552,7 +590,7 @@ def test_fused_marlin_moe(
     if ep_size > 1:
         local_e = e // ep_size
         e_ids = torch.randperm(e, device="cuda", dtype=torch.int32)[:local_e]
-        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map = torch.full((e,), -1, device="cuda", dtype=torch.int32)
         e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
         w1 = w1[e_ids]
         w2 = w2[e_ids]
@@ -569,22 +607,23 @@ def test_fused_marlin_moe(
 
     for i in range(w1.shape[0]):
         if quant_type == scalar_types.float4_e2m1f:
-            w_ref1, qweight1, scales1, global_scale1 = \
-                rand_marlin_weight_fp4_like(w1[i], group_size)
+            w_ref1, qweight1, scales1, global_scale1 = rand_marlin_weight_fp4_like(
+                w1[i], group_size
+            )
 
             w_ref1_l.append(w_ref1.T)
             qweight1_l.append(qweight1)
             scales1_l.append(scales1)
             global_scale1_l.append(global_scale1)
         elif quant_type == scalar_types.float8_e4m3fn:
-            w_ref1, qweight1, scales1 = marlin_quant_fp8_torch(
-                w1[i], group_size)
+            w_ref1, qweight1, scales1 = marlin_quant_fp8_torch(w1[i], group_size)
             w_ref1_l.append(w_ref1.T)
             qweight1_l.append(qweight1)
             scales1_l.append(scales1)
         elif has_zp:
             w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize(
-                w1[i].transpose(1, 0), quant_type, group_size)
+                w1[i].transpose(1, 0), quant_type, group_size
+            )
 
             w_ref1_l.append(w_ref1.T)
             qweight1_l.append(qweight1)
@@ -592,9 +631,9 @@ def test_fused_marlin_moe(
             zeros1_l.append(zeros1)
         else:
             test_perm = torch.randperm(k)
-            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \
-                marlin_quantize(w1[i].transpose(1, 0), quant_type,
-                                group_size, act_order, test_perm)
+            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+            )
 
             w_ref1_l.append(w_ref1.T)
             qweight1_l.append(qweight1)
@@ -620,22 +659,23 @@ def test_fused_marlin_moe(
 
     for i in range(w2.shape[0]):
         if quant_type == scalar_types.float4_e2m1f:
-            w_ref2, qweight2, scales2, global_scale2 = \
-                rand_marlin_weight_fp4_like(w2[i], group_size)
+            w_ref2, qweight2, scales2, global_scale2 = rand_marlin_weight_fp4_like(
+                w2[i], group_size
+            )
 
             w_ref2_l.append(w_ref2.T)
             qweight2_l.append(qweight2)
             scales2_l.append(scales2)
             global_scale2_l.append(global_scale2)
         elif quant_type == scalar_types.float8_e4m3fn:
-            w_ref2, qweight2, scales2 = marlin_quant_fp8_torch(
-                w2[i], group_size)
+            w_ref2, qweight2, scales2 = marlin_quant_fp8_torch(w2[i], group_size)
             w_ref2_l.append(w_ref2.T)
             qweight2_l.append(qweight2)
             scales2_l.append(scales2)
         elif has_zp:
             w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize(
-                w2[i].transpose(1, 0), quant_type, group_size)
+                w2[i].transpose(1, 0), quant_type, group_size
+            )
 
             w_ref2_l.append(w_ref2.T)
             qweight2_l.append(qweight2)
@@ -643,9 +683,9 @@ def test_fused_marlin_moe(
             zeros2_l.append(zeros2)
         else:
             test_perm = torch.randperm(n)
-            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \
-                marlin_quantize(w2[i].transpose(1, 0), quant_type,
-                                group_size, act_order, test_perm)
+            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+            )
 
             w_ref2_l.append(w_ref2.T)
             qweight2_l.append(qweight2)
@@ -666,12 +706,7 @@ def test_fused_marlin_moe(
     topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
 
     with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(a,
-                                 w_ref1,
-                                 w_ref2,
-                                 score,
-                                 topk,
-                                 expert_map=e_map)
+        torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map)
 
     marlin_output = torch.ops.vllm.fused_marlin_moe(
         a,
@@ -693,7 +728,8 @@ def test_fused_marlin_moe(
         w1_zeros=zeros1,
         w2_zeros=zeros2,
         quant_type_id=quant_type.id,
-        is_k_full=is_k_full)
+        is_k_full=is_k_full,
+    )
 
     torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
 
@@ -701,34 +737,36 @@ def test_fused_marlin_moe(
 def test_moe_align_block_size_opcheck():
     num_experts = 4
     block_size = 4
-    topk_ids = torch.randint(0,
-                             num_experts, (3, 4),
-                             dtype=torch.int32,
-                             device='cuda')
+    topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda")
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids = torch.empty((max_num_tokens_padded, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
     sorted_ids.fill_(topk_ids.numel())
     max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids = torch.empty((max_num_m_blocks, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    num_tokens_post_pad = torch.empty((1),
-                                      dtype=torch.int32,
-                                      device=topk_ids.device)
-
-    opcheck(torch.ops._moe_C.moe_align_block_size,
-            (topk_ids, num_experts, block_size, sorted_ids, expert_ids,
-             num_tokens_post_pad))
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+
+    opcheck(
+        torch.ops._moe_C.moe_align_block_size,
+        (
+            topk_ids,
+            num_experts,
+            block_size,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
+        ),
+    )
 
 
 @pytest.mark.parametrize("m", [1, 33, 64, 222])
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
     input = torch.randn((m, topk, k), device="cuda", dtype=dtype)
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index e980422a7b97..e491b2d7898c 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -7,7 +7,8 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size_triton)
+    moe_align_block_size_triton,
+)
 
 
 @pytest.mark.parametrize(
@@ -26,28 +27,32 @@
             ],  # num_tokens
             [1, 4, 16, 64],  # topk
             [64, 160, 256, 257, 260, 264],  #  num_experts
-        )),
+        )
+    ),
 )
-def test_moe_align_block_size_compare_implementations(block_size, num_tokens,
-                                                      topk, num_experts):
-    topk_ids = torch.stack([
-        torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
-        for _ in range(num_tokens)
-    ])
+def test_moe_align_block_size_compare_implementations(
+    block_size, num_tokens, topk, num_experts
+):
+    topk_ids = torch.stack(
+        [
+            torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
+            for _ in range(num_tokens)
+        ]
+    )
 
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
 
-    sorted_ids_cuda = torch.empty((max_num_tokens_padded, ),
-                                  dtype=torch.int32,
-                                  device=topk_ids.device)
+    sorted_ids_cuda = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
     sorted_ids_cuda.fill_(topk_ids.numel())
     max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids_cuda = torch.zeros((max_num_m_blocks, ),
-                                  dtype=torch.int32,
-                                  device=topk_ids.device)
-    num_tokens_post_pad_cuda = torch.empty((1),
-                                           dtype=torch.int32,
-                                           device=topk_ids.device)
+    expert_ids_cuda = torch.zeros(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad_cuda = torch.empty(
+        (1), dtype=torch.int32, device=topk_ids.device
+    )
 
     sorted_ids_triton = torch.empty_like(sorted_ids_cuda)
     sorted_ids_triton.fill_(topk_ids.numel())
@@ -76,14 +81,15 @@ def test_moe_align_block_size_compare_implementations(block_size, num_tokens,
         f"Expert IDs mismatch for block_size={block_size}, "
         f"num_tokens={num_tokens}, topk={topk}\n"
         f"CUDA expert_ids: {expert_ids_cuda}\n"
-        f"Triton expert_ids: {expert_ids_triton}")
+        f"Triton expert_ids: {expert_ids_triton}"
+    )
 
-    assert torch.allclose(
-        num_tokens_post_pad_cuda, num_tokens_post_pad_triton), (
-            f"Num tokens post pad mismatch for block_size={block_size}, "
-            f"num_tokens={num_tokens}, topk={topk}\n"
-            f"CUDA num_tokens_post_pad: {num_tokens_post_pad_cuda}\n"
-            f"Triton num_tokens_post_pad: {num_tokens_post_pad_triton}")
+    assert torch.allclose(num_tokens_post_pad_cuda, num_tokens_post_pad_triton), (
+        f"Num tokens post pad mismatch for block_size={block_size}, "
+        f"num_tokens={num_tokens}, topk={topk}\n"
+        f"CUDA num_tokens_post_pad: {num_tokens_post_pad_cuda}\n"
+        f"Triton num_tokens_post_pad: {num_tokens_post_pad_triton}"
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 7cc83b512c8b..403f018be61f 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -14,7 +14,10 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    moe_permute, moe_permute_unpermute_supported, moe_unpermute)
+    moe_permute,
+    moe_permute_unpermute_supported,
+    moe_unpermute,
+)
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [16, 64]
@@ -23,30 +26,32 @@
 current_platform.seed_everything(0)
 
 
-def torch_permute(hidden_states: torch.Tensor,
-                  topk_ids: torch.Tensor,
-                  token_expert_indices: torch.Tensor,
-                  topk: int,
-                  n_expert: int,
-                  n_local_expert: int,
-                  start_expert: int,
-                  expert_map: Optional[torch.Tensor] = None,
-                  align_block_size: Optional[int] = None,
-                  fill_invalid_expert: int = -1) -> list[torch.Tensor]:
+def torch_permute(
+    hidden_states: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+    start_expert: int,
+    expert_map: Optional[torch.Tensor] = None,
+    align_block_size: Optional[int] = None,
+    fill_invalid_expert: int = -1,
+) -> list[torch.Tensor]:
     n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
     if expert_map is not None:
-        is_local_expert = (expert_map[topk_ids] != -1)
-        not_local_expert = (expert_map[topk_ids] == -1)
-        topk_ids = is_local_expert * (
-            topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
+        is_local_expert = expert_map[topk_ids] != -1
+        not_local_expert = expert_map[topk_ids] == -1
+        topk_ids = is_local_expert * (topk_ids - start_expert) + not_local_expert * (
+            topk_ids + n_expert
+        )
 
-    sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
-                                                 stable=True)
+    sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(), stable=True)
     dst_row_id2src_row_id_map = token_expert_indices.flatten()[sorted_indices]
 
-    expert_first_token_offset = torch.zeros(n_local_expert + 1,
-                                            dtype=torch.int64,
-                                            device="cuda")
+    expert_first_token_offset = torch.zeros(
+        n_local_expert + 1, dtype=torch.int64, device="cuda"
+    )
     idx = 0
     for i in range(0, n_local_expert):
         cnt = 0
@@ -58,101 +63,116 @@ def torch_permute(hidden_states: torch.Tensor,
     _, src2dst_idx = torch.sort(dst_row_id2src_row_id_map)
     valid_row_idx = []
     if align_block_size is None:
-
-        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
-                                               n_token, ...]
+        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map % n_token, ...]
         permuted_row_size = permuted_hidden_states.shape[0]
-        m_indices = torch.empty(permuted_row_size,
-                                device="cuda",
-                                dtype=torch.int32).fill_(fill_invalid_expert)
+        m_indices = torch.empty(
+            permuted_row_size, device="cuda", dtype=torch.int32
+        ).fill_(fill_invalid_expert)
         for i in range(1, n_local_expert + 1):
             first_token_offset = expert_first_token_offset[i - 1]
             last_token_offset = expert_first_token_offset[i]
             m_indices[first_token_offset:last_token_offset] = i - 1
         src_row_id2dst_row_id_map = torch.arange(
-            0, n_token * topk, device="cuda",
-            dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
+            0, n_token * topk, device="cuda", dtype=torch.int32
+        )[src2dst_idx].reshape((n_token, topk))
         valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
         return [
-            permuted_hidden_states, expert_first_token_offset,
-            src_row_id2dst_row_id_map, m_indices, valid_row_idx
+            permuted_hidden_states,
+            expert_first_token_offset,
+            src_row_id2dst_row_id_map,
+            m_indices,
+            valid_row_idx,
         ]
     else:
-        permuted_row_size = (topk * n_token + n_expert *
-                             (align_block_size - 1) + align_block_size -
-                             1) // align_block_size * align_block_size
-        permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
-                                             device="cuda",
-                                             dtype=hidden_states.dtype)
-        align_src_row_id2dst_row_id = torch.empty(n_token * topk,
-                                                  device="cuda",
-                                                  dtype=torch.int32)
-        align_expert_first_token_offset = torch.zeros_like(
-            expert_first_token_offset)
-        m_indices = torch.empty(permuted_row_size,
-                                device="cuda",
-                                dtype=torch.int32).fill_(fill_invalid_expert)
+        permuted_row_size = (
+            (topk * n_token + n_expert * (align_block_size - 1) + align_block_size - 1)
+            // align_block_size
+            * align_block_size
+        )
+        permuted_hidden_states = torch.empty(
+            (permuted_row_size, n_hidden), device="cuda", dtype=hidden_states.dtype
+        )
+        align_src_row_id2dst_row_id = torch.empty(
+            n_token * topk, device="cuda", dtype=torch.int32
+        )
+        align_expert_first_token_offset = torch.zeros_like(expert_first_token_offset)
+        m_indices = torch.empty(
+            permuted_row_size, device="cuda", dtype=torch.int32
+        ).fill_(fill_invalid_expert)
         # get align_permuted_hidden_states,
         # valid row_idx and align_expert_first_token_offset
         for i in range(1, n_local_expert + 1):
             first_token_offset = expert_first_token_offset[i - 1]
             last_token_offset = expert_first_token_offset[i]
             n_token_in_expert = last_token_offset - first_token_offset
-            align_expert_first_token_offset[
-                i] = align_expert_first_token_offset[
-                    i - 1] + (n_token_in_expert + align_block_size -
-                              1) // align_block_size * align_block_size
+            align_expert_first_token_offset[i] = (
+                align_expert_first_token_offset[i - 1]
+                + (n_token_in_expert + align_block_size - 1)
+                // align_block_size
+                * align_block_size
+            )
             align_first_token_offset = align_expert_first_token_offset[i - 1]
             align_last_token_offset = align_expert_first_token_offset[i]
-            dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
-                first_token_offset:first_token_offset +
-                n_token_in_expert] % n_token
+            dst_row_id2src_row_id_in_expert = (
+                dst_row_id2src_row_id_map[
+                    first_token_offset : first_token_offset + n_token_in_expert
+                ]
+                % n_token
+            )
             # store token in current expert with align_first_token_offset
-            permuted_hidden_states[align_first_token_offset:\
-                                   align_first_token_offset+n_token_in_expert,\
-                                      ...] = hidden_states[\
-                                       dst_row_id2src_row_id_in_expert, ...]
+            permuted_hidden_states[
+                align_first_token_offset : align_first_token_offset + n_token_in_expert,
+                ...,
+            ] = hidden_states[dst_row_id2src_row_id_in_expert, ...]
             # set current expert m_indices
             m_indices[align_first_token_offset:align_last_token_offset] = i - 1
             valid_row_idx += [
-                i for i in range(align_first_token_offset,
-                                 align_first_token_offset + n_token_in_expert)
+                i
+                for i in range(
+                    align_first_token_offset,
+                    align_first_token_offset + n_token_in_expert,
+                )
             ]
         # get align_src_row_id2dst_row_id
         for i in range(n_token * topk):
             eid = sorted_topk_ids[i]
-            if (eid >= n_local_expert):
+            if eid >= n_local_expert:
                 # check token not in local expert
-                align_src_row_id2dst_row_id[
-                    i] = align_expert_first_token_offset[-1]
+                align_src_row_id2dst_row_id[i] = align_expert_first_token_offset[-1]
                 continue
             first_token_offset = expert_first_token_offset[eid]
             align_first_token_offset = align_expert_first_token_offset[eid]
             token_offset = i - first_token_offset
-            align_src_row_id2dst_row_id[
-                i] = align_first_token_offset + token_offset
-        align_src_row_id2dst_row_id = align_src_row_id2dst_row_id[\
-            src2dst_idx].reshape((n_token, topk))
+            align_src_row_id2dst_row_id[i] = align_first_token_offset + token_offset
+        align_src_row_id2dst_row_id = align_src_row_id2dst_row_id[src2dst_idx].reshape(
+            (n_token, topk)
+        )
         return [
-            permuted_hidden_states, align_expert_first_token_offset,
-            align_src_row_id2dst_row_id, m_indices, valid_row_idx
+            permuted_hidden_states,
+            align_expert_first_token_offset,
+            align_src_row_id2dst_row_id,
+            m_indices,
+            valid_row_idx,
         ]
 
 
-def torch_unpermute(permuted_hidden_states: torch.Tensor,
-                    topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                    token_expert_indices: torch.Tensor,
-                    src_row_id2dst_row_id_map: torch.Tensor,
-                    valid_row_idx: torch.Tensor, topk: int,
-                    n_expert: int) -> torch.Tensor:
+def torch_unpermute(
+    permuted_hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    src_row_id2dst_row_id_map: torch.Tensor,
+    valid_row_idx: torch.Tensor,
+    topk: int,
+    n_expert: int,
+) -> torch.Tensor:
     # ignore invalid row
-    mask = torch.zeros(permuted_hidden_states.shape[0],
-                       dtype=bool,
-                       device="cuda")
+    mask = torch.zeros(permuted_hidden_states.shape[0], dtype=bool, device="cuda")
     mask[valid_row_idx] = True
     permuted_hidden_states[~mask] = 0
-    idx = src_row_id2dst_row_id_map.flatten()[
-        token_expert_indices.flatten()].reshape(token_expert_indices.shape)
+    idx = src_row_id2dst_row_id_map.flatten()[token_expert_indices.flatten()].reshape(
+        token_expert_indices.shape
+    )
     output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
     output = output.sum(dim=1).to(permuted_hidden_states.dtype)
     return output
@@ -165,25 +185,31 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("align_block_size", [None, 128])
-def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
-                               n_expert: int, ep_size: int, dtype: torch.dtype,
-                               align_block_size: Optional[int]):
+def test_moe_permute_unpermute(
+    n_token: int,
+    n_hidden: int,
+    topk: int,
+    n_expert: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    align_block_size: Optional[int],
+):
     if not moe_permute_unpermute_supported():
         pytest.skip("moe_permute_unpermute is not supported on this platform.")
     fill_invalid_expert = 0
     ep_rank = np.random.randint(0, ep_size)
     expert_map = None
     n_local_expert = n_expert
-    if (ep_size != 1):
-        n_local_expert, expert_map = determine_expert_map(
-            ep_size, ep_rank, n_expert)
+    if ep_size != 1:
+        n_local_expert, expert_map = determine_expert_map(ep_size, ep_rank, n_expert)
         expert_map = expert_map.cuda()
     start_expert = n_local_expert * ep_rank
     current_platform.seed_everything(0)
     hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
     gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
     topk_weights, topk_ids, token_expert_indices = fused_topk(
-        hidden_states, gating_output, topk, False)
+        hidden_states, gating_output, topk, False
+    )
     gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
         hidden_states,
         topk_ids,
@@ -194,12 +220,21 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
         start_expert,
         expert_map=expert_map,
         align_block_size=align_block_size,
-        fill_invalid_expert=fill_invalid_expert)
+        fill_invalid_expert=fill_invalid_expert,
+    )
 
     result0, result1, result2, result3 = moe_permute(
-        hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
-        n_expert, n_local_expert, expert_map, align_block_size,
-        fill_invalid_expert)
+        hidden_states,
+        topk_weights,
+        topk_ids,
+        token_expert_indices,
+        topk,
+        n_expert,
+        n_local_expert,
+        expert_map,
+        align_block_size,
+        fill_invalid_expert,
+    )
 
     # check expert_first_token_offset
     torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
@@ -208,19 +243,33 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
     # check mindice
     torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
     # check permuted_hidden_states, only valid token
-    torch.testing.assert_close(gold0[valid_row_idx],
-                               result0[valid_row_idx],
-                               atol=0,
-                               rtol=0)
+    torch.testing.assert_close(
+        gold0[valid_row_idx], result0[valid_row_idx], atol=0, rtol=0
+    )
 
     # add a random tensor to simulate group gemm
     result0 = 0.5 * result0 + torch.randn_like(result0)
 
-    result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
-                            topk, n_expert, n_local_expert)
-    gold4 = torch_unpermute(result0, topk_weights, topk_ids,
-                            token_expert_indices, result2, valid_row_idx, topk,
-                            n_local_expert)
+    result4 = moe_unpermute(
+        result0,
+        topk_weights,
+        topk_ids,
+        result2,
+        result1,
+        topk,
+        n_expert,
+        n_local_expert,
+    )
+    gold4 = torch_unpermute(
+        result0,
+        topk_weights,
+        topk_ids,
+        token_expert_indices,
+        result2,
+        valid_row_idx,
+        topk,
+        n_local_expert,
+    )
 
     # check unpermuted hidden
     torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py
index 824b072a9f93..5cf8e1bd6e94 100644
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
@@ -9,9 +9,9 @@
 import torch
 from packaging import version
 
-QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
-    "quark") is not None and version.parse(
-        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
 
 
 @dataclass
@@ -20,22 +20,25 @@ class ModelCase:
     tp: int
 
 
-@pytest.mark.parametrize('model_case', [
-    ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
-    ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
-    ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
-])
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.parametrize(
+    "model_case",
+    [
+        ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
+        ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
+        ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1),
+    ],
+)
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
     if torch.cuda.device_count() < model_case.tp:
-        pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
-                    f"{torch.cuda.device_count()}")
-
-    with vllm_runner(model_case.model_id,
-                     tensor_parallel_size=model_case.tp,
-                     load_format="dummy") as llm:
-
+        pytest.skip(
+            f"This test requires >={model_case.tp} gpus, got only "
+            f"{torch.cuda.device_count()}"
+        )
+
+    with vllm_runner(
+        model_case.model_id, tensor_parallel_size=model_case.tp, load_format="dummy"
+    ) as llm:
         # TODO: llm.apply_model(check_model) currently relies on V0 internals.
         # Re-enable this later.
         # def check_model(model):
@@ -52,6 +55,5 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
         # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
         #     llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Today I am in the French Alps and",
-                                     max_tokens=20)
-        assert output
\ No newline at end of file
+        output = llm.generate_greedy("Today I am in the French Alps and", max_tokens=20)
+        assert output
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index 3f5412e75821..f51e081984a5 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -3,9 +3,11 @@
 import pytest
 import torch
 
-from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
-                                                    FLOAT8_E4M3_MAX,
-                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.quantization.nvfp4_utils import (
+    FLOAT4_E2M1_MAX,
+    FLOAT8_E4M3_MAX,
+    dequantize_nvfp4_to_dtype,
+)
 from tests.kernels.utils import torch_moe
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
@@ -14,8 +16,9 @@
 from vllm.platforms import current_platform
 
 if not current_platform.has_device_capability(100):
-    pytest.skip("Nvfp4 Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        "Nvfp4 Requires compute capability of 10 or above.", allow_module_level=True
+    )
 
 MNK_FACTORS = [
     (2, 1024, 1024),
@@ -36,36 +39,34 @@
 @pytest.mark.parametrize("topk", [1, 6, 8])
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 @torch.inference_mode()
-def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
-                                  dtype: torch.dtype):
+def test_cutlass_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-
+        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+    ):
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
         w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
         quant_blocksize = 16
         round_up = lambda x, y: (x + y - 1) // y * y
         sf_w1_2n = round_up(2 * n, 128)
         sf_w1_k = round_up(k // quant_blocksize, 4)
-        w1_blockscale = torch.empty((e, sf_w1_2n, sf_w1_k),
-                                    device="cuda",
-                                    dtype=torch.float8_e4m3fn)
+        w1_blockscale = torch.empty(
+            (e, sf_w1_2n, sf_w1_k), device="cuda", dtype=torch.float8_e4m3fn
+        )
 
         w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
         sf_w2_k = round_up(k, 128)
         sf_w2_n = round_up(n // quant_blocksize, 4)
-        w2_blockscale = torch.empty((e, sf_w2_k, sf_w2_n),
-                                    device="cuda",
-                                    dtype=torch.float8_e4m3fn)
+        w2_blockscale = torch.empty(
+            (e, sf_w2_k, sf_w2_n), device="cuda", dtype=torch.float8_e4m3fn
+        )
 
-        w1_q = torch.empty((e, 2 * n, k // 2),
-                           device="cuda",
-                           dtype=torch.uint8)
+        w1_q = torch.empty((e, 2 * n, k // 2), device="cuda", dtype=torch.uint8)
         w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8)
-        w1_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
-        w2_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
+        w1_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
+        w2_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
 
         for expert in range(e):
             w1_amax = torch.abs(w1).max().to(torch.float32)
@@ -74,19 +75,18 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
             w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
 
             w1_q[expert], w1_blockscale[expert] = ops.scaled_fp4_quant(
-                w1[expert], w1_gs[expert])
+                w1[expert], w1_gs[expert]
+            )
 
             w2_q[expert], w2_blockscale[expert] = ops.scaled_fp4_quant(
-                w2[expert], w2_gs[expert])
+                w2[expert], w2_gs[expert]
+            )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
-        a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
-        a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+        a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
+        a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
 
         cutlass_output = cutlass_moe_fp4(
             a=a,
@@ -108,40 +108,44 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
         )
 
         # Reference check:
-        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                          torch.amax(a.flatten(), dim=-1)).to(torch.float32)
+        a_global_scale = (
+            (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)
+        ).to(torch.float32)
         a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
         _, m_k = a_fp4.shape
-        a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
-                                               a_scale_interleaved,
-                                               a_global_scale,
-                                               dtype=a.dtype,
-                                               device=a.device,
-                                               block_size=quant_blocksize)
+        a_in_dtype = dequantize_nvfp4_to_dtype(
+            a_fp4,
+            a_scale_interleaved,
+            a_global_scale,
+            dtype=a.dtype,
+            device=a.device,
+            block_size=quant_blocksize,
+        )
 
         w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
         w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
 
         for idx in range(0, e):
-            w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
-                                                  w1_blockscale[idx],
-                                                  w1_gs[idx],
-                                                  dtype=w1.dtype,
-                                                  device=w1.device,
-                                                  block_size=quant_blocksize)
-            w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
-                                                  w2_blockscale[idx],
-                                                  w2_gs[idx],
-                                                  dtype=w2.dtype,
-                                                  device=w2.device,
-                                                  block_size=quant_blocksize)
+            w1_d[idx] = dequantize_nvfp4_to_dtype(
+                w1_q[idx],
+                w1_blockscale[idx],
+                w1_gs[idx],
+                dtype=w1.dtype,
+                device=w1.device,
+                block_size=quant_blocksize,
+            )
+            w2_d[idx] = dequantize_nvfp4_to_dtype(
+                w2_q[idx],
+                w2_blockscale[idx],
+                w2_gs[idx],
+                dtype=w2.dtype,
+                device=w2.device,
+                block_size=quant_blocksize,
+            )
 
         torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
 
-        torch.testing.assert_close(torch_output,
-                                   cutlass_output,
-                                   atol=1e-1,
-                                   rtol=1e-1)
+        torch.testing.assert_close(torch_output, cutlass_output, atol=1e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 77adc89ea9da..2403450d1bdf 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -11,8 +11,7 @@
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
@@ -20,9 +19,13 @@
 
 try:
     from pplx_kernels import AllToAll
-    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
-                                      nvshmem_finalize, nvshmem_get_unique_id,
-                                      nvshmem_init)
+    from pplx_kernels.nvshmem import (
+        nvshmem_alloc_empty_unique_id,
+        nvshmem_finalize,
+        nvshmem_get_unique_id,
+        nvshmem_init,
+    )
+
     has_pplx = True
 except ImportError:
     has_pplx = False
@@ -46,12 +49,12 @@ def chunk_by_rank(t, r, w):
     chunk = rank_chunk(num, r, w)
     rem = num % w
     if rem == 0 or r < rem:
-        return t[(r * chunk):(r + 1) * chunk].contiguous()
+        return t[(r * chunk) : (r + 1) * chunk].contiguous()
     else:
         long_chunks = (num // w + 1) * rem
         short_chunks = (r - rem) * chunk
         start = long_chunks + short_chunks
-        return t[start:start + chunk].contiguous()
+        return t[start : start + chunk].contiguous()
 
 
 def pplx_cutlass_moe(
@@ -71,7 +74,9 @@ def pplx_cutlass_moe(
     group_name: Optional[str],
 ):
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize,
+    )
+
     assert torch.cuda.current_device() == pgi.local_rank
 
     num_tokens, hidden_dim = a.shape
@@ -122,35 +127,34 @@ def pplx_cutlass_moe(
         ata,
         max_num_tokens=max_num_tokens,
         num_local_experts=num_local_experts,
-        num_dispatchers=num_dispatchers)
-
-    ab_strides1 = torch.full((num_local_experts, ),
-                             hidden_dim,
-                             device="cuda",
-                             dtype=torch.int64)
-    ab_strides2 = torch.full((num_local_experts, ),
-                             intermediate_dim,
-                             device="cuda",
-                             dtype=torch.int64)
-    c_strides1 = torch.full((num_local_experts, ),
-                            2 * intermediate_dim,
-                            device="cuda",
-                            dtype=torch.int64)
-    c_strides2 = torch.full((num_local_experts, ),
-                            hidden_dim,
-                            device="cuda",
-                            dtype=torch.int64)
-
-    experts = CutlassExpertsFp8(num_local_experts,
-                                out_dtype,
-                                per_act_token,
-                                per_out_ch,
-                                ab_strides1,
-                                ab_strides2,
-                                c_strides1,
-                                c_strides2,
-                                num_dispatchers=num_dispatchers,
-                                use_batched_format=True)
+        num_dispatchers=num_dispatchers,
+    )
+
+    ab_strides1 = torch.full(
+        (num_local_experts,), hidden_dim, device="cuda", dtype=torch.int64
+    )
+    ab_strides2 = torch.full(
+        (num_local_experts,), intermediate_dim, device="cuda", dtype=torch.int64
+    )
+    c_strides1 = torch.full(
+        (num_local_experts,), 2 * intermediate_dim, device="cuda", dtype=torch.int64
+    )
+    c_strides2 = torch.full(
+        (num_local_experts,), hidden_dim, device="cuda", dtype=torch.int64
+    )
+
+    experts = CutlassExpertsFp8(
+        num_local_experts,
+        out_dtype,
+        per_act_token,
+        per_out_ch,
+        ab_strides1,
+        ab_strides2,
+        c_strides1,
+        c_strides2,
+        num_dispatchers=num_dispatchers,
+        use_batched_format=True,
+    )
 
     fused_cutlass_experts = FusedMoEModularKernel(
         prepare_finalize,
@@ -158,10 +162,10 @@ def pplx_cutlass_moe(
     )
 
     a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weights, rank,
-                                      world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank,
-                                   world_size).to(torch.uint32).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weights, rank, world_size).to(device)
+    chunk_topk_ids = (
+        chunk_by_rank(topk_ids, rank, world_size).to(torch.uint32).to(device)
+    )
 
     out = fused_cutlass_experts(
         a_chunk,
@@ -170,11 +174,13 @@ def pplx_cutlass_moe(
         chunk_topk_weight,
         chunk_topk_ids,
         global_num_experts=num_experts,
-        expert_map=None,  #TODO
+        expert_map=None,  # TODO
         w1_scale=chunk_by_rank(w1_scale, rank, world_size),
         w2_scale=chunk_by_rank(w2_scale, rank, world_size),
         a1_scale=chunk_by_rank(a1_scale, rank, world_size)
-        if per_act_token else a1_scale[rank])
+        if per_act_token
+        else a1_scale[rank],
+    )
 
     torch.cuda.synchronize()
 
@@ -209,35 +215,48 @@ def _pplx_moe(
 ):
     try:
         if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = (
+                nvshmem_get_unique_id()
+                if pgi.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
             torch.distributed.broadcast(uid, src=0)
             nvshmem_init(uid, pgi.rank, pgi.world_size)
         else:
             group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
+            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
             group_name = cpu_group.group_name
 
         with set_current_vllm_config(vllm_config):
-            torch_output = torch_experts(a_full, w1_full, w2_full,
-                                         topk_weights, topk_ids)
-            pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
-                                           w2_scale, topk_weights, topk_ids,
-                                           a1_scale, out_dtype, per_act_token,
-                                           per_out_ch, group_name)
-
-            torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                         pgi.world_size).to(pplx_output.device)
+            torch_output = torch_experts(
+                a_full, w1_full, w2_full, topk_weights, topk_ids
+            )
+            pplx_output = pplx_cutlass_moe(
+                pgi,
+                dp_size,
+                a,
+                w1,
+                w2,
+                w1_scale,
+                w2_scale,
+                topk_weights,
+                topk_ids,
+                a1_scale,
+                out_dtype,
+                per_act_token,
+                per_out_ch,
+                group_name,
+            )
+
+            torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
+                pplx_output.device
+            )
 
         # Uncomment if more debugging is needed
         # print("PPLX OUT:", pplx_output)
         # print("TORCH OUT:", torch_output)
 
-        torch.testing.assert_close(pplx_output,
-                                   torch_output,
-                                   atol=0.05,
-                                   rtol=0)
+        torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
     finally:
         if use_internode:
             nvshmem_finalize()
@@ -250,12 +269,14 @@ def _pplx_moe(
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
+@pytest.mark.parametrize("world_dp_size", [[2, 1]])  # , [4, 2]])
 @pytest.mark.parametrize("use_internode", [False])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
 @requires_pplx
 def test_cutlass_moe_pplx(
     m: int,
@@ -271,7 +292,6 @@ def test_cutlass_moe_pplx(
     current_platform.seed_everything(7)
 
     with set_current_vllm_config(vllm_config):
-
         dtype = torch.half
 
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
@@ -281,22 +301,18 @@ def test_cutlass_moe_pplx(
         n_b_scales = 2 * n if per_out_ch else 1
         k_b_scales = k if per_out_ch else 1
 
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
+        w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn)
         w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
+        w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
 
         for expert in range(e):
             w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
+                w1[expert], use_per_token_if_dynamic=per_out_ch
+            )
             w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
+                w2[expert], use_per_token_if_dynamic=per_out_ch
+            )
 
         w1_d = torch.empty_like(w1)
         w2_d = torch.empty_like(w2)
@@ -305,19 +321,35 @@ def test_cutlass_moe_pplx(
             w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids, _ = fused_topk(a,
-                                               score,
-                                               topk,
-                                               renormalize=False)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 
         world_size, dp_size = world_dp_size
-        a_scale1 = torch.randn(
-            (m if per_act_token else 1, 1), device="cuda",
-            dtype=torch.float32) / 10.0
+        a_scale1 = (
+            torch.randn(
+                (m if per_act_token else 1, 1), device="cuda", dtype=torch.float32
+            )
+            / 10.0
+        )
         if not per_act_token:
             a_scale1 = a_scale1.repeat(world_size, 1)
 
-        parallel_launch(world_size, _pplx_moe, dp_size, a, w1_q, w2_q,
-                        w1_scale, w2_scale, topk_weights, topk_ids, a_scale1,
-                        dtype, a, w1_d, w2_d, per_act_token, per_out_ch,
-                        use_internode)
+        parallel_launch(
+            world_size,
+            _pplx_moe,
+            dp_size,
+            a,
+            w1_q,
+            w2_q,
+            w1_scale,
+            w2_scale,
+            topk_weights,
+            topk_ids,
+            a_scale1,
+            dtype,
+            a,
+            w1_d,
+            w2_d,
+            per_act_token,
+            per_out_ch,
+            use_internode,
+        )
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index f7a661b4bc7b..5237703d4389 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
+
 import itertools
 import textwrap
 import traceback
@@ -14,9 +15,13 @@
 
 try:
     from pplx_kernels import AllToAll
-    from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
-                                      nvshmem_finalize, nvshmem_get_unique_id,
-                                      nvshmem_init)
+    from pplx_kernels.nvshmem import (
+        nvshmem_alloc_empty_unique_id,
+        nvshmem_finalize,
+        nvshmem_get_unique_id,
+        nvshmem_init,
+    )
+
     has_pplx = True
 except ImportError:
     has_pplx = False
@@ -27,13 +32,12 @@
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
+    TopKWeightAndReduceDelegate,
+)
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
@@ -46,7 +50,7 @@
 
 PPLX_COMBOS = [
     # TODO: figure out why this fails, seems to be test problem
-    #(1, 128, 128),
+    # (1, 128, 128),
     (2, 128, 512),
     (3, 1024, 2048),
     (4, 128, 128),
@@ -78,17 +82,16 @@ def torch_prepare(
     num_tokens, hidden_dim = a.shape
     topk = topk_ids.shape[1]
 
-    tokens_per_expert = torch.bincount(topk_ids.view(-1),
-                                       minlength=num_experts)
+    tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
 
     assert tokens_per_expert.numel() == num_experts
 
     if max_num_tokens is None:
         max_num_tokens = int(tokens_per_expert.max().item())
 
-    b_a = torch.zeros((num_experts, max_num_tokens, hidden_dim),
-                      dtype=a.dtype,
-                      device=a.device)
+    b_a = torch.zeros(
+        (num_experts, max_num_tokens, hidden_dim), dtype=a.dtype, device=a.device
+    )
 
     token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device)
 
@@ -96,28 +99,29 @@ def torch_prepare(
         for j in range(topk):
             expert_id = topk_ids[token, j]
             idx = token_counts[expert_id]
-            b_a[expert_id, idx:idx + 1, :] = a[token, :]
+            b_a[expert_id, idx : idx + 1, :] = a[token, :]
             token_counts[expert_id] = token_counts[expert_id] + 1
 
     return b_a, tokens_per_expert
 
 
-def torch_finalize(b_out: torch.Tensor, topk_weight: torch.Tensor,
-                   topk_ids: torch.Tensor) -> torch.Tensor:
+def torch_finalize(
+    b_out: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor
+) -> torch.Tensor:
     num_tokens = topk_ids.shape[0]
     num_experts = b_out.shape[0]
     K = b_out.shape[-1]
     out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device)
-    expert_counts = torch.zeros(num_experts,
-                                dtype=torch.int,
-                                device=b_out.device)
+    expert_counts = torch.zeros(num_experts, dtype=torch.int, device=b_out.device)
     for token in range(num_tokens):
         expert_ids = topk_ids[token]
         for i in range(expert_ids.numel()):
             expert_id = expert_ids[i]
             idx = expert_counts[expert_id]
-            out[token, :] = out[token, :] + b_out[expert_id, idx:idx +
-                                                  1, :] * topk_weight[token, i]
+            out[token, :] = (
+                out[token, :]
+                + b_out[expert_id, idx : idx + 1, :] * topk_weight[token, i]
+            )
             expert_counts[expert_id] = expert_counts[expert_id] + 1
 
     return out
@@ -136,17 +140,18 @@ def torch_batched_moe(
     num_tokens, topk = topk_ids.shape
     _, max_num_tokens, K = b_a.shape
     assert num_experts == b_a.shape[0] and w2.shape[1] == K
-    out = torch.zeros((num_experts, max_num_tokens, K),
-                      dtype=b_a.dtype,
-                      device=b_a.device)
-    tmp = torch.empty((max_num_tokens, w1.shape[1] // 2),
-                      dtype=b_a.dtype,
-                      device=b_a.device)
+    out = torch.zeros(
+        (num_experts, max_num_tokens, K), dtype=b_a.dtype, device=b_a.device
+    )
+    tmp = torch.empty(
+        (max_num_tokens, w1.shape[1] // 2), dtype=b_a.dtype, device=b_a.device
+    )
     for expert in range(num_experts):
         num = tokens_per_expert[expert]
         if num > 0:
             torch.ops._C.silu_and_mul(
-                tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1))
+                tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1)
+            )
             out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1)
 
     return torch_finalize(out, topk_weight, topk_ids)
@@ -175,20 +180,16 @@ def test_fused_moe_batched_experts(
 
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight,
-                                        topk_ids)  # only for baseline
+        baseline_output = torch_experts(
+            a, w1, w2, topk_weight, topk_ids
+        )  # only for baseline
         torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
         batched_output = naive_batched_moe(
-            a, w1, w2, topk_weight, topk_ids)  # pick torch_experts or this
+            a, w1, w2, topk_weight, topk_ids
+        )  # pick torch_experts or this
 
-    torch.testing.assert_close(baseline_output,
-                               torch_output,
-                               atol=2e-2,
-                               rtol=0)
-    torch.testing.assert_close(baseline_output,
-                               batched_output,
-                               atol=2e-2,
-                               rtol=0)
+    torch.testing.assert_close(baseline_output, torch_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(baseline_output, batched_output, atol=2e-2, rtol=0)
 
 
 def create_pplx_prepare_finalize(
@@ -206,7 +207,9 @@ def create_pplx_prepare_finalize(
     group_name: Optional[str],
 ):
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
+        PplxPrepareAndFinalize,
+        pplx_hidden_dim_scale_bytes,
+    )
 
     max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
     num_local_experts = rank_chunk(num_experts, 0, world_size)
@@ -255,28 +258,31 @@ def rank_chunk(num: int, r: int, w: int) -> int:
 
 def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
     chunk = rank_chunk(t.shape[0], r, w)
-    return t[(r * chunk):(r + 1) * chunk]
+    return t[(r * chunk) : (r + 1) * chunk]
 
 
-def maybe_chunk_by_rank(t: Optional[torch.Tensor], r: int,
-                        w: int) -> Optional[torch.Tensor]:
+def maybe_chunk_by_rank(
+    t: Optional[torch.Tensor], r: int, w: int
+) -> Optional[torch.Tensor]:
     if t is not None:
         return chunk_by_rank(t, r, w)
     else:
         return t
 
 
-def chunk_scales_by_rank(t: Optional[torch.Tensor], r: int,
-                         w: int) -> Optional[torch.Tensor]:
+def chunk_scales_by_rank(
+    t: Optional[torch.Tensor], r: int, w: int
+) -> Optional[torch.Tensor]:
     if t is not None and t.numel() > 1:
         chunk = rank_chunk(t.shape[0], r, w)
-        return t[(r * chunk):(r + 1) * chunk]
+        return t[(r * chunk) : (r + 1) * chunk]
     else:
         return t
 
 
-def chunk_scales(t: Optional[torch.Tensor], start: int,
-                 end: int) -> Optional[torch.Tensor]:
+def chunk_scales(
+    t: Optional[torch.Tensor], start: int, end: int
+) -> Optional[torch.Tensor]:
     if t is not None and t.numel() > 1:
         return t[start:end]
     else:
@@ -339,8 +345,7 @@ def pplx_prepare_finalize(
         device=device,
     )
 
-    if (quant_dtype is not None and not per_act_token_quant
-            and block_shape is None):
+    if quant_dtype is not None and not per_act_token_quant and block_shape is None:
         a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     else:
@@ -364,8 +369,7 @@ def pplx_prepare_finalize(
         ),
     )
 
-    b_a = dummy_work(
-        dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
+    b_a = dummy_work(dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
 
     prepare_finalize.finalize(
         out,
@@ -399,15 +403,17 @@ def _pplx_prepare_finalize(
 ):
     try:
         if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = (
+                nvshmem_get_unique_id()
+                if pgi.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
             torch.distributed.broadcast(uid, src=0)
             nvshmem_init(uid, pgi.rank, pgi.world_size)
             group_name = None
         else:
             group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
+            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
             group_name = cpu_group.group_name
 
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
@@ -415,22 +421,28 @@ def _pplx_prepare_finalize(
 
         a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
 
-        torch_output = (a_rep.view(m, topk, k) *
-                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(
-                            dim=1)
-
-        pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight,
-                                            topk_ids, num_experts, quant_dtype,
-                                            block_shape, per_act_token_quant,
-                                            group_name)
+        torch_output = (
+            a_rep.view(m, topk, k) * topk_weight.view(m, topk, 1).to(a_rep.dtype)
+        ).sum(dim=1)
+
+        pplx_output = pplx_prepare_finalize(
+            pgi,
+            dp_size,
+            a,
+            topk_weight,
+            topk_ids,
+            num_experts,
+            quant_dtype,
+            block_shape,
+            per_act_token_quant,
+            group_name,
+        )
 
-        torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                     pgi.world_size).to(pgi.device)
+        torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
+            pgi.device
+        )
 
-        torch.testing.assert_close(pplx_output,
-                                   torch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
+        torch.testing.assert_close(pplx_output, torch_output, atol=3e-2, rtol=3e-2)
     finally:
         if use_internode:
             nvshmem_finalize()
@@ -479,9 +491,19 @@ def test_pplx_prepare_finalize_slow(
     a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
     score = torch.randn((m, e), device=device, dtype=act_dtype)
 
-    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score,
-                    topk, e, quant_dtype, block_shape, per_act_token_quant,
-                    use_internode)
+    parallel_launch(
+        world_size,
+        _pplx_prepare_finalize,
+        dp_size,
+        a,
+        score,
+        topk,
+        e,
+        quant_dtype,
+        block_shape,
+        per_act_token_quant,
+        use_internode,
+    )
 
 
 def pplx_moe(
@@ -504,7 +526,6 @@ def pplx_moe(
     use_compile: bool = False,
     use_cudagraphs: bool = True,
 ) -> torch.Tensor:
-
     num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
     topk = topk_ids.shape[1]
@@ -557,41 +578,45 @@ def pplx_moe(
     # large enough to trigger chunking. I'm leaving the flag and
     # setup code in case we are able to revisit this later.
     if use_compile:
-        _fused_experts = torch.compile(fused_experts,
-                                       backend='inductor',
-                                       fullgraph=True)
+        _fused_experts = torch.compile(
+            fused_experts, backend="inductor", fullgraph=True
+        )
         torch._dynamo.mark_dynamic(a_chunk, 0)
         torch._dynamo.mark_dynamic(chunk_topk_weight, 0)
         torch._dynamo.mark_dynamic(chunk_topk_ids, 0)
     else:
         _fused_experts = fused_experts
 
-    out = _fused_experts(a_chunk,
-                         w1_chunk,
-                         w2_chunk,
-                         chunk_topk_weight,
-                         chunk_topk_ids,
-                         w1_scale=w1_scale_chunk,
-                         w2_scale=w2_scale_chunk,
-                         a1_scale=a1_scale_chunk,
-                         a2_scale=a2_scale_chunk,
-                         global_num_experts=num_experts)
+    out = _fused_experts(
+        a_chunk,
+        w1_chunk,
+        w2_chunk,
+        chunk_topk_weight,
+        chunk_topk_ids,
+        w1_scale=w1_scale_chunk,
+        w2_scale=w2_scale_chunk,
+        a1_scale=a1_scale_chunk,
+        a2_scale=a2_scale_chunk,
+        global_num_experts=num_experts,
+    )
 
     if use_cudagraphs:
         out.fill_(0)
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            out = _fused_experts(a_chunk,
-                                 w1_chunk,
-                                 w2_chunk,
-                                 chunk_topk_weight,
-                                 chunk_topk_ids,
-                                 w1_scale=w1_scale_chunk,
-                                 w2_scale=w2_scale_chunk,
-                                 a1_scale=a1_scale_chunk,
-                                 a2_scale=a2_scale_chunk,
-                                 global_num_experts=num_experts)
+            out = _fused_experts(
+                a_chunk,
+                w1_chunk,
+                w2_chunk,
+                chunk_topk_weight,
+                chunk_topk_ids,
+                w1_scale=w1_scale_chunk,
+                w2_scale=w2_scale_chunk,
+                a1_scale=a1_scale_chunk,
+                a2_scale=a2_scale_chunk,
+                global_num_experts=num_experts,
+            )
 
         torch.cuda.synchronize()
         graph.replay()
@@ -621,15 +646,17 @@ def _pplx_moe(
 ):
     try:
         if use_internode:
-            uid = nvshmem_get_unique_id(
-            ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
+            uid = (
+                nvshmem_get_unique_id()
+                if pgi.rank == 0
+                else nvshmem_alloc_empty_unique_id()
+            )
             torch.distributed.broadcast(uid, src=0)
             nvshmem_init(uid, pgi.rank, pgi.world_size)
             group_name = None
         else:
             group_ranks = list(range(pgi.world_size))
-            cpu_group = torch.distributed.new_group(group_ranks,
-                                                    backend="gloo")
+            cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
             group_name = cpu_group.group_name
 
         m, k = a.shape
@@ -647,8 +674,7 @@ def _pplx_moe(
         w1_s = w1_s.to(device) if w1_s is not None else None
         w2_s = w2_s.to(device) if w2_s is not None else None
 
-        if (quant_dtype is not None and not per_act_token_quant
-                and block_shape is None):
+        if quant_dtype is not None and not per_act_token_quant and block_shape is None:
             a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
             a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
         else:
@@ -708,17 +734,14 @@ def _pplx_moe(
             )
 
         chunked_batch_output = chunk_by_rank(
-            batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
+            batched_output, pgi.rank, pgi.world_size
+        ).to(pplx_output.device)
 
-        torch.testing.assert_close(batched_output,
-                                   torch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
+        torch.testing.assert_close(batched_output, torch_output, atol=3e-2, rtol=3e-2)
 
-        torch.testing.assert_close(pplx_output,
-                                   chunked_batch_output,
-                                   atol=3e-2,
-                                   rtol=3e-2)
+        torch.testing.assert_close(
+            pplx_output, chunked_batch_output, atol=3e-2, rtol=3e-2
+        )
     finally:
         if use_internode:
             nvshmem_finalize()
@@ -773,14 +796,32 @@ def test_pplx_moe_slow(
         per_act_token_quant=per_act_token_quant,
     )
 
-    parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk, e,
-                    w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
-                    use_internode)
-
+    parallel_launch(
+        world_size,
+        _pplx_moe,
+        dp_size,
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        e,
+        w1_s,
+        w2_s,
+        quant_dtype,
+        per_act_token_quant,
+        block_shape,
+        use_internode,
+    )
 
-def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
-                    make_weights: bool, test_fn: Callable):
 
+def _pplx_test_loop(
+    pgi: ProcessGroupInfo,
+    dp_size: int,
+    use_internode: bool,
+    make_weights: bool,
+    test_fn: Callable,
+):
     def format_result(msg, ex=None):
         if ex is not None:
             x = str(ex)
@@ -795,8 +836,9 @@ def format_result(msg, ex=None):
             print(f"PASSED {msg}")
 
     current_platform.seed_everything(7)
-    combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
-                               [False, True], [None, [128, 128]])
+    combos = itertools.product(
+        PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
+    )
     exceptions = []
     count = 0
     for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
@@ -810,15 +852,14 @@ def format_result(msg, ex=None):
             use_fp8_w8a8 = False
             quant_dtype = None
 
-        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
-                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
-                     f"block_shape={block_shape}")
+        test_desc = (
+            f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
+            f"dtype={dtype}, per_act_token={per_act_token_quant}, "
+            f"block_shape={block_shape}"
+        )
 
-        if not use_fp8_w8a8 and (per_act_token_quant
-                                 or block_shape is not None):
-            print(
-                f"{test_desc} - Skip quantization test for non-quantized type."
-            )
+        if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
+            print(f"{test_desc} - Skip quantization test for non-quantized type.")
             continue
 
         if per_act_token_quant and block_shape is not None:
@@ -865,10 +906,10 @@ def format_result(msg, ex=None):
     if len(exceptions) > 0:
         raise RuntimeError(
             f"{len(exceptions)} of {count} tests failed in child process, "
-            f"rank={pgi.rank}.")
+            f"rank={pgi.rank}."
+        )
     else:
-        print(f"{count} of {count} tests passed in child process, "
-              f"rank={pgi.rank}.")
+        print(f"{count} of {count} tests passed in child process, rank={pgi.rank}.")
 
 
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@@ -880,8 +921,14 @@ def test_pplx_prepare_finalize(
 ):
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
-                    use_internode, False, _pplx_prepare_finalize)
+    parallel_launch(
+        world_size * dp_size,
+        _pplx_test_loop,
+        dp_size,
+        use_internode,
+        False,
+        _pplx_prepare_finalize,
+    )
 
 
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
@@ -893,5 +940,6 @@ def test_pplx_moe(
 ):
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
-                    _pplx_moe)
+    parallel_launch(
+        world_size, _pplx_test_loop, dp_size, use_internode, True, _pplx_moe
+    )
diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py
index 1c51c530c193..d4724d749fc9 100644
--- a/tests/kernels/moe/test_rocm_aiter_topk.py
+++ b/tests/kernels/moe/test_rocm_aiter_topk.py
@@ -24,13 +24,14 @@
 
 pytestmark = pytest.mark.skipif(
     not (current_platform.is_rocm() and aiter_available),
-    reason="AITER ops are only available on ROCm with aiter package installed")
+    reason="AITER ops are only available on ROCm with aiter package installed",
+)
 
 
 def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
     """Test that the custom op is correctly registered."""
     # Check if the op exists in torch.ops.vllm
-    assert hasattr(torch.ops.vllm, 'rocm_aiter_biased_grouped_topk')
+    assert hasattr(torch.ops.vllm, "rocm_aiter_biased_grouped_topk")
 
     # Check if the op is callable
     assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk)
@@ -39,7 +40,7 @@ def test_rocm_aiter_biased_grouped_topk_custom_op_registration():
 def test_rocm_aiter_grouped_topk_custom_op_registration():
     """Test that the custom op is correctly registered."""
     # Check if the op exists in torch.ops.vllm
-    assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk')
+    assert hasattr(torch.ops.vllm, "rocm_aiter_grouped_topk")
 
     # Check if the op is callable
     assert callable(torch.ops.vllm.rocm_aiter_grouped_topk)
@@ -56,25 +57,29 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility():
     renormalize = True
     scale_factor = 1.0
 
-    gating_output = torch.randn((token, expert),
-                                dtype=torch.bfloat16,
-                                device="cuda")
-    e_score_correction_bias = torch.randn((expert, ),
-                                          dtype=torch.bfloat16,
-                                          device="cuda")
+    gating_output = torch.randn((token, expert), dtype=torch.bfloat16, device="cuda")
+    e_score_correction_bias = torch.randn(
+        (expert,), dtype=torch.bfloat16, device="cuda"
+    )
 
     device = gating_output.device
     topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
-    topk_weights = torch.empty((token, topk),
-                               dtype=torch.float32,
-                               device=device)
+    topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     # Define a function that uses the op
-    def biased_grouped_topk_fn(gating_output, e_score_correction_bias,
-                               topk_weights, topk_ids):
+    def biased_grouped_topk_fn(
+        gating_output, e_score_correction_bias, topk_weights, topk_ids
+    ):
         return torch.ops.vllm.rocm_aiter_biased_grouped_topk(
-            gating_output, e_score_correction_bias, topk_weights, topk_ids,
-            num_expert_group, topk_group, renormalize, scale_factor)
+            gating_output,
+            e_score_correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            scale_factor,
+        )
 
     # Verify the op's fake implementation
     torch.library.opcheck(
@@ -84,51 +89,49 @@ def biased_grouped_topk_fn(gating_output, e_score_correction_bias,
             "num_expert_group": num_expert_group,
             "topk_group": topk_group,
             "need_renorm": renormalize,
-            "routed_scaling_factor": scale_factor
+            "routed_scaling_factor": scale_factor,
         },
-        test_utils=("test_faketensor"))
+        test_utils=("test_faketensor"),
+    )
 
     # Compile the function with appropriate settings
-    compiled_fn = torch.compile(biased_grouped_topk_fn,
-                                fullgraph=True,
-                                backend="inductor",
-                                mode="reduce-overhead",
-                                dynamic=False)
-
-    topk_weights_original = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_original = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
-
-    topk_weights_compiled = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_compiled = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
+    compiled_fn = torch.compile(
+        biased_grouped_topk_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    topk_weights_original = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_original = torch.empty((token, topk), dtype=torch.int32, device=device)
+
+    topk_weights_compiled = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_compiled = torch.empty((token, topk), dtype=torch.int32, device=device)
 
     # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
-    biased_grouped_topk_fn(gating_output, e_score_correction_bias,
-                           topk_weights_original, topk_ids_original)
-    compiled_fn(gating_output, e_score_correction_bias, topk_weights_compiled,
-                topk_ids_compiled)
+    biased_grouped_topk_fn(
+        gating_output, e_score_correction_bias, topk_weights_original, topk_ids_original
+    )
+    compiled_fn(
+        gating_output, e_score_correction_bias, topk_weights_compiled, topk_ids_compiled
+    )
 
     # Sort the results for comparison since the order might not be deterministic
     topk_ids_original, indices_original = torch.sort(topk_ids_original)
-    topk_weights_original = torch.gather(topk_weights_original, 1,
-                                         indices_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1, indices_original)
 
     topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
-    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
-                                         indices_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1, indices_compiled)
 
     # Verify results match
-    assert torch.allclose(topk_weights_original,
-                          topk_weights_compiled,
-                          rtol=1e-2,
-                          atol=1e-2)
+    assert torch.allclose(
+        topk_weights_original, topk_weights_compiled, rtol=1e-2, atol=1e-2
+    )
     assert torch.allclose(topk_ids_original, topk_ids_compiled)
 
 
@@ -144,73 +147,73 @@ def test_rocm_aiter_grouped_topk_torch_compile_compatibility():
     scoring_func = "softmax"
     scale_factor = 1.0
 
-    gating_output = torch.randn((token, expert),
-                                dtype=torch.bfloat16,
-                                device="cuda")
+    gating_output = torch.randn((token, expert), dtype=torch.bfloat16, device="cuda")
 
     device = gating_output.device
     topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
-    topk_weights = torch.empty((token, topk),
-                               dtype=torch.float32,
-                               device=device)
+    topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     # Define a function that uses the op
     def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func):
         return torch.ops.vllm.rocm_aiter_grouped_topk(
-            gating_output, topk_weights, topk_ids, num_expert_group,
-            topk_group, renormalize, scoring_func, scale_factor)
+            gating_output,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            scoring_func,
+            scale_factor,
+        )
 
     # Verify the op's fake implementation
-    torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk,
-                          (gating_output, topk_weights, topk_ids),
-                          kwargs={
-                              "num_expert_group": num_expert_group,
-                              "topk_group": topk_group,
-                              "need_renorm": renormalize,
-                              "scoring_func": scoring_func,
-                              "routed_scaling_factor": scale_factor
-                          },
-                          test_utils=("test_faketensor"))
+    torch.library.opcheck(
+        torch.ops.vllm.rocm_aiter_grouped_topk,
+        (gating_output, topk_weights, topk_ids),
+        kwargs={
+            "num_expert_group": num_expert_group,
+            "topk_group": topk_group,
+            "need_renorm": renormalize,
+            "scoring_func": scoring_func,
+            "routed_scaling_factor": scale_factor,
+        },
+        test_utils=("test_faketensor"),
+    )
 
     # Compile the function with appropriate settings
-    compiled_fn = torch.compile(grouped_topk_fn,
-                                fullgraph=True,
-                                backend="inductor",
-                                mode="reduce-overhead",
-                                dynamic=False)
-
-    topk_weights_original = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_original = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
-
-    topk_weights_compiled = torch.empty((token, topk),
-                                        dtype=torch.float32,
-                                        device=device)
-    topk_ids_compiled = torch.empty((token, topk),
-                                    dtype=torch.int32,
-                                    device=device)
+    compiled_fn = torch.compile(
+        grouped_topk_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    topk_weights_original = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_original = torch.empty((token, topk), dtype=torch.int32, device=device)
+
+    topk_weights_compiled = torch.empty(
+        (token, topk), dtype=torch.float32, device=device
+    )
+    topk_ids_compiled = torch.empty((token, topk), dtype=torch.int32, device=device)
 
     # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
-    grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original,
-                    scoring_func)
-    compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled,
-                scoring_func)
+    grouped_topk_fn(
+        gating_output, topk_weights_original, topk_ids_original, scoring_func
+    )
+    compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled, scoring_func)
 
     # Sort the results for comparison since the order might not be deterministic
     topk_ids_original, indices_original = torch.sort(topk_ids_original)
-    topk_weights_original = torch.gather(topk_weights_original, 1,
-                                         indices_original)
+    topk_weights_original = torch.gather(topk_weights_original, 1, indices_original)
 
     topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled)
-    topk_weights_compiled = torch.gather(topk_weights_compiled, 1,
-                                         indices_compiled)
+    topk_weights_compiled = torch.gather(topk_weights_compiled, 1, indices_compiled)
 
     # Verify results match
-    assert torch.allclose(topk_weights_original,
-                          topk_weights_compiled,
-                          rtol=1e-2,
-                          atol=1e-2)
+    assert torch.allclose(
+        topk_weights_original, topk_weights_compiled, rtol=1e-2, atol=1e-2
+    )
     assert torch.allclose(topk_ids_original, topk_ids_compiled)
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index 673a0aa36794..59ceb1e7e374 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -5,7 +5,8 @@
 import torch
 
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm)
+    silu_mul_fp8_quant_deep_gemm,
+)
 from vllm.platforms import current_platform
 
 # (E, T, H, group_size, seed)
@@ -28,16 +29,15 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
     tokens_per_expert = torch.randint(
         low=0,
         high=T,
-        size=(E, ),
+        size=(E,),
         dtype=torch.int32,
         device="cuda",
     )
 
     # Run the Triton kernel
-    y_q, y_s = silu_mul_fp8_quant_deep_gemm(y,
-                                            tokens_per_expert,
-                                            group_size=group_size,
-                                            eps=1e-10)
+    y_q, y_s = silu_mul_fp8_quant_deep_gemm(
+        y, tokens_per_expert, group_size=group_size, eps=1e-10
+    )
 
     # Reference implementation
     fp8_info = torch.finfo(torch.float8_e4m3fn)
@@ -54,9 +54,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
     # Compute reference scales and quantized output, skipping padded tokens
     for e in range(E):
         nt = tokens_per_expert[e].item()
-        ref_s = torch.empty((T, H // group_size),
-                            dtype=torch.float32,
-                            device="cuda")
+        ref_s = torch.empty((T, H // group_size), dtype=torch.float32, device="cuda")
         ref_q = torch.empty((T, H), dtype=torch.float8_e4m3fn, device="cuda")
         for t in range(nt):
             data = merged[e, t]
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index dfd0f35c8da3..3bc1ac7b36c7 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -14,8 +14,7 @@
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -29,14 +28,13 @@ def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     B = B.to(torch.float32)
 
     assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
-    assert B.ndim == 2 and B.is_contiguous(
-    ), "B must be a 2D contiguous tensor"
+    assert B.ndim == 2 and B.is_contiguous(), "B must be a 2D contiguous tensor"
 
     # Reshape input
     M = A.numel() // A.shape[-1]
     B = B.t()  # Transpose weight matrix
     N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (K, )
+    origin_C_shape = A.shape[:-1] + (K,)
     A = A.reshape(M, N)
 
     # As is per-token [M, 1], Bs is per-column [1, K]
@@ -86,17 +84,17 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
             act_out = SiluAndMul().forward_native(inter_out)
             # Quantize activation output with per-token
             act_out_q, act_out_s = ops.scaled_fp8_quant(
-                act_out, use_per_token_if_dynamic=True)
+                act_out, use_per_token_if_dynamic=True
+            )
 
             # Second MLP layer
-            out[mask] = native_w8a8_per_token_matmul(act_out_q,
-                                                     w2[i],
-                                                     act_out_s,
-                                                     w2_s[i],
-                                                     output_dtype=a.dtype)
+            out[mask] = native_w8a8_per_token_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], output_dtype=a.dtype
+            )
     # Apply routing weights and sum
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 @pytest.fixture(autouse=True, scope="module")
@@ -114,8 +112,10 @@ def setup_cuda():
 SEEDS = [0]
 
 
-@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
-                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M, N, K, E, topk, dtype, seed",
+    itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     torch.manual_seed(seed)
@@ -131,12 +131,10 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
 
     # Generate int8 weights
     w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
-    w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min,
-                                   max=fp8_max).to(torch.float8_e4m3fn)
+    w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
     w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
-    w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min,
-                                   max=fp8_max).to(torch.float8_e4m3fn)
+    w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
     # Generate scale for each column (per-column quantization)
     w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
@@ -160,7 +158,7 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
         )
 
     # Check results
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.05
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index df89ad7e6da6..a063876c7546 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -5,15 +5,15 @@
 import torch
 
 import vllm._custom_ops as ops
-from tests.kernels.quant_utils import (per_block_cast_to_fp8,
-                                       per_block_cast_to_int8)
+from tests.kernels.quant_utils import per_block_cast_to_fp8, per_block_cast_to_int8
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+    BatchedPrepareAndFinalize,
+    BatchedTritonExperts,
+    NaiveBatchedExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils import round_up
 
 
@@ -31,18 +31,20 @@ def triton_moe(
     per_act_token_quant=False,
     block_shape: Optional[list[int]] = None,
 ) -> torch.Tensor:
-    return fused_experts(a,
-                         w1,
-                         w2,
-                         topk_weight,
-                         topk_ids,
-                         w1_scale=w1_scale,
-                         w2_scale=w2_scale,
-                         a1_scale=a1_scale,
-                         a2_scale=a2_scale,
-                         per_channel_quant=per_act_token_quant,
-                         use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
-                         block_shape=block_shape)
+    return fused_experts(
+        a,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        per_channel_quant=per_act_token_quant,
+        use_fp8_w8a8=quant_dtype == torch.float8_e4m3fn,
+        block_shape=block_shape,
+    )
 
 
 def batched_moe(
@@ -62,10 +64,9 @@ def batched_moe(
     max_num_tokens = round_up(a.shape[0], 64)
 
     fused_experts = FusedMoEModularKernel(
-        BatchedPrepareAndFinalize(max_num_tokens,
-                                  num_dispatchers=1,
-                                  num_local_experts=w1.shape[0],
-                                  rank=0),
+        BatchedPrepareAndFinalize(
+            max_num_tokens, num_dispatchers=1, num_local_experts=w1.shape[0], rank=0
+        ),
         BatchedTritonExperts(
             max_num_tokens=max_num_tokens,
             num_dispatchers=1,
@@ -75,15 +76,17 @@ def batched_moe(
         ),
     )
 
-    return fused_experts(a,
-                         w1,
-                         w2,
-                         topk_weight,
-                         topk_ids,
-                         w1_scale=w1_scale,
-                         w2_scale=w2_scale,
-                         a1_scale=a1_scale,
-                         a2_scale=a2_scale)
+    return fused_experts(
+        a,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+    )
 
 
 def naive_batched_moe(
@@ -103,10 +106,9 @@ def naive_batched_moe(
     max_num_tokens = round_up(a.shape[0], 64)
 
     fused_experts = FusedMoEModularKernel(
-        BatchedPrepareAndFinalize(max_num_tokens,
-                                  num_dispatchers=1,
-                                  num_local_experts=w1.shape[0],
-                                  rank=0),
+        BatchedPrepareAndFinalize(
+            max_num_tokens, num_dispatchers=1, num_local_experts=w1.shape[0], rank=0
+        ),
         NaiveBatchedExperts(
             max_num_tokens=max_num_tokens,
             num_dispatchers=1,
@@ -116,19 +118,22 @@ def naive_batched_moe(
         ),
     )
 
-    return fused_experts(a,
-                         w1,
-                         w2,
-                         topk_weight,
-                         topk_ids,
-                         w1_scale=w1_scale,
-                         w2_scale=w2_scale,
-                         a1_scale=a1_scale,
-                         a2_scale=a2_scale)
+    return fused_experts(
+        a,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+    )
 
 
-def chunk_scales(scales: Optional[torch.Tensor], start: int,
-                 end: int) -> Optional[torch.Tensor]:
+def chunk_scales(
+    scales: Optional[torch.Tensor], start: int, end: int
+) -> Optional[torch.Tensor]:
     if scales is not None:
         if scales.numel() == 1:
             return scales
@@ -151,13 +156,15 @@ def make_quantized_test_activations(
     a_scale = None
 
     if quant_dtype is not None:
-        assert (quant_dtype == torch.float8_e4m3fn
-                or quant_dtype == torch.int8), "only fp8/int8 supported"
+        assert quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8, (
+            "only fp8/int8 supported"
+        )
         a_q = torch.zeros_like(a, dtype=quant_dtype)
         a_scale_l = [None] * E
         for e in range(E):
             a_q[e], a_scale_l[e] = moe_kernel_quantize_input(
-                a[e], None, quant_dtype, per_act_token_quant, block_shape)
+                a[e], None, quant_dtype, per_act_token_quant, block_shape
+            )
         a_scale = torch.stack(a_scale_l)
 
         if not per_act_token_quant and block_shape is None:
@@ -173,8 +180,9 @@ def moe_quantize_weights(
     per_token_quant: bool,
     block_shape: Optional[list[int]],
 ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-    assert (quant_dtype == torch.float8_e4m3fn
-            or quant_dtype == torch.int8), "only fp8/int8 supported"
+    assert quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8, (
+        "only fp8/int8 supported"
+    )
 
     if block_shape is not None:
         assert not per_token_quant
@@ -185,10 +193,12 @@ def moe_quantize_weights(
     else:
         if quant_dtype == torch.int8:
             w, w_s = ops.scaled_int8_quant(
-                w, w_s, use_per_token_if_dynamic=per_token_quant)
+                w, w_s, use_per_token_if_dynamic=per_token_quant
+            )
         else:
             w, w_s = ops.scaled_fp8_quant(
-                w, w_s, use_per_token_if_dynamic=per_token_quant)
+                w, w_s, use_per_token_if_dynamic=per_token_quant
+            )
 
     return w, w_s
 
@@ -209,7 +219,8 @@ def make_test_weight(
         w_s_l = [None] * e
         for idx in range(e):
             w_l[idx], w_s_l[idx] = moe_quantize_weights(
-                w_16[idx], None, quant_dtype, per_act_token_quant, block_shape)
+                w_16[idx], None, quant_dtype, per_act_token_quant, block_shape
+            )
 
         w = torch.stack(w_l)
         w_s = torch.stack(w_s_l)
@@ -237,11 +248,19 @@ def make_test_weights(
     quant_dtype: Optional[torch.dtype] = None,
     block_shape: Optional[list[int]] = None,
     per_act_token_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor,
-           torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Optional[torch.Tensor],
+    torch.Tensor,
+    torch.Tensor,
+    Optional[torch.Tensor],
+]:
     return (
-        *make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
-                          per_act_token_quant),
-        *make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
-                          per_act_token_quant),
+        *make_test_weight(
+            e, 2 * n, k, in_dtype, quant_dtype, block_shape, per_act_token_quant
+        ),
+        *make_test_weight(
+            e, k, n, in_dtype, quant_dtype, block_shape, per_act_token_quant
+        ),
     )
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 6f43d1111c98..abd622ef5264 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -5,8 +5,7 @@
 
 import torch
 
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    group_broadcast)
+from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
@@ -17,25 +16,31 @@
 
 
 def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
-    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+    return torch.as_tensor(x, dtype=torch.float32, device="cuda")
 
-def ref_dynamic_per_token_quant(x: torch.tensor,
-                                quant_dtype: torch.dtype,
-                                scale_ub: Optional[torch.tensor] = None) \
-        -> tuple[torch.tensor, torch.tensor]:
 
+def ref_dynamic_per_token_quant(
+    x: torch.tensor, quant_dtype: torch.dtype, scale_ub: Optional[torch.tensor] = None
+) -> tuple[torch.tensor, torch.tensor]:
     assert quant_dtype in [torch.int8, FP8_DTYPE]
     if scale_ub is not None:
         assert quant_dtype == FP8_DTYPE
 
-    qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
-            else torch.finfo(quant_dtype)
-    qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                        else qtype_traits.max
-    qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                        else qtype_traits.min
+    qtype_traits = (
+        torch.iinfo(quant_dtype)
+        if quant_dtype == torch.int8
+        else torch.finfo(quant_dtype)
+    )
+    qtype_traits_max = (
+        ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else qtype_traits.max
+    )
+    qtype_traits_min = (
+        -ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else qtype_traits.min
+    )
     qtype_max = as_float32_tensor(qtype_traits_max)
     s_1 = as_float32_tensor(1.0)
     s_512 = as_float32_tensor(512.0)
@@ -56,15 +61,13 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
         iscales = as_float32_tensor(s_1 / scales)
         torch_out = as_float32_tensor(x) * iscales
         torch_out = torch_out.round()
-        torch_out = torch_out.clamp(qtype_traits_min,
-                                    qtype_traits_max).to(quant_dtype)
+        torch_out = torch_out.clamp(qtype_traits_min, qtype_traits_max).to(quant_dtype)
     else:
         assert quant_dtype == FP8_DTYPE
         min_scaling_factor = s_1 / (qtype_max * s_512)
         scales = scales.clamp(min=min_scaling_factor)
         torch_out = as_float32_tensor(x) / scales
-        torch_out = torch_out.clamp(qtype_traits_min,
-                                    qtype_traits_max).to(quant_dtype)
+        torch_out = torch_out.clamp(qtype_traits_min, qtype_traits_max).to(quant_dtype)
 
     return torch_out, scales
 
@@ -72,16 +75,20 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
 # The int8 version is very similar. Incorporate the int8 version, like in
 # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
 # kernel
-def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-                    -> tuple[torch.tensor, torch.tensor]:
-
+def ref_dynamic_per_tensor_fp8_quant(
+    x: torch.tensor,
+) -> tuple[torch.tensor, torch.tensor]:
     fp8_traits = torch.finfo(FP8_DTYPE)
-    fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                    else fp8_traits.max
-    fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \
-                                            and current_platform.is_fp8_fnuz() \
-                                    else fp8_traits.min
+    fp8_traits_max = (
+        ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else fp8_traits.max
+    )
+    fp8_traits_min = (
+        -ROCM_FP8FNUZ_MAX
+        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
+        else fp8_traits.min
+    )
     fp8_max = as_float32_tensor(fp8_traits_max)
     one = as_float32_tensor(1.0)
 
@@ -92,9 +99,12 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
     x_max = as_float32_tensor(x.abs().max())
     ref_scale = x_max / fp8_max
     ref_iscale = one / ref_scale
-    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
-        fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
-    return ref_out, ref_scale.view((1, ))
+    ref_out = (
+        (as_float32_tensor(x) * ref_iscale)
+        .clamp(fp8_traits_min, fp8_traits_max)
+        .to(FP8_DTYPE)
+    )
+    return ref_out, ref_scale.view((1,))
 
 
 def native_w8a8_block_matmul(
@@ -126,7 +136,7 @@ def native_w8a8_block_matmul(
 
     M = A.numel() // A.shape[-1]
     N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (N, )
+    origin_C_shape = A.shape[:-1] + (N,)
     A = A.reshape(M, A.shape[-1])
     As = As.reshape(M, As.shape[-1])
     n_tiles = (N + block_n - 1) // block_n
@@ -137,19 +147,19 @@ def native_w8a8_block_matmul(
     C_shape = (M, N)
     C = torch.zeros(C_shape, dtype=compute_type, device=A.device)
 
-    A_tiles = [
-        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
-    ]
-    B_tiles = [[
-        B[
-            j * block_n:min((j + 1) * block_n, N),
-            i * block_k:min((i + 1) * block_k, K),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-    C_tiles = [
-        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
+    B_tiles = [
+        [
+            B[
+                j * block_n : min((j + 1) * block_n, N),
+                i * block_k : min((i + 1) * block_k, K),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
     ]
-    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+    C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
+    As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
 
     for i in range(k_tiles):
         for j in range(n_tiles):
@@ -163,14 +173,14 @@ def native_w8a8_block_matmul(
     return C
 
 
-def native_per_token_group_quant_fp8(x,
-                                     group_size,
-                                     eps=1e-10,
-                                     dtype=torch.float8_e4m3fn):
+def native_per_token_group_quant_fp8(
+    x, group_size, eps=1e-10, dtype=torch.float8_e4m3fn
+):
     """Function to perform per-token-group quantization on an input tensor
     `x` using native torch."""
-    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` must "
-                                           "be divisible by `group_size`")
+    assert x.shape[-1] % group_size == 0, (
+        "the last dimension of `x` must be divisible by `group_size`"
+    )
     assert x.is_contiguous(), "`x` is not contiguous"
 
     finfo = torch.finfo(dtype)
@@ -178,28 +188,25 @@ def native_per_token_group_quant_fp8(x,
     fp8_max = finfo.max
 
     x_ = x.reshape(x.numel() // group_size, group_size)
-    amax = x_.abs().max(dim=-1,
-                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
     x_s = amax / fp8_max
     x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
     x_q = x_q.reshape(x.shape)
-    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
 
     return x_q, x_s
 
 
-def native_per_token_group_quant_int8(x,
-                                      group_size,
-                                      eps=1e-10,
-                                      dtype=torch.int8):
+def native_per_token_group_quant_int8(x, group_size, eps=1e-10, dtype=torch.int8):
     """Function to perform per-token-group quantization on an input tensor
     `x` using native torch.
 
     It converts the tensor values into int8 values and returns the
     quantized tensor along with the scaling factor used for quantization.
     """
-    assert (x.shape[-1] % group_size == 0
-            ), "the last dimension of `x` must be divisible by `group_size`"
+    assert x.shape[-1] % group_size == 0, (
+        "the last dimension of `x` must be divisible by `group_size`"
+    )
     assert x.is_contiguous(), "`x` is not contiguous"
 
     iinfo = torch.iinfo(dtype)
@@ -208,13 +215,13 @@ def native_per_token_group_quant_int8(x,
 
     x_ = x.reshape(x.numel() // group_size, group_size)
     # Use float32 for scale calculation for stability
-    amax = x_.abs().max(dim=-1,
-                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
     x_s = amax / int8_max
-    x_q = (x_.to(torch.float32) / x_s).round().clamp(
-        min=int8_min, max=int8_max).to(dtype)  # Round before clamping
+    x_q = (
+        (x_.to(torch.float32) / x_s).round().clamp(min=int8_min, max=int8_max).to(dtype)
+    )  # Round before clamping
     x_q = x_q.reshape(x.shape)
-    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
 
     return x_q, x_s
 
@@ -229,9 +236,9 @@ def per_block_cast_to_fp8(
     block_m, block_n = block_shape
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
-                           dtype=x.dtype,
-                           device=x.device)
+    x_padded = torch.zeros(
+        (round_up(m, block_m), round_up(n, block_n)), dtype=x.dtype, device=x.device
+    )
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
@@ -248,9 +255,9 @@ def per_block_cast_to_int8(
     block_m, block_n = block_shape
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
-                           dtype=x.dtype,
-                           device=x.device)
+    x_padded = torch.zeros(
+        (round_up(m, block_m), round_up(n, block_n)), dtype=x.dtype, device=x.device
+    )
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
@@ -288,8 +295,9 @@ def batched_dequant(
         assert t.shape[0] == scale.shape[0]
         out = torch.empty_like(t, dtype=out_dtype)
         for e in range(t.shape[0]):
-            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant,
-                             out_dtype)
+            out[e] = dequant(
+                t[e], scale[e], block_shape, per_act_token_quant, out_dtype
+            )
         return out
 
     return t.to(out_dtype)
@@ -313,15 +321,17 @@ def native_batched_masked_quant_matmul(
         num_tokens = num_expert_tokens_cpu[e]
         if A.dtype.itemsize == 1 and block_shape is not None:
             assert A_scale is not None and B_scale is not None
-            tmp = native_w8a8_block_matmul(A[e], B[e], A_scale[e], B_scale[e],
-                                           block_shape, C.dtype)
+            tmp = native_w8a8_block_matmul(
+                A[e], B[e], A_scale[e], B_scale[e], block_shape, C.dtype
+            )
             C[e, :num_tokens, :] = tmp[:num_tokens, :]
         elif A.dtype.itemsize == 1 and block_shape is None:
             assert A_scale is not None and B_scale is not None
             A_dq = dequant(A[e], A_scale[e], block_shape, per_act_token_quant)
             B_dq = dequant(B[e], B_scale[e], block_shape, per_act_token_quant)
-            C[e, :num_tokens, :] = (
-                A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(C.dtype)
+            C[e, :num_tokens, :] = (A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(
+                C.dtype
+            )
         else:
             assert A_scale is None
             assert B_scale is None
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
index 1095975ab2b4..db7feea10a5f 100644
--- a/tests/kernels/quantization/nvfp4_utils.py
+++ b/tests/kernels/quantization/nvfp4_utils.py
@@ -7,8 +7,9 @@
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 
-kE2M1ToFloat = torch.tensor([0., 0.5, 1., 1.5, 2., 3., 4., 6.],
-                            dtype=torch.float32)
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
 
 
 def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
@@ -21,12 +22,9 @@ def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
     return out[0:m, 0:k]
 
 
-def dequantize_nvfp4_to_dtype(tensor_fp4,
-                              tensor_sf,
-                              global_scale,
-                              dtype,
-                              device,
-                              block_size=16):
+def dequantize_nvfp4_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
     """Dequantize the fp4 tensor back to high precision."""
     # Two fp4 values are packed into one uint8.
     assert tensor_fp4.dtype == torch.uint8
diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
index 3de9cb364468..de0cd0874746 100644
--- a/tests/kernels/quantization/test_allspark_gemm.py
+++ b/tests/kernels/quantization/test_allspark_gemm.py
@@ -2,28 +2,29 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
-
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.allspark_utils import (
-    ALLSPARK_AMPERE_K_ALIGN, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
-    ALLSPARK_AMPERE_N_ALIGN)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    quantize_weights)
+    ALLSPARK_AMPERE_K_ALIGN,
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    ALLSPARK_AMPERE_N_ALIGN,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_weights
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 
-def is_gptq_allspark_supported(min_capability: int,
-                               max_capability: int) -> bool:
+def is_gptq_allspark_supported(min_capability: int, max_capability: int) -> bool:
     if not current_platform.is_cuda():
         return False
 
     capability = current_platform.get_device_capability()
     assert capability is not None
 
-    return capability.to_int() >= min_capability \
-        and capability.to_int() <= max_capability
+    return (
+        capability.to_int() >= min_capability and capability.to_int() <= max_capability
+    )
 
 
 MNK_FACTORS = [
@@ -43,7 +44,8 @@ def is_gptq_allspark_supported(min_capability: int,
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+        torch.abs(output_ref)
+    )
 
 
 def rand_data(shape, dtype=torch.float16):
@@ -52,7 +54,8 @@ def rand_data(shape, dtype=torch.float16):
 
 @pytest.mark.skipif(
     not is_gptq_allspark_supported(80, 89),
-    reason="AllSpark Ampere kernel is not supported on this GPU type.")
+    reason="AllSpark Ampere kernel is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("group_size", [-1])
 @pytest.mark.parametrize("has_zp", HAS_ZP_OPTS)
@@ -67,8 +70,9 @@ def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
     weight = rand_data((k, n), dtype=dtype)
 
     # Quantize (and apply act_order if provided)
-    w_ref, qw, s, zp = quantize_weights(weight, scalar_types.uint8b128,
-                                        group_size, has_zp)
+    w_ref, qw, s, zp = quantize_weights(
+        weight, scalar_types.uint8b128, group_size, has_zp
+    )
 
     qw = qw.to(torch.uint8)
     if has_zp:
@@ -79,20 +83,42 @@ def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
 
     n_32align = (n + 32 - 1) // 32 * 32
 
-    qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
-        qw, s, zp, has_zp)
-    opcheck(torch.ops._C.rearrange_kn_weight_as_n32k16_order,
-            (qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n,
-             n_32align))
-
-    opcheck(torch.ops._C.allspark_w8a16_gemm,
-            (input, qw_reorder, s_reorder, zp_reorder, n, group_size, sm_count,
-             sm_version, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, has_zp, True),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
-    output = ops.allspark_w8a16_gemm(input, qw_reorder, s_reorder, zp_reorder,
-                                     n, group_size, sm_count, sm_version,
-                                     ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
-                                     has_zp, True)
+    qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(qw, s, zp, has_zp)
+    opcheck(
+        torch.ops._C.rearrange_kn_weight_as_n32k16_order,
+        (qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n, n_32align),
+    )
+
+    opcheck(
+        torch.ops._C.allspark_w8a16_gemm,
+        (
+            input,
+            qw_reorder,
+            s_reorder,
+            zp_reorder,
+            n,
+            group_size,
+            sm_count,
+            sm_version,
+            ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+            has_zp,
+            True,
+        ),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+    output = ops.allspark_w8a16_gemm(
+        input,
+        qw_reorder,
+        s_reorder,
+        zp_reorder,
+        n,
+        group_size,
+        sm_count,
+        sm_version,
+        ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+        has_zp,
+        True,
+    )
 
     output_ref = torch.matmul(input, w_ref)
     torch.cuda.synchronize()
diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
index 427db3e60292..20eddf741781 100644
--- a/tests/kernels/quantization/test_aqlm.py
+++ b/tests/kernels/quantization/test_aqlm.py
@@ -2,39 +2,36 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
-
 from tests.kernels.utils import opcheck
+
 from vllm import _custom_ops as ops  # noqa: F401
 
 
 def test_aqlm_dequant_opcheck():
-    codes = torch.randint(-32768,
-                          32767, (22016, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((2, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
+    codes = torch.randint(
+        -32768, 32767, (22016, 512, 1), device="cuda", dtype=torch.int16
+    )
+    codebooks = torch.rand((2, 65536, 1, 8), device="cuda", dtype=torch.float16)
     codebook_partition_sizes = [11008, 11008]
 
-    opcheck(torch.ops._C.aqlm_dequant,
-            (codes, codebooks, codebook_partition_sizes))
+    opcheck(torch.ops._C.aqlm_dequant, (codes, codebooks, codebook_partition_sizes))
 
 
 def test_aqlm_gemm_opcheck():
-    input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
-    codes = torch.randint(-32768,
-                          32767, (12288, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((3, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
-    scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
+    input = torch.rand((4, 4096), device="cuda", dtype=torch.float16)
+    codes = torch.randint(
+        -32768, 32767, (12288, 512, 1), device="cuda", dtype=torch.int16
+    )
+    codebooks = torch.rand((3, 65536, 1, 8), device="cuda", dtype=torch.float16)
+    scales = torch.rand((12288, 1, 1, 1), device="cuda", dtype=torch.float16)
     codebook_partition_sizes = [4096, 4096, 4096]
     bias = None
 
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, None))
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, bias))
+    opcheck(
+        torch.ops._C.aqlm_gemm,
+        (input, codes, codebooks, scales, codebook_partition_sizes, None),
+    )
+    opcheck(
+        torch.ops._C.aqlm_gemm,
+        (input, codes, codebooks, scales, codebook_partition_sizes, bias),
+    )
diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py
index bc0868123d82..94be8805a0f3 100644
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -3,45 +3,47 @@
 
 import pytest
 import torch
-
 from tests.kernels.utils import opcheck
+
 from vllm import _custom_ops as ops  # noqa: F401
 
 
-@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
-                    reason="AWQ is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not hasattr(torch.ops._C, "awq_dequantize"),
+    reason="AWQ is not supported on this GPU type.",
+)
 def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_TRITON_AWQ", "0")
-        qweight = torch.randint(-2000000000,
-                                2000000000, (8192, 256),
-                                device='cuda',
-                                dtype=torch.int32)
-        scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
-        zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+        qweight = torch.randint(
+            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
+        )
+        scales = torch.rand((64, 2048), device="cuda", dtype=torch.float16)
+        zeros = torch.empty((64, 256), device="cuda", dtype=torch.int32)
         split_k_iters = 0
         thx = 0
         thy = 0
-        opcheck(torch.ops._C.awq_dequantize,
-                (qweight, scales, zeros, split_k_iters, thx, thy))
+        opcheck(
+            torch.ops._C.awq_dequantize,
+            (qweight, scales, zeros, split_k_iters, thx, thy),
+        )
 
 
 @pytest.mark.skip(reason="Not working; needs investigation.")
-@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
-                    reason="AWQ is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not hasattr(torch.ops._C, "awq_gemm"),
+    reason="AWQ is not supported on this GPU type.",
+)
 def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_TRITON_AWQ", "0")
-        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
-        qweight = torch.randint(-2000000000,
-                                2000000000, (8192, 256),
-                                device='cuda',
-                                dtype=torch.int32)
-        scales = torch.randint(-2000000000,
-                               2000000000, (64, 256),
-                               device='cuda',
-                               dtype=torch.int32)
-        qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+        input = torch.rand((2, 8192), device="cuda", dtype=torch.float16)
+        qweight = torch.randint(
+            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
+        )
+        scales = torch.randint(
+            -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32
+        )
+        qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
         split_k_iters = 8
-        opcheck(torch.ops._C.awq_gemm,
-                (input, qweight, qzeros, scales, split_k_iters))
+        opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters))
diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 96797e85bd12..b74c7b84120f 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -4,11 +4,15 @@
 
 Run `pytest tests/kernels/test_awq_triton.py`.
 """
+
 import pytest
 import torch
 
 from vllm.model_executor.layers.quantization.awq_triton import (
-    AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
+    AWQ_TRITON_SUPPORTED_GROUP_SIZES,
+    awq_dequantize_triton,
+    awq_gemm_triton,
+)
 from vllm.platforms import current_platform
 
 device = "cuda"
@@ -33,23 +37,24 @@ def reverse_awq_order(t: torch.Tensor):
 # qweights - [R     , C // 8], int32
 # scales   - [R // G, C     ], float16
 # zeros    - [R // G, C // 8], int32
-def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
-                         qzeros: torch.Tensor,
-                         group_size: int) -> torch.Tensor:
-
+def awq_dequantize_torch(
+    qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, group_size: int
+) -> torch.Tensor:
     if group_size == -1:
         group_size = qweight.shape[0]
 
     bits = 4
     shifts = torch.arange(0, 32, bits, device=qzeros.device)
 
-    iweights = torch.bitwise_right_shift(qweight[:, :, None],
-                                         shifts[None, None, :]).to(torch.int8)
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
 
     iweights = iweights.view(iweights.shape[0], -1)
 
-    zeros = torch.bitwise_right_shift(qzeros[:, :, None],
-                                      shifts[None, None, :]).to(torch.int8)
+    zeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
     zeros = zeros.view(qzeros.shape[0], -1)
     zeros = reverse_awq_order(zeros)
 
@@ -70,7 +75,6 @@ def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
 @pytest.mark.parametrize("qweight_cols", [448, 576, 4736, 16, 32, 64, 128])
 @pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
 def test_dequantize(qweight_rows, qweight_cols, group_size):
-
     if group_size == -1:
         group_size = qweight_rows
 
@@ -84,25 +88,27 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
 
     current_platform.seed_everything(0)
 
-    qweight = torch.randint(0,
-                            torch.iinfo(torch.int32).max,
-                            (qweight_rows, qweight_cols),
-                            dtype=qweight_dtype,
-                            device=device)
-    scales = torch.rand(scales_rows,
-                        scales_cols,
-                        dtype=scales_dtype,
-                        device=device)
-    zeros = torch.randint(0,
-                          torch.iinfo(torch.int32).max,
-                          (zeros_rows, zeros_cols),
-                          dtype=zeros_dtype,
-                          device=device)
+    qweight = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (qweight_rows, qweight_cols),
+        dtype=qweight_dtype,
+        device=device,
+    )
+    scales = torch.rand(scales_rows, scales_cols, dtype=scales_dtype, device=device)
+    zeros = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (zeros_rows, zeros_cols),
+        dtype=zeros_dtype,
+        device=device,
+    )
 
     iweights_triton = awq_dequantize_triton(qweight, scales, zeros)
 
-    assert (not torch.any(torch.isinf(iweights_triton))
-            and not torch.any(torch.isnan(iweights_triton)))
+    assert not torch.any(torch.isinf(iweights_triton)) and not torch.any(
+        torch.isnan(iweights_triton)
+    )
 
     iweights_torch = awq_dequantize_torch(qweight, scales, zeros, group_size)
 
@@ -119,7 +125,6 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
 @pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("splitK", [1, 8])
 def test_gemm(N, K, M, splitK, group_size):
-
     if group_size == -1:
         group_size = K
 
@@ -138,35 +143,29 @@ def test_gemm(N, K, M, splitK, group_size):
 
     current_platform.seed_everything(0)
 
-    input = torch.rand((input_rows, input_cols),
-                       dtype=input_dtype,
-                       device=device)
-    qweight = torch.randint(0,
-                            torch.iinfo(torch.int32).max,
-                            (qweight_rows, qweight_cols),
-                            device=device)
-    qzeros = torch.randint(0,
-                           torch.iinfo(torch.int32).max,
-                           (qzeros_rows, qzeros_cols),
-                           device=device)
-    scales = torch.rand((scales_rows, scales_cols),
-                        dtype=scales_dtype,
-                        device=device)
-
-    output_triton = awq_gemm_triton(input, qweight, scales, qzeros,
-                                    split_k_iters)
-
-    assert (not torch.any(torch.isinf(output_triton))
-            and not torch.any(torch.isnan(output_triton)))
+    input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
+    qweight = torch.randint(
+        0, torch.iinfo(torch.int32).max, (qweight_rows, qweight_cols), device=device
+    )
+    qzeros = torch.randint(
+        0, torch.iinfo(torch.int32).max, (qzeros_rows, qzeros_cols), device=device
+    )
+    scales = torch.rand((scales_rows, scales_cols), dtype=scales_dtype, device=device)
+
+    output_triton = awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
+
+    assert not torch.any(torch.isinf(output_triton)) and not torch.any(
+        torch.isnan(output_triton)
+    )
 
     dequantized_weights = awq_dequantize_triton(qweight, scales, qzeros)
 
     output_torch = torch.matmul(input, dequantized_weights)
 
-    assert (not torch.any(torch.isinf(output_torch))
-            and not torch.any(torch.isnan(output_torch)))
+    assert not torch.any(torch.isinf(output_torch)) and not torch.any(
+        torch.isnan(output_torch)
+    )
 
-    torch.testing.assert_close(output_triton.cpu(),
-                               output_torch.cpu(),
-                               atol=1e-1,
-                               rtol=1e-1)
+    torch.testing.assert_close(
+        output_triton.cpu(), output_torch.cpu(), atol=1e-1, rtol=1e-1
+    )
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 26aa8d652e63..b33ee5bd8a26 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -6,20 +6,23 @@
 
 import pytest
 import torch
+from tests.kernels.quant_utils import (
+    native_per_token_group_quant_fp8,
+    native_w8a8_block_matmul,
+)
 
-from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
-                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
-    w8a8_block_fp8_matmul)
+    get_col_major_tma_aligned_tensor,
+    per_token_group_quant_fp8,
+    w8a8_block_fp8_matmul,
+)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
 
 if current_platform.get_device_capability() < (9, 0):
-    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
-                allow_module_level=True)
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -50,7 +53,8 @@ def setup_cuda():
 
 @pytest.mark.parametrize(
     "num_tokens,d,dtype,group_size,seed",
-    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
+)
 @torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     torch.manual_seed(seed)
@@ -59,15 +63,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
     ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
     out, scale = per_token_group_quant_fp8(x, group_size)
 
-    assert torch.allclose(out.to(torch.float32),
-                          ref_out.to(torch.float32),
-                          rtol=0.15)
+    assert torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
     assert torch.allclose(scale, ref_scale)
 
 
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
-    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -88,21 +91,20 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
     Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
 
-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
     out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
 
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
 
 
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
-    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
-@pytest.mark.skipif(not has_deep_gemm(),
-                    reason="DeepGemm kernels not available.")
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
+)
+@pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     # only aligned sizes
@@ -122,20 +124,20 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     As = As_fp8.to(torch.float32)
     Bs = Bs_fp8.to(torch.float32)
 
-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
 
     # Transpose earlier so that the testing will not trigger transposing kernels
     As_fp8 = get_col_major_tma_aligned_tensor(As_fp8)
 
-    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+    out = torch.zeros((M, N), device="cuda", dtype=out_dtype)
 
-    assert As_fp8.shape == (M, (K + 127) //
-                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+    assert As_fp8.shape == (M, (K + 127) // 128), (
+        f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+    )
 
     fp8_gemm_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
 
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index fac82cf9c8b5..9ca05563a194 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -6,16 +6,16 @@
 
 import pytest
 import torch
-
 from tests.kernels.quant_utils import native_w8a8_block_matmul
+
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.int8_utils import (
-    w8a8_block_int8_matmul)
+    w8a8_block_int8_matmul,
+)
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (7, 0):
-    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
-                allow_module_level=True)
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -36,8 +36,10 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
-                         itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
     torch.manual_seed(seed)
@@ -58,11 +60,10 @@ def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
     As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
     Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
 
-    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                       out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
     out = w8a8_block_int8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
 
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.001
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 878f66647e19..e88f82d3e5cb 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -7,16 +7,15 @@
 
 import pytest
 import torch
-
 from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
+
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
+    sparse_cutlass_supported,
+)
 from vllm.platforms import current_platform
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
@@ -40,9 +39,7 @@ def prune_to_2_4(tensor):
 
     # Create binary mask
     mask = torch.zeros_like(reshaped)
-    mask.scatter_(dim=1,
-                  index=indices,
-                  src=torch.ones_like(indices, dtype=mask.dtype))
+    mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
 
     # Apply mask and reshape back
     pruned = reshaped * mask
@@ -55,32 +52,31 @@ def prune_to_2_4(tensor):
 
 # This function checks that applying an identity matrix multiplication
 # to the compressed weights yields the original uncompressed weights.
-def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor,
-                                         b_compressed: torch.Tensor,
-                                         b_metadata: torch.Tensor):
-
+def check_compress_decompress_invariance(
+    dtype: torch.dtype,
+    b: torch.Tensor,
+    b_compressed: torch.Tensor,
+    b_metadata: torch.Tensor,
+):
     # For float16 and bfloat16, cutlass_scaled_sparse_mm's output must be the
     # same dtype as its inputs. This line addresses that constraint while
     # arbitrarily using bfloat16 for the int8/fp8 cases.
     out_dtype = torch.float16 if dtype is torch.float16 else torch.bfloat16
 
-    eye = torch.eye(b.shape[0], device='cuda', dtype=dtype)
-    eye_scale = torch.ones(1, device='cuda', dtype=torch.float32)
-    b_decomp = ops.cutlass_scaled_sparse_mm(eye,
-                                            b_compressed,
-                                            b_metadata,
-                                            eye_scale,
-                                            eye_scale,
-                                            out_dtype=out_dtype)
+    eye = torch.eye(b.shape[0], device="cuda", dtype=dtype)
+    eye_scale = torch.ones(1, device="cuda", dtype=torch.float32)
+    b_decomp = ops.cutlass_scaled_sparse_mm(
+        eye, b_compressed, b_metadata, eye_scale, eye_scale, out_dtype=out_dtype
+    )
 
     torch.testing.assert_close(b.to(dtype=out_dtype), b_decomp)
 
 
 def make_rand_sparse_tensors(
-        dtype: torch.dtype, m: int, n: int, k: int
+    dtype: torch.dtype, m: int, n: int, k: int
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda')
-    b = torch.randn((n, k), device='cuda').t()
+    a = torch.randn((m, k), device="cuda")
+    b = torch.randn((n, k), device="cuda").t()
 
     if dtype == torch.int8:
         # ensure A and B aren't all zeros after rounding
@@ -107,32 +103,25 @@ def make_rand_sparse_tensors(
     return b_compressed, e, a, b
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 # Test working with a subset of A and B for sparse matmul
 def test_cutlass_sparse_subset():
-
     big_m = 1024
     m, n, k = 512, 512, 512
 
     # Create tensors
-    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
-                                                     big_m, n, k)
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, big_m, n, k)
     a = whole_a[0:m, 0:k]
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
 
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=torch.bfloat16
+    )
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=torch.bfloat16)
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
@@ -161,105 +150,87 @@ def test_cutlass_sparse_subset():
 
 
 # Test working with a subset of A and B for sparse matmul
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("m, n, k", MNK_FACTORS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: type[torch.dtype],
-                             use_bias: bool):
-
+def test_cutlass_sparse_gemm(
+    m: int, k: int, n: int, dtype: type[torch.dtype], use_bias: bool
+):
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
     scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
     scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
 
-    bias = torch.rand((n, ), device="cuda", dtype=dtype) if use_bias else None
+    bias = torch.rand((n,), device="cuda", dtype=dtype) if use_bias else None
 
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=dtype,
-                                       bias=bias)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=dtype, bias=bias
+    )
 
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=dtype,
-                                  bias=bias)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=dtype, bias=bias)
 
     torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("m, k, n", MNK_FACTORS)
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int, use_bias: bool):
-
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
-    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32)
     out_dtype = torch.bfloat16
 
-    bias = torch.rand(
-        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+    bias = torch.rand((n,), device="cuda", dtype=out_dtype) * 10 if use_bias else None
 
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=out_dtype,
-                                       bias=bias)
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
 
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=out_dtype,
-                                  bias=bias)
+    baseline = baseline_scaled_mm(
+        a, b, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
 
     torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse CUTLASS is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse CUTLASS is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("m,k,n", MNK_FACTORS)
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
-                                  per_out_ch: bool, use_bias: bool):
-
+def test_cutlass_sparse_int8_gemm(
+    m: int, n: int, k: int, per_act_token: bool, per_out_ch: bool, use_bias: bool
+):
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
-    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
-    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32)
     out_dtype = torch.bfloat16
 
-    bias = torch.rand(
-        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
-
-    out = ops.cutlass_scaled_sparse_mm(a,
-                                       b_comp,
-                                       e,
-                                       scale_a,
-                                       scale_b,
-                                       out_dtype=out_dtype,
-                                       bias=bias)
-
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=out_dtype,
-                                  bias=bias)
+    bias = torch.rand((n,), device="cuda", dtype=out_dtype) * 10 if use_bias else None
+
+    out = ops.cutlass_scaled_sparse_mm(
+        a, b_comp, e, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
+
+    baseline = baseline_scaled_mm(
+        a, b, scale_a, scale_b, out_dtype=out_dtype, bias=bias
+    )
 
     torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index c4d349f1a5a0..dd1b02083c8d 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -4,12 +4,13 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
+
 import random
 
 import pytest
 import torch
-
 from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
+
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
@@ -36,9 +37,7 @@
     (512, 24576, 128),
 ]
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # -1 means full extent in that dimension
 TENSORWISE_GROUP_SHAPE = (-1, -1)
@@ -60,18 +59,19 @@ def group_scale_helper(shape, group_shape):
 def scale_shape(shape, group_shape):
     assert len(shape) == len(group_shape)
     group_shape = group_scale_helper(shape, group_shape)
-    return tuple(
-        cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
-
-
-def cutlass_fp8_gemm_helper(m: int,
-                            n: int,
-                            k: int,
-                            a_scale_group_shape: tuple,
-                            b_scale_group_shape: tuple,
-                            use_bias: bool,
-                            out_dtype: type[torch.dtype] = torch.bfloat16,
-                            device: str = "cuda"):
+    return tuple(cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+def cutlass_fp8_gemm_helper(
+    m: int,
+    n: int,
+    k: int,
+    a_scale_group_shape: tuple,
+    b_scale_group_shape: tuple,
+    use_bias: bool,
+    out_dtype: type[torch.dtype] = torch.bfloat16,
+    device: str = "cuda",
+):
     # Test for a cutlass kernel with per-token activation quantization
     # and per-output channel weight quantization.
     a = to_fp8(torch.randn((m, k), device=device))
@@ -80,8 +80,8 @@ def cutlass_fp8_gemm_helper(m: int,
     a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
     b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
 
-    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
-    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+    scale_a = torch.randn(a_scales_shape, device=device, dtype=torch.float32)
+    scale_b = torch.randn(b_scales_shape, device=device, dtype=torch.float32)
 
     # make scales M-major for blockwise quant, doesn't affect 1D scales
     scale_a = scale_a.t().contiguous().t()
@@ -89,7 +89,7 @@ def cutlass_fp8_gemm_helper(m: int,
     scale_b = scale_b.t().contiguous().t()
 
     if use_bias:
-        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+        bias = torch.rand((n,), device=device, dtype=out_dtype) * 10
     else:
         bias = None
 
@@ -98,18 +98,19 @@ def cutlass_fp8_gemm_helper(m: int,
 
     torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1.5e-1)
 
-    opcheck(torch.ops._C.cutlass_scaled_mm,
-            (out, a, b, scale_a, scale_b, bias))
+    opcheck(torch.ops._C.cutlass_scaled_mm, (out, a, b, scale_a, scale_b, bias))
 
 
-def cutlass_int8_gemm_helper(m: int,
-                             n: int,
-                             k: int,
-                             a_scale_group_shape: tuple,
-                             b_scale_group_shape: tuple,
-                             use_bias: bool,
-                             out_dtype: type[torch.dtype] = torch.bfloat16,
-                             device: str = "cuda"):
+def cutlass_int8_gemm_helper(
+    m: int,
+    n: int,
+    k: int,
+    a_scale_group_shape: tuple,
+    b_scale_group_shape: tuple,
+    use_bias: bool,
+    out_dtype: type[torch.dtype] = torch.bfloat16,
+    device: str = "cuda",
+):
     # Test for a cutlass kernel with per-token activation quantization
     # and per-output channel weight quantization.
     a = to_int8(torch.randn((m, k), device=device) * 5)
@@ -118,11 +119,11 @@ def cutlass_int8_gemm_helper(m: int,
     a_scales_shape = scale_shape(a.shape, a_scale_group_shape)
     b_scales_shape = scale_shape(b.shape, b_scale_group_shape)
 
-    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
-    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+    scale_a = torch.randn(a_scales_shape, device=device, dtype=torch.float32)
+    scale_b = torch.randn(b_scales_shape, device=device, dtype=torch.float32)
 
     if use_bias:
-        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+        bias = torch.rand((n,), device=device, dtype=out_dtype) * 10
     else:
         bias = None
 
@@ -131,145 +132,192 @@ def cutlass_int8_gemm_helper(m: int,
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
-    opcheck(torch.ops._C.cutlass_scaled_mm,
-            (out, a, b, scale_a, scale_b, bias))
+    opcheck(torch.ops._C.cutlass_scaled_mm, (out, a, b, scale_a, scale_b, bias))
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm(m: int, n: int, k: int, a_scale_group_shape,
-                          b_scale_group_shape, use_bias: bool):
-    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
-                            use_bias)
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm(
+    m: int, n: int, k: int, a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape, use_bias)
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
-                         [((1, 128), (128, 128))])
+@pytest.mark.parametrize(
+    "a_scale_group_shape,b_scale_group_shape", [((1, 128), (128, 128))]
+)
 @pytest.mark.parametrize("use_bias", [False])
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="FP8 blockwise is not supported on this GPU type.")
-def test_cutlass_fp8_blockwise_scale_gemm(m: int, n: int, k: int,
-                                          a_scale_group_shape,
-                                          b_scale_group_shape, use_bias: bool):
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="FP8 blockwise is not supported on this GPU type.",
+)
+def test_cutlass_fp8_blockwise_scale_gemm(
+    m: int, n: int, k: int, a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
     if k % b_scale_group_shape[0] != 0 or n % b_scale_group_shape[1] != 0:
         return
     if m % a_scale_group_shape[0] != 0 or k % a_scale_group_shape[1] != 0:
         return
     if m % 4 != 0 and current_platform.has_device_capability(100):
         return
-    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
-                            use_bias)
+    cutlass_fp8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape, use_bias)
 
 
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape,
-                           b_scale_group_shape, use_bias: bool):
-    cutlass_int8_gemm_helper(m, n, k, a_scale_group_shape, b_scale_group_shape,
-                             use_bias)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+def test_cutlass_int8_gemm(
+    m: int, n: int, k: int, a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
+    cutlass_int8_gemm_helper(
+        m, n, k, a_scale_group_shape, b_scale_group_shape, use_bias
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
-                                        b_scale_group_shape,
-                                        out_dtype: type[torch.dtype],
-                                        use_bias: bool):
-    cutlass_int8_gemm_helper(512,
-                             512,
-                             512,
-                             a_scale_group_shape,
-                             b_scale_group_shape,
-                             use_bias,
-                             out_dtype=out_dtype)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+def test_cutlass_int8_gemm_output_dtype(
+    a_scale_group_shape,
+    b_scale_group_shape,
+    out_dtype: type[torch.dtype],
+    use_bias: bool,
+):
+    cutlass_int8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=out_dtype,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
-                                       b_scale_group_shape,
-                                       out_dtype: type[torch.dtype],
-                                       use_bias: bool):
-    cutlass_fp8_gemm_helper(512,
-                            512,
-                            512,
-                            a_scale_group_shape,
-                            b_scale_group_shape,
-                            use_bias,
-                            out_dtype=out_dtype)
-
-
-@pytest.mark.parametrize("a_scale_group_shape,b_scale_group_shape",
-                         [((1, 128), (128, 128))])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm_output_dtype(
+    a_scale_group_shape,
+    b_scale_group_shape,
+    out_dtype: type[torch.dtype],
+    use_bias: bool,
+):
+    cutlass_fp8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=out_dtype,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape,b_scale_group_shape", [((1, 128), (128, 128))]
+)
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [False])
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="FP8 blockwise is not supported on this GPU type.")
-def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape,
-                                                b_scale_group_shape,
-                                                out_dtype: type[torch.dtype],
-                                                use_bias: bool):
-    cutlass_fp8_gemm_helper(512,
-                            512,
-                            512,
-                            a_scale_group_shape,
-                            b_scale_group_shape,
-                            use_bias,
-                            out_dtype=out_dtype)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="FP8 blockwise is not supported on this GPU type.",
+)
+def test_cutlass_fp8_blockwise_scale_gemm_dtype(
+    a_scale_group_shape,
+    b_scale_group_shape,
+    out_dtype: type[torch.dtype],
+    use_bias: bool,
+):
+    cutlass_fp8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=out_dtype,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
-                                  use_bias: bool, device: str):
-    cutlass_fp8_gemm_helper(512, 512, 512, a_scale_group_shape,
-                            b_scale_group_shape, use_bias, torch.bfloat16,
-                            device)
-
-
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm_devices(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool, device: str
+):
+    cutlass_fp8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        torch.bfloat16,
+        device,
+    )
+
+
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
-                                   use_bias: bool, device: str):
-    cutlass_int8_gemm_helper(512,
-                             512,
-                             512,
-                             a_scale_group_shape,
-                             b_scale_group_shape,
-                             use_bias,
-                             out_dtype=torch.bfloat16,
-                             device=device)
+def test_cutlass_int8_gemm_devices(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool, device: str
+):
+    cutlass_int8_gemm_helper(
+        512,
+        512,
+        512,
+        a_scale_group_shape,
+        b_scale_group_shape,
+        use_bias,
+        out_dtype=torch.bfloat16,
+        device=device,
+    )
 
 
 # For the following two tests:
@@ -277,32 +325,42 @@ def test_cutlass_int8_gemm_devices(a_scale_group_shape, b_scale_group_shape,
 # of a large power of two. In any case, the kernel will have a naive fallback
 # when N and K are not divisible by 16. But M is the number of tokens and the
 # kernel must handle any M thrown at it.
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-@pytest.mark.skipif(not current_platform.has_device_capability(89),
-                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_fp8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
-                                  use_bias: bool):
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(89),
+    reason="FP8 is not supported on this GPU type.",
+)
+def test_cutlass_fp8_gemm_m_sweep(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_fp8_gemm_helper(m, nk, nk, a_scale_group_shape,
-                                    b_scale_group_shape, use_bias)
+            cutlass_fp8_gemm_helper(
+                m, nk, nk, a_scale_group_shape, b_scale_group_shape, use_bias
+            )
 
 
-@pytest.mark.parametrize("a_scale_group_shape",
-                         [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
-@pytest.mark.parametrize("b_scale_group_shape",
-                         [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE])
+@pytest.mark.parametrize(
+    "a_scale_group_shape", [PER_TOKEN_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
+@pytest.mark.parametrize(
+    "b_scale_group_shape", [PER_OUT_CH_GROUP_SHAPE, TENSORWISE_GROUP_SHAPE]
+)
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
-                                   use_bias: bool):
+def test_cutlass_int8_gemm_m_sweep(
+    a_scale_group_shape, b_scale_group_shape, use_bias: bool
+):
     for nk in range(32, 128, 32):
         for m in range(1, 128):
-            cutlass_int8_gemm_helper(m, nk, nk, a_scale_group_shape,
-                                     b_scale_group_shape, use_bias)
+            cutlass_int8_gemm_helper(
+                m, nk, nk, a_scale_group_shape, b_scale_group_shape, use_bias
+            )
 
 
 @pytest.mark.parametrize("m", [32, 64, 128])
@@ -310,8 +368,7 @@ def test_cutlass_int8_gemm_m_sweep(a_scale_group_shape, b_scale_group_shape,
 @pytest.mark.parametrize("k", [64, 128, 256])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.skip
-def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
-                                    out_dtype: torch.dtype):
+def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int, out_dtype: torch.dtype):
     # Currently, the test is failing because folding azp into
     # 16-bit bias loses too much precision
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
@@ -328,7 +385,7 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
 
     b_dq = scale_b * bq_f32
 
-    azp_a = torch.rand((1, ), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_a = torch.rand((1,), device="cuda", dtype=torch.float32) * 10 + 1.5
     azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
     azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
 
@@ -340,18 +397,17 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
     J = torch.ones((1, k), device="cuda", dtype=torch.float32)
     azp_bias = (azp_a * scale_b * (J @ bq_f32)).to(out_dtype)
     assert azp_bias.shape == (1, n)
-    assert azp_bias[0, :].shape == (n, )
-
-    baseline_q = (scale_a.to(device='cpu') * scale_b.to(device='cpu') * (
-        (aq_i32 + azp_aq_i8).to(device='cpu') @ bq_i32.to(device='cpu'))).to(
-            dtype=out_dtype, device='cuda')
-
-    out = ops.cutlass_scaled_mm(aq_i8,
-                                bq_i8,
-                                scale_a,
-                                scale_b,
-                                out_dtype=out_dtype,
-                                bias=azp_bias[0, :])
+    assert azp_bias[0, :].shape == (n,)
+
+    baseline_q = (
+        scale_a.to(device="cpu")
+        * scale_b.to(device="cpu")
+        * ((aq_i32 + azp_aq_i8).to(device="cpu") @ bq_i32.to(device="cpu"))
+    ).to(dtype=out_dtype, device="cuda")
+
+    out = ops.cutlass_scaled_mm(
+        aq_i8, bq_i8, scale_a, scale_b, out_dtype=out_dtype, bias=azp_bias[0, :]
+    )
     torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0)
     torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0)
 
@@ -362,8 +418,9 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
 @pytest.mark.parametrize("azp_per_token", [True, False])
-def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
-                          use_bias: bool, azp_per_token: bool):
+def test_cutlass_int8_azp(
+    m: int, n: int, k: int, out_dtype: torch.dtype, use_bias: bool, azp_per_token: bool
+):
     m_azp = m if azp_per_token else 1
     scale_a = torch.randn((m_azp, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
@@ -377,16 +434,12 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     bq_f32 = bq_i8.to(dtype=torch.float32)
     b_dq = scale_b * bq_f32
 
-    azp_a = torch.rand(
-        (m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
+    azp_a = torch.rand((m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
     azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
     azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a  # correct for rounding
 
     a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32)
-    torch.testing.assert_close(a_dq,
-                               scale_a * aq_f32 - azp_a,
-                               rtol=1e-4,
-                               atol=1e-3)
+    torch.testing.assert_close(a_dq, scale_a * aq_f32 - azp_a, rtol=1e-4, atol=1e-3)
 
     if use_bias:
         bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5
@@ -396,8 +449,8 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     baseline_dq = (torch.mm(a_dq, b_dq) + bias).to(out_dtype)
 
     # int32 mm not supported on CUDA
-    a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device='cpu')
-    cq = (a_noazp_i32_cpu @ bq_i32.to(device='cpu')).to(device='cuda')
+    a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device="cpu")
+    cq = (a_noazp_i32_cpu @ bq_i32.to(device="cpu")).to(device="cuda")
     baseline_q = (scale_a * scale_b * cq + bias).to(dtype=out_dtype)
 
     # Hadamard is just the sum of the cols
@@ -406,14 +459,14 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     func_bias = bias if use_bias else None
 
     if azp_per_token:
-        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
-                                        out_dtype, azp_adj_i32, azp_i32,
-                                        func_bias)
+        out = ops.cutlass_scaled_mm_azp(
+            aq_i8, bq_i8, scale_a, scale_b, out_dtype, azp_adj_i32, azp_i32, func_bias
+        )
     else:
         azp_with_adj_i32 = azp_i32 * azp_adj_i32
-        out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
-                                        out_dtype, azp_with_adj_i32, None,
-                                        func_bias)
+        out = ops.cutlass_scaled_mm_azp(
+            aq_i8, bq_i8, scale_a, scale_b, out_dtype, azp_with_adj_i32, None, func_bias
+        )
 
     # bfloat16 precision is 7-bit mantissa -> 2^-8 ~ 0.4%
     # float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05%
@@ -423,13 +476,15 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
     torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
 
     if azp_per_token:
-        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
-                (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
-                 func_bias))
+        opcheck(
+            torch.ops._C.cutlass_scaled_mm_azp,
+            (out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32, func_bias),
+        )
     else:
-        opcheck(torch.ops._C.cutlass_scaled_mm_azp,
-                (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
-                 func_bias))
+        opcheck(
+            torch.ops._C.cutlass_scaled_mm_azp,
+            (out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None, func_bias),
+        )
 
 
 # Test working with a subset of A and B
@@ -445,23 +500,14 @@ def test_cutlass_subset():
     scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
     scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
 
-    out = ops.cutlass_scaled_mm(a,
-                                b,
-                                scale_a,
-                                scale_b,
-                                out_dtype=torch.bfloat16)
-    baseline = baseline_scaled_mm(a,
-                                  b,
-                                  scale_a,
-                                  scale_b,
-                                  out_dtype=torch.bfloat16)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype=torch.bfloat16)
 
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
 # Test to make sure cuda graphs work
 class CutlassLayer(torch.nn.Module):
-
     def __init__(self, b, scale_a, scale_b, out_dtype):
         super().__init__()
         self.b = b
@@ -470,8 +516,9 @@ def __init__(self, b, scale_a, scale_b, out_dtype):
         self.out_dtype = out_dtype
 
     def forward(self, a):
-        return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
-                                     self.out_dtype)
+        return ops.cutlass_scaled_mm(
+            a, self.b, self.scale_a, self.scale_b, self.out_dtype
+        )
 
 
 @pytest.mark.parametrize("per_act_token", [True, False])
@@ -485,10 +532,8 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
     m_a_scales = m if per_act_token else 1
     n_b_scales = n if per_out_ch else 1
 
-    scale_a = (torch.randn(
-        (m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
-    scale_b = (torch.randn(
-        (1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
+    scale_a = torch.randn((m_a_scales, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, n_b_scales), device="cuda", dtype=torch.float32) / 10
 
     # Construct a trivial model with a single layer that calls a CUTLASS kernel
     model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
@@ -502,13 +547,14 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
     out.zero_()
     g.replay()
 
-    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
-                        scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16)
+    baseline = torch.mm(
+        scale_a * a.to(dtype=torch.float32), scale_b * b.to(dtype=torch.float32)
+    ).to(torch.bfloat16)
     torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
 def test_cutlass_support_opcheck():
-    opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+    opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability,))
 
 
 @pytest.mark.parametrize("num_experts", [8, 64])
@@ -517,11 +563,13 @@ def test_cutlass_support_opcheck():
 @pytest.mark.parametrize("use_bias", [False])
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
-                                per_out_ch: bool, use_bias: bool):
-
+        current_platform.get_device_capability()
+    ),
+    reason="Grouped gemm is not supported on this GPU type.",
+)
+def test_cutlass_fp8_group_gemm(
+    num_experts: int, per_act_token: bool, per_out_ch: bool, use_bias: bool
+):
     # Device and dtype setup
     device = "cuda"
     out_dtype = torch.half
@@ -533,13 +581,9 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
     b_scales_tensors = []
     baseline_tensors = []
 
-    expert_offsets = torch.zeros((num_experts + 1),
-                                 device=device,
-                                 dtype=torch.int32)
+    expert_offsets = torch.zeros((num_experts + 1), device=device, dtype=torch.int32)
 
-    problem_sizes = torch.zeros((num_experts, 3),
-                                device=device,
-                                dtype=torch.int32)
+    problem_sizes = torch.zeros((num_experts, 3), device=device, dtype=torch.int32)
 
     if not per_act_token:
         one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
@@ -568,77 +612,78 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
         b_tensors.append(b_g)
 
         # Set up A/B scales
-        scale_b = torch.randn((1, n_b_scales),
-                              device=device,
-                              dtype=torch.float32)
+        scale_b = torch.randn((1, n_b_scales), device=device, dtype=torch.float32)
         b_scales_tensors.append(scale_b)
 
         if per_act_token:
-            scale_a = torch.randn((m_a_scales, 1),
-                                  device=device,
-                                  dtype=torch.float32)
+            scale_a = torch.randn((m_a_scales, 1), device=device, dtype=torch.float32)
             a_scales_tensors.append(scale_a)
         else:
             scale_a = one_scale_a
 
         # Compute baseline result for this group
-        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
-                                        None)
+        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype, None)
         baseline_tensors.append(baseline_g)
 
-    a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
-                                    device=device,
-                                    dtype=torch.float8_e4m3fn)
-    b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
-                                    device=device,
-                                    dtype=torch.float8_e4m3fn)
+    a_tensors_stacked = torch.empty(
+        (expert_offsets[num_experts], k_g), device=device, dtype=torch.float8_e4m3fn
+    )
+    b_tensors_stacked = torch.empty(
+        (num_experts, n_g, k_g), device=device, dtype=torch.float8_e4m3fn
+    )
 
     for g in range(num_experts):
-        a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
-                                                           1]] = a_tensors[g]
+        a_tensors_stacked[expert_offsets[g] : expert_offsets[g + 1]] = a_tensors[g]
         b_tensors_stacked[g] = b_tensors[g].t()
     b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
 
     if per_act_token:
         a_scales_tensors_stacked = torch.empty(
-            (expert_offsets[num_experts], 1),
-            device=device,
-            dtype=torch.float32)
+            (expert_offsets[num_experts], 1), device=device, dtype=torch.float32
+        )
         for g in range(num_experts):
-            a_scales_tensors_stacked[
-                expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
+            a_scales_tensors_stacked[expert_offsets[g] : expert_offsets[g + 1]] = (
+                a_scales_tensors[g]
+            )
     else:
         a_scales_tensors_stacked = one_scale_a
 
-    b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
-                                           device=device,
-                                           dtype=torch.float32)
+    b_scales_tensors_stacked = torch.empty(
+        (num_experts, n_b_scales), device=device, dtype=torch.float32
+    )
     for g in range(num_experts):
         b_scales_tensors_stacked[g] = b_scales_tensors[g]
 
-    out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
-                                      device=device,
-                                      dtype=out_dtype)
-
-    ab_strides = torch.full((num_experts, ),
-                            a_tensors_stacked.stride(0),
-                            device="cuda",
-                            dtype=torch.int64)
-    c_strides = torch.full((num_experts, ),
-                           out_tensors_stacked.stride(0),
-                           device="cuda",
-                           dtype=torch.int64)
-
-    ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
-                       b_tensors_stacked, a_scales_tensors_stacked,
-                       b_scales_tensors_stacked, expert_offsets[:-1],
-                       problem_sizes, ab_strides, ab_strides, c_strides,
-                       per_act_token, per_out_ch)
+    out_tensors_stacked = torch.zeros(
+        (expert_offsets[num_experts], n_g), device=device, dtype=out_dtype
+    )
+
+    ab_strides = torch.full(
+        (num_experts,), a_tensors_stacked.stride(0), device="cuda", dtype=torch.int64
+    )
+    c_strides = torch.full(
+        (num_experts,), out_tensors_stacked.stride(0), device="cuda", dtype=torch.int64
+    )
+
+    ops.cutlass_moe_mm(
+        out_tensors_stacked,
+        a_tensors_stacked,
+        b_tensors_stacked,
+        a_scales_tensors_stacked,
+        b_scales_tensors_stacked,
+        expert_offsets[:-1],
+        problem_sizes,
+        ab_strides,
+        ab_strides,
+        c_strides,
+        per_act_token,
+        per_out_ch,
+    )
 
     # Validate each group's result against the baseline
     for g in range(num_experts):
         baseline = baseline_tensors[g]
-        c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
+        c = out_tensors_stacked[expert_offsets[g] : expert_offsets[g + 1]]
         print(baseline)
         print(c)
         print("*")
diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
index 0a3edd4ddc16..0372fe285cf2 100644
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -3,40 +3,56 @@
 
 import pytest
 import torch
+from tests.kernels.quant_utils import (
+    FP8_DTYPE,
+    ref_dynamic_per_tensor_fp8_quant,
+    ref_dynamic_per_token_quant,
+)
+from tests.kernels.utils import opcheck
 
 import vllm._custom_ops as ops
-from tests.kernels.quant_utils import (FP8_DTYPE,
-                                       ref_dynamic_per_tensor_fp8_quant,
-                                       ref_dynamic_per_token_quant)
-from tests.kernels.utils import opcheck
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
-                8193]  # Arbitrary values for testing
+HIDDEN_SIZES = [
+    1,
+    2,
+    3,
+    4,
+    16,
+    67,
+    768,
+    2048,
+    5120,
+    5137,
+    8192,
+    8193,
+]  # Arbitrary values for testing
 HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SCALE_UBS = [True, False]
 SEEDS = [0]
 
 
-def opcheck_fp8_quant(output,
-                      input,
-                      scale=None,
-                      scale_ub=None,
-                      use_per_token_if_dynamic=False):
+def opcheck_fp8_quant(
+    output, input, scale=None, scale_ub=None, use_per_token_if_dynamic=False
+):
     if scale is not None:
         opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
     elif use_per_token_if_dynamic:
-        scale = torch.empty((input.shape[0], 1),
-                            device=input.device,
-                            dtype=torch.float32)
-        opcheck(torch.ops._C.dynamic_per_token_scaled_fp8_quant,
-                (output, input, scale, scale_ub))
+        scale = torch.empty(
+            (input.shape[0], 1), device=input.device, dtype=torch.float32
+        )
+        opcheck(
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant,
+            (output, input, scale, scale_ub),
+        )
     else:
-        scale = torch.empty((input.numel() // input.shape[-1], 1),
-                            device=input.device,
-                            dtype=torch.float32)
+        scale = torch.empty(
+            (input.numel() // input.shape[-1], 1),
+            device=input.device,
+            dtype=torch.float32,
+        )
         opcheck(torch.ops._C.dynamic_scaled_fp8_quant, (output, input, scale))
 
 
@@ -46,30 +62,29 @@ def opcheck_fp8_quant(output,
 @pytest.mark.parametrize("scale_ub", SCALE_UBS)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
-                                     dtype: torch.dtype, scale_ub: bool,
-                                     seed: int) -> None:
+def test_dynamic_per_token_fp8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
+) -> None:
     current_platform.seed_everything(seed)
 
-    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
-                   device="cuda") + 1e-6  # avoid nans
+    x = (
+        torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
+    )  # avoid nans
 
-    scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
-            if scale_ub else None
+    scale_ub = (
+        torch.mean(x).to(dtype=torch.float32, device="cuda") if scale_ub else None
+    )
     ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub)
-    ops_out, ops_scales = ops.scaled_fp8_quant(x,
-                                               scale_ub=scale_ub,
-                                               use_per_token_if_dynamic=True)
+    ops_out, ops_scales = ops.scaled_fp8_quant(
+        x, scale_ub=scale_ub, use_per_token_if_dynamic=True
+    )
 
     torch.testing.assert_close(ref_scales, ops_scales)
-    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
-                               ops_out.to(dtype=torch.float32))
+    torch.testing.assert_close(
+        ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+    )
 
-    opcheck_fp8_quant(ops_out,
-                      x,
-                      None,
-                      scale_ub,
-                      use_per_token_if_dynamic=True)
+    opcheck_fp8_quant(ops_out, x, None, scale_ub, use_per_token_if_dynamic=True)
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -77,8 +92,9 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
-                                      dtype: torch.dtype, seed: int) -> None:
+def test_dynamic_per_tensor_fp8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
+) -> None:
     current_platform.seed_everything(seed)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
@@ -87,8 +103,9 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
     ops_out, ops_scale = ops.scaled_fp8_quant(x)
 
     torch.testing.assert_close(ref_scale, ops_scale)
-    torch.testing.assert_close(ref_out.to(dtype=torch.float32),
-                               ops_out.to(dtype=torch.float32))
+    torch.testing.assert_close(
+        ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+    )
 
     opcheck_fp8_quant(ops_out, x)
 
diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py
index 07651fef39bf..eac7f4d4f250 100644
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
@@ -4,8 +4,8 @@
 import gguf
 import pytest
 import torch
-
 from tests.kernels.utils import opcheck
+
 from vllm import _custom_ops as ops  # noqa: F401
 
 
@@ -13,33 +13,42 @@
 def test_ggml_opcheck(quant_type):
     block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
     shape = [256, 1152]
-    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
+    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
     m = qweight.shape[0]
     n = qweight.shape[1] // type_size * block_size
-    opcheck(torch.ops._C.ggml_dequantize,
-            (qweight, quant_type, m, n, torch.float16))
+    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n, torch.float16))
 
-    x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
-    opcheck(torch.ops._C.ggml_mul_mat_a8,
-            (qweight, x, quant_type, qweight.shape[0]))
-    opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
-            (qweight, x, quant_type, qweight.shape[0]))
+    x = torch.rand((m, 512), device="cuda", dtype=torch.float16)
+    opcheck(torch.ops._C.ggml_mul_mat_a8, (qweight, x, quant_type, qweight.shape[0]))
+    opcheck(
+        torch.ops._C.ggml_mul_mat_vec_a8, (qweight, x, quant_type, qweight.shape[0])
+    )
 
     shape = [256, 1024, 336]
-    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
-    x = torch.rand((1, 1024), device='cuda', dtype=torch.float16)
-    sorted_token_ids = torch.arange(776, device='cuda')
-    expert_ids = torch.randint(0, 256, (194, ), device='cuda')
-    num_tokens_post_padded = torch.tensor([1],
-                                          dtype=torch.int64,
-                                          device='cuda')
+    qweight = torch.randint(0, 100, shape, device="cuda", dtype=torch.uint8)
+    x = torch.rand((1, 1024), device="cuda", dtype=torch.float16)
+    sorted_token_ids = torch.arange(776, device="cuda")
+    expert_ids = torch.randint(0, 256, (194,), device="cuda")
+    num_tokens_post_padded = torch.tensor([1], dtype=torch.int64, device="cuda")
 
-    opcheck(torch.ops._C.ggml_moe_a8,
-            (x, qweight, sorted_token_ids, expert_ids, num_tokens_post_padded,
-             quant_type, qweight.shape[0], 1, x.shape[0]))
-
-    topk_ids = torch.zeros((1, 1), device='cuda', dtype=torch.int32)
+    opcheck(
+        torch.ops._C.ggml_moe_a8,
+        (
+            x,
+            qweight,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            quant_type,
+            qweight.shape[0],
+            1,
+            x.shape[0],
+        ),
+    )
+
+    topk_ids = torch.zeros((1, 1), device="cuda", dtype=torch.int32)
 
     opcheck(
         torch.ops._C.ggml_moe_a8_vec,
-        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]))
+        (x, qweight, topk_ids, 1, quant_type, qweight.shape[0], x.shape[0]),
+    )
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
index 436d5cb64021..0988ba01759f 100644
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -18,8 +18,8 @@
 
 
 def get_gguf_sample_tensors(
-        hidden_size: int,
-        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
+    hidden_size: int, quant_type: GGMLQuantizationType
+) -> list[ReaderTensor]:
     sample_dir = GGUF_SAMPLE
     filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
     sample_file = Path(sample_dir) / filename
@@ -27,8 +27,8 @@ def get_gguf_sample_tensors(
 
 
 def get_gguf_MoE_tensors(
-        hidden_size: int,
-        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
+    hidden_size: int, quant_type: GGMLQuantizationType
+) -> list[ReaderTensor]:
     sample_dir = GGUF_SAMPLE_MOE
     filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
     sample_file = Path(sample_dir) / filename
@@ -68,17 +68,20 @@ def get_gguf_MoE_tensors(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
-def test_dequantize(hidden_size: int, dtype: torch.dtype,
-                    quant_type: GGMLQuantizationType):
+def test_dequantize(
+    hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType
+):
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     for tensor in tensors:
         shape_str = tensor.name.split("_")[-1]
         shape = map(int, shape_str.split("x"))
 
-        ref_output = torch.tensor(dequantize(tensor.data, quant_type),
-                                  device="cuda").to(dtype)
-        output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
-                                     quant_type, *list(shape), dtype)
+        ref_output = torch.tensor(
+            dequantize(tensor.data, quant_type), device="cuda"
+        ).to(dtype)
+        output = ops.ggml_dequantize(
+            torch.tensor(tensor.data, device="cuda"), quant_type, *list(shape), dtype
+        )
 
         torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
 
@@ -87,20 +90,21 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
-def test_mmvq(hidden_size: int, dtype: torch.dtype,
-              quant_type: GGMLQuantizationType):
+def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
     current_platform.seed_everything(0)
 
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
     for tensor in tensors:
-        weight = torch.tensor(dequantize(tensor.data, quant_type),
-                              device="cuda").to(dtype)
+        weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to(
+            dtype
+        )
         ref_output = x @ weight.T
 
         qweight = torch.tensor(tensor.data, device="cuda")
-        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,
-                                         qweight.shape[0]).to(dtype)
+        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type, qweight.shape[0]).to(
+            dtype
+        )
 
         torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
 
@@ -121,17 +125,23 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype,
         GGMLQuantizationType.Q4_0,
         GGMLQuantizationType.Q5_0,
         GGMLQuantizationType.Q8_0,
-    ])
+    ],
+)
 @torch.inference_mode()
-def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
-             quant_type: GGMLQuantizationType):
+def test_mmq(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    quant_type: GGMLQuantizationType,
+):
     current_platform.seed_everything(0)
 
     tensors = get_gguf_sample_tensors(hidden_size, quant_type)
     x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
     for tensor in tensors:
-        weight = torch.tensor(dequantize(tensor.data, quant_type),
-                              device="cuda").to(dtype)
+        weight = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to(
+            dtype
+        )
         ref_output = x @ weight.T
 
         qweight = torch.tensor(tensor.data, device="cuda")
@@ -141,10 +151,9 @@ def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
         # bfloat16 tends to accumulate and can greatly inflate rtol
         # since outputs are also very close to 0
         rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1}
-        torch.testing.assert_close(output,
-                                   ref_output,
-                                   atol=atols[dtype],
-                                   rtol=rtols[dtype])
+        torch.testing.assert_close(
+            output, ref_output, atol=atols[dtype], rtol=rtols[dtype]
+        )
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -153,35 +162,46 @@ def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
-def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,
-             quant_type: GGMLQuantizationType, top_k: int):
+def test_moe(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    quant_type: GGMLQuantizationType,
+    top_k: int,
+):
     current_platform.seed_everything(0)
     H, E = 1024, 256
 
     x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
 
     topk_weights = torch.rand(num_tokens, top_k, device="cuda", dtype=dtype)
-    topk_ids = torch.randint(0,
-                             E, (num_tokens, top_k),
-                             device="cuda",
-                             dtype=torch.int32)
+    topk_ids = torch.randint(
+        0, E, (num_tokens, top_k), device="cuda", dtype=torch.int32
+    )
 
     tensors = get_gguf_MoE_tensors(hidden_size, quant_type)
 
     w13 = tensors[0]
     w2 = tensors[1]
 
-    w13_dequant = torch.tensor(dequantize(w13.data, quant_type),
-                               device="cuda").to(dtype)
-
-    w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
-                              device="cuda").to(dtype)
-
-    output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
-                             torch.tensor(w2.data,
-                                          device="cuda"), topk_weights,
-                             topk_ids, quant_type, quant_type, "silu")
-
-    ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
-                               topk_ids).reshape(output.shape)
+    w13_dequant = torch.tensor(dequantize(w13.data, quant_type), device="cuda").to(
+        dtype
+    )
+
+    w2_dequant = torch.tensor(dequantize(w2.data, quant_type), device="cuda").to(dtype)
+
+    output = _fused_moe_gguf(
+        x,
+        torch.tensor(w13.data, device="cuda"),
+        torch.tensor(w2.data, device="cuda"),
+        topk_weights,
+        topk_ids,
+        quant_type,
+        quant_type,
+        "silu",
+    )
+
+    ref_output = fused_experts(
+        x, w13_dequant, w2_dequant, topk_weights, topk_ids
+    ).reshape(output.shape)
     torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
diff --git a/tests/kernels/quantization/test_gptq.py b/tests/kernels/quantization/test_gptq.py
index 7fb57a1576bd..32782be3cee4 100644
--- a/tests/kernels/quantization/test_gptq.py
+++ b/tests/kernels/quantization/test_gptq.py
@@ -2,31 +2,28 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import torch
-
 from tests.kernels.utils import opcheck
+
 from vllm import _custom_ops as ops  # noqa: F401
 
 
 def test_gptq_shuffle_opcheck():
-    weight = torch.randint(-2000000,
-                           2000000, (1792, 4096),
-                           device='cuda',
-                           dtype=torch.int32)
-    perm = torch.empty((0, ), device='cuda', dtype=torch.int32)
+    weight = torch.randint(
+        -2000000, 2000000, (1792, 4096), device="cuda", dtype=torch.int32
+    )
+    perm = torch.empty((0,), device="cuda", dtype=torch.int32)
     bit = 4
     opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit))
 
 
 def test_gptq_gemm_opcheck():
-    a = torch.rand((240, 4096), device='cuda', dtype=torch.float16)
-    weight = torch.randint(-2000000,
-                           2000000, (512, 6144),
-                           device='cuda',
-                           dtype=torch.int32)
-    zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32)
-    scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16)
-    idx = torch.empty((0, ), device='cuda', dtype=torch.int32)
+    a = torch.rand((240, 4096), device="cuda", dtype=torch.float16)
+    weight = torch.randint(
+        -2000000, 2000000, (512, 6144), device="cuda", dtype=torch.int32
+    )
+    zeros = torch.zeros((32, 768), device="cuda", dtype=torch.int32)
+    scales = torch.rand((32, 6144), device="cuda", dtype=torch.float16)
+    idx = torch.empty((0,), device="cuda", dtype=torch.int32)
     use_exllama = True
     bit = 4
-    opcheck(torch.ops._C.gptq_gemm,
-            (a, weight, zeros, scales, idx, use_exllama, bit))
+    opcheck(torch.ops._C.gptq_gemm, (a, weight, zeros, scales, idx, use_exllama, bit))
diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
index dc5fecbf4ccc..3af4da6d7e05 100644
--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -10,12 +10,12 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.quantization.utils.int8_utils import (
-    per_token_quant_int8)
+    per_token_quant_int8,
+)
 from vllm.platforms import current_platform
 
 if current_platform.get_device_capability() < (7, 0):
-    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
-                allow_module_level=True)
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
@@ -25,14 +25,13 @@ def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     B = B.to(torch.float32)
 
     assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
-    assert B.ndim == 2 and B.is_contiguous(
-    ), "B must be a 2D contiguous tensor"
+    assert B.ndim == 2 and B.is_contiguous(), "B must be a 2D contiguous tensor"
 
     # Reshape input
     M = A.numel() // A.shape[-1]
     B = B.t()  # Transpose weight matrix
     N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (K, )
+    origin_C_shape = A.shape[:-1] + (K,)
     A = A.reshape(M, N)
 
     # As is per-token [M, 1], Bs is per-column [1, K]
@@ -66,25 +65,22 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
         mask = topk_ids == i
         if mask.sum():
             # First MLP layer: note that a_s is now per-token
-            inter_out = native_w8a8_per_token_matmul(a_q[mask],
-                                                     w1[i],
-                                                     a_s[mask],
-                                                     w1_s[i],
-                                                     output_dtype=a.dtype)
+            inter_out = native_w8a8_per_token_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], output_dtype=a.dtype
+            )
             # Activation function
             act_out = SiluAndMul().forward_native(inter_out)
             # Quantize activation output with per-token
             act_out_q, act_out_s = per_token_quant_int8(act_out)
 
             # Second MLP layer
-            out[mask] = native_w8a8_per_token_matmul(act_out_q,
-                                                     w2[i],
-                                                     act_out_s,
-                                                     w2_s[i],
-                                                     output_dtype=a.dtype)
+            out[mask] = native_w8a8_per_token_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], output_dtype=a.dtype
+            )
     # Apply routing weights and sum
-    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
 
 
 @pytest.fixture(autouse=True, scope="module")
@@ -102,8 +98,10 @@ def setup_cuda():
 SEEDS = [0]
 
 
-@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
-                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@pytest.mark.parametrize(
+    "M, N, K, E, topk, dtype, seed",
+    itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
+)
 @torch.inference_mode()
 def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     torch.manual_seed(seed)
@@ -144,7 +142,7 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     )
 
     # Check results
-    rel_diff = (torch.mean(
-        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
-                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    rel_diff = torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
+    ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
     assert rel_diff < 0.05
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index 5a37b976db9e..7b66077ca13c 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -3,9 +3,9 @@
 
 import pytest
 import torch
-
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
+
 from vllm._custom_ops import scaled_int8_quant
 from vllm.platforms import current_platform
 
@@ -19,26 +19,24 @@
 
 def opcheck_int8_quant_static(output, input, scale, azp=None):
     if azp is None:
-        opcheck(torch.ops._C.static_scaled_int8_quant,
-                (output, input, scale, None))
+        opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale, None))
     else:
-        opcheck(torch.ops._C.static_scaled_int8_quant,
-                (output, input, scale, azp))
+        opcheck(torch.ops._C.static_scaled_int8_quant, (output, input, scale, azp))
 
 
 def opcheck_int8_quant_dynamic(output, input, symmetric=True):
-    scale = torch.empty((input.numel() // input.shape[-1], 1),
-                        device=input.device,
-                        dtype=torch.float32)
+    scale = torch.empty(
+        (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
+    )
     if symmetric:
-        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
-                (output, input, scale, None))
+        opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale, None))
     else:
-        azp = torch.empty((input.numel() // input.shape[-1], 1),
-                          device=input.device,
-                          dtype=torch.int32)
-        opcheck(torch.ops._C.dynamic_scaled_int8_quant,
-                (output, input, scale, azp))
+        azp = torch.empty(
+            (input.numel() // input.shape[-1], 1),
+            device=input.device,
+            dtype=torch.int32,
+        )
+        opcheck(torch.ops._C.dynamic_scaled_int8_quant, (output, input, scale, azp))
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -46,8 +44,9 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
-                                   dtype: torch.dtype, seed: int) -> None:
+def test_dynamic_scaled_int8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
+) -> None:
     current_platform.seed_everything(seed)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -69,30 +68,31 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
-def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
-                                       dtype: torch.dtype, seed: int) -> None:
+def test_dynamic_scaled_int8_azp_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
+) -> None:
     current_platform.seed_everything(seed)
     int8_traits = torch.iinfo(torch.int8)
 
-    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
-                   device="cuda") * 1000 - 300
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
 
     x_token_max, _ = x.to(dtype=torch.float32).max(dim=1, keepdim=True)
     x_token_min, _ = x.to(dtype=torch.float32).min(dim=1, keepdim=True)
 
     # calculate scale and azp, and adjust the range
     scales = (x_token_max - x_token_min) / torch.tensor(255.0)
-    azps = torch.round(torch.tensor(-128.0) - x_token_min / scales).to(
-        torch.int32)
+    azps = torch.round(torch.tensor(-128.0) - x_token_min / scales).to(torch.int32)
 
-    torch_out = ((x / scales).round() + azps).clamp(
-        int8_traits.min, int8_traits.max).to(torch.int8)
-    assert torch_out.min() >= int8_traits.min and torch_out.max(
-    ) <= int8_traits.max
+    torch_out = (
+        ((x / scales).round() + azps)
+        .clamp(int8_traits.min, int8_traits.max)
+        .to(torch.int8)
+    )
+    assert torch_out.min() >= int8_traits.min and torch_out.max() <= int8_traits.max
 
     ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False)
 
-    if (not torch.allclose(scales_out, scales)):
+    if not torch.allclose(scales_out, scales):
         print(torch.argmax(torch.abs(scales_out - scales)))
     torch.testing.assert_close(scales_out, scales)
     # big atol to account for rounding errors
@@ -109,17 +109,18 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("scale", SCALE)
 @torch.inference_mode()
-def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
-                                  dtype: torch.dtype, seed: int,
-                                  scale: float) -> None:
+def test_static_scaled_int8_quant(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
+) -> None:
     current_platform.seed_everything(seed)
     int8_traits = torch.iinfo(torch.int8)
 
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
     scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
 
-    out1 = (x / scale_arg).round().clamp(int8_traits.min,
-                                         int8_traits.max).to(torch.int8)
+    out1 = (
+        (x / scale_arg).round().clamp(int8_traits.min, int8_traits.max).to(torch.int8)
+    )
     out2, scale2, _ = scaled_int8_quant(x, scale_arg)
     assert scale2 is scale_arg
 
@@ -136,24 +137,28 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("scale", SCALE)
 @pytest.mark.parametrize("azp", [-255, 54])
 @torch.inference_mode()
-def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
-                                      dtype: torch.dtype, seed: int,
-                                      scale: float, azp: int) -> None:
+def test_static_scaled_int8_azp_quant(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    scale: float,
+    azp: int,
+) -> None:
     current_platform.seed_everything(seed)
     int8_traits = torch.iinfo(torch.int8)
 
-    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
-                   device="cuda") * 1000 - 300
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
 
-    out1 = ((x / scale).round() + azp).clamp(int8_traits.min,
-                                             int8_traits.max).to(torch.int8)
+    out1 = (
+        ((x / scale).round() + azp)
+        .clamp(int8_traits.min, int8_traits.max)
+        .to(torch.int8)
+    )
     scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
     azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda")
 
-    out2, scale2, azp2 = scaled_int8_quant(x,
-                                           scale_arg,
-                                           azp_arg,
-                                           symmetric=False)
+    out2, scale2, azp2 = scaled_int8_quant(x, scale_arg, azp_arg, symmetric=False)
     assert scale2 is scale_arg
     assert azp2 is azp_arg
 
@@ -173,10 +178,7 @@ def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None:
     int32_traits = torch.iinfo(torch.int32)
     val = float(int32_traits.max if is_max else int32_traits.min)
 
-    x_vals = [[
-        nextafter(val, inf), val + 1, val, val - 1,
-        nextafter(val, -inf)
-    ]]
+    x_vals = [[nextafter(val, inf), val + 1, val, val - 1, nextafter(val, -inf)]]
     x = torch.tensor(x_vals, dtype=torch.float32, device="cuda")
 
     # The calculation in the kernel is: cast<int8>(cast<int32>(x / scale) + azp)
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index a7cb2a4e7f21..72d9d3e0e404 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -11,19 +11,20 @@
 
 import pytest
 import torch
-
 from tests.kernels.utils import opcheck
+
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.machete_utils import (
-    query_machete_supported_group_sizes)
+    query_machete_supported_group_sizes,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    pack_rows, quantize_weights)
+    pack_rows,
+    quantize_weights,
+)
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
 #  unit tests to a common utility function. Currently the use of
@@ -76,46 +77,63 @@ class Tensors:
 #  Ch Scales Type, Tok Scales Type)
 # NOTE: None "Scale Type" means the act type is floating point
 #       None "Output Type" means the output type is the same as the act type
-TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
-                      Optional[torch.dtype], bool]
+TestTypeTuple = tuple[
+    list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool
+]
 TEST_TYPES = [
     # GPTQ style
-    *(TypeConfig(act_type=a_type,
-                 weight_type=w_type,
-                 output_type=None,
-                 group_scale_type=a_type,
-                 group_zero_type=None,
-                 channel_scale_type=None,
-                 token_scale_type=None)
-      for w_type in [scalar_types.uint4b8, scalar_types.uint8b128]
-      for a_type in [torch.float16, torch.bfloat16]),
+    *(
+        TypeConfig(
+            act_type=a_type,
+            weight_type=w_type,
+            output_type=None,
+            group_scale_type=a_type,
+            group_zero_type=None,
+            channel_scale_type=None,
+            token_scale_type=None,
+        )
+        for w_type in [scalar_types.uint4b8, scalar_types.uint8b128]
+        for a_type in [torch.float16, torch.bfloat16]
+    ),
     # AWQ style
-    *(TypeConfig(act_type=a_type,
-                 weight_type=w_type,
-                 output_type=None,
-                 group_scale_type=a_type,
-                 group_zero_type=a_type,
-                 channel_scale_type=None,
-                 token_scale_type=None)
-      for w_type in [scalar_types.uint4, scalar_types.uint8]
-      for a_type in [torch.float16, torch.bfloat16]),
+    *(
+        TypeConfig(
+            act_type=a_type,
+            weight_type=w_type,
+            output_type=None,
+            group_scale_type=a_type,
+            group_zero_type=a_type,
+            channel_scale_type=None,
+            token_scale_type=None,
+        )
+        for w_type in [scalar_types.uint4, scalar_types.uint8]
+        for a_type in [torch.float16, torch.bfloat16]
+    ),
     # QQQ style
-    *(TypeConfig(act_type=torch.int8,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
-    *(TypeConfig(act_type=torch.float8_e4m3fn,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
+    *(
+        TypeConfig(
+            act_type=torch.int8,
+            weight_type=scalar_types.uint4b8,
+            output_type=torch.float16,
+            group_scale_type=group_scale_type,
+            group_zero_type=None,
+            channel_scale_type=torch.float,
+            token_scale_type=torch.float,
+        )
+        for group_scale_type in [None, torch.float16]
+    ),
+    *(
+        TypeConfig(
+            act_type=torch.float8_e4m3fn,
+            weight_type=scalar_types.uint4b8,
+            output_type=torch.float16,
+            group_scale_type=group_scale_type,
+            group_zero_type=None,
+            channel_scale_type=torch.float,
+            token_scale_type=torch.float,
+        )
+        for group_scale_type in [None, torch.float16]
+    ),
 ]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
@@ -137,17 +155,18 @@ def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
     return zps if zps is None else -1 * s * (zps.to(s.dtype))
 
 
-def group_size_valid(shape: tuple[int, int, int],
-                     group_size: Optional[int]) -> bool:
+def group_size_valid(shape: tuple[int, int, int], group_size: Optional[int]) -> bool:
     return group_size is None or group_size == -1 or shape[2] % group_size == 0
 
 
-def machete_quantize_and_pack(atype: torch.dtype,
-                              w: torch.Tensor,
-                              wtype: ScalarType,
-                              stype: Optional[torch.dtype],
-                              group_size: Optional[int],
-                              zero_points: bool = False):
+def machete_quantize_and_pack(
+    atype: torch.dtype,
+    w: torch.Tensor,
+    wtype: ScalarType,
+    stype: Optional[torch.dtype],
+    group_size: Optional[int],
+    zero_points: bool = False,
+):
     assert wtype.is_integer(), "TODO: support floating point weights"
 
     w_ref, w_q, w_s, w_zp = quantize_weights(
@@ -156,7 +175,8 @@ def machete_quantize_and_pack(atype: torch.dtype,
         group_size=group_size,
         zero_points=zero_points,
         # to match how the kernel applies zps
-        ref_zero_points_after_scales=True)
+        ref_zero_points_after_scales=True,
+    )
 
     w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
     w_q = w_q.t().contiguous().t()  # convert to col major
@@ -167,15 +187,18 @@ def machete_quantize_and_pack(atype: torch.dtype,
     return w_ref, w_q_machete, w_s, w_zp
 
 
-def create_test_tensors(shape: tuple[int, int, int],
-                        types: TypeConfig,
-                        group_size: Optional[int],
-                        subset_stride_factor: Optional[int] = None) -> Tensors:
+def create_test_tensors(
+    shape: tuple[int, int, int],
+    types: TypeConfig,
+    group_size: Optional[int],
+    subset_stride_factor: Optional[int] = None,
+) -> Tensors:
     m, n, k = shape
     factor = subset_stride_factor or 1
 
-    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
-          group_size)
+    print(
+        "create_test_tensors, shape:", shape, "types:", types, "group_size:", group_size
+    )
 
     a = rand_data((m * factor, k * factor), types.act_type, scale=3, offset=2)
     w = rand_data((k * factor, n * factor), types.act_type, scale=3, offset=1)
@@ -190,8 +213,13 @@ def create_test_tensors(shape: tuple[int, int, int],
         w = w.to(torch.float16)
 
     w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
-        types.group_zero_type is not None)
+        a.dtype,
+        w,
+        types.weight_type,
+        types.group_scale_type,
+        group_size,
+        types.group_zero_type is not None,
+    )
 
     if not a.dtype.is_floating_point:
         aiinfo = torch.iinfo(a.dtype)
@@ -200,35 +228,47 @@ def create_test_tensors(shape: tuple[int, int, int],
     a_ref = a.to(torch.float32)
     w_ref = w_ref.to(torch.float32)
 
-    w_ch_s = None if types.channel_scale_type is None else\
-        rand_data((n,), types.channel_scale_type)
-    w_tok_s = None if types.token_scale_type is None else\
-        rand_data((m,), types.token_scale_type)
+    w_ch_s = (
+        None
+        if types.channel_scale_type is None
+        else rand_data((n,), types.channel_scale_type)
+    )
+    w_tok_s = (
+        None
+        if types.token_scale_type is None
+        else rand_data((m,), types.token_scale_type)
+    )
 
-    return Tensors(w_ref=w_ref,
-                   a_ref=a_ref,
-                   a=a,
-                   w_q=w_q_packed,
-                   w_g_s=w_s,
-                   w_g_zp=maybe_convert_zeropoints(w_zp, w_s),
-                   w_ch_s=w_ch_s,
-                   w_tok_s=w_tok_s)
+    return Tensors(
+        w_ref=w_ref,
+        a_ref=a_ref,
+        a=a,
+        w_q=w_q_packed,
+        w_g_s=w_s,
+        w_g_zp=maybe_convert_zeropoints(w_zp, w_s),
+        w_ch_s=w_ch_s,
+        w_tok_s=w_tok_s,
+    )
 
 
 # None stype means scales use the same dtype as a
-def machete_mm_test_helper(types: TypeConfig,
-                           tensors: Tensors,
-                           group_size: Optional[int] = None,
-                           schedule: Optional[str] = None):
+def machete_mm_test_helper(
+    types: TypeConfig,
+    tensors: Tensors,
+    group_size: Optional[int] = None,
+    schedule: Optional[str] = None,
+):
     output_ref = torch.matmul(tensors.a_ref, tensors.w_ref)
     output_ref_type = output_ref.dtype
 
     if tensors.w_ch_s is not None:
-        output_ref = (output_ref.to(tensors.w_ch_s.dtype) *
-                      tensors.w_ch_s.unsqueeze(0)).to(output_ref_type)
+        output_ref = (
+            output_ref.to(tensors.w_ch_s.dtype) * tensors.w_ch_s.unsqueeze(0)
+        ).to(output_ref_type)
     if tensors.w_tok_s is not None:
-        output_ref = (output_ref.to(tensors.w_tok_s.dtype) *
-                      tensors.w_tok_s.unsqueeze(1)).to(output_ref_type)
+        output_ref = (
+            output_ref.to(tensors.w_tok_s.dtype) * tensors.w_tok_s.unsqueeze(1)
+        ).to(output_ref_type)
 
     output = ops.machete_mm(
         a=tensors.a,
@@ -249,23 +289,23 @@ def machete_mm_test_helper(types: TypeConfig,
     # Relax atol as our reduction dim becomes larger (more rounding error)
     # Relax atol when we have zeropoints since the way machete applies
     #  zeropoints (after scales) causes noise around 0
-    atol = 1 if tensors.w_g_zp is not None\
+    atol = (
+        1
+        if tensors.w_g_zp is not None
         else min(5e-2 * math.sqrt(tensors.a.shape[1]), 1)
+    )
     rtol = 1e-1 if tensors.a.element_size() >= 2 else 2e-1
-    torch.testing.assert_close(output,
-                               output_ref.to(output.dtype),
-                               rtol=rtol,
-                               atol=atol)
+    torch.testing.assert_close(
+        output, output_ref.to(output.dtype), rtol=rtol, atol=atol
+    )
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
+@pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_all_schedules(shape, types: TypeConfig):
-
     group_sizes: list[Optional[int]] = []
     if types.group_scale_type is None:
         group_sizes = [None]
@@ -279,20 +319,20 @@ def test_machete_all_schedules(shape, types: TypeConfig):
         tensors = create_test_tensors(shape, types, group_size)
         print(f"MNK = {shape}")
         for schedule in ops.machete_supported_schedules(
-                types.act_type,
-                types.weight_type,
-                group_scales_type=types.group_scale_type,
-                group_zeros_type=types.group_scale_type,
-                out_type=types.output_type):
+            types.act_type,
+            types.weight_type,
+            group_scales_type=types.group_scale_type,
+            group_zeros_type=types.group_scale_type,
+            out_type=types.output_type,
+        ):
             print(f"Testing schedule {schedule}")
             machete_mm_test_helper(types, tensors, group_size, schedule)
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
-@pytest.mark.parametrize("shape",
-                         MNK_SHAPES,
-                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
+@pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_heuristic(shape, types: TypeConfig):
     group_sizes: list[Optional[int]] = []
@@ -310,19 +350,22 @@ def test_machete_heuristic(shape, types: TypeConfig):
 
 
 # Test working on other devices
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_machete_devices(device: str):
     group_size = 128
 
-    type_config = TypeConfig(act_type=torch.float16,
-                             weight_type=scalar_types.uint4b8,
-                             output_type=None,
-                             group_scale_type=torch.float16,
-                             group_zero_type=None,
-                             channel_scale_type=None,
-                             token_scale_type=None)
+    type_config = TypeConfig(
+        act_type=torch.float16,
+        weight_type=scalar_types.uint4b8,
+        output_type=None,
+        group_scale_type=torch.float16,
+        group_zero_type=None,
+        channel_scale_type=None,
+        token_scale_type=None,
+    )
 
     tensors = create_test_tensors((512, 4096, 4096), type_config, group_size)
 
@@ -335,29 +378,30 @@ def test_machete_devices(device: str):
 
 
 # Test working with a subset of A and B
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
 def test_machete_subset():
     group_size = 128
 
-    type_config = TypeConfig(act_type=torch.float16,
-                             weight_type=scalar_types.uint4b8,
-                             output_type=None,
-                             group_scale_type=torch.float16,
-                             group_zero_type=None,
-                             channel_scale_type=None,
-                             token_scale_type=None)
-
-    tensors = create_test_tensors((512, 4096, 4096),
-                                  type_config,
-                                  group_size,
-                                  subset_stride_factor=2)
+    type_config = TypeConfig(
+        act_type=torch.float16,
+        weight_type=scalar_types.uint4b8,
+        output_type=None,
+        group_scale_type=torch.float16,
+        group_zero_type=None,
+        channel_scale_type=None,
+        token_scale_type=None,
+    )
+
+    tensors = create_test_tensors(
+        (512, 4096, 4096), type_config, group_size, subset_stride_factor=2
+    )
     machete_mm_test_helper(type_config, tensors, group_size)
 
 
 # Test to make sure cuda graphs work
 class MacheteLayer(torch.nn.Module):
-
     def __init__(self, **kwargs):
         super().__init__()
         self.kwargs = kwargs
@@ -366,8 +410,9 @@ def forward(self, a):
         return ops.machete_mm(a=a, **self.kwargs)
 
 
-@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
-                    reason="Machete is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not IS_SUPPORTED_BY_GPU, reason="Machete is not supported on this GPU type."
+)
 def test_machete_cuda_graph():
     m, n, k = 512, 4096, 4096
 
@@ -379,7 +424,8 @@ def test_machete_cuda_graph():
     zero_points = False
 
     w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
-        a.dtype, b, wtype, stype, group_size, zero_points)
+        a.dtype, b, wtype, stype, group_size, zero_points
+    )
 
     # Construct a trivial model with a single layer that calls a machete kernel
     model = MacheteLayer(
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 92914bd5cbba..9850332d385c 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -4,36 +4,61 @@
 
 Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
 """
+
 import pytest
 import torch
-
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from tests.quantization.utils import is_quant_method_supported
+
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+    GPTQ_MARLIN_24_MAX_PARALLEL,
+    GPTQ_MARLIN_24_MIN_THREAD_N,
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
+    GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
+)
 from vllm.model_executor.layers.quantization.qqq import (
-    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
-    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
+    MARLIN_QQQ_MAX_PARALLEL,
+    MARLIN_QQQ_MIN_THREAD_N,
+    MARLIN_QQQ_SUPPORTED_GROUP_SIZES,
+    MARLIN_QQQ_SUPPORTED_NUM_BITS,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_scales,
-    query_marlin_supported_quant_types)
+    GPTQ_MARLIN_MAX_PARALLEL,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    MARLIN_SUPPORTED_GROUP_SIZES,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace_new,
+    marlin_permute_scales,
+    query_marlin_supported_quant_types,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_fp4_like)
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
+    rand_marlin_weight_fp4_like,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-    marlin_quant_fp8_torch)
+    marlin_quant_fp8_torch,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
-    marlin_weights)
+    MarlinWorkspace,
+    awq_marlin_quantize,
+    get_weight_perm,
+    marlin_quantize,
+    marlin_weights,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
-    marlin_24_quantize)
+    marlin_24_quantize,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
-    marlin_qqq_quantize)
+    marlin_qqq_quantize,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
+    awq_pack,
+    gptq_pack,
+    gptq_quantize_weights,
+    quantize_weights,
+    sort_weights,
+)
 from vllm.scalar_type import scalar_types
 
 ACT_ORDER_OPTS = [False, True]
@@ -65,24 +90,27 @@
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+        torch.abs(output_ref)
+    )
 
 
 def rand_data(shape, dtype=torch.float16):
     return torch.randn(shape, dtype=dtype, device="cuda")
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("quant_type",
-                         query_marlin_supported_quant_types(False, False))
+@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(False, False))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
-                            act_order, mnk_factors):
+def test_gptq_marlin_repack(
+    k_chunk, n_chunk, quant_type, group_size, act_order, mnk_factors
+):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_k = k_chunk * k_factor
@@ -105,7 +133,8 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
-        b_weight, quant_type, group_size, act_order)
+        b_weight, quant_type, group_size, act_order
+    )
 
     # Pack to GPTQ format
     q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
@@ -118,11 +147,14 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 
     # Pack to Marlin format
     weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
-                                  weight_perm)
+    marlin_q_w_1 = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+    )
 
-    opcheck(torch.ops._C.gptq_marlin_repack,
-            (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
+    opcheck(
+        torch.ops._C.gptq_marlin_repack,
+        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits),
+    )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
@@ -137,16 +169,16 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("quant_type",
-                         query_marlin_supported_quant_types(True))
+@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(True))
 @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
-                           mnk_factors):
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_k = k_chunk * k_factor
@@ -161,21 +193,22 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     b_weight = rand_data((size_k, size_n))
 
     # Quantize
-    w_ref, q_w, s, zp = quantize_weights(b_weight,
-                                         quant_type,
-                                         group_size,
-                                         zero_points=True)
+    w_ref, q_w, s, zp = quantize_weights(
+        b_weight, quant_type, group_size, zero_points=True
+    )
 
     # Pack to AWQ format
     q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # Pack to Marlin format
     weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
-                                  weight_perm)
+    marlin_q_w_1 = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+    )
 
-    opcheck(torch.ops._C.awq_marlin_repack,
-            (q_w_awq, size_k, size_n, quant_type.size_bits))
+    opcheck(
+        torch.ops._C.awq_marlin_repack, (q_w_awq, size_k, size_n, quant_type.size_bits)
+    )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
@@ -189,14 +222,16 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types())
 @pytest.mark.parametrize(
-    "group_size",
-    set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES))
+    "group_size", set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES)
+)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
@@ -238,7 +273,8 @@ def test_gptq_marlin_gemm(
         if group_size != 16 or act_order:
             return
         w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
-            b_weight.T, group_size)
+            b_weight.T, group_size
+        )
         g_idx = None
         sort_indices = None
         marlin_zp = None
@@ -247,8 +283,7 @@ def test_gptq_marlin_gemm(
             return
         if act_order:
             return
-        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(
-            b_weight.T, group_size)
+        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b_weight.T, group_size)
         g_idx = None
         sort_indices = None
         marlin_zp = None
@@ -257,7 +292,8 @@ def test_gptq_marlin_gemm(
         if group_size == 16:
             return
         w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
-            b_weight, quant_type, group_size)
+            b_weight, quant_type, group_size
+        )
         g_idx = None
         sort_indices = None
         marlin_s2 = None
@@ -265,18 +301,36 @@ def test_gptq_marlin_gemm(
         if group_size == 16:
             return
         w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-            b_weight, quant_type, group_size, act_order)
+            b_weight, quant_type, group_size, act_order
+        )
         marlin_zp = None
         marlin_s2 = None
 
     workspace = marlin_make_workspace_new(w_ref.device)
 
-    opcheck(torch.ops._C.gptq_marlin_gemm,
-            (a_input, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, g_idx,
-             sort_indices, workspace, quant_type.id, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1], is_k_full, use_atomic_add,
-             use_fp32_reduce, False),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    opcheck(
+        torch.ops._C.gptq_marlin_gemm,
+        (
+            a_input,
+            None,
+            marlin_q_w,
+            marlin_s,
+            marlin_s2,
+            marlin_zp,
+            g_idx,
+            sort_indices,
+            workspace,
+            quant_type.id,
+            a_input.shape[0],
+            b_weight.shape[1],
+            a_input.shape[1],
+            is_k_full,
+            use_atomic_add,
+            use_fp32_reduce,
+            False,
+        ),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
 
     output = ops.gptq_marlin_gemm(
         a_input,
@@ -308,23 +362,40 @@ def test_gptq_marlin_gemm(
 
 # TODO: find better way to test this?
 @torch.compile(fullgraph=True)
-def marlin_24_gemm_tester(a_input, marlin_24_q_w_comp, marlin_24_meta,
-                          marlin_24_s, scratch, quant_type, size_m, size_n,
-                          size_k):
-    return ops.gptq_marlin_24_gemm(a_input, marlin_24_q_w_comp, marlin_24_meta,
-                                   marlin_24_s, scratch, quant_type, size_m,
-                                   size_n, size_k)
+def marlin_24_gemm_tester(
+    a_input,
+    marlin_24_q_w_comp,
+    marlin_24_meta,
+    marlin_24_s,
+    scratch,
+    quant_type,
+    size_m,
+    size_n,
+    size_k,
+):
+    return ops.gptq_marlin_24_gemm(
+        a_input,
+        marlin_24_q_w_comp,
+        marlin_24_meta,
+        marlin_24_s,
+        scratch,
+        quant_type,
+        size_m,
+        size_n,
+        size_k,
+    )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
 @pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
-                             mnk_factors):
+def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size, mnk_factors):
     m_factor, n_factor, k_factor = mnk_factors
 
     size_m = m_factor
@@ -334,19 +405,31 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
     a_input = rand_data((size_m, size_k))
     b_weight = rand_data((size_k, size_n))
 
-    (w_24_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b_weight, quant_type, group_size)
+    (w_24_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = marlin_24_quantize(
+        b_weight, quant_type, group_size
+    )
 
-    workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
-                                   GPTQ_MARLIN_24_MAX_PARALLEL)
+    workspace_24 = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
+    )
 
     output_ref = torch.matmul(a_input, w_24_ref)
 
-    opcheck(torch.ops._C.gptq_marlin_24_gemm,
-            (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
-             workspace_24.scratch, quant_type.id, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1]),
-            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    opcheck(
+        torch.ops._C.gptq_marlin_24_gemm,
+        (
+            a_input,
+            marlin_24_q_w_comp,
+            marlin_24_meta,
+            marlin_24_s,
+            workspace_24.scratch,
+            quant_type.id,
+            a_input.shape[0],
+            b_weight.shape[1],
+            a_input.shape[1],
+        ),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
 
     output = marlin_24_gemm_tester(
         a_input,
@@ -367,8 +450,10 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
     assert max_diff < 0.04
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("group_size", HQQ_SUPPORTED_GROUP_SIZES)
@@ -392,22 +477,22 @@ def test_hqq_marlin_gemm(
     a_input = rand_data((size_m, size_k))
     dev = a_input.device
 
-    b_weight = torch.randint(0,
-                             10, (size_n, size_k),
-                             dtype=torch.uint8,
-                             device=dev)
+    b_weight = torch.randint(0, 10, (size_n, size_k), dtype=torch.uint8, device=dev)
     scale = rand_data((size_n, size_k // group_size))
     zero = rand_data((size_n, size_k // group_size))
 
     gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n)
 
     sort_indices = torch.empty(0, dtype=torch.int, device=dev)
-    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n,
-                                        4).to(dev)
-    marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n,
-                                     group_size).to(dev)
-    marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n,
-                                      group_size).to(dev)
+    marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n, 4).to(
+        dev
+    )
+    marlin_s = marlin_permute_scales(
+        scale.transpose(1, 0), size_k, size_n, group_size
+    ).to(dev)
+    marlin_zp = marlin_permute_scales(
+        zero.transpose(1, 0), size_k, size_n, group_size
+    ).to(dev)
 
     g_idx = marlin_make_empty_g_idx(dev)
     g_idx_sort_indices = marlin_make_empty_g_idx(dev)
@@ -438,8 +523,7 @@ def test_hqq_marlin_gemm(
     s_flat = scale.reshape(-1, 1)
     dequant = (b_flat - zp_flat) * s_flat
 
-    output_ref = torch.matmul(a_input,
-                              dequant.reshape(b_weight.shape).transpose(1, 0))
+    output_ref = torch.matmul(a_input, dequant.reshape(b_weight.shape).transpose(1, 0))
 
     torch.cuda.synchronize()
 
@@ -448,8 +532,10 @@ def test_hqq_marlin_gemm(
     assert max_diff < 0.04
 
 
-@pytest.mark.skipif(not is_quant_method_supported("qqq"),
-                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("qqq"),
+    reason="Marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
@@ -473,22 +559,34 @@ def test_marlin_qqq_gemm(
     b_weight = rand_data((size_k, size_n))
 
     # Quantize activations
-    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
-        torch.float)
-    q_a = (a_input / s_a).round().clamp(int8_traits.min,
-                                        int8_traits.max).to(torch.int8)
+    s_a = (
+        a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(torch.float)
+    )
+    q_a = (a_input / s_a).round().clamp(int8_traits.min, int8_traits.max).to(torch.int8)
 
     # Quantize weights
-    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
-    marlin_qqq_quantize(b_weight, num_bits, group_size)
+    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = (
+        marlin_qqq_quantize(b_weight, num_bits, group_size)
+    )
 
-    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
-                                MARLIN_QQQ_MAX_PARALLEL)
+    workspace = MarlinWorkspace(
+        size_n, MARLIN_QQQ_MIN_THREAD_N, MARLIN_QQQ_MAX_PARALLEL
+    )
 
-    opcheck(torch.ops._C.marlin_qqq_gemm,
-            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
-             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1]))
+    opcheck(
+        torch.ops._C.marlin_qqq_gemm,
+        (
+            q_a,
+            marlin_qqq_q_w,
+            s_a,
+            marlin_qqq_s_channel,
+            marlin_qqq_s_group,
+            workspace.scratch,
+            a_input.shape[0],
+            b_weight.shape[1],
+            a_input.shape[1],
+        ),
+    )
 
     output = ops.marlin_qqq_gemm(
         q_a,
@@ -518,11 +616,12 @@ def test_marlin_gemm_subset_input():
     big_m = size_m * 2
     big_k = size_k * 2
 
-    a_input = rand_data((big_m, big_k))[8:size_m + 8, 8:size_k + 8]
+    a_input = rand_data((big_m, big_k))[8 : size_m + 8, 8 : size_k + 8]
     b_weight = rand_data((size_k, size_n))
 
     w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-        b_weight, quant_type, group_size, False)
+        b_weight, quant_type, group_size, False
+    )
 
     marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
     workspace = marlin_make_workspace_new(a_input.device)
@@ -559,11 +658,12 @@ def test_marlin_gemm_opcheck():
     size_m = 2048
     size_n = 4096
     size_k = 4096
-    a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
-    w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
-    s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
-    wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
-                         GPTQ_MARLIN_MAX_PARALLEL).scratch
+    a = torch.rand((size_m, size_n), device="cuda", dtype=torch.float16)
+    w = torch.randint(-5, 5, (256, 8192), device="cuda", dtype=torch.int32)
+    s = torch.full((32, size_k), 0.125, device="cuda", dtype=torch.float16)
+    wk = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+    ).scratch
     x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
     y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
     torch.testing.assert_close(x, y)
diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
index 3a8f4c17598c..e9b091d06697 100644
--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -8,15 +8,27 @@
 from vllm.scalar_type import scalar_types
 
 if not current_platform.has_device_capability(100):
-    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
 
 DTYPES = [torch.float16, torch.bfloat16]
 SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
-PAD_SHAPES = [(90, 64), (150, 64), (128, 48), (128, 80), (150, 80), (90, 48),
-              (90, 128), (150, 128), (150, 48), (90, 80)]
+PAD_SHAPES = [
+    (90, 64),
+    (150, 64),
+    (128, 48),
+    (128, 80),
+    (150, 80),
+    (90, 48),
+    (90, 128),
+    (150, 128),
+    (150, 48),
+    (90, 80),
+]
 SEEDS = [42]
-CUDA_DEVICES = ['cuda:0']
+CUDA_DEVICES = ["cuda:0"]
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
@@ -31,7 +43,22 @@
 # 0001 -> 0.5
 # 0000 -> 0
 E2M1_TO_FLOAT32 = [
-    0., 0.5, 1., 1.5, 2., 3., 4., 6., 0., -0.5, -1., -1.5, -2., -3., -4., -6.
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+    0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
 ]
 BLOCK_SIZE = 16
 
@@ -74,8 +101,7 @@ def ref_nvfp4_quant(x, global_scale):
     assert x.ndim == 2
     m, n = x.shape
     x = torch.reshape(x, (m, n // BLOCK_SIZE, BLOCK_SIZE))
-    vec_max = torch.max(torch.abs(x), dim=-1,
-                        keepdim=True)[0].to(torch.float32)
+    vec_max = torch.max(torch.abs(x), dim=-1, keepdim=True)[0].to(torch.float32)
     scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
     scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
     output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
@@ -131,7 +157,7 @@ def test_quantize_to_fp4(
 def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     dtype = torch.float16
     current_platform.seed_everything(42)
-    torch.set_default_device('cuda:0')
+    torch.set_default_device("cuda:0")
 
     m, n = pad_shape
 
diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
index 0b45c2298175..d2a352ce8445 100644
--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -2,15 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
-from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
-                         dequantize_nvfp4_to_dtype)
+from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dtype
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
 if not current_platform.has_device_capability(100):
-    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
-                allow_module_level=True)
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
 
 DTYPES = [torch.float16, torch.bfloat16]
 # m, n, k
@@ -19,26 +20,31 @@
 SHAPES.extend(PAD_SHAPES)
 
 SEEDS = [42]
-CUDA_DEVICES = ['cuda:0']
+CUDA_DEVICES = ["cuda:0"]
 
 
-def get_ref_results(a_fp4, b_fp4, a_sf, b_sf, a_global_scale, b_global_scale,
-                    m, n, dtype, block_size, device):
+def get_ref_results(
+    a_fp4,
+    b_fp4,
+    a_sf,
+    b_sf,
+    a_global_scale,
+    b_global_scale,
+    m,
+    n,
+    dtype,
+    block_size,
+    device,
+):
     _, m_k = a_fp4.shape
     _, n_k = b_fp4.shape
-    assert (m_k == n_k)
-    a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
-                                           a_sf,
-                                           a_global_scale,
-                                           dtype=dtype,
-                                           device=device,
-                                           block_size=block_size)
-    b_in_dtype = dequantize_nvfp4_to_dtype(b_fp4,
-                                           b_sf,
-                                           b_global_scale,
-                                           dtype=dtype,
-                                           device=device,
-                                           block_size=block_size)
+    assert m_k == n_k
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4, a_sf, a_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
+    b_in_dtype = dequantize_nvfp4_to_dtype(
+        b_fp4, b_sf, b_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
     return torch.matmul(a_in_dtype, b_in_dtype.t())
 
 
@@ -60,22 +66,31 @@ def test_nvfp4_gemm(
     a_dtype = torch.randn((m, k), dtype=dtype, device=device)
     b_dtype = torch.randn((n, k), dtype=dtype, device=device)
 
-    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32)
-    b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
-                      torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
-    alpha = 1. / (a_global_scale * b_global_scale)
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    b_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(b_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    alpha = 1.0 / (a_global_scale * b_global_scale)
     a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
     b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
 
-    expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved,
-                                   b_scale_interleaved, a_global_scale,
-                                   b_global_scale, m, n, dtype, block_size,
-                                   device)
-    out = ops.cutlass_scaled_fp4_mm(a_fp4, b_fp4, a_scale_interleaved,
-                                    b_scale_interleaved, alpha, dtype)
+    expected_out = get_ref_results(
+        a_fp4,
+        b_fp4,
+        a_scale_interleaved,
+        b_scale_interleaved,
+        a_global_scale,
+        b_global_scale,
+        m,
+        n,
+        dtype,
+        block_size,
+        device,
+    )
+    out = ops.cutlass_scaled_fp4_mm(
+        a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, alpha, dtype
+    )
 
-    torch.testing.assert_close(out,
-                               expected_out.to(dtype=dtype),
-                               atol=1e-1,
-                               rtol=1e-1)
+    torch.testing.assert_close(out, expected_out.to(dtype=dtype), atol=1e-1, rtol=1e-1)
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index 533a4fe59677..539689989fff 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
+from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
 
 import vllm._custom_ops as ops
-from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
 from vllm.platforms import current_platform
 
 DTYPES = [torch.bfloat16, torch.float16]
@@ -20,8 +20,7 @@
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test for rocm")
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 @torch.inference_mode()
 def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
     torch.manual_seed(seed)
@@ -39,8 +38,7 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
 @pytest.mark.parametrize("m", [8] + M)  # m >= 8
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test for rocm")
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="only test for rocm")
 def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
     cu_count = current_platform.get_cu_count()
@@ -61,7 +59,8 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(
     not (current_platform.is_rocm() and current_platform.supports_fp8()),
-    reason="only test for rocm fp8")
+    reason="only test for rocm fp8",
+)
 def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
     torch.manual_seed(seed)
 
@@ -71,12 +70,9 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
     A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
     B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
 
-    ref_out = torch._scaled_mm(A,
-                               B.t(),
-                               out_dtype=dtype,
-                               scale_a=scale_a,
-                               scale_b=scale_b)
-    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
-                        current_platform.get_cu_count())
+    ref_out = torch._scaled_mm(
+        A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b
+    )
+    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b, current_platform.get_cu_count())
 
     assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 8a2cc3baced2..26d49dad7396 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
 """
+
 import importlib
 from typing import Optional
 
@@ -15,17 +16,19 @@
 device = "cuda"
 
 triton_scaled_mm_module = importlib.import_module(
-    "vllm.model_executor.layers.quantization.compressed_tensors."
-    "triton_scaled_mm")
+    "vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm"
+)
 triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
 
 
-def torch_scaled_mm(a: torch.Tensor,
-                    b: torch.Tensor,
-                    scale_a: torch.Tensor,
-                    scale_b: torch.Tensor,
-                    out_dtype: type[torch.dtype],
-                    bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+def torch_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
     out = torch.mm(a.to(torch.float32), b.to(torch.float32))
     out = scale_a * out
     out = scale_b.T * out
@@ -44,20 +47,22 @@ def get_8bit_types():
 
 
 # This test is to check regressions for int8 support on ROCm.
-@pytest.mark.parametrize("model_path", [
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-])
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="Should only run on ROCm")
-def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
-                                      max_tokens, num_logprobs):
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="Should only run on ROCm")
+def test_rocm_compressed_tensors_w8a8(
+    vllm_runner, example_prompts, model_path, max_tokens, num_logprobs
+):
     dtype = "bfloat16"
 
     with vllm_runner(model_path, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(example_prompts, max_tokens,
-                                            num_logprobs)
+        vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, num_logprobs)
 
 
 @pytest.mark.parametrize("M", [1, 33, 64, 512])
@@ -68,10 +73,10 @@ def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
 @pytest.mark.parametrize("use_scalar_scale_a", [True, False])
 @pytest.mark.parametrize("use_scalar_scale_b", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
-                   use_scalar_scale_b, use_bias):
-    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
-                                                    ).is_floating_point()
+def test_scaled_mm(
+    M, N, K, in_dtype, out_dtype, use_scalar_scale_a, use_scalar_scale_b, use_bias
+):
+    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
 
     current_platform.seed_everything(0)
 
@@ -85,10 +90,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
     #
     # So, the values here are kept small enough to avoid this situation.
     if is_floating_point_type(in_dtype):
-        a = (0.25 * torch.rand(
-            (M, K), dtype=torch.float32, device=device)).to(in_dtype)
-        b = (0.25 * torch.rand(
-            (K, N), dtype=torch.float32, device=device)).to(in_dtype)
+        a = (0.25 * torch.rand((M, K), dtype=torch.float32, device=device)).to(in_dtype)
+        b = (0.25 * torch.rand((K, N), dtype=torch.float32, device=device)).to(in_dtype)
     else:
         a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
         b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
@@ -105,7 +108,7 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
 
     bias = None
     if use_bias:
-        bias = torch.rand((N, ), device=device, dtype=out_dtype)
+        bias = torch.rand((N,), device=device, dtype=out_dtype)
 
     c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py
index 90380b872d6c..a4619f5846b1 100644
--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
@@ -4,8 +4,10 @@
 import torch
 
 from tests.kernels.utils import opcheck
-from vllm._custom_ops import (apply_repetition_penalties_cuda,
-                              apply_repetition_penalties_torch)
+from vllm._custom_ops import (
+    apply_repetition_penalties_cuda,
+    apply_repetition_penalties_torch,
+)
 from vllm.platforms import current_platform
 
 NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
@@ -21,8 +23,9 @@
 @pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test for checking CUDA kernel")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test for checking CUDA kernel"
+)
 @torch.inference_mode()
 def test_apply_repetition_penalties(
     num_seqs: int,
@@ -32,7 +35,7 @@ def test_apply_repetition_penalties(
     seed: int,
 ) -> None:
     """
-    Test the apply_repetition_penalties custom op 
+    Test the apply_repetition_penalties custom op
     against a reference implementation.
     """
     current_platform.seed_everything(seed)
@@ -46,39 +49,40 @@ def test_apply_repetition_penalties(
     output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
 
     # Mark some tokens as repeated in prompt and output
-    prompt_indices = torch.randint(0, vocab_size,
-                                   (num_seqs, max(1, vocab_size // 200)))
-    output_indices = torch.randint(0, vocab_size,
-                                   (num_seqs, max(1, vocab_size // 200)))
+    prompt_indices = torch.randint(0, vocab_size, (num_seqs, max(1, vocab_size // 200)))
+    output_indices = torch.randint(0, vocab_size, (num_seqs, max(1, vocab_size // 200)))
 
     for i in range(num_seqs):
         prompt_mask[i, prompt_indices[i]] = True
         output_mask[i, output_indices[i]] = True
 
     # Create repetition penalties tensor
-    repetition_penalties = torch.full((num_seqs, ),
-                                      repetition_penalty,
-                                      dtype=dtype)
+    repetition_penalties = torch.full((num_seqs,), repetition_penalty, dtype=dtype)
 
     # Run all three implementations
     logits_torch = logits.clone()
     logits_cuda = logits.clone()
 
-    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
-                                     repetition_penalties)
-    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
-                                    repetition_penalties)
+    apply_repetition_penalties_torch(
+        logits_torch, prompt_mask, output_mask, repetition_penalties
+    )
+    apply_repetition_penalties_cuda(
+        logits_cuda, prompt_mask, output_mask, repetition_penalties
+    )
 
     # Compare all outputs to reference
     torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
 
     # Test the operator by applying the opcheck utility
-    opcheck(torch.ops._C.apply_repetition_penalties_,
-            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+    opcheck(
+        torch.ops._C.apply_repetition_penalties_,
+        (logits.clone(), prompt_mask, output_mask, repetition_penalties),
+    )
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test for checking CUDA kernel")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test for checking CUDA kernel"
+)
 @torch.inference_mode()
 def test_apply_repetition_penalties_zero_seqs() -> None:
     """
@@ -104,22 +108,24 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
     # No tokens to mark as repeated since num_seqs=0
 
     # Create repetition penalties tensor
-    repetition_penalties = torch.full((num_seqs, ),
-                                      repetition_penalty,
-                                      dtype=dtype)
+    repetition_penalties = torch.full((num_seqs,), repetition_penalty, dtype=dtype)
 
     # Run all three implementations
     logits_torch = logits.clone()
     logits_cuda = logits.clone()
 
-    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
-                                     repetition_penalties)
-    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
-                                    repetition_penalties)
+    apply_repetition_penalties_torch(
+        logits_torch, prompt_mask, output_mask, repetition_penalties
+    )
+    apply_repetition_penalties_cuda(
+        logits_cuda, prompt_mask, output_mask, repetition_penalties
+    )
 
     # Compare all outputs to reference
     torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
 
     # Test the operator by applying the opcheck utility
-    opcheck(torch.ops._C.apply_repetition_penalties_,
-            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
+    opcheck(
+        torch.ops._C.apply_repetition_penalties_,
+        (logits.clone(), prompt_mask, output_mask, repetition_penalties),
+    )
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
index 2b745b84dae6..f305dffb6cd8 100644
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -11,34 +11,29 @@
 if not current_platform.has_device_capability(100):
     pytest.skip(
         reason="Cutlass MLA Requires compute capability of 10 or above.",
-        allow_module_level=True)
+        allow_module_level=True,
+    )
 
 
 def ref_mla(
-        out: Tensor,  # (bs, num_heads, v_head_dim)
-        query: Tensor,  # (bs, num_heads, head_dim)
-        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
-        scale: float,
-        block_tables: Tensor,  # (bs, max_num_blocks)
-        seq_lens: Tensor,  # (bs,)
+    out: Tensor,  # (bs, num_heads, v_head_dim)
+    query: Tensor,  # (bs, num_heads, head_dim)
+    kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+    scale: float,
+    block_tables: Tensor,  # (bs, max_num_blocks)
+    seq_lens: Tensor,  # (bs,)
 ):
     bs, num_heads, v_head_dim = out.shape
     head_dim = query.shape[2]
 
     for i in range(bs):
         # gather and flatten KV-cache
-        kv = kv_cache[
-            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
-        kv = kv.view(1, -1,
-                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        kv = kv_cache[block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1, head_dim)[:, : seq_lens[i]]  # (1, seq_len, head_dim)
         v = kv[:, :, :v_head_dim]
 
         q = query[i].view(num_heads, 1, head_dim)
-        o = F.scaled_dot_product_attention(q,
-                                           kv,
-                                           v,
-                                           scale=scale,
-                                           enable_gqa=True)
+        o = F.scaled_dot_product_attention(q, kv, v, scale=scale, enable_gqa=True)
         out[i] = o.view(num_heads, v_head_dim)
 
     return out
@@ -49,10 +44,11 @@ def ref_mla(
 @pytest.mark.parametrize("bs", [1, 2, 4])
 @pytest.mark.parametrize("varlen", [False, True])
 @pytest.mark.parametrize("block_size", [16, 64, 128])
-def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
-                            varlen: bool, block_size: int):
+def test_cutlass_mla_decode(
+    dtype: torch.dtype, mean_seq_len: int, bs: int, varlen: bool, block_size: int
+):
     torch.set_default_dtype(dtype)
-    torch.set_default_device('cuda')
+    torch.set_default_device("cuda")
     torch.manual_seed(42)
 
     d = 576
@@ -61,12 +57,12 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
 
     q_nope_dim = 128
     q_pe_dim = 64
-    scale = (q_nope_dim + q_pe_dim)**(-0.5)
+    scale = (q_nope_dim + q_pe_dim) ** (-0.5)
     if varlen:
         seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
         seq_lens = seq_lens.clip(2).to(torch.int32)
     else:
-        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+        seq_lens = torch.full((bs,), mean_seq_len, dtype=torch.int32)
     max_seq_len = seq_lens.max().item()
     block_num = (max_seq_len + block_size - 1) // block_size
 
@@ -79,9 +75,7 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
     # Amplify input values to ensure test coverage of edge cases where CUTLASS
     # kernel errors occur with split_k settings.
     q = torch.randn(bs, h_q, d) * 100
-    block_table = torch.randint(0,
-                                bs * block_num, (bs, block_num),
-                                dtype=torch.int32)
+    block_table = torch.randint(0, bs * block_num, (bs, block_num), dtype=torch.int32)
 
     kv_cache = torch.randn(block_table.numel(), block_size, d)
 
@@ -90,7 +84,8 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
     out_ans = torch.zeros_like(out_ref)
     q_nope = q[:, :, :dv].clone()
     q_pe = q[:, :, dv:].clone()
-    ops.cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache, seq_lens,
-                           block_table, scale)
+    ops.cutlass_mla_decode(
+        out_ans, q_nope, q_pe, kv_cache, seq_lens, block_table, scale
+    )
 
     torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index e25556c89fb9..e67f04c07b3b 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -43,10 +43,9 @@ def test_flex_attention_vs_default_backend(monkeypatch):
         "The capital of France is",
     ]
 
-    sampling_params = SamplingParams(temperature=0.0,
-                                     top_p=1.0,
-                                     seed=seed,
-                                     max_tokens=max_tokens)
+    sampling_params = SamplingParams(
+        temperature=0.0, top_p=1.0, seed=seed, max_tokens=max_tokens
+    )
 
     # Run with flex attention
     with monkeypatch.context() as m:
@@ -76,8 +75,7 @@ def test_flex_attention_vs_default_backend(monkeypatch):
         output_default = llm_default.generate(prompts, sampling_params)
 
     # Compare outputs from both backends
-    for i, (flex_result,
-            default_result) in enumerate(zip(output_flex, output_default)):
+    for i, (flex_result, default_result) in enumerate(zip(output_flex, output_default)):
         prompt = prompts[i]
         flex_text = flex_result.outputs[0].text
         default_text = default_result.outputs[0].text
@@ -85,7 +83,8 @@ def test_flex_attention_vs_default_backend(monkeypatch):
         assert flex_text == default_text, (
             f"FlexAttention output doesn't match default for: {prompt!r}\n"
             f"FlexAttention: {flex_text!r}\n"
-            f"Default: {default_text!r}")
+            f"Default: {default_text!r}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
index 803453a20d81..c79e6105e69f 100644
--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -13,13 +13,12 @@
 NUM_TOKENS = [1, 17, 86, 1234, 3045]  # Arbitrary values for testing
 HIDDEN_SIZES = [16, 48, 128, 1562, 4096]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
-def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
-             scale: torch.Tensor) -> torch.Tensor:
+def ref_impl(
+    silu_and_mul: SiluAndMul, x: torch.Tensor, scale: torch.Tensor
+) -> torch.Tensor:
     silu_and_mul_out = silu_and_mul.forward_native(x)
     out, scales = ops.scaled_fp8_quant(silu_and_mul_out, scale)
     return out
@@ -27,9 +26,7 @@ def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
 
 def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     out_shape = (x.shape[0], x.shape[1] // 2)
-    out = torch.empty(out_shape,
-                      dtype=current_platform.fp8_dtype(),
-                      device=x.device)
+    out = torch.empty(out_shape, dtype=current_platform.fp8_dtype(), device=x.device)
     torch.ops._C.silu_and_mul_quant(out, x, scale)
     return out
 
@@ -57,7 +54,7 @@ def test_silu_and_mul(
     layer = SiluAndMul()
 
     # Make inputs
-    scale = (torch.randn((1), device=device, dtype=torch.float32))
+    scale = torch.randn((1), device=device, dtype=torch.float32)
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
 
     ref_out = ref_impl(layer, x, scale)
@@ -66,6 +63,7 @@ def test_silu_and_mul(
     assert ref_out.dtype == quant_dtype
     assert ops_out.dtype == quant_dtype
     assert ref_out.shape == ops_out.shape
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
-                          ops_out.to(dtype=torch.float32))
+    assert torch.allclose(
+        ref_out.to(dtype=torch.float32), ops_out.to(dtype=torch.float32)
+    )
     opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
index 1c31cfb25e5a..4b0bbb992d2e 100644
--- a/tests/kernels/test_triton_flash_attention.py
+++ b/tests/kernels/test_triton_flash_attention.py
@@ -4,21 +4,24 @@
 
 Run `pytest tests/kernels/test_triton_flash_attention.py`.
 """
+
 import pytest
 import torch
 
-from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
-                                                       MetaData,
-                                                       compute_alibi_tensor,
-                                                       scale_fp8,
-                                                       triton_attention_rocm)
+from vllm.attention.ops.triton_flash_attention import (
+    SUPPORTED_LAYOUTS,
+    MetaData,
+    compute_alibi_tensor,
+    scale_fp8,
+    triton_attention_rocm,
+)
 from vllm.platforms import current_platform
 
 
 class ReferenceAttention:
-
-    def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
-                 input_metadata):
+    def __init__(
+        self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype, input_metadata
+    ):
         self.Z = Z
         self.HQ = HQ
         self.HK = HK
@@ -30,21 +33,23 @@ def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
         self.input_metadata = input_metadata
 
     def fwd(self, q, k, v):
-        scores = torch.einsum('bhqd,bhkd->bhqk', q,
-                              k).float() * self.input_metadata.sm_scale
+        scores = (
+            torch.einsum("bhqd,bhkd->bhqk", q, k).float() * self.input_metadata.sm_scale
+        )
         if self.input_metadata.causal:
-            mask = torch.tril(torch.ones(self.N_CTX_Q,
-                                         self.N_CTX_K,
-                                         device="cuda"),
-                              diagonal=self.N_CTX_K - self.N_CTX_Q)
+            mask = torch.tril(
+                torch.ones(self.N_CTX_Q, self.N_CTX_K, device="cuda"),
+                diagonal=self.N_CTX_K - self.N_CTX_Q,
+            )
             scores[:, :, mask == 0] = float("-inf")
 
         if self.input_metadata.bias is not None:
             scores += self.input_metadata.bias
 
         if self.use_alibi:
-            scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
-                                           self.N_CTX_Q, self.N_CTX_K)
+            scores += compute_alibi_tensor(
+                self.input_metadata.alibi_slopes, self.N_CTX_Q, self.N_CTX_K
+            )
 
         p = torch.softmax(scores, dim=-1)
         if self.input_metadata.causal:
@@ -54,31 +59,38 @@ def fwd(self, q, k, v):
             # should be out of the softmax.
             nan_mask = torch.isnan(p)
             p[nan_mask == 1] = 0
-        ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
+        ref_out = torch.einsum("bhqk,bhkd->bhqd", p.to(self.dtype), v)
         # compare
-        if self.input_metadata.layout == 'bshd':
+        if self.input_metadata.layout == "bshd":
             ref_out = ref_out.transpose(1, 2).clone()
         return ref_out
 
     def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
         q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
-            self.dtype)
+            self.dtype
+        )
         k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
-            self.dtype)
+            self.dtype
+        )
         v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
-            self.dtype)
+            self.dtype
+        )
         result = self.fwd(q, k, v)
         if self.input_metadata.o_scale is not None:
             result, _ = scale_fp8(result, self.input_metadata.o_scale)
         return result
 
     def fwd_fp8_kv(self, q, k_quantized, v_quantized):
-        k_descale, v_descale = (self.input_metadata.k_descale,
-                                self.input_metadata.v_descale)
-        k_dequantized = (k_quantized.to(torch.float32) *
-                         k_descale.to(torch.float32)).to(self.dtype)
-        v_dequantized = (v_quantized.to(torch.float32) *
-                         v_descale.to(torch.float32)).to(self.dtype)
+        k_descale, v_descale = (
+            self.input_metadata.k_descale,
+            self.input_metadata.v_descale,
+        )
+        k_dequantized = (
+            k_quantized.to(torch.float32) * k_descale.to(torch.float32)
+        ).to(self.dtype)
+        v_dequantized = (
+            v_quantized.to(torch.float32) * v_descale.to(torch.float32)
+        ).to(self.dtype)
         return self.fwd(q, k_dequantized, v_dequantized)
 
     def varlen_fwd(self, q, k, v, is_mqa=False):
@@ -86,29 +98,33 @@ def varlen_fwd(self, q, k, v, is_mqa=False):
         if is_mqa:
             # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
             # the size aligns with Q.
-            k_ref = k.view(k.shape[0], k.shape[1], 1,
-                           k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
-            v_ref = v.view(v.shape[0], v.shape[1], 1,
-                           v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
+            k_ref = k.view(k.shape[0], k.shape[1], 1, k.shape[2]).expand(
+                -1, -1, self.HQ // self.HK, -1
+            )
+            v_ref = v.view(v.shape[0], v.shape[1], 1, v.shape[2]).expand(
+                -1, -1, self.HQ // self.HK, -1
+            )
         else:
             k_ref = k
             v_ref = v
 
         for i in range(0, self.input_metadata.num_contexts):
-            start_q, start_k = self.input_metadata.cu_seqlens_q[
-                i], self.input_metadata.cu_seqlens_k[i]
-            end_q, end_k = self.input_metadata.cu_seqlens_q[
-                i + 1], self.input_metadata.cu_seqlens_k[i + 1]
+            start_q, start_k = (
+                self.input_metadata.cu_seqlens_q[i],
+                self.input_metadata.cu_seqlens_k[i],
+            )
+            end_q, end_k = (
+                self.input_metadata.cu_seqlens_q[i + 1],
+                self.input_metadata.cu_seqlens_k[i + 1],
+            )
             k_curr = k_ref[start_k:end_k]
             v_curr = v_ref[start_k:end_k]
             if is_mqa:
                 k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
                 v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
-            scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
-                                  k_curr).float()
-            p = torch.softmax(scores * self.input_metadata.sm_scale,
-                              dim=-1).half()
-            ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
+            scores = torch.einsum("qhd,khd->qhk", q[start_q:end_q], k_curr).float()
+            p = torch.softmax(scores * self.input_metadata.sm_scale, dim=-1).half()
+            ref_out[start_q:end_q] = torch.einsum("qhk,khd->qhd", p, v_curr)
         return ref_out
 
 
@@ -123,8 +139,7 @@ def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
     # model.
     p_scale = None
 
-    o_scale = torch.rand(1, device="cuda",
-                         requires_grad=False) if use_o_scale else None
+    o_scale = torch.rand(1, device="cuda", requires_grad=False) if use_o_scale else None
 
     return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
 
@@ -150,10 +165,10 @@ def input_helper(
     current_platform.seed_everything(0)
 
     # Initialize q, k, v
-    if layout == 'bhsd':
+    if layout == "bhsd":
         q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
         k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
-    elif layout == 'bshd':
+    elif layout == "bshd":
         q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
         k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
 
@@ -161,69 +176,54 @@ def input_helper(
         # for n heads the set of slopes is the geometric sequence that starts
         # 2^(-8/n)
         alibi_slopes = torch.tensor(
-            [2**(-8 / HQ * i) for i in range(1, HQ + 1)],
+            [2 ** (-8 / HQ * i) for i in range(1, HQ + 1)],
             dtype=torch.float32,
-            device="cuda").repeat(Z, 1)
+            device="cuda",
+        ).repeat(Z, 1)
     else:
         alibi_slopes = None
 
     if use_bias:
-        bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
-                           dtype=dtype,
-                           device="cuda",
-                           requires_grad=False)
+        bias = torch.randn(
+            (1, HQ, N_CTX_Q, N_CTX_K), dtype=dtype, device="cuda", requires_grad=False
+        )
     else:
         bias = None
 
-    q = torch.randn(q_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-    k = torch.randn(k_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-    v = torch.randn(k_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
+    q = torch.randn(q_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
+    k = torch.randn(k_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
+    v = torch.randn(k_tensor_shape, dtype=dtype, device="cuda", requires_grad=False)
 
     if is_fp8:
-        (q, k, v, q_descale, k_descale, v_descale, p_scale,
-         o_scale) = quantize_input(q,
-                                   k,
-                                   v,
-                                   use_o_scale=use_o_scale,
-                                   fp8_kv=fp8_kv)
+        (q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale) = quantize_input(
+            q, k, v, use_o_scale=use_o_scale, fp8_kv=fp8_kv
+        )
     else:
         q_descale = k_descale = v_descale = p_scale = o_scale = None
 
-    input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
-                              max_seqlens_q=N_CTX_Q,
-                              max_seqlens_k=N_CTX_K,
-                              layout=layout,
-                              alibi_slopes=alibi_slopes,
-                              alibi_batch=Z,
-                              alibi_nheads=HQ,
-                              q_descale=q_descale,
-                              k_descale=k_descale,
-                              v_descale=v_descale,
-                              p_scale=p_scale,
-                              o_scale=o_scale,
-                              bias=bias,
-                              seqlen_q=N_CTX_Q,
-                              seqlen_k=N_CTX_K)
+    input_metadata = MetaData(
+        sm_scale=D_HEAD**-0.5,
+        max_seqlens_q=N_CTX_Q,
+        max_seqlens_k=N_CTX_K,
+        layout=layout,
+        alibi_slopes=alibi_slopes,
+        alibi_batch=Z,
+        alibi_nheads=HQ,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
+        p_scale=p_scale,
+        o_scale=o_scale,
+        bias=bias,
+        seqlen_q=N_CTX_Q,
+        seqlen_k=N_CTX_K,
+    )
     return q, k, v, input_metadata
 
 
-def varlen_input_helper(Z,
-                        HQ,
-                        HK,
-                        N_CTX_Q,
-                        N_CTX_K,
-                        D_HEAD,
-                        dtype,
-                        equal_seqlens=False):
+def varlen_input_helper(
+    Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, dtype, equal_seqlens=False
+):
     current_platform.seed_everything(0)
 
     # Random sequence lengths. Using N_CTX as kind of max of sum of individual
@@ -231,66 +231,72 @@ def varlen_input_helper(Z,
     if not equal_seqlens:
         max_seqlens_q = N_CTX_Q // Z
         max_seqlens_k = N_CTX_K // Z
-        seqlens_q = torch.randint(1,
-                                  max_seqlens_q + 1, (Z, ),
-                                  dtype=torch.int32)
-        seqlens_k = torch.randint(1,
-                                  max_seqlens_k + 1, (Z, ),
-                                  dtype=torch.int32)
+        seqlens_q = torch.randint(1, max_seqlens_q + 1, (Z,), dtype=torch.int32)
+        seqlens_k = torch.randint(1, max_seqlens_k + 1, (Z,), dtype=torch.int32)
     else:
-        seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
-        seqlens_k = torch.full((Z, ), N_CTX_K // Z)
+        seqlens_q = torch.full((Z,), N_CTX_Q // Z)
+        seqlens_k = torch.full((Z,), N_CTX_K // Z)
 
     # Calculate cumulative sequence lengths
-    cu_seqlens_q = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        seqlens_q.cumsum(dim=0, dtype=torch.int32)
-    ])
-    cu_seqlens_k = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        seqlens_k.cumsum(dim=0, dtype=torch.int32)
-    ])
+    cu_seqlens_q = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32),
+            seqlens_q.cumsum(dim=0, dtype=torch.int32),
+        ]
+    )
+    cu_seqlens_k = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32),
+            seqlens_k.cumsum(dim=0, dtype=torch.int32),
+        ]
+    )
     cu_seqlens_q = cu_seqlens_q.to(device="cuda")
     cu_seqlens_k = cu_seqlens_k.to(device="cuda")
 
     # Initialize q, k, v with variable lengths
     total_q = cu_seqlens_q[-1].item()
     total_k = cu_seqlens_k[-1].item()
-    q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    q = (
+        torch.randn((total_q, HQ, D_HEAD), dtype=dtype, device="cuda")
+        .normal_(mean=0.0, std=0.5)
+        .requires_grad_()
+    )
+    k = (
+        torch.randn((total_k, HK, D_HEAD), dtype=dtype, device="cuda")
+        .normal_(mean=0.0, std=0.5)
+        .requires_grad_()
+    )
+    v = (
+        torch.randn((total_k, HK, D_HEAD), dtype=dtype, device="cuda")
+        .normal_(mean=0.0, std=0.5)
+        .requires_grad_()
+    )
     sm_scale = D_HEAD**-0.5
     input_metadata = MetaData(sm_scale=sm_scale)
     input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
     return q, k, v, input_metadata
 
 
-@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (1, 48, 12, 1, 1, 64),
-    (4, 4, 4, 128, 128, 65),
-    (16, 48, 48, 1, 1, 128),
-    (64, 48, 24, 3, 3, 128),
-    (4, 4, 4, 113, 123, 1),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('use_alibi', [True, False])
-@pytest.mark.parametrize('layout', ['bshd'])
-def test_op_fwd(Z,
-                HQ,
-                HK,
-                N_CTX_Q,
-                N_CTX_K,
-                D_HEAD,
-                causal,
-                use_alibi,
-                layout,
-                dtype=torch.float16):
+@pytest.mark.parametrize(
+    "Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD",
+    [
+        (1, 48, 12, 1, 1, 64),
+        (4, 4, 4, 128, 128, 65),
+        (16, 48, 48, 1, 1, 128),
+        (64, 48, 24, 3, 3, 128),
+        (4, 4, 4, 113, 123, 1),
+    ],
+)
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("use_alibi", [True, False])
+@pytest.mark.parametrize("layout", ["bshd"])
+def test_op_fwd(
+    Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_alibi, layout, dtype=torch.float16
+):
     current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
-                                           dtype, layout, use_alibi, causal)
+    q, k, v, input_metadata = input_helper(
+        Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, dtype, layout, use_alibi, causal
+    )
 
     o = torch.empty_like(q)
 
@@ -299,48 +305,50 @@ def test_op_fwd(Z,
 
     # Transpose here if layout is bshd so we have same reference code for all
     # layouts
-    if layout == 'bshd':
+    if layout == "bshd":
         q = q.transpose(1, 2).clone()
         k = k.transpose(1, 2).clone()
         v = v.transpose(1, 2).clone()
     # Replicate K and V if using MQA/GQA
     if HQ != HK:
-        k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
-                   k.shape[3]).expand(-1, -1, HQ // HK, -1,
-                                      -1).reshape(k.shape[0], -1, k.shape[2],
-                                                  k.shape[3])
-        v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
-                   v.shape[3]).expand(-1, -1, HQ // HK, -1,
-                                      -1).reshape(v.shape[0], -1, v.shape[2],
-                                                  v.shape[3])
-
-    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
-                                  use_alibi, dtype, input_metadata)
+        k = (
+            k.view(k.shape[0], k.shape[1], -1, k.shape[2], k.shape[3])
+            .expand(-1, -1, HQ // HK, -1, -1)
+            .reshape(k.shape[0], -1, k.shape[2], k.shape[3])
+        )
+        v = (
+            v.view(v.shape[0], v.shape[1], -1, v.shape[2], v.shape[3])
+            .expand(-1, -1, HQ // HK, -1, -1)
+            .reshape(v.shape[0], -1, v.shape[2], v.shape[3])
+        )
+
+    ref_impl = ReferenceAttention(
+        Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype, input_metadata
+    )
     ref_out = ref_impl.fwd(q, k, v)
 
     torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
 
 
-@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (4, 48, 1, 1, 64),
-    (4, 48, 1, 1, 128),
-    (4, 48, 3, 3, 128),
-    (4, 4, 128, 128, 65),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('layout', ['bhsd'])
-@pytest.mark.parametrize('use_o_scale', [True, False])
-@pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
-                    reason="Triton FP8 requires CUDA 9.0 or higher")
-def test_op_fwd_fp8(Z,
-                    H,
-                    N_CTX_Q,
-                    N_CTX_K,
-                    D_HEAD,
-                    causal,
-                    layout,
-                    use_o_scale,
-                    dtype=torch.float32):
+@pytest.mark.parametrize(
+    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
+    [
+        (4, 48, 1, 1, 64),
+        (4, 48, 1, 1, 128),
+        (4, 48, 3, 3, 128),
+        (4, 4, 128, 128, 65),
+    ],
+)
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("layout", ["bhsd"])
+@pytest.mark.parametrize("use_o_scale", [True, False])
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability() < (9, 0),
+    reason="Triton FP8 requires CUDA 9.0 or higher",
+)
+def test_op_fwd_fp8(
+    Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, layout, use_o_scale, dtype=torch.float32
+):
     current_platform.seed_everything(0)
 
     # Disable grad to save memory it won't run into OOM on CI machine.
@@ -358,95 +366,103 @@ def test_op_fwd_fp8(Z,
         causal=causal,
         layout=layout,
         is_fp8=True,
-        use_o_scale=use_o_scale)
+        use_o_scale=use_o_scale,
+    )
 
     o = torch.empty_like(q_quantized) if use_o_scale else None
 
-    tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
-                                       o, input_metadata)
+    tri_out, _ = triton_attention_rocm(
+        q_quantized, k_quantized, v_quantized, o, input_metadata
+    )
 
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-                                  dtype, input_metadata)
+    ref_impl = ReferenceAttention(
+        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
+    )
     ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
 
     # compare
-    torch.testing.assert_close(ref_out.to(torch.float32),
-                               tri_out.to(torch.float32),
-                               atol=7e-2,
-                               rtol=2e-1)
-
-
-@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (4, 48, 1, 1, 64),
-    (4, 48, 1, 1, 128),
-    (4, 48, 3, 3, 128),
-    (4, 4, 128, 128, 65),
-    (4, 4, 113, 123, 1),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('layout', ['bhsd'])
-def test_op_fwd_fp8_kv(Z,
-                       H,
-                       N_CTX_Q,
-                       N_CTX_K,
-                       D_HEAD,
-                       causal,
-                       layout,
-                       dtype=torch.float32):
+    torch.testing.assert_close(
+        ref_out.to(torch.float32), tri_out.to(torch.float32), atol=7e-2, rtol=2e-1
+    )
+
+
+@pytest.mark.parametrize(
+    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
+    [
+        (4, 48, 1, 1, 64),
+        (4, 48, 1, 1, 128),
+        (4, 48, 3, 3, 128),
+        (4, 4, 128, 128, 65),
+        (4, 4, 113, 123, 1),
+    ],
+)
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("layout", ["bhsd"])
+def test_op_fwd_fp8_kv(
+    Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, layout, dtype=torch.float32
+):
     current_platform.seed_everything(0)
 
-    q, k_quantized, v_quantized, input_metadata = input_helper(Z,
-                                                               H,
-                                                               H,
-                                                               N_CTX_Q,
-                                                               N_CTX_K,
-                                                               D_HEAD,
-                                                               dtype,
-                                                               causal=causal,
-                                                               layout=layout,
-                                                               is_fp8=True,
-                                                               fp8_kv=True)
+    q, k_quantized, v_quantized, input_metadata = input_helper(
+        Z,
+        H,
+        H,
+        N_CTX_Q,
+        N_CTX_K,
+        D_HEAD,
+        dtype,
+        causal=causal,
+        layout=layout,
+        is_fp8=True,
+        fp8_kv=True,
+    )
 
     o = torch.empty_like(q)
 
-    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
-                                       input_metadata)
+    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o, input_metadata)
 
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-                                  dtype, input_metadata)
+    ref_impl = ReferenceAttention(
+        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
+    )
     ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
 
     torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
 
 
-@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (4, 48, 1, 1, 64),
-    (4, 48, 1, 1, 128),
-    (4, 48, 3, 3, 128),
-    (4, 4, 128, 128, 65),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('use_bias', [True])
-@pytest.mark.parametrize('dtype', [torch.bfloat16])
+@pytest.mark.parametrize(
+    "Z, H, N_CTX_Q, N_CTX_K, D_HEAD",
+    [
+        (4, 48, 1, 1, 64),
+        (4, 48, 1, 1, 128),
+        (4, 48, 3, 3, 128),
+        (4, 4, 128, 128, 65),
+    ],
+)
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("use_bias", [True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
     current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(Z,
-                                           H,
-                                           H,
-                                           N_CTX_Q,
-                                           N_CTX_K,
-                                           D_HEAD,
-                                           dtype,
-                                           layout='bhsd',
-                                           causal=causal,
-                                           use_bias=use_bias)
+    q, k, v, input_metadata = input_helper(
+        Z,
+        H,
+        H,
+        N_CTX_Q,
+        N_CTX_K,
+        D_HEAD,
+        dtype,
+        layout="bhsd",
+        causal=causal,
+        use_bias=use_bias,
+    )
     o = torch.empty_like(q)
 
     # triton implementation
     tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
 
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-                                  dtype, input_metadata)
+    ref_impl = ReferenceAttention(
+        Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False, dtype, input_metadata
+    )
     ref_out = ref_impl.fwd(q, k, v)
 
     # compare
@@ -454,47 +470,47 @@ def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
 
 
 # NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
-                                                 (4, 48, 512, 64),
-                                                 (16, 48, 512, 64),
-                                                 (64, 48, 128, 128)])
-@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize(
+    "Z, H, N_CTX, D_HEAD",
+    [(1, 48, 256, 64), (4, 48, 512, 64), (16, 48, 512, 64), (64, 48, 128, 128)],
+)
+@pytest.mark.parametrize("causal", [True, False])
 def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
-
-    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
-                                                  D_HEAD, dtype)
+    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX, D_HEAD, dtype)
 
     tri_out = torch.empty_like(q)
     triton_attention_rocm(q, k, v, tri_out, input_metadata)
 
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
-                                  input_metadata)
+    ref_impl = ReferenceAttention(
+        Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype, input_metadata
+    )
     ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
 
     torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
 
 
 # NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
-                                                      (4, 48, 12, 256, 64),
-                                                      (4, 48, 4, 512, 64),
-                                                      (4, 64, 16, 128, 128)])
-@pytest.mark.parametrize('causal', [False])
-def test_op_varlen_mqa_fwd(Z,
-                           HQ,
-                           HK,
-                           N_CTX,
-                           D_HEAD,
-                           causal,
-                           dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
-                                                  D_HEAD, dtype)
+@pytest.mark.parametrize(
+    "Z, HQ, HK, N_CTX, D_HEAD",
+    [
+        (2, 48, 24, 128, 64),
+        (4, 48, 12, 256, 64),
+        (4, 48, 4, 512, 64),
+        (4, 64, 16, 128, 128),
+    ],
+)
+@pytest.mark.parametrize("causal", [False])
+def test_op_varlen_mqa_fwd(Z, HQ, HK, N_CTX, D_HEAD, causal, dtype=torch.float16):
+    q, k, v, input_metadata = varlen_input_helper(
+        Z, HQ, HK, N_CTX, N_CTX, D_HEAD, dtype
+    )
 
     tri_out = torch.empty_like(q)
     triton_attention_rocm(q, k, v, tri_out, input_metadata)
 
-    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
-                                  dtype, input_metadata)
+    ref_impl = ReferenceAttention(
+        Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False, dtype, input_metadata
+    )
     ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
 
     torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 2e8febbdcf26..1c9636d5c6a2 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -16,11 +16,14 @@
 from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.platforms.interface import _Backend
-from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
-                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
+from vllm.utils import (
+    STR_BACKEND_ENV_VAR,
+    STR_FLASH_ATTN_VAL,
+    STR_XFORMERS_ATTN_VAL,
+    make_tensor_with_pad,
+)
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
@@ -39,7 +42,7 @@
 
 
 class QKVInputs(NamedTuple):
-    '''
+    """
     Data structure for representing unpacked attention inputs,
     query/key/values and their sequence lengths.
 
@@ -49,7 +52,7 @@ class QKVInputs(NamedTuple):
                              num_heads x head_size) attention inputs
         * q_seq_lens: query sequence lengths list
         * kv_seq_lens: shared key/value sequence lengths list
-    '''
+    """
 
     query: torch.Tensor
     key: torch.Tensor
@@ -59,7 +62,7 @@ class QKVInputs(NamedTuple):
 
 
 class QKVO(NamedTuple):
-    '''
+    """
     Data structure for representing unpacked attention inputs,
     alongside unpacked known-correct attention output
 
@@ -69,14 +72,14 @@ class QKVO(NamedTuple):
                              num_heads x head_size) attention inputs
         * ideal_output: unpacked (batch_size x padded_seq_len x
                         num_heads x head_size) known-correct attention output
-    '''
+    """
 
     qkv: QKVInputs
     ideal_output: torch.Tensor
 
 
 class PackedQKVInputs(NamedTuple):
-    '''
+    """
     Data structure for representing packed attention inputs
 
     Attributes:
@@ -88,7 +91,7 @@ class PackedQKVInputs(NamedTuple):
                              packed tensor
         * q_seq_lens: query sequence lengths list
         * kv_seq_lens: shared key/value sequence lengths list
-    '''
+    """
 
     query: torch.Tensor
     key: torch.Tensor
@@ -100,7 +103,7 @@ class PackedQKVInputs(NamedTuple):
 
 
 class PackedQKVO(NamedTuple):
-    '''
+    """
     Data structure for representing packed attention inputs,
     alongside packed known-correct attention output
 
@@ -110,28 +113,28 @@ class PackedQKVO(NamedTuple):
                       x head_size) attention inputs
         * ideal_output: packed (number_of_tokens x num_heads
                         x head_size) known-correct attention output
-    '''
+    """
 
     packed_qkv: Optional[PackedQKVInputs]
     ideal_output: torch.Tensor
 
 
 class KVMemoryMap(NamedTuple):
-    '''
+    """
     Data structure for encapsulating KV cache memory mapping.
 
     Attributes:
 
         * block_tables: KV cache block tables
         * slot_mapping: mapping of sequence offset to physical address
-    '''
+    """
 
     block_tables: torch.Tensor
     slot_mapping: torch.Tensor
 
 
 class PhaseTestParameters(NamedTuple):
-    '''
+    """
     Data structure for encapsulating the test parameters
     for a given test "phase" (prefill or decode phase) and attention
     scenario (encoder, decoder-self, encoder/decoder-cross)
@@ -143,7 +146,7 @@ class PhaseTestParameters(NamedTuple):
                        output
         * kv_mmap: KV cache memory mapping, specific to this test phase &
                    attention scenario
-    '''
+    """
 
     packed_qkvo: PackedQKVO
     kv_mmap: Optional[KVMemoryMap]
@@ -153,41 +156,43 @@ def maybe_make_int_tensor(
     _list: Optional[list[int]],
     device: Union[torch.device, str],
 ) -> torch.Tensor:
-    '''
+    """
     Convert Python int list to a 1D int torch.Tensor on `device`
 
     Returns:
 
     * If _list is not None: 1D int torch.Tensor on `device`
     * None otherwise
-    '''
-    return None if _list is None else torch.tensor(
-        _list, dtype=torch.int, device=device)
+    """
+    return (
+        None if _list is None else torch.tensor(_list, dtype=torch.int, device=device)
+    )
 
 
 def maybe_make_long_tensor(
     _list: Optional[list[int]],
     device: Union[torch.device, str],
 ) -> torch.Tensor:
-    '''
+    """
     Convert Python int list to a 1D long torch.Tensor on `device`
 
     Returns:
 
     * If _list is not None: 1D long torch.Tensor on `device`
     * None otherwise
-    '''
-    return None if _list is None else torch.tensor(
-        _list, dtype=torch.long, device=device)
+    """
+    return (
+        None if _list is None else torch.tensor(_list, dtype=torch.long, device=device)
+    )
 
 
 def maybe_max(_list: Optional[list]) -> Optional[Number]:
-    '''
+    """
     Returns:
 
     * If _list is not None: max(_list)
     * None otherwise
-    '''
+    """
     return None if _list is None else max(_list)
 
 
@@ -195,7 +200,7 @@ def make_causal_mask(
     q_max_seq_len: int,
     kv_max_seq_len: int,
 ) -> torch.Tensor:
-    '''
+    """
     Create a q_max_seq_len x kv_max_seq_len causal mask
 
     Arguments:
@@ -206,19 +211,19 @@ def make_causal_mask(
     Returns:
 
     * 2D tensor, q_max_seq_len x kv_max_seq_len
-    '''
+    """
 
     # Create a matrix where entry (i, j) is True if i >= j
     mask = torch.triu(torch.ones(q_max_seq_len, kv_max_seq_len), diagonal=1)
     # Replace True with float('-inf') and False with 0
-    mask = mask.masked_fill(mask == 1,
-                            float('-inf')).masked_fill(mask == 0, 0.0)
+    mask = mask.masked_fill(mask == 1, float("-inf")).masked_fill(mask == 0, 0.0)
     return mask
 
 
-def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
-                                  backend_name: str) -> None:
-    '''
+def override_backend_env_variable(
+    mpatch: pytest.MonkeyPatch, backend_name: str
+) -> None:
+    """
     Override the environment variable indicating the vLLM backend temporarily,
     using pytest monkeypatch to ensure that the env vars get
     reset once the test context exits.
@@ -227,18 +232,20 @@ def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
 
     * mpatch: pytest monkeypatch instance
     * backend_name: attention backend name to force
-    '''
+    """
     mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
 
 
-def ref_masked_attention(query: torch.Tensor,
-                         key: torch.Tensor,
-                         value: torch.Tensor,
-                         scale: float,
-                         custom_mask: Optional[torch.Tensor] = None,
-                         q_seq_lens: Optional[list] = None,
-                         kv_seq_lens: Optional[list] = None) -> torch.Tensor:
-    '''
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    custom_mask: Optional[torch.Tensor] = None,
+    q_seq_lens: Optional[list] = None,
+    kv_seq_lens: Optional[list] = None,
+) -> torch.Tensor:
+    """
     "Golden" masked attention reference. Supports two types of masking:
 
     * Basic attention mask, utilizing {q,kv}_seq_lens args to mask out
@@ -260,14 +267,14 @@ def ref_masked_attention(query: torch.Tensor,
     Returns:
 
     * Attention result, batch_size x q_padded_seq_len x num_heads x head_size
-    '''
+    """
 
     assert q_seq_lens is not None
     assert kv_seq_lens is not None
 
     batch_size = query.shape[0]
-    assert (len(q_seq_lens) == batch_size)
-    assert (len(kv_seq_lens) == batch_size)
+    assert len(q_seq_lens) == batch_size
+    assert len(kv_seq_lens) == batch_size
 
     attn_weights = scale * torch.einsum("bqhd,bkhd->bhqk", query, key).float()
 
@@ -303,7 +310,7 @@ def make_qkv(
     attn_type: AttentionType = AttentionType.ENCODER_DECODER,
     force_max_len: bool = False,
 ) -> tuple[QKVInputs, QKVInputs, QKVInputs]:
-    '''
+    """
     Construct QKV test tensors for self- and cross-attention.
 
     Generates three query/key/value triplets:
@@ -340,14 +347,12 @@ def make_qkv(
     * Overall QKVInputs structure (containing full unpacked Q/K/V tensors)
     * Prefill QKVInputs structure (containing all but the last sequence offset)
     * Decode QKVInputs structure (containing all only the last sequence offset)
-    '''
+    """
 
     if force_max_len:
         q_seq_lens = [max_q_seq_len for _ in range(batch_size)]
     else:
-        q_seq_lens = [
-            random.randint(2, max_q_seq_len) for _ in range(batch_size)
-        ]
+        q_seq_lens = [random.randint(2, max_q_seq_len) for _ in range(batch_size)]
     kv_seq_lens = None
     if force_kv_seq_lens is not None:
         kv_seq_lens = force_kv_seq_lens
@@ -360,50 +365,44 @@ def make_qkv(
         if force_max_len:
             kv_seq_lens = [max_kv_seq_len] * batch_size
         else:
-            kv_seq_lens = [
-                random.randint(2, max_kv_seq_len) for _ in range(batch_size)
-            ]
-
-    query = torch.rand(
-        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
-    key = torch.rand(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-    value = torch.rand(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-
-    prefill_query = torch.zeros(
-        (batch_size, max_q_seq_len, num_heads, head_size)).to(device)
-    prefill_key = torch.zeros(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-    prefill_value = torch.zeros(
-        (batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
-
-    decode_query = torch.zeros(
-        (batch_size, 1, num_heads, head_size)).to(device)
+            kv_seq_lens = [random.randint(2, max_kv_seq_len) for _ in range(batch_size)]
+
+    query = torch.rand((batch_size, max_q_seq_len, num_heads, head_size)).to(device)
+    key = torch.rand((batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+    value = torch.rand((batch_size, max_kv_seq_len, num_heads, head_size)).to(device)
+
+    prefill_query = torch.zeros((batch_size, max_q_seq_len, num_heads, head_size)).to(
+        device
+    )
+    prefill_key = torch.zeros((batch_size, max_kv_seq_len, num_heads, head_size)).to(
+        device
+    )
+    prefill_value = torch.zeros((batch_size, max_kv_seq_len, num_heads, head_size)).to(
+        device
+    )
+
+    decode_query = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
     decode_key = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
-    decode_value = torch.zeros(
-        (batch_size, 1, num_heads, head_size)).to(device)
+    decode_value = torch.zeros((batch_size, 1, num_heads, head_size)).to(device)
 
-    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens,
-                                                      kv_seq_lens)):
+    for bdx, (q_seq_len, kv_seq_len) in enumerate(zip(q_seq_lens, kv_seq_lens)):
         query[bdx, q_seq_len:, :, :] = 0
         key[bdx, kv_seq_len:, :, :] = 0
         value[bdx, kv_seq_len:, :, :] = 0
 
-        prefill_query[bdx,
-                      0:(q_seq_len - 1), :, :] = query[bdx,
-                                                       0:(q_seq_len - 1), :, :]
-        prefill_key[bdx,
-                    0:(kv_seq_len - 1), :, :] = key[bdx,
-                                                    0:(kv_seq_len - 1), :, :]
-        prefill_value[bdx, 0:(kv_seq_len -
-                              1), :, :] = value[bdx, 0:(kv_seq_len - 1), :, :]
-
-        decode_query[bdx, :, :, :] = query[bdx,
-                                           (q_seq_len - 1):q_seq_len, :, :]
-        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1):kv_seq_len, :, :]
-        decode_value[bdx, :, :, :] = value[bdx,
-                                           (kv_seq_len - 1):kv_seq_len, :, :]
+        prefill_query[bdx, 0 : (q_seq_len - 1), :, :] = query[
+            bdx, 0 : (q_seq_len - 1), :, :
+        ]
+        prefill_key[bdx, 0 : (kv_seq_len - 1), :, :] = key[
+            bdx, 0 : (kv_seq_len - 1), :, :
+        ]
+        prefill_value[bdx, 0 : (kv_seq_len - 1), :, :] = value[
+            bdx, 0 : (kv_seq_len - 1), :, :
+        ]
+
+        decode_query[bdx, :, :, :] = query[bdx, (q_seq_len - 1) : q_seq_len, :, :]
+        decode_key[bdx, :, :, :] = key[bdx, (kv_seq_len - 1) : kv_seq_len, :, :]
+        decode_value[bdx, :, :, :] = value[bdx, (kv_seq_len - 1) : kv_seq_len, :, :]
 
     prefill_q_seq_lens = [plen - 1 for plen in q_seq_lens]
     prefill_kv_seq_lens = [plen - 1 for plen in kv_seq_lens]
@@ -417,25 +416,29 @@ def make_qkv(
             key,
             value,
             q_seq_lens,
-            kv_seq_lens),
+            kv_seq_lens,
+        ),
         QKVInputs(
             prefill_query,  # Prefill subset of QKV sequences
             prefill_key,
             prefill_value,
             prefill_q_seq_lens,
-            prefill_kv_seq_lens),
+            prefill_kv_seq_lens,
+        ),
         QKVInputs(
             decode_query,  # Decode subset of KV sequences
             decode_key,
             decode_value,
             decode_q_seq_lens,
-            decode_kv_seq_lens))
+            decode_kv_seq_lens,
+        ),
+    )
 
 
 def pack_tensor(
-        unpacked_tensor: torch.Tensor, seq_lens: list[int],
-        device: Union[torch.device, str]) -> tuple[torch.Tensor, list[int]]:
-    '''
+    unpacked_tensor: torch.Tensor, seq_lens: list[int], device: Union[torch.device, str]
+) -> tuple[torch.Tensor, list[int]]:
+    """
     Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
     unpadded number_of_tokens x num_heads x head_size tensor, where
     number_of_tokens = sum(seq_lens)
@@ -451,7 +454,7 @@ def pack_tensor(
     * packed_tensor: number_of_tokens x num_heads x head_size
     * start_loc_list: start idx of each batch elt in packed_tensor; [0] +
       list(itertools.accumulate(seq_lens))
-    '''
+    """
 
     num_tok = sum(seq_lens)
     num_heads = unpacked_tensor.shape[-2]
@@ -460,16 +463,15 @@ def pack_tensor(
     packed_tensor = torch.zeros((num_tok, num_heads, head_size), device=device)
 
     for bdx, (seq_len, start_loc) in enumerate(zip(seq_lens, start_loc_list)):
-
-        packed_tensor[start_loc:(
-            start_loc + seq_len), :, :] = unpacked_tensor[bdx, :seq_len, :, :]
+        packed_tensor[start_loc : (start_loc + seq_len), :, :] = unpacked_tensor[
+            bdx, :seq_len, :, :
+        ]
 
     return packed_tensor, start_loc_list
 
 
-def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
-                                           str]) -> PackedQKVInputs:
-    '''
+def pack_qkv(qkv: QKVInputs, device: Union[torch.device, str]) -> PackedQKVInputs:
+    """
     Individually pack each of Q, K and V, each with dimensions batch_size x
     padded_seq_len x num_heads x head_size, into respective number_of_tokens x
     num_heads x head_size tensors.
@@ -488,28 +490,30 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
 
     * Packed (number_of_tokens x num_heads x head_size) QKV inputs
       derived from unpacked inputs
-    '''
+    """
 
     if qkv.query is None:
         packed_query = None
         q_start_loc_list = None
     else:
-        packed_query, q_start_loc_list = pack_tensor(qkv.query,
-                                                     qkv.q_seq_lens,
-                                                     device=device)
-    packed_key, kv_start_loc_list = pack_tensor(qkv.key,
-                                                qkv.kv_seq_lens,
-                                                device=device)
+        packed_query, q_start_loc_list = pack_tensor(
+            qkv.query, qkv.q_seq_lens, device=device
+        )
+    packed_key, kv_start_loc_list = pack_tensor(qkv.key, qkv.kv_seq_lens, device=device)
     packed_value, _ = pack_tensor(qkv.value, qkv.kv_seq_lens, device=device)
     return PackedQKVInputs(
-        packed_query, packed_key, packed_value, q_start_loc_list,
+        packed_query,
+        packed_key,
+        packed_value,
+        q_start_loc_list,
         kv_start_loc_list,
         (None if q_start_loc_list is None else qkv.q_seq_lens),
-        qkv.kv_seq_lens)
+        qkv.kv_seq_lens,
+    )
 
 
 def make_backend(backend_name: str) -> AttentionBackend:
-    '''
+    """
     Construct the backend instance determined by the backend_name string
     argument.
 
@@ -527,17 +531,18 @@ def make_backend(backend_name: str) -> AttentionBackend:
     Returns:
 
     * Backend instance
-    '''
+    """
     if backend_name == STR_XFORMERS_ATTN_VAL:
         # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
         from vllm.attention.backends.xformers import XFormersBackend
+
         return XFormersBackend()
     elif backend_name == STR_FLASH_ATTN_VAL:
         from vllm.attention.backends.flash_attn import FlashAttentionBackend
+
         return FlashAttentionBackend()
 
-    raise AssertionError(
-        f"Unrecognized backend_name {backend_name} for unit test")
+    raise AssertionError(f"Unrecognized backend_name {backend_name} for unit test")
 
 
 def _make_metadata_tensors(
@@ -545,9 +550,17 @@ def _make_metadata_tensors(
     context_lens: Optional[list[int]],
     encoder_seq_lens: Optional[list[int]],
     device: Union[torch.device, str],
-) -> tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
-           torch.Tensor, torch.Tensor, Optional[int]]:
-    '''
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Any,
+    Any,
+    Optional[torch.Tensor],
+    torch.Tensor,
+    torch.Tensor,
+    Optional[int],
+]:
+    """
     Build scalar & tensor values required to build attention metadata structure.
 
     Arguments:
@@ -567,48 +580,61 @@ def _make_metadata_tensors(
     * encoder_seq_lens_tensor: encoder seq_lens list, as tensor
     * encoder_seq_start_loc: start idx of each encoder sequence
     * max_encoder_seq_len: encoder seq_lens list, as tensor
-    '''
+    """
     seq_lens_tensor = maybe_make_int_tensor(seq_lens, device)
     context_lens_tensor = maybe_make_int_tensor(context_lens, device)
     max_context_len = maybe_max(context_lens)
     max_seq_len = maybe_max(seq_lens)
 
     encoder_seq_lens_tensor = maybe_make_int_tensor(encoder_seq_lens, device)
-    max_encoder_seq_len = (None if encoder_seq_lens is None else
-                           max(encoder_seq_lens))
+    max_encoder_seq_len = None if encoder_seq_lens is None else max(encoder_seq_lens)
 
     seq_start_loc = None
 
     if seq_lens_tensor is not None:
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=seq_lens_tensor.device)
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-
-    encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1,
-                                        dtype=torch.int32,
-                                        device=encoder_seq_lens_tensor.device)
-    torch.cumsum(encoder_seq_lens_tensor,
-                 dim=0,
-                 dtype=encoder_seq_start_loc.dtype,
-                 out=encoder_seq_start_loc[1:])
-
-    return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len,
-            seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc,
-            max_encoder_seq_len)
-
-
-def make_kv_cache(num_blocks: int,
-                  num_heads: int,
-                  head_size: int,
-                  block_size: int,
-                  device: Union[torch.device, str],
-                  backend: str,
-                  default_val: float = 0.0) -> torch.Tensor:
-    '''
+        seq_start_loc = torch.zeros(
+            seq_lens_tensor.shape[0] + 1,
+            dtype=torch.int32,
+            device=seq_lens_tensor.device,
+        )
+        torch.cumsum(
+            seq_lens_tensor, dim=0, dtype=seq_start_loc.dtype, out=seq_start_loc[1:]
+        )
+
+    encoder_seq_start_loc = torch.zeros(
+        encoder_seq_lens_tensor.shape[0] + 1,
+        dtype=torch.int32,
+        device=encoder_seq_lens_tensor.device,
+    )
+    torch.cumsum(
+        encoder_seq_lens_tensor,
+        dim=0,
+        dtype=encoder_seq_start_loc.dtype,
+        out=encoder_seq_start_loc[1:],
+    )
+
+    return (
+        seq_lens_tensor,
+        context_lens_tensor,
+        max_context_len,
+        max_seq_len,
+        seq_start_loc,
+        encoder_seq_lens_tensor,
+        encoder_seq_start_loc,
+        max_encoder_seq_len,
+    )
+
+
+def make_kv_cache(
+    num_blocks: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    device: Union[torch.device, str],
+    backend: str,
+    default_val: float = 0.0,
+) -> torch.Tensor:
+    """
     Create a fake KV cache.
 
     Arguments:
@@ -626,27 +652,29 @@ def make_kv_cache(num_blocks: int,
     *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
     *     for backend 'FLASH_ATTN'
-    '''
-    if backend == 'XFORMERS':
-        kv_cache = torch.rand(
-            (2, num_blocks, block_size * num_heads * head_size)).to(device)
-    elif backend == 'FLASH_ATTN':
-        kv_cache = torch.rand(
-            (2, num_blocks, block_size, num_heads, head_size)).to(device)
+    """
+    if backend == "XFORMERS":
+        kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
+            device
+        )
+    elif backend == "FLASH_ATTN":
+        kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
+            device
+        )
     else:
         raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
-            f"'FLASH_ATTN'.")
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
+        )
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
 
 
 def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int:
-    '''
+    """
     Compute the minimum number of blocks required to hold num_tokens tokens,
     given block_size
-    '''
+    """
     return (num_tokens + block_size) // block_size
 
 
@@ -658,9 +686,12 @@ def make_empty_block_tables_tensor(device: Union[torch.device, str]):
     return torch.tensor([], device=device)
 
 
-def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
-                       device: Union[torch.device, str]):
-    '''
+def split_slot_mapping(
+    slot_mapping_list: torch.Tensor,
+    seq_lens: list[int],
+    device: Union[torch.device, str],
+):
+    """
     Split a slot mapping into valid prefill- and decode-phase slot mappings.
 
     Context:
@@ -698,28 +729,32 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
       reflecting all N prefill prompts
     * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
       all N decoded tokens
-    '''
+    """
 
     prefill_slot_mapping = []
     decode_slot_mapping = []
 
     base_idx = 0
     for seq_len in seq_lens:
-        prefill_slot_mapping.extend(slot_mapping_list[base_idx:(base_idx +
-                                                                seq_len - 1)])
+        prefill_slot_mapping.extend(
+            slot_mapping_list[base_idx : (base_idx + seq_len - 1)]
+        )
         decode_slot_mapping.append(slot_mapping_list[base_idx + seq_len - 1])
         base_idx += seq_len
 
-    return (maybe_make_long_tensor(prefill_slot_mapping, device),
-            maybe_make_long_tensor(decode_slot_mapping, device))
+    return (
+        maybe_make_long_tensor(prefill_slot_mapping, device),
+        maybe_make_long_tensor(decode_slot_mapping, device),
+    )
 
 
 def make_block_tables_slot_mapping(
-        block_size: int,
-        seq_lens: list[int],
-        device: Union[torch.device, str],
-        block_base_addr: int = 0) -> tuple[torch.Tensor, list[int], int]:
-    '''
+    block_size: int,
+    seq_lens: list[int],
+    device: Union[torch.device, str],
+    block_base_addr: int = 0,
+) -> tuple[torch.Tensor, list[int], int]:
+    """
     Construct fake block tables & slot mappings.
 
     For a sequence with num_tokens tokens the minimum number
@@ -756,12 +791,11 @@ def make_block_tables_slot_mapping(
     * block_tables_tensor: block table for sequence
     * slot_mapping_list: slot mapping for sequence
     * max_block_idx: the highest block address within this block table
-    '''
+    """
 
     # Provision minimum number of KV cache blocks
     num_blocks_list = [
-        _num_tokens_to_min_blocks(num_tokens, block_size)
-        for num_tokens in seq_lens
+        _num_tokens_to_min_blocks(num_tokens, block_size) for num_tokens in seq_lens
     ]
     max_block_table_len = max(num_blocks_list)
     block_table_pad_tokens = 10
@@ -774,11 +808,11 @@ def make_block_tables_slot_mapping(
     max_block_idx = block_base_idx
     for sdx, num_tokens in enumerate(seq_lens):
         num_blocks = num_blocks_list[sdx]
-        block_table = list(
-            range(block_base_idx, block_base_idx - num_blocks, -1))
+        block_table = list(range(block_base_idx, block_base_idx - num_blocks, -1))
         for idx in range(num_tokens):
-            mapping_value = (
-                idx % block_size) + block_table[idx // block_size] * block_size
+            mapping_value = (idx % block_size) + block_table[
+                idx // block_size
+            ] * block_size
             slot_mapping_list.append(mapping_value)
 
         block_base_idx -= num_blocks
@@ -802,9 +836,9 @@ def make_test_metadata(
     decoder_test_params: Optional[PhaseTestParameters],
     device: Union[torch.device, str],
     encoder_test_params: Optional[PhaseTestParameters] = None,
-    cross_test_params: Optional[PhaseTestParameters] = None
+    cross_test_params: Optional[PhaseTestParameters] = None,
 ) -> AttentionMetadata:
-    '''
+    """
     Construct fake attention metadata for a given test phase
     (prefill-phase or decode-phase).
 
@@ -841,13 +875,12 @@ def make_test_metadata(
     Return:
 
     * AttentionMetadata structure
-    '''
+    """
 
     # Decoder self-attention memory mapping
     # decoder_test_params is None signals encoder-only
     # scenario, so kv_mmap is None
-    kv_mmap = (None
-               if decoder_test_params is None else decoder_test_params.kv_mmap)
+    kv_mmap = None if decoder_test_params is None else decoder_test_params.kv_mmap
 
     # This function constructs metadata assuming no chunked prefill,
     # i.e. 100% prefill tokens or 100% decode tokens
@@ -860,10 +893,11 @@ def make_test_metadata(
     # seq_lens is None signals encoder-only
     # scenario, in which case num_prefills_or_decodes and
     # num_prefill_or_decode_tokens are unused
-    num_prefills_or_decodes = (None if seq_lens is None else len(seq_lens))
+    num_prefills_or_decodes = None if seq_lens is None else len(seq_lens)
 
-    num_prefill_or_decode_tokens = (None if seq_lens is None else (
-        sum(seq_lens) if is_prompt else len(seq_lens)))
+    num_prefill_or_decode_tokens = (
+        None if seq_lens is None else (sum(seq_lens) if is_prompt else len(seq_lens))
+    )
 
     # Seems for non-prefix-caching scenarios context_lens
     # is never needed
@@ -877,8 +911,9 @@ def make_test_metadata(
         # * Extract encoder input sequence lengths
         assert encoder_test_params.packed_qkvo.packed_qkv is not None
         encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
-        num_encoder_tokens = (None if encoder_seq_lens is None else
-                              (sum(encoder_seq_lens)))
+        num_encoder_tokens = (
+            None if encoder_seq_lens is None else (sum(encoder_seq_lens))
+        )
 
     if cross_test_params is None:
         cross_kv_mmap = None
@@ -906,10 +941,9 @@ def make_test_metadata(
             encoder_seq_lens_tensor,
             encoder_seq_start_loc,
             max_encoder_seq_len,
-        ) = _make_metadata_tensors(seq_lens,
-                                   context_lens,
-                                   encoder_seq_lens,
-                                   device=device)
+        ) = _make_metadata_tensors(
+            seq_lens, context_lens, encoder_seq_lens, device=device
+        )
         return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
             slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
@@ -930,10 +964,13 @@ def make_test_metadata(
             encoder_seq_lens_tensor=encoder_seq_lens_tensor,
             encoder_seq_start_loc=encoder_seq_start_loc,
             max_encoder_seq_len=max_encoder_seq_len,
-            cross_slot_mapping=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.slot_mapping),
-            cross_block_tables=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.block_tables))
+            cross_slot_mapping=(
+                None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping
+            ),
+            cross_block_tables=(
+                None if cross_kv_mmap is None else cross_kv_mmap.block_tables
+            ),
+        )
 
     else:  # not is_prompt
         # Decode-phase scenario
@@ -955,10 +992,9 @@ def make_test_metadata(
             encoder_seq_lens_tensor,
             encoder_seq_start_loc,
             max_encoder_seq_len,
-        ) = _make_metadata_tensors(seq_lens,
-                                   context_lens,
-                                   encoder_seq_lens,
-                                   device=device)
+        ) = _make_metadata_tensors(
+            seq_lens, context_lens, encoder_seq_lens, device=device
+        )
 
         return attn_backend_obj.make_metadata(
             num_prefills=num_prefills,
@@ -981,16 +1017,19 @@ def make_test_metadata(
             encoder_seq_lens_tensor=encoder_seq_lens_tensor,
             encoder_seq_start_loc=encoder_seq_start_loc,
             max_encoder_seq_len=max_encoder_seq_len,
-            cross_slot_mapping=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.slot_mapping),
-            cross_block_tables=(None if cross_kv_mmap is None else
-                                cross_kv_mmap.block_tables))
-
-
-def assert_actual_matches_ideal(test_params: PhaseTestParameters,
-                                output_under_test: torch.Tensor,
-                                backend: str) -> None:
-    '''
+            cross_slot_mapping=(
+                None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping
+            ),
+            cross_block_tables=(
+                None if cross_kv_mmap is None else cross_kv_mmap.block_tables
+            ),
+        )
+
+
+def assert_actual_matches_ideal(
+    test_params: PhaseTestParameters, output_under_test: torch.Tensor, backend: str
+) -> None:
+    """
     Assert that observed output matches the ideal output
     contained in the test parameters data structure.
 
@@ -998,24 +1037,24 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
 
     * test_params: Test parameters including packed ideal output
     * output_under_test: actually observed output value
-    '''
+    """
     ideal_output = test_params.packed_qkvo.ideal_output
-    if backend == 'XFORMERS':
-        torch.testing.assert_close(ideal_output,
-                                   output_under_test.view_as(ideal_output))
+    if backend == "XFORMERS":
+        torch.testing.assert_close(
+            ideal_output, output_under_test.view_as(ideal_output)
+        )
 
-    elif backend == 'FLASH_ATTN':
+    elif backend == "FLASH_ATTN":
         # For FlashAttention override the accuracy thresholds to non default
         # values since we notice a higher difference between the ideal and
         # actual output.
-        torch.testing.assert_close(ideal_output,
-                                   output_under_test.view_as(ideal_output),
-                                   atol=0.01,
-                                   rtol=0.016)
+        torch.testing.assert_close(
+            ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+        )
     else:
         raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or "
-            f"'FLASH_ATTN'.")
+            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
+        )
 
 
 # Copied/modified from torch._refs.__init__.py
@@ -1029,19 +1068,15 @@ def fp8_allclose(
     """
     Reference implementation of torch.allclose
     """
-    torch._refs._check_close_args(name="torch.allclose",
-                                  a=a,
-                                  b=b,
-                                  rtol=rtol,
-                                  atol=atol)
+    torch._refs._check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol)
 
     return bool(
         torch.all(
-            torch.isclose(a.double(),
-                          b.double(),
-                          rtol=rtol,
-                          atol=atol,
-                          equal_nan=equal_nan)).item())
+            torch.isclose(
+                a.double(), b.double(), rtol=rtol, atol=atol, equal_nan=equal_nan
+            )
+        ).item()
+    )
 
 
 # Marlin MoE test utils
@@ -1054,7 +1089,8 @@ def stack_and_dev(tensors: list[torch.Tensor]):
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
-        torch.abs(output_ref))
+        torch.abs(output_ref)
+    )
 
 
 def torch_experts(
@@ -1074,10 +1110,11 @@ def torch_experts(
     block_shape: Optional[list[int]] = None,
     apply_router_weights_on_input: bool = False,
 ) -> torch.Tensor:
-    assert (global_num_experts == -1
-            or (global_num_experts == w1.shape[0] and expert_map is None)
-            or (expert_map is not None
-                and global_num_experts == expert_map.shape[0]))
+    assert (
+        global_num_experts == -1
+        or (global_num_experts == w1.shape[0] and expert_map is None)
+        or (expert_map is not None and global_num_experts == expert_map.shape[0])
+    )
 
     M, K = a.shape
     topk = topk_ids.shape[1]
@@ -1092,8 +1129,9 @@ def torch_experts(
 
     if a1_scale:
         assert not per_act_token_quant and block_shape is None
-    a, a_scale = moe_kernel_quantize_input(a, a1_scale, quant_dtype,
-                                           per_act_token_quant, block_shape)
+    a, a_scale = moe_kernel_quantize_input(
+        a, a1_scale, quant_dtype, per_act_token_quant, block_shape
+    )
 
     num_experts = w1.shape[0]
 
@@ -1112,22 +1150,28 @@ def torch_experts(
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
             elif block_shape is not None:
                 # block quantized
-                assert (a_scale is not None and w1_scale is not None
-                        and w2_scale is not None)
-                tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
-                                                w1_scale[i], block_shape,
-                                                out.dtype)
+                assert (
+                    a_scale is not None
+                    and w1_scale is not None
+                    and w2_scale is not None
+                )
+                tmp1 = native_w8a8_block_matmul(
+                    a[mask], w1[i], a_scale[mask], w1_scale[i], block_shape, out.dtype
+                )
                 tmp2 = SiluAndMul()(tmp1)
                 tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
-                    block_shape)
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant, block_shape
+                )
 
-                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
-                                                     w2_scale[i], block_shape,
-                                                     out.dtype)
+                out[mask] = native_w8a8_block_matmul(
+                    tmp2, w2[i], b_scale, w2_scale[i], block_shape, out.dtype
+                )
             else:
-                assert (a_scale is not None and w1_scale is not None
-                        and w2_scale is not None)
+                assert (
+                    a_scale is not None
+                    and w1_scale is not None
+                    and w2_scale is not None
+                )
                 scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
 
                 tmp1 = a[mask].to(f32) * scales
@@ -1137,8 +1181,8 @@ def torch_experts(
                 tmp2 = SiluAndMul()(tmp1).to(out.dtype)
 
                 tmp2, b_scale = moe_kernel_quantize_input(
-                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
-                    block_shape)
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant, block_shape
+                )
                 assert b_scale is not None
 
                 tmp2 = tmp2.to(f32) * b_scale
@@ -1148,21 +1192,27 @@ def torch_experts(
     if apply_router_weights_on_input:
         return out
     else:
-        return (out.view(M, -1, w2.shape[1]).to(f32) *
-                topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
+        return (
+            (out.view(M, -1, w2.shape[1]).to(f32) * topk_weight.view(M, -1, 1))
+            .sum(dim=1)
+            .to(out.dtype)
+        )
 
 
-def torch_moe(a: torch.Tensor,
-              w1: torch.Tensor,
-              w2: torch.Tensor,
-              score: torch.Tensor,
-              topk: int,
-              global_num_experts: int = -1,
-              expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+def torch_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    score: torch.Tensor,
+    topk: int,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
-    return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
-                         expert_map)
+    return torch_experts(
+        a, w1, w2, topk_weight, topk_ids, global_num_experts, expert_map
+    )
 
 
 def torch_moe_single(a, w, score, topk):
@@ -1181,41 +1231,49 @@ def torch_moe_single(a, w, score, topk):
 
 # A special version of op check that has a restricted default set of test_utils
 # and a patched version of allclose that supports fp8 types.
-def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
-                      torch._library.custom_ops.CustomOpDef],
-            args: tuple[Any, ...],
-            kwargs: Optional[dict[str, Any]] = None,
-            *,
-            test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
-            raise_exception: bool = True,
-            cond: bool = True) -> dict[str, str]:
-    with unittest.mock.patch('torch.allclose', new=fp8_allclose):
-        return torch.library.opcheck(
-            op,
-            args,
-            kwargs,
-            test_utils=test_utils,
-            raise_exception=raise_exception) if cond else {}
+def opcheck(
+    op: Union[
+        torch._ops.OpOverload,
+        torch._ops.OpOverloadPacket,
+        torch._library.custom_ops.CustomOpDef,
+    ],
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
+    raise_exception: bool = True,
+    cond: bool = True,
+) -> dict[str, str]:
+    with unittest.mock.patch("torch.allclose", new=fp8_allclose):
+        return (
+            torch.library.opcheck(
+                op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception
+            )
+            if cond
+            else {}
+        )
 
 
 # For testing quantized linear kernels
 def to_fp8(tensor: torch.Tensor):
     finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
 
 
 def to_int8(tensor: torch.Tensor):
     return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 
 
-def baseline_scaled_mm(a: torch.Tensor,
-                       b: torch.Tensor,
-                       scale_a: torch.Tensor,
-                       scale_b: torch.Tensor,
-                       out_dtype: type[torch.dtype],
-                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-
+def baseline_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
     # We treat N-dimensional group scaling as extended numpy-style broadcasting
     # in numpy simply stretches dimensions with an extent of 1 to match the
     # the target shape by repeating the data along that dimension (broadcasting)
@@ -1234,16 +1292,19 @@ def group_broadcast(t, shape):
         for i, s in enumerate(shape):
             if t.shape[i] != s and t.shape[i] != 1:
                 assert s % t.shape[i] == 0
-                t = t.unsqueeze(i + 1)\
-                  .expand(*t.shape[:i+1], s // t.shape[i], *t.shape[i+1:])\
-                  .flatten(i, i + 1)
+                t = (
+                    t.unsqueeze(i + 1)
+                    .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :])
+                    .flatten(i, i + 1)
+                )
         return t
 
     scale_a = group_broadcast(scale_a, a.shape)
     scale_b = group_broadcast(scale_b, b.shape)
 
-    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
-                      (scale_b * b.to(dtype=torch.float32))).to(out_dtype)
+    output = torch.mm(
+        (scale_a * a.to(dtype=torch.float32)), (scale_b * b.to(dtype=torch.float32))
+    ).to(out_dtype)
 
     if bias is not None:
         output = output + bias
diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py
index 9f2229cc41df..1d24851bb782 100644
--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
@@ -19,8 +19,11 @@ def setup_servers():
         pytest.skip("Skipping test: fewer than 2 GPUs available")
 
     # Set up environment variables
-    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
-                                           shell=True).decode().strip()
+    VLLM_HOST_IP = (
+        subprocess.check_output("hostname -I | awk '{print $1}'", shell=True)
+        .decode()
+        .strip()
+    )
     os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
 
     # Start prefill instance
@@ -37,7 +40,7 @@ def setup_servers():
         "--max-model-len",
         "1000",
         "--kv-transfer-config",
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'
         '"kv_rank":0,"kv_parallel_size":2}',
     ]
     prefill_env = os.environ.copy()
@@ -58,7 +61,7 @@ def setup_servers():
         "--max-model-len",
         "1000",
         "--kv-transfer-config",
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'
         '"kv_rank":1,"kv_parallel_size":2}',
     ]
     decode_env = os.environ.copy()
@@ -98,23 +101,27 @@ def wait_for_server(port, timeout=240):
 @pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
 def test_disaggregated_prefilling(prompt):
     # Send to prefill
-    response = requests.post("http://localhost:8100/v1/completions",
-                             headers={"Content-Type": "application/json"},
-                             json={
-                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
-                                 "prompt": prompt,
-                                 "max_tokens": 1,
-                                 "temperature": 0
-                             })
+    response = requests.post(
+        "http://localhost:8100/v1/completions",
+        headers={"Content-Type": "application/json"},
+        json={
+            "model": "meta-llama/Llama-3.2-1B-Instruct",
+            "prompt": prompt,
+            "max_tokens": 1,
+            "temperature": 0,
+        },
+    )
     assert response.status_code == 200
 
     # Send to decode
-    response = requests.post("http://localhost:8200/v1/completions",
-                             headers={"Content-Type": "application/json"},
-                             json={
-                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
-                                 "prompt": prompt,
-                                 "max_tokens": 10,
-                                 "temperature": 0
-                             })
+    response = requests.post(
+        "http://localhost:8200/v1/completions",
+        headers={"Content-Type": "application/json"},
+        json={
+            "model": "meta-llama/Llama-3.2-1B-Instruct",
+            "prompt": prompt,
+            "max_tokens": 10,
+            "temperature": 0,
+        },
+    )
     assert response.status_code == 200
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 352ab63552de..ff96527318b8 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -8,8 +8,7 @@
 from tqdm import tqdm
 
 from vllm.config import KVTransferConfig
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
-    SimpleBuffer)
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer
 from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
 
 # TODO: the test depends on a lot of fields in the current implementation.
@@ -17,7 +16,6 @@
 
 
 def test_run(my_rank, buffer, device):
-
     # buffer should be empty in the beginning
     if my_rank == 0:
         assert buffer.buffer_size == 0
@@ -27,7 +25,7 @@ def test_run(my_rank, buffer, device):
 
     # insert
     tokens = torch.tensor([1, 2, 3]).to(device)
-    roi = (tokens > 0)
+    roi = tokens > 0
     if my_rank == 0:
         key = 2.0 * torch.ones([5, 6]).to(device)
         value = 3.0 * torch.ones([5, 6]).to(device)
@@ -55,7 +53,6 @@ def test_run(my_rank, buffer, device):
 
 
 def stress_test(my_rank, buf, device):
-
     torch.distributed.barrier()
     torch.manual_seed(100)
 
@@ -66,7 +63,8 @@ def stress_test(my_rank, buf, device):
             torch.rand(100).to(device),  # key
             torch.rand(100).to(device),  # value
             torch.rand(100).to(device),  # hidden
-        ) for i in tqdm(range(200))
+        )
+        for i in tqdm(range(200))
     ]
 
     random.seed(my_rank)
@@ -115,12 +113,11 @@ def stress_test(my_rank, buf, device):
 
 
 if __name__ == "__main__":
-
-    my_rank = int(os.environ['RANK'])
+    my_rank = int(os.environ["RANK"])
 
     torch.distributed.init_process_group(
-        backend='gloo',
-        init_method='tcp://localhost:12398',
+        backend="gloo",
+        init_method="tcp://localhost:12398",
         world_size=2,
         rank=my_rank,
     )
@@ -128,8 +125,8 @@ def stress_test(my_rank, buf, device):
     print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
-        kv_buffer_device='cuda',
+        kv_connector="PyNcclConnector",
+        kv_buffer_device="cuda",
         kv_buffer_size=1e9,
         kv_rank=my_rank,
         kv_role="kv_both",  # this arg doesn't matter in this test
@@ -160,4 +157,4 @@ def stress_test(my_rank, buf, device):
     buffer.close()
     data_pipe.close()
     cpu_pipe.close()
-    print('Done')
+    print("Done")
diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py
index 7a04174870da..b9a28e4bceb7 100644
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
@@ -9,21 +9,19 @@
 
 
 def run_python_script(script_name, timeout):
-    script_name = f'kv_transfer/{script_name}'
+    script_name = f"kv_transfer/{script_name}"
     try:
         # Start both processes asynchronously using Popen
         process0 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK":
-                 "0"},  # Set the RANK environment variable for process 0
+            env={"RANK": "0"},  # Set the RANK environment variable for process 0
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
 
         process1 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK":
-                 "1"},  # Set the RANK environment variable for process 1
+            env={"RANK": "1"},  # Set the RANK environment variable for process 1
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
@@ -34,11 +32,9 @@ def run_python_script(script_name, timeout):
 
         # Check the return status of both processes
         if process0.returncode != 0:
-            pytest.fail(
-                f"Test {script_name} failed for RANK=0, {process0.returncode}")
+            pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
         if process1.returncode != 0:
-            pytest.fail(
-                f"Test {script_name} failed for RANK=1, {process1.returncode}")
+            pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
 
     except subprocess.TimeoutExpired:
         # If either process times out, terminate both and fail the test
@@ -53,15 +49,14 @@ def run_python_script(script_name, timeout):
 @pytest.mark.parametrize(
     "script_name,timeout",
     [
-        ("test_lookup_buffer.py",
-         60),  # Second test case with a 60-second timeout
-        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
-    ])
+        ("test_lookup_buffer.py", 60),  # Second test case with a 60-second timeout
+        ("test_send_recv.py", 120),  # First test case with a 120-second timeout
+    ],
+)
 def test_run_python_script(script_name, timeout):
     # Check the number of GPUs
     if torch.cuda.device_count() < 2:
-        pytest.skip(
-            f"Skipping test {script_name} because <2 GPUs are available")
+        pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
 
     # Run the test if there are at least 2 GPUs
     run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 32116608a217..16ae4ad2ee9f 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -15,7 +15,7 @@ def test_run(my_rank, pipe):
     print(f"rank {my_rank} test_run starts....")
     # test run
     x = torch.tensor([1]).to(pipe.device)
-    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
+    y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
     if my_rank == 0:
         pipe.send_tensor(x)
         print(f"rank {my_rank} sent tensor x")
@@ -53,9 +53,8 @@ def stress_test(my_rank, pipe):
     for i in tqdm(range(500)):
         mean = torch.rand(1).item() * 100
         std = torch.rand(1).item() * 100
-        size = torch.randint(900, 1000, (2, ))
-        x = torch.normal(mean * 1.0, std * 1.0,
-                         size=size.tolist()).to(pipe.device)
+        size = torch.randint(900, 1000, (2,))
+        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
 
         # 5% probability of sending a None
         if torch.rand(1).item() < 0.05:
@@ -96,20 +95,16 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     torch.distributed.barrier()
 
     for i in tqdm(range(500)):
-
         tensors = []
 
         if my_rank == 0:
             # create tensor
-            tensors = [
-                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
-            ]
+            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
 
         torch.distributed.barrier()
 
         if my_rank == 0:
-            t = torch.tensor([time.time()],
-                             dtype=torch.float64).to(pipe.device)
+            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
             for tensor in tensors:
                 pipe.send_tensor(tensor)
             pipe.send_tensor(t)
@@ -121,24 +116,23 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
     torch.distributed.barrier()
 
-    print('Latency test passed.')
-    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
+    print("Latency test passed.")
+    print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
 
 
 if __name__ == "__main__":
-
-    my_rank = int(os.environ['RANK'])
+    my_rank = int(os.environ["RANK"])
 
     torch.distributed.init_process_group(
-        backend='gloo',
-        init_method='tcp://localhost:12398',
+        backend="gloo",
+        init_method="tcp://localhost:12398",
         world_size=2,
         rank=my_rank,
     )
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
-        kv_buffer_device='cuda',
+        kv_connector="PyNcclConnector",
+        kv_buffer_device="cuda",
         kv_buffer_size=1e9,
         kv_rank=my_rank,
         kv_role="kv_both",  # this arg doesn't matter in this test
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 881d5efa6919..7b836f765403 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -12,12 +12,16 @@
 
 import vllm
 from vllm.config import LoRAConfig
-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -51,11 +55,13 @@ def dist_init():
     if current_platform.is_cpu() or current_platform.is_tpu():
         backend = "gloo"
 
-    init_distributed_environment(world_size=1,
-                                 rank=0,
-                                 distributed_init_method=f"file://{temp_file}",
-                                 local_rank=0,
-                                 backend=backend)
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend=backend,
+    )
     initialize_model_parallel(1, 1)
     yield
     cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -70,10 +76,9 @@ def dist_init_torch_only():
         backend = "gloo"
 
     temp_file = tempfile.mkstemp()[1]
-    torch.distributed.init_process_group(world_size=1,
-                                         rank=0,
-                                         init_method=f"file://{temp_file}",
-                                         backend=backend)
+    torch.distributed.init_process_group(
+        world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
+    )
 
 
 class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@@ -83,25 +88,31 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
 @pytest.fixture
 def dummy_model() -> nn.Module:
     model = DummyLoRAModel(
-        OrderedDict([
-            ("dense1", ColumnParallelLinear(764, 100)),
-            ("dense2", RowParallelLinear(100, 50)),
-            (
-                "layer1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", ColumnParallelLinear(100, 10)),
-                        ("dense2", RowParallelLinear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("output", ColumnParallelLinear(50, 10)),
-            ("outact", nn.Sigmoid()),
-            # Special handling for lm_head & sampler
-            ("lm_head", ParallelLMHead(512, 10)),
-            ("logits_processor", LogitsProcessor(512)),
-            ("sampler", Sampler())
-        ]))
+        OrderedDict(
+            [
+                ("dense1", ColumnParallelLinear(764, 100)),
+                ("dense2", RowParallelLinear(100, 50)),
+                (
+                    "layer1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", ColumnParallelLinear(100, 10)),
+                                ("dense2", RowParallelLinear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("output", ColumnParallelLinear(50, 10)),
+                ("outact", nn.Sigmoid()),
+                # Special handling for lm_head & sampler
+                ("lm_head", ParallelLMHead(512, 10)),
+                ("logits_processor", LogitsProcessor(512)),
+                ("sampler", Sampler()),
+            ]
+        )
+    )
     model.config = MagicMock()
     model.embedding_modules = {"lm_head": "lm_head"}
     return model
@@ -110,25 +121,31 @@ def dummy_model() -> nn.Module:
 @pytest.fixture
 def dummy_model_gate_up() -> nn.Module:
     model = DummyLoRAModel(
-        OrderedDict([
-            ("dense1", ColumnParallelLinear(764, 100)),
-            ("dense2", RowParallelLinear(100, 50)),
-            (
-                "layer1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", ColumnParallelLinear(100, 10)),
-                        ("dense2", RowParallelLinear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
-            ("outact", nn.Sigmoid()),
-            # Special handling for lm_head & sampler
-            ("lm_head", ParallelLMHead(512, 10)),
-            ("logits_processor", LogitsProcessor(512)),
-            ("sampler", Sampler())
-        ]))
+        OrderedDict(
+            [
+                ("dense1", ColumnParallelLinear(764, 100)),
+                ("dense2", RowParallelLinear(100, 50)),
+                (
+                    "layer1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", ColumnParallelLinear(100, 10)),
+                                ("dense2", RowParallelLinear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
+                ("outact", nn.Sigmoid()),
+                # Special handling for lm_head & sampler
+                ("lm_head", ParallelLMHead(512, 10)),
+                ("logits_processor", LogitsProcessor(512)),
+                ("sampler", Sampler()),
+            ]
+        )
+    )
     model.config = MagicMock()
     model.packed_modules_mapping = {
         "gate_up_proj": [
@@ -232,8 +249,7 @@ def llama_2_7b_engine_extra_embeddings():
     get_model_old = get_model
 
     def get_model_patched(**kwargs):
-        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
-                                                       max_lora_rank=8)
+        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4, max_lora_rank=8)
         return get_model_old(**kwargs)
 
     with patch("vllm.worker.model_runner.get_model", get_model_patched):
@@ -245,8 +261,9 @@ def get_model_patched(**kwargs):
 
 @pytest.fixture
 def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
-    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
-           model_runner.model)
+    yield (
+        llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.model_runner.model
+    )
 
 
 @pytest.fixture
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index cc8160b2860d..a7d95ef19ccf 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -8,7 +8,8 @@
 import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
@@ -27,14 +28,10 @@ def get_lora_requests(lora_path) -> list[LoRARequest]:
     return lora_requests
 
 
-async def requests_processing_time(llm,
-                                   lora_requests: list[LoRARequest]) -> float:
-
-    sampling_params = SamplingParams(n=1,
-                                     temperature=0.0,
-                                     top_p=1.0,
-                                     ignore_eos=True,
-                                     max_tokens=1)
+async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
+    sampling_params = SamplingParams(
+        n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
+    )
 
     generators = []
     start = time.perf_counter()
@@ -42,11 +39,11 @@ async def requests_processing_time(llm,
     for lora_request in lora_requests:
         lora_int_id = lora_request.lora_int_id
         generator = llm.generate(
-            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
-                              multi_modal_data=None),  # type: ignore 
+            prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None),  # type: ignore
             sampling_params=sampling_params,
             lora_request=lora_request,
-            request_id=f"test{lora_int_id}")
+            request_id=f"test{lora_int_id}",
+        )
         generators.append(generator)
 
     all_gens = merge_async_iterators(*generators)
@@ -59,13 +56,13 @@ async def requests_processing_time(llm,
 
 @pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
-    """ 
+    """
     The add_lora function is used to pre-load some LoRA adapters into the
     engine in anticipation of future requests using these adapters. To test
     this functionality, we use the async engine to process some requests - We
     do it twice, once with add_lora() pre-loading and once without.
 
-    We measure the request processing time in both cases and expect the time 
+    We measure the request processing time in both cases and expect the time
     to be lesser in the case with add_lora() calls.
     """
     lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
@@ -79,18 +76,18 @@ async def test_add_lora(chatglm3_lora_files):
         max_loras=max_loras,
         max_lora_rank=LORA_RANK,
         max_model_len=128,
-        gpu_memory_utilization=0.8,  #avoid OOM
+        gpu_memory_utilization=0.8,  # avoid OOM
         trust_remote_code=True,
-        enforce_eager=True)
+        enforce_eager=True,
+    )
 
     # split lora_requests into 3 parts
     part_size = len(lora_requests) // 3
     dummy_run_requests = lora_requests[:part_size]
-    warmup_run_requests = lora_requests[part_size:part_size * 2]
-    cold_run_requests = lora_requests[part_size * 2:]
+    warmup_run_requests = lora_requests[part_size : part_size * 2]
+    cold_run_requests = lora_requests[part_size * 2 :]
 
     async with build_async_engine_client_from_engine_args(engine_args) as llm:
-
         # Dummy run - So any 1-time functionality like triton kernel compilation
         # is complete here.
         await requests_processing_time(llm, dummy_run_requests)
@@ -104,18 +101,16 @@ async def test_add_lora(chatglm3_lora_files):
         else:
             # No way to check V0 engine results as the calls just return None.
             pass
-        time_with_add_lora = await requests_processing_time(
-            llm, warmup_run_requests)
+        time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)
 
         # Run without any warmup
-        time_cold_start = await requests_processing_time(
-            llm, cold_run_requests)
+        time_cold_start = await requests_processing_time(llm, cold_run_requests)
 
-    print(f"time hot-start {time_with_add_lora} vs "
-          f"time cold-start {time_cold_start} ")
+    print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")
 
     assert time_with_add_lora < time_cold_start, (
         f"time_with_add_lora={time_with_add_lora}, "
         f"time_cold_start={time_cold_start}"
         "The engine request processing time with LoRA pre-loading "
-        "must be less than the version that does on-demand LoRA loading.")
+        "must be less than the version that does on-demand LoRA loading."
+    )
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 774ebb9db210..3eb8c81f2261 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -16,12 +16,10 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query="What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
         ),
         PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+            query="Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
         ),
     ]
     print(prompts)
@@ -29,8 +27,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -42,12 +40,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 
 def test_baichuan_lora(baichuan_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        trust_remote_code=True,
+    )
 
     expected_lora_output = [
         "SELECT count(*) FROM singer",
@@ -64,31 +64,36 @@ def test_baichuan_lora(baichuan_lora_files):
 
 
 @pytest.mark.parametrize("fully_sharded", [True, False])
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
-                                           num_gpus_available, fully_sharded):
+def test_baichuan_tensor_parallel_equality(
+    baichuan_lora_files, num_gpus_available, fully_sharded
+):
     if num_gpus_available < 4:
         pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
 
-    llm_tp1 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
+    llm_tp1 = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_lora_rank=64,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
     output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
 
     del llm_tp1
     cleanup_dist_env_and_memory()
 
-    llm_tp2 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=2,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
+    llm_tp2 = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=2,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
     output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
 
     del llm_tp2
@@ -96,14 +101,16 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
 
     assert output_tp1 == output_tp2
 
-    llm_tp4 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=4,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
+    llm_tp4 = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
     output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
 
     del llm_tp4
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 5481b413b8f5..8495d8e8c168 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -21,20 +21,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query="What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
         ),
         PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+            query="Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
         ),
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -47,13 +45,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
 
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -66,15 +66,17 @@ def test_chatglm3_lora(chatglm3_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=False,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=False,
+        enable_chunked_prefill=True,
+    )
 
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -87,15 +89,17 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=True,
+        enable_chunked_prefill=True,
+    )
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index f615ceda76b5..1a5b9ba3641d 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -32,15 +32,12 @@
     "max_lora_rank": 320,
     "max_model_len": 12800,
     "gpu_memory_utilization": 0.8,
-    "limit_mm_per_prompt": {
-        "audio": 1
-    },
+    "limit_mm_per_prompt": {"audio": 1},
     "enforce_eager": True,
 }
 
 
-def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
-             **kwargs):
+def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs):
     inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
 
     # Apply any additional kwargs as overrides to the base kwargs
@@ -53,11 +50,11 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
                 max_tokens=128,
                 audios=audios,
                 lora_request=lora_request,
-            ) for prompts, audios in inputs
+            )
+            for prompts, audios in inputs
         ]
 
-        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
-            expected_suffix)
+        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
 
 
 def test_active_default_mm_lora(
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 92db023babc2..9b7c78aeefda 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -15,29 +15,42 @@
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
-    RowParallelLinearWithShardedLoRA)
+    MergedQKVParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithShardedLoRA,
+    RowParallelLinearWithShardedLoRA,
+)
+
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LogitsProcessorWithLoRA, LoRAMapping,
-                              MergedColumnParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithLoRA,
-                              QKVParallelLinearWithLoRA,
-                              ReplicatedLinearWithLoRA,
-                              RowParallelLinearWithLoRA,
-                              VocabParallelEmbeddingWithLoRA)
+from vllm.lora.layers import (
+    BaseLayerWithLoRA,
+    ColumnParallelLinearWithLoRA,
+    LogitsProcessorWithLoRA,
+    LoRAMapping,
+    MergedColumnParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithLoRA,
+    QKVParallelLinearWithLoRA,
+    ReplicatedLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+    VocabParallelEmbeddingWithLoRA,
+)
+
 # yapf: enable
 from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
+    ParallelLMHead,
+    VocabParallelEmbedding,
+    get_masked_input_and_mask,
+)
 from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import current_platform
 
@@ -51,11 +64,14 @@
 
 pytestmark = pytest.mark.skipif(
     not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
-    reason="Backend not supported")
+    reason="Backend not supported",
+)
 
-DEVICES = ([
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-] if current_platform.is_cuda_alike() else ["cpu"])
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
 
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]
@@ -68,8 +84,8 @@
 @pytest.fixture(autouse=True)
 def clean_cache_reset_device(reset_default_device):
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
-    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
-                                                _LORA_B_PTR_DICT)
+    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+
     _LORA_B_PTR_DICT.clear()
     _LORA_A_PTR_DICT.clear()
 
@@ -79,13 +95,14 @@ def clean_cache_reset_device(reset_default_device):
 @pytest.fixture(autouse=True)
 def skip_cuda_with_stage_false(request):
     """
-    On cuda-like platforms, we use the same kernels for prefill and decode 
+    On cuda-like platforms, we use the same kernels for prefill and decode
     stage, and 'stage' is generally ignored, so we only need to test once.
     """
     if current_platform.is_cuda_alike():
         try:
             if hasattr(request.node, "callspec") and hasattr(
-                    request.node.callspec, "params"):
+                request.node.callspec, "params"
+            ):
                 params = request.node.callspec.params
                 if "stage" in params and params["stage"] is False:
                     pytest.skip("Skip test when stage=False")
@@ -94,9 +111,9 @@ def skip_cuda_with_stage_false(request):
     yield
 
 
-def get_random_id_to_index(num_loras: int,
-                           num_slots: int,
-                           log: bool = True) -> list[Optional[int]]:
+def get_random_id_to_index(
+    num_loras: int, num_slots: int, log: bool = True
+) -> list[Optional[int]]:
     """Creates a random lora_id_to_index mapping.
 
     Args:
@@ -109,7 +126,8 @@ def get_random_id_to_index(num_loras: int,
     if num_loras > num_slots:
         raise ValueError(
             f"num_loras is higher than num_slots: {num_loras} > {num_slots}. "
-            "num_loras must be less than or equal to num_slots.")
+            "num_loras must be less than or equal to num_slots."
+        )
 
     slots: list[Optional[int]] = [None] * num_slots
     random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
@@ -158,19 +176,18 @@ def populate_loras(
             subloras: list[LoRALayerWeights] = []
             sublora_len = layer_weights.shape[0] // repeats
             for i in range(repeats):
-                sublora = DummyLoRAManager(
-                    layer_weights.device).init_random_lora(
-                        module_name=f"fake_{i}",
-                        weight=layer_weights,
-                        generate_embeddings_tensor=generate_embeddings_tensor,
-                    )
-                sublora.lora_b = sublora.lora_b[:, (sublora_len *
-                                                    i):(sublora_len * (i + 1))]
+                sublora = DummyLoRAManager(layer_weights.device).init_random_lora(
+                    module_name=f"fake_{i}",
+                    weight=layer_weights,
+                    generate_embeddings_tensor=generate_embeddings_tensor,
+                )
+                sublora.lora_b = sublora.lora_b[
+                    :, (sublora_len * i) : (sublora_len * (i + 1))
+                ]
                 sublora.optimize()
                 subloras.append(sublora)
 
-            lora = PackedLoRALayerWeights.pack(
-                subloras) if repeats > 1 else subloras[0]
+            lora = PackedLoRALayerWeights.pack(subloras) if repeats > 1 else subloras[0]
 
             layer.set_lora(
                 slot_idx,
@@ -191,7 +208,7 @@ def create_random_inputs(
     input_size: tuple[int, ...],
     input_range: tuple[float, float],
     input_type: torch.dtype = torch.int,
-    device: torch.device = "cuda"
+    device: torch.device = "cuda",
 ) -> tuple[list[torch.Tensor], list[int], list[int]]:
     """Creates random inputs.
 
@@ -213,14 +230,15 @@ def create_random_inputs(
     for _ in range(num_inputs):
         if input_type == torch.int:
             inputs.append(
-                torch.randint(low=int(low),
-                              high=int(high),
-                              size=input_size,
-                              device=device))
+                torch.randint(
+                    low=int(low), high=int(high), size=input_size, device=device
+                )
+            )
         else:
             inputs.append(
-                torch.rand(size=input_size, dtype=input_type, device=device) *
-                high + low)
+                torch.rand(size=input_size, dtype=input_type, device=device) * high
+                + low
+            )
 
         lora_id = random.choice(active_lora_ids)
         index_mapping += [lora_id] * input_size[0]
@@ -258,9 +276,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16)
+    lora_config = LoRAConfig(
+        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
+    )
 
     def create_random_embedding_layer():
         embedding = VocabParallelEmbedding(vocab_size, 256)
@@ -286,15 +304,18 @@ def create_random_embedding_layer():
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
 
         lora_result = lora_embedding(torch.cat(inputs))
 
@@ -306,15 +327,12 @@ def create_random_embedding_layer():
                 input_,
                 lora.lora_a,
             )
-            result += (after_a @ lora.lora_b)
+            result += after_a @ lora.lora_b
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -324,24 +342,24 @@ def create_random_embedding_layer():
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=[0],
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
 
         lora_result = lora_embedding(torch.cat(inputs))
         expected_result = embedding(torch.cat(inputs))
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -351,9 +369,9 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
-def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
-                                        vocab_size, stage) -> None:
-
+def test_embeddings_with_new_embeddings(
+    dist_init, num_loras, device, vocab_size, stage
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -361,9 +379,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16)
+    lora_config = LoRAConfig(
+        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
+    )
 
     def create_random_embedding_layer():
         embedding = VocabParallelEmbedding(vocab_size, 256)
@@ -373,12 +391,12 @@ def create_random_embedding_layer():
         expanded_embedding = VocabParallelEmbedding(
             vocab_size + lora_config.lora_extra_vocab_size * max_loras,
             256,
-            org_num_embeddings=vocab_size)
+            org_num_embeddings=vocab_size,
+        )
         expanded_embedding.weight.data[:vocab_size, :] = embedding_data
         # We need to deepcopy the embedding as it will be modified
         # in place
-        lora_embedding = VocabParallelEmbeddingWithLoRA(
-            deepcopy(expanded_embedding))
+        lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding))
         lora_embedding.create_lora_weights(max_loras, lora_config)
 
         return expanded_embedding, lora_embedding
@@ -392,7 +410,8 @@ def create_random_embedding_layer():
             id_to_index,
             layer=lora_embedding,
             layer_weights=torch.zeros(
-                (256, vocab_size + lora_config.lora_extra_vocab_size)),
+                (256, vocab_size + lora_config.lora_extra_vocab_size)
+            ),
             generate_embeddings_tensor=256,
         )
 
@@ -410,52 +429,53 @@ def create_random_embedding_layer():
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
         original_inputs = deepcopy(inputs)
 
         # Force some of the inputs to be in the extended embeddings range
         # to guarantee that their behavior is tested.
-        for input_, original_input_, lora_id in zip(inputs, original_inputs,
-                                                    prompt_mapping):
+        for input_, original_input_, lora_id in zip(
+            inputs, original_inputs, prompt_mapping
+        ):
             embedding_id = lora_id - 1
             input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
             original_input_[-1] = vocab_size
-            input_[-2] = vocab_size + (
-                (embedding_id + 1) * embeddings_tensor_len - 1)
+            input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1)
             original_input_[-2] = vocab_size + embeddings_tensor_len - 1
 
-        expanded_embedding.weight[vocab_size:vocab_size +
-                                  (embeddings_tensor_len *
-                                   max_loras)] = torch.cat(embeddings_tensors)
+        expanded_embedding.weight[
+            vocab_size : vocab_size + (embeddings_tensor_len * max_loras)
+        ] = torch.cat(embeddings_tensors)
 
         lora_result = lora_embedding(torch.cat(original_inputs))
 
         expected_results: list[torch.Tensor] = []
-        for input_, original_input_, lora_id in zip(inputs, original_inputs,
-                                                    prompt_mapping):
+        for input_, original_input_, lora_id in zip(
+            inputs, original_inputs, prompt_mapping
+        ):
             lora = lora_dict[lora_id]
             result = expanded_embedding(input_)
             after_a = F.embedding(
                 original_input_,
                 lora.lora_a,
             )
-            result += (after_a @ lora.lora_b)
+            result += after_a @ lora.lora_b
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -465,24 +485,24 @@ def create_random_embedding_layer():
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=[0],
             num_inputs=num_loras * 3,
-            input_size=(200, ),
+            input_size=(200,),
             input_range=(1, vocab_size),
-            device=device)
+            device=device,
+        )
         original_inputs = deepcopy(inputs)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       vocab_size,
-                                       lora_config.lora_extra_vocab_size)
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
+        punica_wrapper.update_metadata(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            vocab_size,
+            lora_config.lora_extra_vocab_size,
+        )
         lora_result = lora_embedding(torch.cat(original_inputs))
         expected_result = expanded_embedding(torch.cat(inputs))
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -490,9 +510,9 @@ def create_random_embedding_layer():
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
-def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
-                                  stage) -> None:
-
+def test_lm_head_logits_processor(
+    dist_init, num_loras, device, vocab_size, stage
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -500,22 +520,25 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
     max_loras = 8
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16)
+    lora_config = LoRAConfig(
+        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
+    )
 
     def _pretest():
-        linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
-                                1024,
-                                vocab_size,
-                                params_dtype=torch.float16)
+        linear = ParallelLMHead(
+            vocab_size + lora_config.lora_extra_vocab_size,
+            1024,
+            vocab_size,
+            params_dtype=torch.float16,
+        )
         linear.weight.data = torch.rand_like(linear.weight.data)
         linear.weight.data[:, vocab_size:] = 0
         logits_processor = LogitsProcessor(
-            vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
+            vocab_size + lora_config.lora_extra_vocab_size, vocab_size
+        )
         lora_logits_processor = LogitsProcessorWithLoRA(
-            logits_processor, 1024, linear.weight.dtype, linear.weight.device,
-            None)
+            logits_processor, 1024, linear.weight.dtype, linear.weight.device, None
+        )
         lora_logits_processor.create_lora_weights(max_loras, lora_config)
 
         return linear, logits_processor, lora_logits_processor
@@ -542,10 +565,9 @@ def _pretest():
             input_size=(1, 1024),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -556,25 +578,24 @@ def _pretest():
         input_ = torch.rand(20, 1024)
 
         lora_result = lora_logits_processor._get_logits(
-            hidden_states=torch.cat(inputs),
-            lm_head=linear,
-            embedding_bias=None)
+            hidden_states=torch.cat(inputs), lm_head=linear, embedding_bias=None
+        )
 
         original_lm_head = deepcopy(linear)
 
-        linear.weight[logits_processor.
-                      org_vocab_size:logits_processor.org_vocab_size +
-                      embeddings_tensor_len] = embeddings_tensor
+        linear.weight[
+            logits_processor.org_vocab_size : logits_processor.org_vocab_size
+            + embeddings_tensor_len
+        ] = embeddings_tensor
 
-        logits_processor.org_vocab_size = (vocab_size +
-                                           lora_config.lora_extra_vocab_size)
+        logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size
         expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
-            result = logits_processor._get_logits(hidden_states=input_,
-                                                  lm_head=linear,
-                                                  embedding_bias=None)
-            result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
+            result = logits_processor._get_logits(
+                hidden_states=input_, lm_head=linear, embedding_bias=None
+            )
+            result[:, vocab_size + embeddings_tensor_len :] = float("-inf")
             result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
@@ -591,10 +612,9 @@ def _pretest():
             input_size=(1, 1024),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -606,17 +626,16 @@ def _pretest():
         lora_result = lora_logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
             lm_head=original_lm_head,
-            embedding_bias=None)[:, :vocab_size]
+            embedding_bias=None,
+        )[:, :vocab_size]
         expected_result = logits_processor._get_logits(
             hidden_states=torch.cat(inputs),
             lm_head=original_lm_head,
-            embedding_bias=None)
+            embedding_bias=None,
+        )
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -624,9 +643,7 @@ def _pretest():
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 @pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_replicated(dist_init, num_loras, device, stage,
-                           bias_enabled) -> None:
-
+def test_linear_replicated(dist_init, num_loras, device, stage, bias_enabled) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -634,23 +651,25 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        lora_dtype=torch.float16,
+        bias_enabled=bias_enabled,
+    )
 
     def create_random_linear_replicated_layer():
-
-        linear = ReplicatedLinear(4096,
-                                  4096,
-                                  bias=False,
-                                  params_dtype=torch.float16)
+        linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
         linear.weight.data = torch.rand_like(linear.weight.data)
         lora_linear = ReplicatedLinearWithLoRA(linear)
 
         lora_linear.create_lora_weights(max_loras, lora_config)
-        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked) == 1)
+        assert (
+            lora_linear.n_slices
+            == len(lora_linear.lora_a_stacked)
+            == len(lora_linear.lora_b_stacked)
+            == 1
+        )
         if bias_enabled:
             assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
@@ -676,10 +695,9 @@ def create_random_linear_replicated_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -699,10 +717,7 @@ def create_random_linear_replicated_layer():
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -715,22 +730,19 @@ def create_random_linear_replicated_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+        punica_wrapper.update_metadata(
+            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+        )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -740,9 +752,9 @@ def create_random_linear_replicated_layer():
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 @pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage, bias_enabled) -> None:
-
+def test_linear_parallel(
+    dist_init, num_loras, orientation, fully_shard, device, stage, bias_enabled
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -750,33 +762,42 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+        bias_enabled=bias_enabled,
+    )
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
-            linear = RowParallelLinear(4096,
-                                       4096,
-                                       bias=False,
-                                       params_dtype=torch.float16)
+            linear = RowParallelLinear(
+                4096, 4096, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard
-                           else RowParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                RowParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else RowParallelLinearWithShardedLoRA(linear)
+            )
         else:
-            linear = ColumnParallelLinear(4096,
-                                          4096,
-                                          bias=False,
-                                          params_dtype=torch.float16)
+            linear = ColumnParallelLinear(
+                4096, 4096, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (ColumnParallelLinearWithLoRA(linear)
-                           if not fully_shard else
-                           ColumnParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                ColumnParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else ColumnParallelLinearWithShardedLoRA(linear)
+            )
         lora_linear.create_lora_weights(max_loras, lora_config)
-        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked) == 1)
+        assert (
+            lora_linear.n_slices
+            == len(lora_linear.lora_a_stacked)
+            == len(lora_linear.lora_b_stacked)
+            == 1
+        )
         if bias_enabled:
             assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
@@ -802,10 +823,9 @@ def create_random_linear_parallel_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
         punica_wrapper.update_metadata(
             lora_mapping,
             id_to_index,
@@ -825,10 +845,7 @@ def create_random_linear_parallel_layer():
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         # Check that resetting the lora weights succeeds
 
@@ -841,22 +858,19 @@ def create_random_linear_parallel_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
-        punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+        punica_wrapper.update_metadata(
+            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+        )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @torch.inference_mode()
@@ -866,9 +880,9 @@ def create_random_linear_parallel_layer():
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 @pytest.mark.parametrize("bias_enabled", [True, False])
-def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage, bias_enabled) -> None:
-
+def test_column_parallel_packed(
+    dist_init, num_loras, repeats, fully_shard, device, stage, bias_enabled
+) -> None:
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
@@ -876,41 +890,45 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+        bias_enabled=bias_enabled,
+    )
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
-            linear = MergedColumnParallelLinear(4096, [4096] * repeats,
-                                                bias=False,
-                                                params_dtype=torch.float16)
+            linear = MergedColumnParallelLinear(
+                4096, [4096] * repeats, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (MergedColumnParallelLinearWithLoRA(linear)
-                           if not fully_shard else
-                           MergedColumnParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                MergedColumnParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else MergedColumnParallelLinearWithShardedLoRA(linear)
+            )
         elif repeats == 3:
-            linear = QKVParallelLinear(4096,
-                                       64,
-                                       32,
-                                       bias=False,
-                                       params_dtype=torch.float16)
+            linear = QKVParallelLinear(
+                4096, 64, 32, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (MergedQKVParallelLinearWithLoRA(linear)
-                           if not fully_shard else
-                           MergedQKVParallelLinearWithShardedLoRA(linear))
+            lora_linear = (
+                MergedQKVParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else MergedQKVParallelLinearWithShardedLoRA(linear)
+            )
         else:
-            linear = QKVParallelLinear(4096,
-                                       64,
-                                       32,
-                                       bias=False,
-                                       params_dtype=torch.float16)
+            linear = QKVParallelLinear(
+                4096, 64, 32, bias=False, params_dtype=torch.float16
+            )
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = QKVParallelLinearWithLoRA(
-                linear
-            ) if not fully_shard else QKVParallelLinearWithShardedLoRA(linear)
+            lora_linear = (
+                QKVParallelLinearWithLoRA(linear)
+                if not fully_shard
+                else QKVParallelLinearWithShardedLoRA(linear)
+            )
 
         @dataclass
         class FakeConfig:
@@ -919,11 +937,15 @@ class FakeConfig:
             num_attention_heads = 32
 
         n_slices = repeats
-        lora_linear.create_lora_weights(max_loras,
-                                        lora_config,
-                                        model_config=FakeConfig())
-        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked) == n_slices)
+        lora_linear.create_lora_weights(
+            max_loras, lora_config, model_config=FakeConfig()
+        )
+        assert (
+            lora_linear.n_slices
+            == len(lora_linear.lora_a_stacked)
+            == len(lora_linear.lora_b_stacked)
+            == n_slices
+        )
         if bias_enabled:
             assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
@@ -951,10 +973,9 @@ class FakeConfig:
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
             lora_mapping,
@@ -971,17 +992,14 @@ class FakeConfig:
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
             for i, sublora in enumerate(subloras):
-                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
-                       (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
-                                    sublora.scaling)
+                result[
+                    :, sublora.lora_b.shape[1] * i : sublora.lora_b.shape[1] * (i + 1)
+                ] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
         for slot_idx in range(max_loras):
             lora_linear.reset_lora(slot_idx)
@@ -992,10 +1010,9 @@ class FakeConfig:
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-            device=device)
-        lora_mapping = LoRAMapping(index_mapping,
-                                   prompt_mapping,
-                                   is_prefill=stage)
+            device=device,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
             lora_mapping,
@@ -1009,15 +1026,13 @@ class FakeConfig:
         expected_result = linear(torch.cat(inputs))[0]
 
         rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result,
-                                   expected_result,
-                                   rtol=rtol,
-                                   atol=atol)
+        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
 @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
 @pytest.mark.parametrize(
-    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
+    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
+)
 def test_vocab_parallel_embedding_indices(tp_size, seed):
     random.seed(seed)
     vocab_size = random.randint(4000, 64000)
@@ -1035,20 +1050,24 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
     token_ids: list[int] = []
 
     for tp_rank in range(tp_size):
-        with patch(
+        with (
+            patch(
                 "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
-                return_value=tp_rank
-        ), patch(
+                return_value=tp_rank,
+            ),
+            patch(
                 "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
-                return_value=tp_size):
+                return_value=tp_size,
+            ),
+        ):
             vocab_embedding = VocabParallelEmbedding(
-                vocab_size, 1, org_num_embeddings=org_vocab_size)
+                vocab_size, 1, org_num_embeddings=org_vocab_size
+            )
         vocab_size_padded = vocab_embedding.num_embeddings_padded
         shard_indices = vocab_embedding.shard_indices
         # Assert that the ranges are contiguous
         assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
-        assert (shard_indices.added_vocab_start_index ==
-                last_added_vocab_end_index)
+        assert shard_indices.added_vocab_start_index == last_added_vocab_end_index
 
         # Ensure that we are not exceeding the vocab size
         computed_vocab_size += shard_indices.num_elements_padded
@@ -1057,22 +1076,39 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
 
         # Ensure that the ranges are not overlapping
         all_org_tokens.extend(
-            range(shard_indices.org_vocab_start_index,
-                  shard_indices.org_vocab_end_index))
+            range(
+                shard_indices.org_vocab_start_index, shard_indices.org_vocab_end_index
+            )
+        )
         all_added_tokens.extend(
-            range(shard_indices.added_vocab_start_index,
-                  shard_indices.added_vocab_end_index))
+            range(
+                shard_indices.added_vocab_start_index,
+                shard_indices.added_vocab_end_index,
+            )
+        )
 
         token_ids.extend(
-            range(shard_indices.org_vocab_start_index,
-                  shard_indices.org_vocab_end_index))
-        token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
-                                 shard_indices.num_org_elements))
+            range(
+                shard_indices.org_vocab_start_index, shard_indices.org_vocab_end_index
+            )
+        )
+        token_ids.extend(
+            [-1]
+            * (shard_indices.num_org_elements_padded - shard_indices.num_org_elements)
+        )
         token_ids.extend(
-            range(shard_indices.added_vocab_start_index,
-                  shard_indices.added_vocab_end_index))
-        token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
-                                 shard_indices.num_added_elements))
+            range(
+                shard_indices.added_vocab_start_index,
+                shard_indices.added_vocab_end_index,
+            )
+        )
+        token_ids.extend(
+            [-1]
+            * (
+                shard_indices.num_added_elements_padded
+                - shard_indices.num_added_elements
+            )
+        )
 
         last_org_vocab_end_index = shard_indices.org_vocab_end_index
         last_added_vocab_end_index = shard_indices.added_vocab_end_index
@@ -1100,130 +1136,165 @@ def test_get_masked_input_and_mask():
     x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
 
     # base tp 1 case, no padding
-    modified_x, _ = get_masked_input_and_mask(x,
-                                              org_vocab_start_index=0,
-                                              org_vocab_end_index=8,
-                                              added_vocab_start_index=8,
-                                              added_vocab_end_index=12,
-                                              num_org_vocab_padding=0)
+    modified_x, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=8,
+        added_vocab_start_index=8,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0,
+    )
     assert torch.equal(x, modified_x)
 
     # tp 2 case, no padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=0)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=4,
+        added_vocab_start_index=8,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=0,
+    )
     modified_x_rank_1, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=8,
         added_vocab_start_index=10,
         added_vocab_end_index=12,
-        num_org_vocab_padding=0)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
+        num_org_vocab_padding=0,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5])
+    )
 
     # tp 4 case, no padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=2,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=9,
-                                                     num_org_vocab_padding=0)
-    modified_x_rank_1, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=2,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=9,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=0)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=2,
+        added_vocab_start_index=8,
+        added_vocab_end_index=9,
+        num_org_vocab_padding=0,
+    )
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=2,
+        org_vocab_end_index=4,
+        added_vocab_start_index=9,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=0,
+    )
     modified_x_rank_2, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=6,
         added_vocab_start_index=10,
         added_vocab_end_index=11,
-        num_org_vocab_padding=0)
+        num_org_vocab_padding=0,
+    )
     modified_x_rank_3, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=6,
         org_vocab_end_index=8,
         added_vocab_start_index=11,
         added_vocab_end_index=12,
-        num_org_vocab_padding=0)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
-    assert torch.equal(modified_x_rank_2,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
-    assert torch.equal(modified_x_rank_3,
-                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
+        num_org_vocab_padding=0,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_2, torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_3, torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2])
+    )
 
     # base tp 1 case, with padding
-    modified_x, _ = get_masked_input_and_mask(x,
-                                              org_vocab_start_index=0,
-                                              org_vocab_end_index=8,
-                                              added_vocab_start_index=8,
-                                              added_vocab_end_index=12,
-                                              num_org_vocab_padding=2)
-    assert torch.equal(modified_x,
-                       torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
+    modified_x, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=8,
+        added_vocab_start_index=8,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2,
+    )
+    assert torch.equal(
+        modified_x, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13])
+    )
 
     # tp 2 case, with padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=2)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=4,
+        added_vocab_start_index=8,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=2,
+    )
     modified_x_rank_1, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=8,
         added_vocab_start_index=10,
         added_vocab_end_index=12,
-        num_org_vocab_padding=2)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
+        num_org_vocab_padding=2,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7])
+    )
 
     # tp 4 case, with padding
-    modified_x_rank_0, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=0,
-                                                     org_vocab_end_index=2,
-                                                     added_vocab_start_index=8,
-                                                     added_vocab_end_index=9,
-                                                     num_org_vocab_padding=2)
-    modified_x_rank_1, _ = get_masked_input_and_mask(x,
-                                                     org_vocab_start_index=2,
-                                                     org_vocab_end_index=4,
-                                                     added_vocab_start_index=9,
-                                                     added_vocab_end_index=10,
-                                                     num_org_vocab_padding=2)
+    modified_x_rank_0, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=0,
+        org_vocab_end_index=2,
+        added_vocab_start_index=8,
+        added_vocab_end_index=9,
+        num_org_vocab_padding=2,
+    )
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=2,
+        org_vocab_end_index=4,
+        added_vocab_start_index=9,
+        added_vocab_end_index=10,
+        num_org_vocab_padding=2,
+    )
     modified_x_rank_2, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=4,
         org_vocab_end_index=6,
         added_vocab_start_index=10,
         added_vocab_end_index=11,
-        num_org_vocab_padding=2)
+        num_org_vocab_padding=2,
+    )
     modified_x_rank_3, _ = get_masked_input_and_mask(
         x,
         org_vocab_start_index=6,
         org_vocab_end_index=8,
         added_vocab_start_index=11,
         added_vocab_end_index=12,
-        num_org_vocab_padding=2)
-    assert torch.equal(modified_x_rank_0,
-                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
-    assert torch.equal(modified_x_rank_1,
-                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
-    assert torch.equal(modified_x_rank_2,
-                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
-    assert torch.equal(modified_x_rank_3,
-                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
+        num_org_vocab_padding=2,
+    )
+    assert torch.equal(
+        modified_x_rank_0, torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_1, torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_2, torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0])
+    )
+    assert torch.equal(
+        modified_x_rank_3, torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4])
+    )
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index bebf44b6dfd7..d7824178626f 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -27,27 +27,28 @@
     "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
     "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
     "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
-    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
 ]
 
 
-def do_sample(llm: vllm.LLM,
-              lora_path: str,
-              lora_id: int,
-              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
+def do_sample(
+    llm: vllm.LLM,
+    lora_path: str,
+    lora_id: int,
+    tensorizer_config_dict: Union[dict, None] = None,
+) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
     ]
 
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=256,
-                                          skip_special_tokens=False,
-                                          stop=["[/assistant]"])
+    sampling_params = vllm.SamplingParams(
+        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
+    )
 
     if tensorizer_config_dict is not None:
         outputs = llm.generate(
@@ -57,14 +58,19 @@ def do_sample(llm: vllm.LLM,
                 str(lora_id),
                 lora_id,
                 lora_path,
-                tensorizer_config_dict=tensorizer_config_dict)
-            if lora_id else None)
+                tensorizer_config_dict=tensorizer_config_dict,
+            )
+            if lora_id
+            else None,
+        )
     else:
         outputs = llm.generate(
             prompts,
             sampling_params,
             lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-            if lora_id else None)
+            if lora_id
+            else None,
+        )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -75,53 +81,72 @@ def do_sample(llm: vllm.LLM,
     return generated_texts
 
 
-def generate_and_test(llm,
-                      sql_lora_files,
-                      tensorizer_config_dict: Union[dict, None] = None):
+def generate_and_test(
+    llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None
+):
     print("lora adapter created")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=0,
+        )
+        == EXPECTED_NO_LORA_OUTPUT
+    )
 
     print("lora 1")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=1,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
 
     print("no lora")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=0,
+        )
+        == EXPECTED_NO_LORA_OUTPUT
+    )
 
     print("lora 2")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=2) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=2,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
 
     print("removing lora")
 
 
 @create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
-
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
         # also test odd max_num_seqs
         max_num_seqs=13,
         max_loras=4,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
     generate_and_test(llm, sql_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
-
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
@@ -136,7 +161,6 @@ def test_llama_lora_tp4(sql_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
-
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
@@ -151,9 +175,9 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
 
 @multi_gpu_test(num_gpus=2)
 @create_new_process_for_each_test()
-def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
-                                            sql_lora_huggingface_id):
-
+def test_tp2_serialize_and_deserialize_lora(
+    tmp_path, sql_lora_files, sql_lora_huggingface_id
+):
     # Run the tensorizing of the LoRA adapter and the model in a subprocess
     # to guarantee cleanup
 
@@ -164,17 +188,28 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
     lora_path = sql_lora_huggingface_id
     suffix = "test"
     try:
-        result = subprocess.run([
-            sys.executable,
-            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
-            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
-            str(tp_size), "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
-            '{"limit_cpu_concurrency": 4}'
-        ],
-                                check=True,
-                                capture_output=True,
-                                text=True)
+        result = subprocess.run(
+            [
+                sys.executable,
+                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
+                "--model",
+                MODEL_PATH,
+                "--lora-path",
+                lora_path,
+                "--tensor-parallel-size",
+                str(tp_size),
+                "serialize",
+                "--serialized-directory",
+                str(tmp_path),
+                "--suffix",
+                suffix,
+                "--serialization-kwargs",
+                '{"limit_cpu_concurrency": 4}',
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
     except subprocess.CalledProcessError as e:
         print("Tensorizing failed.")
         print("STDOUT:\n", e.stdout)
@@ -186,25 +221,37 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
     model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
     tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
 
-    loaded_vllm_model = LLM(model=model_ref,
-                            load_format="tensorizer",
-                            enable_lora=True,
-                            enforce_eager=True,
-                            model_loader_extra_config=tensorizer_config,
-                            max_num_seqs=13,
-                            tensor_parallel_size=2,
-                            max_loras=2)
+    loaded_vllm_model = LLM(
+        model=model_ref,
+        load_format="tensorizer",
+        enable_lora=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config,
+        max_num_seqs=13,
+        tensor_parallel_size=2,
+        max_loras=2,
+    )
 
     tc_as_dict = tensorizer_config.to_serializable()
 
     print("lora adapter created")
-    assert do_sample(loaded_vllm_model,
-                     sql_lora_files,
-                     tensorizer_config_dict=tc_as_dict,
-                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    assert (
+        do_sample(
+            loaded_vllm_model,
+            sql_lora_files,
+            tensorizer_config_dict=tc_as_dict,
+            lora_id=0,
+        )
+        == EXPECTED_NO_LORA_OUTPUT
+    )
 
     print("lora 1")
-    assert do_sample(loaded_vllm_model,
-                     sql_lora_files,
-                     tensorizer_config_dict=tc_as_dict,
-                     lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            loaded_vllm_model,
+            sql_lora_files,
+            tensorizer_config_dict=tc_as_dict,
+            lora_id=1,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py
index 01bc102bd112..ed23d8278488 100644
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -3,16 +3,16 @@
 
 import pytest
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         VllmConfig)
+from vllm.config import CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, VllmConfig
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.v1.engine.processor import Processor
 
 
-def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
-                                           sql_lora_files):
+def test_allowed_token_ids_with_lora_vocab(
+    llama_2_7b_base_huggingface_id, sql_lora_files
+):
     """
     Test that we properly resolve the range of allowed token ids for lora
     adapters that define additional tokens.
@@ -36,7 +36,8 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
     tokenizer = init_tokenizer_from_configs(
         model_config=vllm_config.model_config,
         scheduler_config=vllm_config.scheduler_config,
-        lora_config=vllm_config.lora_config)
+        lora_config=vllm_config.lora_config,
+    )
     processor = Processor(vllm_config, tokenizer)
 
     lora_request = LoRARequest("1", 1, str(sql_lora_files))
@@ -49,7 +50,8 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
         request_id,
         prompt,
         params=SamplingParams(allowed_token_ids=lora_token_ids),
-        lora_request=lora_request)
+        lora_request=lora_request,
+    )
 
     # tokens in the base model should not raise an error
     base_token_ids = [1000, 1001, 1002, 1003]
@@ -57,7 +59,8 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
         request_id,
         prompt,
         params=SamplingParams(allowed_token_ids=base_token_ids),
-        lora_request=lora_request)
+        lora_request=lora_request,
+    )
 
     # tokens not in the lora adapter should raise an error
     invalid_token_ids = [35000, 35001, 35002, 35003]
@@ -66,7 +69,8 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
             request_id,
             prompt,
             params=SamplingParams(allowed_token_ids=invalid_token_ids),
-            lora_request=lora_request)
+            lora_request=lora_request,
+        )
 
     # tokens in the lora adapter with no lora request should raise an error
     with pytest.raises(ValueError):
@@ -78,7 +82,8 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
 
 
 def test_allowed_token_ids_with_lora_adapter_no_vocab(
-        qwen25vl_base_huggingface_id, qwen25vl_lora_files):
+    qwen25vl_base_huggingface_id, qwen25vl_lora_files
+):
     """
     Test that we properly resolve the range of allowed token ids for lora
     adapters that do not define additional tokens.
@@ -102,7 +107,8 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
     tokenizer = init_tokenizer_from_configs(
         model_config=vllm_config.model_config,
         scheduler_config=vllm_config.scheduler_config,
-        lora_config=vllm_config.lora_config)
+        lora_config=vllm_config.lora_config,
+    )
     processor = Processor(vllm_config, tokenizer)
 
     lora_request = LoRARequest("1", 1, str(qwen25vl_lora_files))
@@ -115,7 +121,8 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
         request_id,
         prompt,
         params=SamplingParams(allowed_token_ids=base_token_ids),
-        lora_request=lora_request)
+        lora_request=lora_request,
+    )
 
     # tokens in the base model with no lora request should not raise an error
     base_token_ids = [1000, 1001, 1002, 1003]
@@ -132,4 +139,5 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
             request_id,
             prompt,
             params=SamplingParams(allowed_token_ids=invalid_token_ids),
-            lora_request=lora_request)
+            lora_request=lora_request,
+        )
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index ebc0f26378d2..2219d470e91a 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -8,9 +8,7 @@
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
 
-lora_lst = [
-    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
-]
+lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
 BAICHUAN_LORA_MODULES = [
     "W_pack",
     "o_proj",
@@ -37,8 +35,9 @@ def test_load_checkpoints(
         else:
             expected_lora_modules.append(module)
     if lora_name == "baichuan7B":
-        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_lora_files, max_position_embeddings=4096
+        )
         # For the baichuan7B model, load it's LoRA,
         # and the test should pass.
         LoRAModel.from_local_checkpoint(
@@ -48,13 +47,15 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
         # such as "model.layers.0.self_atten.W_pack", and
         # the test should pass.
-        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_zero_lora_files, max_position_embeddings=4096
+        )
         LoRAModel.from_local_checkpoint(
             baichuan_zero_lora_files,
             expected_lora_modules,
@@ -62,12 +63,14 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
         # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
-        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_regex_lora_files, max_position_embeddings=4096
+        )
         LoRAModel.from_local_checkpoint(
             baichuan_regex_lora_files,
             expected_lora_modules,
@@ -75,13 +78,15 @@ def test_load_checkpoints(
             lora_model_id=1,
             device="cpu",
             embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
         # and the test should raise the following error.
         expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
-        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            chatglm3_lora_files, max_position_embeddings=4096
+        )
         with pytest.raises(ValueError, match=expected_error):
             LoRAModel.from_local_checkpoint(
                 chatglm3_lora_files,
@@ -90,11 +95,11 @@ def test_load_checkpoints(
                 lora_model_id=1,
                 device="cpu",
                 embedding_modules=embedding_modules,
-                embedding_padding_modules=embed_padding_modules)
+                embedding_padding_modules=embed_padding_modules,
+            )
 
 
 def test_lora_weights_mapping(baichuan_lora_files):
-
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
@@ -113,8 +118,9 @@ def test_lora_weights_mapping(baichuan_lora_files):
             ".layers.": ".baichuan_layers.",
         },
     )
-    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        baichuan_lora_files, max_position_embeddings=4096
+    )
     lora_model = LoRAModel.from_local_checkpoint(
         baichuan_lora_files,
         expected_lora_modules,
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index 50c60341f0d8..bc90a88dc226 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -3,12 +3,14 @@
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
+
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
@@ -17,23 +19,24 @@
 
 
 def make_lora_request(lora_id: int):
-    return LoRARequest(lora_name=f"{lora_id}",
-                       lora_int_id=lora_id,
-                       lora_path=LORA_MODULE_PATH)
+    return LoRARequest(
+        lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
+    )
 
 
 def test_lora_functions_sync():
-
     max_loras = 4
     # Create engine in eager-mode. Due to high max_loras, the CI can
     # OOM during cuda-graph capture.
-    engine_args = EngineArgs(model=MODEL_PATH,
-                             enable_lora=True,
-                             max_loras=max_loras,
-                             max_lora_rank=LORA_RANK,
-                             max_model_len=128,
-                             gpu_memory_utilization=0.8,
-                             enforce_eager=True)
+    engine_args = EngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
 
     llm = LLMEngine.from_engine_args(engine_args)
 
@@ -70,15 +73,16 @@ def run_check(fn, args, expected: list):
 
 @pytest.mark.asyncio
 async def test_lora_functions_async():
-
     max_loras = 4
-    engine_args = AsyncEngineArgs(model=MODEL_PATH,
-                                  enable_lora=True,
-                                  max_loras=max_loras,
-                                  max_lora_rank=LORA_RANK,
-                                  max_model_len=128,
-                                  gpu_memory_utilization=0.8,
-                                  enforce_eager=True)
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
 
     async def run_check(fn, args, expected: list):
         await fn(args)
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index b46d81f1651a..7d20faef541a 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -11,8 +11,12 @@
 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
-    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
-    "lm_head"
+    "qkv_proj",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+    "embed_tokens",
+    "lm_head",
 ]
 
 
@@ -40,7 +44,8 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
         lora_model_id=1,
         device="cpu",
         embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules)
+        embedding_padding_modules=embed_padding_modules,
+    )
 
     # Assertions to ensure the model is loaded correctly
     assert lora_model is not None, "LoRAModel is not loaded correctly"
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 8f8a27006cf6..77317d63dc21 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -9,16 +9,21 @@
 from torch import nn
 
 from vllm.config import LoRAConfig
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithLoRA,
-                              RowParallelLinearWithLoRA)
+from vllm.lora.layers import (
+    ColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
-                              LRUCacheLoRAModelManager)
+from vllm.lora.models import (
+    LoRAMapping,
+    LoRAModel,
+    LoRAModelManager,
+    LRUCacheLoRAModelManager,
+)
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
-from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
-                                      WorkerLoRAManager)
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager, WorkerLoRAManager
 from vllm.platforms import current_platform
 
 EMBEDDING_MODULES = {
@@ -28,9 +33,11 @@
 
 EMBEDDING_PADDING_MODULES = ["lm_head"]
 
-DEVICES = ([
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-] if current_platform.is_cuda_alike() else ["cpu"])
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
 
 DEFAULT_DTYPE = torch.get_default_dtype()
 
@@ -42,19 +49,20 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     LoRAModelManager it is okay to just test V0.
     """
     with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
+        m.setenv("VLLM_USE_V1", "0")
         yield
 
 
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
-    tensors = load_file(
-        os.path.join(sql_lora_files, "adapter_model.safetensors"))
+    tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
     new_embeddings = load_file(
-        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+        os.path.join(sql_lora_files, "new_embeddings.safetensors")
+    )
 
-    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        sql_lora_files, max_position_embeddings=4096
+    )
     lora_model = LoRAModel.from_lora_tensors(
         1,
         tensors,
@@ -62,7 +70,8 @@ def test_from_lora_tensors(sql_lora_files, device):
         device=device,
         embeddings=new_embeddings,
         embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES,
+    )
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
         assert lora.rank == 8
@@ -71,22 +80,27 @@ def test_from_lora_tensors(sql_lora_files, device):
         assert lora.lora_b is not None
         assert lora.lora_a.device == torch.device(device)
         assert lora.lora_b.device == torch.device(device)
-        assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
-                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        assert lora.lora_a.shape[1] == lora.lora_b.shape[0], (
+            f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        )
         assert lora.lora_a.shape[1] == 8
         embeddings_module = next(
-            (k for k in EMBEDDING_MODULES if k in module_name), None)
+            (k for k in EMBEDDING_MODULES if k in module_name), None
+        )
         if embeddings_module:
             assert torch.equal(
                 lora.embeddings_tensor,
                 new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
-                    device=lora.embeddings_tensor.device))
+                    device=lora.embeddings_tensor.device
+                ),
+            )
         else:
             assert lora.embeddings_tensor is None
 
 
-def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
-                device: torch.device) -> LoRAModel:
+def create_lora(
+    lora_id: int, model: nn.Module, sub_modules: list[str], device: torch.device
+) -> LoRAModel:
     loras: dict[str, LoRALayerWeights] = {}
     for name in sub_modules:
         w = model.get_submodule(name).weight
@@ -118,8 +132,7 @@ def create_packed_lora(
             8,
             16,
             torch.rand([w.shape[1], 8], device=device),
-            torch.rand([8, w.shape[0] // len(replaced_module_names)],
-                       device=device),
+            torch.rand([8, w.shape[0] // len(replaced_module_names)], device=device),
         )
     return LoRAModel(lora_id, 8, loras)
 
@@ -127,42 +140,42 @@ def create_packed_lora(
 def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
     manager = LoRAModelManager(
-        model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8,
-                   max_cpu_loras=8,
-                   max_loras=8,
-                   lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
+        model,
+        1,
+        1,
+        1,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=8, max_loras=8, lora_dtype=DEFAULT_DTYPE
+        ),
+        torch.device(DEVICES[0]),
+    )
     model = manager.model
-    assert isinstance(model.get_submodule("dense1"),
-                      ColumnParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("layer1.dense1"),
-                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("dense1"), ColumnParallelLinearWithLoRA)
+    assert isinstance(
+        model.get_submodule("layer1.dense1"), ColumnParallelLinearWithLoRA
+    )
     assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("layer1.dense2"),
-                      RowParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense2"), RowParallelLinearWithLoRA)
 
 
 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
     model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LoRAModelManager(model,
-                               2,
-                               2,
-                               2,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=3,
-                                          max_loras=2,
-                                          lora_dtype=DEFAULT_DTYPE),
-                               device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
     assert manager.activate_adapter(1)
@@ -212,24 +225,21 @@ def test_lora_model_manager(dist_init, dummy_model, device):
 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LRUCacheLoRAModelManager(model,
-                                       2,
-                                       2,
-                                       2,
-                                       LoRAConfig(max_lora_rank=8,
-                                                  max_cpu_loras=3,
-                                                  max_loras=2,
-                                                  lora_dtype=DEFAULT_DTYPE),
-                                       device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LRUCacheLoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
     assert manager.activate_adapter(1)
@@ -305,27 +315,22 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
     model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora4 = create_lora(4,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LRUCacheLoRAModelManager(model,
-                                       2,
-                                       2,
-                                       2,
-                                       LoRAConfig(max_lora_rank=8,
-                                                  max_cpu_loras=2,
-                                                  max_loras=2,
-                                                  lora_dtype=DEFAULT_DTYPE),
-                                       device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LRUCacheLoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
 
     assert all(x is None for x in manager.lora_index_to_id)
 
@@ -430,66 +435,83 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                          sql_lora_files, device):
-    lora_config = LoRAConfig(max_lora_rank=8,
-                             max_cpu_loras=4,
-                             max_loras=4,
-                             lora_dtype=DEFAULT_DTYPE)
+def test_lru_cache_worker_adapter_manager(
+    llama_2_7b_model_extra_embeddings, sql_lora_files, device
+):
+    lora_config = LoRAConfig(
+        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
+    )
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, device,
-        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+        4,
+        2,
+        llama_2_7b_model_extra_embeddings.unpadded_vocab_size
+        - lora_config.lora_extra_vocab_size,
+        lora_config,
+        device,
+        EMBEDDING_MODULES,
+        EMBEDDING_PADDING_MODULES,
+    )
+    worker_adapter_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [LoRARequest("1", 1, sql_lora_files), LoRARequest("2", 2, sql_lora_files)],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("3", 3, sql_lora_files),
+            LoRARequest("4", 4, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("2", 2, sql_lora_files),
+            LoRARequest("5", 5, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("1", 1, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("6", 6, sql_lora_files),
+            LoRARequest("7", 7, sql_lora_files),
+            LoRARequest("8", 8, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
@@ -498,78 +520,97 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
-        ], mapping)
+        worker_adapter_manager.set_active_adapters(
+            [
+                LoRARequest("10", 10, sql_lora_files),
+                LoRARequest("11", 11, sql_lora_files),
+                LoRARequest("12", 12, sql_lora_files),
+                LoRARequest("13", 13, sql_lora_files),
+                LoRARequest("14", 14, sql_lora_files),
+            ],
+            mapping,
+        )
 
     assert worker_adapter_manager.device == device
-    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
-            device)
+    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                sql_lora_files, device):
+def test_worker_adapter_manager(
+    llama_2_7b_model_extra_embeddings, sql_lora_files, device
+):
     # Should remove every LoRA not specified in the request.
-    lora_config = LoRAConfig(max_lora_rank=8,
-                             max_cpu_loras=4,
-                             max_loras=4,
-                             lora_dtype=DEFAULT_DTYPE)
+    lora_config = LoRAConfig(
+        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
+    )
     worker_adapter_manager = WorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, device,
-        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+        4,
+        2,
+        llama_2_7b_model_extra_embeddings.unpadded_vocab_size
+        - lora_config.lora_extra_vocab_size,
+        lora_config,
+        device,
+        EMBEDDING_MODULES,
+        EMBEDDING_PADDING_MODULES,
+    )
+    worker_adapter_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
 
     mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [LoRARequest("1", 1, sql_lora_files), LoRARequest("2", 2, sql_lora_files)],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("3", 3, sql_lora_files),
+            LoRARequest("4", 4, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 3, 4}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("2", 2, sql_lora_files),
+            LoRARequest("5", 5, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1, 2, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("1", 1, sql_lora_files),
+            LoRARequest("1", 1, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {1}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
 
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("6", 6, sql_lora_files),
+            LoRARequest("7", 7, sql_lora_files),
+            LoRARequest("8", 8, sql_lora_files),
+        ],
+        mapping,
+    )
     assert worker_adapter_manager.list_adapters() == {6, 7, 8}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
@@ -577,17 +618,19 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 
     # Over capacity
     with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
-        ], mapping)
+        worker_adapter_manager.set_active_adapters(
+            [
+                LoRARequest("10", 10, sql_lora_files),
+                LoRARequest("11", 11, sql_lora_files),
+                LoRARequest("12", 12, sql_lora_files),
+                LoRARequest("13", 13, sql_lora_files),
+                LoRARequest("14", 14, sql_lora_files),
+            ],
+            mapping,
+        )
 
     assert worker_adapter_manager.device == device
-    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
-            device)
+    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
 
 
 @pytest.mark.parametrize("device", DEVICES)
@@ -598,7 +641,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
         model,
         module_name="gate_up_proj",
         replaced_module_names=["gate_proj", "up_proj"],
-        device=device)
+        device=device,
+    )
     model_lora1 = create_packed_lora(
         2,
         model,
@@ -608,19 +652,21 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
         empty_replaced_module_name="gate_proj",
     )
 
-    manager = LoRAModelManager(model,
-                               2,
-                               2,
-                               2,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=2,
-                                          max_loras=2,
-                                          lora_dtype=DEFAULT_DTYPE),
-                               device=device)
+    manager = LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
     model = manager.model
 
-    assert isinstance(model.get_submodule("gate_up_proj"),
-                      MergedColumnParallelLinearWithLoRA)
+    assert isinstance(
+        model.get_submodule("gate_up_proj"), MergedColumnParallelLinearWithLoRA
+    )
     # Verify packed lora is correct
     model_lora_clone = model_lora.clone(1)
     model_lora_clone1 = model_lora1.clone(1)
@@ -633,21 +679,27 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
     packed_lora = model_lora.get_lora("gate_up_proj")
     assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
 
-    torch.testing.assert_close(packed_lora.lora_a[0],
-                               model_lora_clone.get_lora("gate_proj").lora_a)
-    torch.testing.assert_close(packed_lora.lora_b[0],
-                               model_lora_clone.get_lora("gate_proj").lora_b)
-    torch.testing.assert_close(packed_lora.lora_a[1],
-                               model_lora_clone.get_lora("up_proj").lora_a)
-    torch.testing.assert_close(packed_lora.lora_b[1],
-                               model_lora_clone.get_lora("up_proj").lora_b)
+    torch.testing.assert_close(
+        packed_lora.lora_a[0], model_lora_clone.get_lora("gate_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_b[0], model_lora_clone.get_lora("gate_proj").lora_b
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_a[1], model_lora_clone.get_lora("up_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_b[1], model_lora_clone.get_lora("up_proj").lora_b
+    )
 
     packed_lora1 = model_lora1.get_lora("gate_up_proj")
     assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
 
     assert packed_lora1.lora_a[0] is None
     assert packed_lora1.lora_b[0] is None
-    torch.testing.assert_close(packed_lora1.lora_a[1],
-                               model_lora_clone1.get_lora("up_proj").lora_a)
-    torch.testing.assert_close(packed_lora1.lora_b[1],
-                               model_lora_clone1.get_lora("up_proj").lora_b)
+    torch.testing.assert_close(
+        packed_lora1.lora_a[1], model_lora_clone1.get_lora("up_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b
+    )
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 99fe951bbf07..ce98fe2f8613 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -15,7 +15,8 @@
 PROMPT_TEMPLATE = (
     "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
     "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n")
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
 
 IMAGE_ASSETS = [
     ImageAsset("stop_sign"),
@@ -34,18 +35,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
         stop_token_ids=[128001, 128009],  # eos_id, eot_id
     )
 
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
+    inputs = [
+        {
+            "prompt": PROMPT_TEMPLATE,
+            "multi_modal_data": {"image": asset.pil_image},
+        }
+        for asset in IMAGE_ASSETS
+    ]
 
     outputs = llm.generate(
         inputs,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
     )
     # Print the outputs.
     generated_texts: list[str] = []
@@ -58,7 +59,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -68,10 +70,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_lora_rank=8,
         enforce_eager=True,
         max_model_len=2048,
-        limit_mm_per_prompt={
-            "image": 2,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 2, "video": 0},
         trust_remote_code=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -82,11 +81,13 @@ def test_minicpmv_lora(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output2[i])
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
 @create_new_process_for_each_test()
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
@@ -96,10 +97,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         tensor_parallel_size=4,
-        limit_mm_per_prompt={
-            "image": 2,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 2, "video": 0},
         trust_remote_code=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -107,11 +105,13 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
 @create_new_process_for_each_test()
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
@@ -122,10 +122,7 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
         max_lora_rank=8,
         tensor_parallel_size=4,
         trust_remote_code=True,
-        limit_mm_per_prompt={
-            "image": 1,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 1, "video": 0},
         fully_sharded_loras=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 0ea07793311c..f80b496d1b34 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -11,15 +11,15 @@
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: list[str]) -> list[str]:
-
+def do_sample(
+    llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]
+) -> list[str]:
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -33,8 +33,11 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
 @pytest.mark.parametrize("tp_size", [4])
 def test_mixtral_lora(mixtral_lora_files, tp_size):
     """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count(
-    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
+    if (
+        torch.cuda.device_count() < tp_size
+        and tp_size > 1
+        and current_platform.is_cuda_alike()
+    ):
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     prompts = [
@@ -58,7 +61,11 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
         "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
         "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
     ]
-    assert do_sample(llm, mixtral_lora_files, lora_id=1,
-                     prompts=prompts) == expected_lora_output
-    assert do_sample(llm, mixtral_lora_files, lora_id=2,
-                     prompts=prompts) == expected_lora_output
+    assert (
+        do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts)
+        == expected_lora_output
+    )
+    assert (
+        do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts)
+        == expected_lora_output
+    )
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index f16589e06b2d..d05c1bd22c9f 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -13,34 +13,27 @@
 ERROR_CASES = [
     (
         "test_rank",
-        {
-            "r": 1024
-        },
+        {"r": 1024},
         "is greater than max_lora_rank",
     ),
     (
         "test_bias",
-        {
-            "bias": "all"
-        },
+        {"bias": "all"},
         "Adapter bias cannot be used without bias_enabled",
     ),
-    ("test_dora", {
-        "use_dora": True
-    }, "does not yet support DoRA"),
+    ("test_dora", {"use_dora": True}, "does not yet support DoRA"),
     (
         "test_modules_to_save",
-        {
-            "modules_to_save": ["lm_head"]
-        },
+        {"modules_to_save": ["lm_head"]},
         "only supports modules_to_save being None",
     ),
 ]
 
 
 def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
-    peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        long_context_lora_files_16k_1, max_position_embeddings=4096
+    )
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
     assert peft_helper.r == 8
@@ -59,8 +52,8 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
     assert peft_helper.context_length == 16384
     assert peft_helper.vllm_max_position_embeddings == 4096
     assert peft_helper.vllm_long_context_scaling_factor == float(
-        math.ceil(peft_helper.context_length /
-                  peft_helper.vllm_max_position_embeddings))
+        math.ceil(peft_helper.context_length / peft_helper.vllm_max_position_embeddings)
+    )
     # test RSLoRA
     rslora_config = dict(use_rslora=True)
     test_dir = tmp_path / "test_rslora"
@@ -77,8 +70,7 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
     with open(config_path, "w") as f:
         json.dump(adapter_config, f)
 
-    peft_helper = PEFTHelper.from_local_dir(test_dir,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(test_dir, max_position_embeddings=4096)
     peft_helper.validate_legal(lora_config)
     scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
     assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
@@ -109,4 +101,5 @@ def test_peft_helper_error(
     # Test loading the adapter
     with pytest.raises(ValueError, match=expected_error):
         PEFTHelper.from_local_dir(
-            test_dir, max_position_embeddings=4096).validate_legal(lora_config)
+            test_dir, max_position_embeddings=4096
+        ).validate_legal(lora_config)
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 3090941e6367..ebc027aab384 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -12,30 +12,23 @@
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which catalog publisher has published the most catalogs?",
-            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
+            sql_prompt="Which catalog publisher has published the most catalogs?",
+            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);",
+        ),
         PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
-            context=
-            "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);"  # noqa: E501
+            sql_prompt="Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
+            context="CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);",  # noqa: E501
         ),
         PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "How many marine species are found in the Southern Ocean?",  # noqa: E501
-            context=
-            "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));"  # noqa: E501
+            sql_prompt="How many marine species are found in the Southern Ocean?",  # noqa: E501
+            context="CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));",  # noqa: E501
         ),
     ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=64,
-                                          stop="### End")
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64, stop="### End")
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
     )
     # Print the outputs.
     generated_texts: list[str] = []
@@ -50,12 +43,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 def test_phi2_lora(phi2_lora_files):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=2,
-                   enforce_eager=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=2,
+        enforce_eager=True,
+        enable_chunked_prefill=True,
+    )
 
     expected_lora_output = [
         "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 14fa79ae5b44..e4df9751077d 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -21,11 +21,18 @@ def reset_device(reset_default_device):
 
 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
-        nslices: int, inputs_tensor: torch.Tensor,
-        lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
-        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
-        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
-        num_tokens: int, scaling: float):
+    nslices: int,
+    inputs_tensor: torch.Tensor,
+    lora_weights_lst: list[torch.Tensor],
+    out_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    prompt_lora_mapping: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    num_tokens: int,
+    scaling: float,
+):
     """
     Wrapper around torch_ops.sgmv_shrink that handles any nslices.
     """
@@ -44,15 +51,20 @@ def sgmv_shrink_for_nslices(
         )
 
 
-def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
-                            inputs_tensor: torch.Tensor,
-                            lora_weights_lst: list[torch.Tensor],
-                            out_tensor: torch.Tensor,
-                            b_seq_start_loc: torch.Tensor,
-                            seq_len_tensor: torch.Tensor,
-                            prompt_lora_mapping: torch.Tensor, batches: int,
-                            max_seq_length: int, num_tokens: int,
-                            add_inputs: bool) -> None:
+def sgmv_expand_for_nslices(
+    nslices: int,
+    hidden_size: int,
+    inputs_tensor: torch.Tensor,
+    lora_weights_lst: list[torch.Tensor],
+    out_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    prompt_lora_mapping: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    num_tokens: int,
+    add_inputs: bool,
+) -> None:
     """
     Wrapper around torch_ops.sgmv_expand that handles any nslices.
     """
@@ -94,10 +106,17 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
 _dict_lock = Lock()
 
 
-def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
-                             hidden_size: int, nslices: int,
-                             dtype: torch.dtype, device: str, seq_length: int,
-                             scaling: float):
+def check_lora_shrink_kernel(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seq_length: int,
+    scaling: float,
+):
     """
     Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
     kernels.
@@ -116,14 +135,19 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
     max_seq_length, token_nums = data.meta()
 
     # Setup metadata information for SGMV and reference kernels
-    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
-                      data.prompt_lora_mapping, batches, max_seq_length,
-                      token_nums)
+    sgmv_meta_args = (
+        data.b_seq_start_loc,
+        data.seq_len_tensor,
+        data.prompt_lora_mapping,
+        batches,
+        max_seq_length,
+        token_nums,
+    )
 
     # Setup metadata information for the LoRA kernel.
-    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
-                                    max_num_tokens=token_nums,
-                                    device='cuda')
+    lora_meta = LoRAKernelMeta.make(
+        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+    )
     lora_meta.prepare_tensors(data.token_lora_mapping)
 
     ref_out_tensor = data.ref_out_tensor
@@ -154,10 +178,17 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
     assert_close(out_tensor, ref_out_tensor)
 
 
-def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
-                             hidden_size: int, nslices: int,
-                             dtype: torch.dtype, device: str, seq_length: int,
-                             add_inputs: bool):
+def check_lora_expand_kernel(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seq_length: int,
+    add_inputs: bool,
+):
     """
     Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
     kernels.
@@ -177,14 +208,19 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
     max_seq_length, token_nums = data.meta()
 
     # Setup metadata information for SGMV and reference kernels
-    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
-                      data.prompt_lora_mapping, batches, max_seq_length,
-                      token_nums)
+    sgmv_meta_args = (
+        data.b_seq_start_loc,
+        data.seq_len_tensor,
+        data.prompt_lora_mapping,
+        batches,
+        max_seq_length,
+        token_nums,
+    )
 
     # Setup metadata information for the LoRA kernel.
-    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
-                                    max_num_tokens=token_nums,
-                                    device='cuda')
+    lora_meta = LoRAKernelMeta.make(
+        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+    )
     lora_meta.prepare_tensors(data.token_lora_mapping)
 
     # Setup output tensors
@@ -194,21 +230,25 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
     with _dict_lock:
         # lora_expand kernel
         _LORA_B_PTR_DICT.clear()
-        triton_ops.lora_expand(data.inputs_tensor,
-                               data.lora_weights,
-                               out_tensor,
-                               *lora_meta.meta_args(token_nums=token_nums),
-                               offset_start=0,
-                               add_inputs=add_inputs)
+        triton_ops.lora_expand(
+            data.inputs_tensor,
+            data.lora_weights,
+            out_tensor,
+            *lora_meta.meta_args(token_nums=token_nums),
+            offset_start=0,
+            add_inputs=add_inputs,
+        )
 
     # Reference
-    sgmv_expand_for_nslices(nslices,
-                            hidden_size,
-                            data.inputs_tensor,
-                            data.lora_weights,
-                            ref_out_tensor,
-                            *sgmv_meta_args,
-                            add_inputs=add_inputs)
+    sgmv_expand_for_nslices(
+        nslices,
+        hidden_size,
+        data.inputs_tensor,
+        data.lora_weights,
+        ref_out_tensor,
+        *sgmv_meta_args,
+        add_inputs=add_inputs,
+    )
 
     assert_close(out_tensor, ref_out_tensor)
 
@@ -299,7 +339,7 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
     128000,
     128256,
 ]
-#The size of TP
+# The size of TP
 divisibility = [1, 2, 8, 16, 64]
 
 all_hidden_size = []
@@ -331,10 +371,10 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
 SEED = [0]
 
 
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("batches", test_params["batches"])
+@pytest.mark.parametrize("num_loras", test_params["num_loras"])
+@pytest.mark.parametrize("rank", test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
 @pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", DEVICES)
@@ -358,31 +398,35 @@ def test_kernels(
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_lora_shrink_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 scaling=0.5)
+        check_lora_shrink_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            scaling=0.5,
+        )
     else:
-        check_lora_expand_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+        check_lora_expand_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            add_inputs=True,
+        )
+
+
+@pytest.mark.parametrize("batches", hs_test_params["batches"])
+@pytest.mark.parametrize("num_loras", hs_test_params["num_loras"])
+@pytest.mark.parametrize("rank", hs_test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", hs_test_params["hidden_sizes"])
 @pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", DEVICES)
@@ -406,22 +450,26 @@ def test_kernels_hidden_size(
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_lora_shrink_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 scaling=0.5)
+        check_lora_shrink_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            scaling=0.5,
+        )
     else:
-        check_lora_expand_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 add_inputs=True)
+        check_lora_expand_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            add_inputs=True,
+        )
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index caa31fdb0e73..6b1180ea68d0 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -20,28 +20,27 @@ class ModelWithQuantization:
 
 
 MODELS: list[ModelWithQuantization]
-#AWQ quantization is currently not supported in ROCm.
+# AWQ quantization is currently not supported in ROCm.
 if current_platform.is_rocm():
     MODELS = [
         ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
     ]
 else:
     MODELS = [
         ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-            quantization="awq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
+        ),
         ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
     ]
 
 
-def do_sample(llm: vllm.LLM,
-              lora_path: str,
-              lora_id: int,
-              max_tokens: int = 256) -> list[str]:
+def do_sample(
+    llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
+) -> list[str]:
     raw_prompts = [
         "Give me an orange-ish brown color",
         "Give me a neon pink color",
@@ -52,14 +51,14 @@ def format_prompt_tuples(prompt):
 
     prompts = [format_prompt_tuples(p) for p in raw_prompts]
 
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=max_tokens,
-                                          stop=["<|im_end|>"])
+    sampling_params = vllm.SamplingParams(
+        temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
+    )
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -72,22 +71,22 @@ def format_prompt_tuples(prompt):
 
 @pytest.mark.parametrize("model", MODELS)
 def test_quant_model_lora(tinyllama_lora_files, model):
-
     llm = vllm.LLM(
         model=model.model_path,
         enable_lora=True,
         max_num_seqs=16,
         max_loras=4,
         max_model_len=400,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
 
     if model.quantization is None:
         expected_no_lora_output = [
             "Here are some examples of orange-brown colors",
-            "I'm sorry, I don't have"
+            "I'm sorry, I don't have",
         ]
         expected_lora_output = [
             "#ff8050",
@@ -115,43 +114,31 @@ def test_quant_model_lora(tinyllama_lora_files, model):
     def expect_match(output, expected_output):
         # HACK: GPTQ lora outputs are just incredibly unstable.
         # Assert that the outputs changed.
-        if (model.quantization == "gptq"
-                and expected_output is expected_lora_output):
+        if model.quantization == "gptq" and expected_output is expected_lora_output:
             assert output != expected_no_lora_output
             for i, o in enumerate(output):
-                assert o.startswith(
-                    '#'), f"Expected example {i} to start with # but got {o}"
+                assert o.startswith("#"), (
+                    f"Expected example {i} to start with # but got {o}"
+                )
             return
         assert output == expected_output
 
     max_tokens = 10
 
     print("lora adapter created")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=0,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=0, max_tokens=max_tokens)
     expect_match(output, expected_no_lora_output)
 
     print("lora 1")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=1,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
     expect_match(output, expected_lora_output)
 
     print("no lora")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=0,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=0, max_tokens=max_tokens)
     expect_match(output, expected_no_lora_output)
 
     print("lora 2")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=2,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
     expect_match(output, expected_lora_output)
 
     print("removing lora")
@@ -161,8 +148,7 @@ def expect_match(output, expected_output):
 
 
 @pytest.mark.parametrize("model", MODELS)
-def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
-                                 model):
+def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
     if num_gpus_available < 2:
         pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
     if model.quantization == "gptq":
@@ -172,10 +158,11 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         enable_lora=True,
         max_num_seqs=16,
         max_loras=4,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
@@ -187,9 +174,10 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         max_num_seqs=16,
         max_loras=4,
         tensor_parallel_size=2,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
         quantization=model.quantization,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 604bb307b889..9d3e2b265e3a 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -39,7 +39,8 @@ class Qwen2VLTester:
         "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
         "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
         "What is in the image?<|im_end|>\n"
-        "<|im_start|>assistant\n")
+        "<|im_start|>assistant\n"
+    )
 
     def __init__(self, config: TestConfig):
         self.config = config
@@ -58,68 +59,68 @@ def _initialize_llm(self) -> vllm.LLM:
             max_model_len=self.config.max_model_len,
         )
 
-    def run_test(self,
-                 images: list[ImageAsset],
-                 expected_outputs: list[str],
-                 lora_id: Optional[int] = None,
-                 temperature: float = 0,
-                 max_tokens: int = 5):
-
+    def run_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[str],
+        lora_id: Optional[int] = None,
+        temperature: float = 0,
+        max_tokens: int = 5,
+    ):
         sampling_params = vllm.SamplingParams(
             temperature=temperature,
             max_tokens=max_tokens,
         )
-        inputs = [{
-            "prompt": self.PROMPT_TEMPLATE,
-            "multi_modal_data": {
-                "image": asset.pil_image
-            },
-        } for asset in images]
-
-        lora_request = LoRARequest(str(lora_id), lora_id,
-                                   self.config.lora_path)
-        outputs = self.llm.generate(inputs,
-                                    sampling_params,
-                                    lora_request=lora_request)
-        generated_texts = [
-            output.outputs[0].text.strip() for output in outputs
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
         ]
 
+        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
+        generated_texts = [output.outputs[0].text.strip() for output in outputs]
+
         # Validate outputs
         for generated, expected in zip(generated_texts, expected_outputs):
-            assert expected.startswith(
-                generated), f"Generated text {generated} doesn't "
+            assert expected.startswith(generated), (
+                f"Generated text {generated} doesn't "
+            )
             f"match expected pattern {expected}"
 
-    def run_beam_search_test(self,
-                             images: list[ImageAsset],
-                             expected_outputs: list[list[str]],
-                             lora_id: Optional[int] = None,
-                             temperature: float = 0,
-                             beam_width: int = 2,
-                             max_tokens: int = 5):
-
-        beam_search_params = BeamSearchParams(beam_width=beam_width,
-                                              max_tokens=max_tokens,
-                                              temperature=temperature)
-
-        inputs = [{
-            "prompt": self.PROMPT_TEMPLATE,
-            "multi_modal_data": {
-                "image": asset.pil_image
-            },
-        } for asset in images]
-
-        lora_request = LoRARequest(str(lora_id), lora_id,
-                                   self.config.lora_path)
-        outputs = self.llm.beam_search(inputs,
-                                       beam_search_params,
-                                       lora_request=lora_request)
+    def run_beam_search_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[list[str]],
+        lora_id: Optional[int] = None,
+        temperature: float = 0,
+        beam_width: int = 2,
+        max_tokens: int = 5,
+    ):
+        beam_search_params = BeamSearchParams(
+            beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
+        )
+
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
+        ]
+
+        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        outputs = self.llm.beam_search(
+            inputs, beam_search_params, lora_request=lora_request
+        )
 
         for output_obj, expected_outs in zip(outputs, expected_outputs):
             output_texts = [seq.text for seq in output_obj.sequences]
-            assert output_texts == expected_outs, \
-                f"Generated texts {output_texts} do not match expected {expected_outs}"  # noqa: E501
+            assert output_texts == expected_outs, (
+                f"Generated texts {output_texts} do not match expected {expected_outs}"
+            )  # noqa: E501
 
 
 TEST_IMAGES = [
@@ -146,27 +147,25 @@ def run_beam_search_test(self,
 
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+    reason="Qwen2-VL dependency xformers incompatible with ROCm",
+)
 def test_qwen2vl_lora(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA"""
-    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
-                        lora_path=qwen2vl_lora_files)
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
     tester = Qwen2VLTester(config)
 
     # Test with different LoRA IDs
     for lora_id in [1, 2]:
-        tester.run_test(TEST_IMAGES,
-                        expected_outputs=EXPECTED_OUTPUTS,
-                        lora_id=lora_id)
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
 @pytest.mark.xfail(
     current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+    reason="Qwen2-VL dependency xformers incompatible with ROCm",
+)
 def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA through beam search."""
-    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
-                        lora_path=qwen2vl_lora_files)
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
     tester = Qwen2VLTester(config)
 
     # Test with different LoRA IDs
@@ -178,7 +177,8 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
         tester.run_beam_search_test(
             [ImageAsset("cherry_blossom")],
             expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
-            lora_id=lora_id)
+            lora_id=lora_id,
+        )
 
 
 @pytest.mark.xfail(
@@ -191,12 +191,9 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
 )
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
-    config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
-                        lora_path=qwen25vl_lora_files)
+    config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
     tester = Qwen2VLTester(config)
 
     # Test with different LoRA IDs
     for lora_id in [1, 2]:
-        tester.run_test(TEST_IMAGES,
-                        expected_outputs=EXPECTED_OUTPUTS,
-                        lora_id=lora_id)
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
index 6c93e577611f..c70e58a375c7 100644
--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
@@ -12,13 +12,15 @@
 class DummyLoRAResolver(LoRAResolver):
     """A dummy LoRA resolver for testing."""
 
-    async def resolve_lora(self, base_model_name: str,
-                           lora_name: str) -> Optional[LoRARequest]:
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> Optional[LoRARequest]:
         if lora_name == "test_lora":
             return LoRARequest(
                 lora_name=lora_name,
                 lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
-                lora_int_id=abs(hash(lora_name)))
+                lora_int_id=abs(hash(lora_name)),
+            )
         return None
 
 
@@ -70,6 +72,5 @@ async def test_dummy_resolver_resolve():
     assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
 
     # Test failed resolution
-    result = await dummy_resolver.resolve_lora(base_model_name,
-                                               "nonexistent_lora")
+    result = await dummy_resolver.resolve_lora(base_model_name, "nonexistent_lora")
     assert result is None
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index 6cfdaf50d33c..740da5e35529 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -22,22 +22,25 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
     )
     lora_request = LoRARequest("1", 1, sql_lora_files)
     assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
-        prompt="prompt", lora_request=lora_request)
-    assert reference_tokenizer.encode(
-        "prompt") == await tokenizer_group.encode_async(
-            prompt="prompt", lora_request=lora_request)
-    assert isinstance(tokenizer_group.get_lora_tokenizer(None),
-                      PreTrainedTokenizerBase)
+        prompt="prompt", lora_request=lora_request
+    )
+    assert reference_tokenizer.encode("prompt") == await tokenizer_group.encode_async(
+        prompt="prompt", lora_request=lora_request
+    )
+    assert isinstance(tokenizer_group.get_lora_tokenizer(None), PreTrainedTokenizerBase)
     assert tokenizer_group.get_lora_tokenizer(
-        None) == await tokenizer_group.get_lora_tokenizer_async(None)
+        None
+    ) == await tokenizer_group.get_lora_tokenizer_async(None)
 
-    assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
-                      PreTrainedTokenizerBase)
+    assert isinstance(
+        tokenizer_group.get_lora_tokenizer(lora_request), PreTrainedTokenizerBase
+    )
     assert tokenizer_group.get_lora_tokenizer(
-        lora_request) != tokenizer_group.get_lora_tokenizer(None)
+        lora_request
+    ) != tokenizer_group.get_lora_tokenizer(None)
     assert tokenizer_group.get_lora_tokenizer(
-        lora_request) == await tokenizer_group.get_lora_tokenizer_async(
-            lora_request)
+        lora_request
+    ) == await tokenizer_group.get_lora_tokenizer_async(lora_request)
 
 
 def test_get_lora_tokenizer(sql_lora_files, tmp_path):
@@ -66,7 +69,6 @@ def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
         max_input_length=None,
     )
     if enable_lora:
-        assert tokenizer_group.lora_tokenizers.capacity == max(
-            max_num_seqs, max_loras)
+        assert tokenizer_group.lora_tokenizers.capacity == max(max_num_seqs, max_loras)
     else:
         assert tokenizer_group.lora_tokenizers.capacity == 0
diff --git a/tests/lora/test_transformers_model.py b/tests/lora/test_transformers_model.py
index 5065a2fb7164..da924485c3e2 100644
--- a/tests/lora/test_transformers_model.py
+++ b/tests/lora/test_transformers_model.py
@@ -24,20 +24,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query="What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
         ),
         PROMPT_TEMPLATE.format(
-            query=
-            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+            query="What are all distinct countries where singers above age 20 are from?"  # noqa: E501
         ),
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -49,13 +47,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 
 def test_ilama_lora(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   trust_remote_code=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
 
     output1 = do_sample(llm, ilama_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -65,20 +65,23 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=False,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=False,
+        enable_chunked_prefill=True,
+    )
 
     output1 = do_sample(llm, ilama_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -88,20 +91,23 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=True,
+        enable_chunked_prefill=True,
+    )
     output1 = do_sample(llm, ilama_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index b343bef0a920..aed91d98ddbd 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -9,8 +9,11 @@
 from huggingface_hub.utils import HfHubHTTPError
 from torch import nn
 
-from vllm.lora.utils import (get_adapter_absolute_path,
-                             parse_fine_tuned_lora_name, replace_submodule)
+from vllm.lora.utils import (
+    get_adapter_absolute_path,
+    parse_fine_tuned_lora_name,
+    replace_submodule,
+)
 from vllm.model_executor.models.utils import WeightsMapper
 
 
@@ -24,10 +27,12 @@ class LoRANameParserTestConfig(NamedTuple):
 
 def test_parse_fine_tuned_lora_name_valid():
     fixture = [
-        LoRANameParserTestConfig("base_model.model.lm_head.lora_A.weight",
-                                 "lm_head", True, False),
-        LoRANameParserTestConfig("base_model.model.lm_head.lora_B.weight",
-                                 "lm_head", False, False),
+        LoRANameParserTestConfig(
+            "base_model.model.lm_head.lora_A.weight", "lm_head", True, False
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.lm_head.lora_B.weight", "lm_head", False, False
+        ),
         LoRANameParserTestConfig(
             "base_model.model.model.embed_tokens.lora_embedding_A",
             "model.embed_tokens",
@@ -71,7 +76,8 @@ def test_parse_fine_tuned_lora_name_valid():
             True,
             False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
         LoRANameParserTestConfig(
             "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
@@ -79,7 +85,8 @@ def test_parse_fine_tuned_lora_name_valid():
             False,
             False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
         LoRANameParserTestConfig(
             "model.layers.9.mlp.down_proj.lora_A.weight",
@@ -87,7 +94,8 @@ def test_parse_fine_tuned_lora_name_valid():
             True,
             False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
         LoRANameParserTestConfig(
             "model.layers.9.mlp.down_proj.lora_B.weight",
@@ -95,12 +103,14 @@ def test_parse_fine_tuned_lora_name_valid():
             False,
             False,
             weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
         ),
     ]
     for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
-        assert (module_name, is_lora_a,
-                is_bias) == parse_fine_tuned_lora_name(name, weights_mapper)
+        assert (module_name, is_lora_a, is_bias) == parse_fine_tuned_lora_name(
+            name, weights_mapper
+        )
 
 
 def test_parse_fine_tuned_lora_name_invalid():
@@ -115,22 +125,28 @@ def test_parse_fine_tuned_lora_name_invalid():
 
 def test_replace_submodule():
     model = nn.Sequential(
-        OrderedDict([
-            ("dense1", nn.Linear(764, 100)),
-            ("act1", nn.ReLU()),
-            ("dense2", nn.Linear(100, 50)),
-            (
-                "seq1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", nn.Linear(100, 10)),
-                        ("dense2", nn.Linear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("output", nn.Linear(50, 10)),
-            ("outact", nn.Sigmoid()),
-        ]))
+        OrderedDict(
+            [
+                ("dense1", nn.Linear(764, 100)),
+                ("act1", nn.ReLU()),
+                ("dense2", nn.Linear(100, 50)),
+                (
+                    "seq1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", nn.Linear(100, 10)),
+                                ("dense2", nn.Linear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("output", nn.Linear(50, 10)),
+                ("outact", nn.Sigmoid()),
+            ]
+        )
+    )
 
     sigmoid = nn.Sigmoid()
 
@@ -143,52 +159,51 @@ def test_replace_submodule():
 
 
 # Unit tests for get_adapter_absolute_path
-@patch('os.path.isabs')
+@patch("os.path.isabs")
 def test_get_adapter_absolute_path_absolute(mock_isabs):
-    path = '/absolute/path/to/lora'
+    path = "/absolute/path/to/lora"
     mock_isabs.return_value = True
     assert get_adapter_absolute_path(path) == path
 
 
-@patch('os.path.expanduser')
+@patch("os.path.expanduser")
 def test_get_adapter_absolute_path_expanduser(mock_expanduser):
     # Path with ~ that needs to be expanded
-    path = '~/relative/path/to/lora'
-    absolute_path = '/home/user/relative/path/to/lora'
+    path = "~/relative/path/to/lora"
+    absolute_path = "/home/user/relative/path/to/lora"
     mock_expanduser.return_value = absolute_path
     assert get_adapter_absolute_path(path) == absolute_path
 
 
-@patch('os.path.exists')
-@patch('os.path.abspath')
+@patch("os.path.exists")
+@patch("os.path.abspath")
 def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
     # Relative path that exists locally
-    path = 'relative/path/to/lora'
-    absolute_path = '/absolute/path/to/lora'
+    path = "relative/path/to/lora"
+    absolute_path = "/absolute/path/to/lora"
     mock_exist.return_value = True
     mock_abspath.return_value = absolute_path
     assert get_adapter_absolute_path(path) == absolute_path
 
 
-@patch('huggingface_hub.snapshot_download')
-@patch('os.path.exists')
-def test_get_adapter_absolute_path_huggingface(mock_exist,
-                                               mock_snapshot_download):
+@patch("huggingface_hub.snapshot_download")
+@patch("os.path.exists")
+def test_get_adapter_absolute_path_huggingface(mock_exist, mock_snapshot_download):
     # Hugging Face model identifier
-    path = 'org/repo'
-    absolute_path = '/mock/snapshot/path'
+    path = "org/repo"
+    absolute_path = "/mock/snapshot/path"
     mock_exist.return_value = False
     mock_snapshot_download.return_value = absolute_path
     assert get_adapter_absolute_path(path) == absolute_path
 
 
-@patch('huggingface_hub.snapshot_download')
-@patch('os.path.exists')
-def test_get_adapter_absolute_path_huggingface_error(mock_exist,
-                                                     mock_snapshot_download):
+@patch("huggingface_hub.snapshot_download")
+@patch("os.path.exists")
+def test_get_adapter_absolute_path_huggingface_error(
+    mock_exist, mock_snapshot_download
+):
     # Hugging Face model identifier with download error
-    path = 'org/repo'
+    path = "org/repo"
     mock_exist.return_value = False
-    mock_snapshot_download.side_effect = HfHubHTTPError(
-        "failed to query model info")
+    mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
     assert get_adapter_absolute_path(path) == path
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 9999c1be54ea..172e8a440d3f 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -8,9 +8,16 @@
 from unittest.mock import patch
 
 import vllm.envs as envs
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    LoadConfig,
+    LoRAConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker as V1Worker
@@ -21,9 +28,9 @@
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
-
-    def set_active_loras(worker: Union[Worker, V1Worker],
-                         lora_requests: list[LoRARequest]):
+    def set_active_loras(
+        worker: Union[Worker, V1Worker], lora_requests: list[LoRARequest]
+    ):
         lora_mapping = LoRAMapping([], [])
         if isinstance(worker, Worker):
             # v0 case
@@ -31,7 +38,8 @@ def set_active_loras(worker: Union[Worker, V1Worker],
         else:
             # v1 case
             worker.model_runner.lora_manager.set_active_adapters(
-                lora_requests, lora_mapping)
+                lora_requests, lora_mapping
+            )
 
     worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
 
@@ -63,9 +71,9 @@ def set_active_loras(worker: Union[Worker, V1Worker],
             swap_space=0,
             cache_dtype="auto",
         ),
-        lora_config=LoRAConfig(max_lora_rank=8,
-                               max_cpu_loras=NUM_LORAS,
-                               max_loras=NUM_LORAS),
+        lora_config=LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
+        ),
     )
     worker = worker_cls(
         vllm_config=vllm_config,
@@ -81,23 +89,22 @@ def set_active_loras(worker: Union[Worker, V1Worker],
     assert worker.list_loras() == set()
 
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files)
-        for i in range(NUM_LORAS)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
     assert worker.list_loras() == {
-        lora_request.lora_int_id
-        for lora_request in lora_requests
+        lora_request.lora_int_id for lora_request in lora_requests
     }
 
     for i in range(NUM_LORAS):
         random.seed(i)
-        iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, NUM_LORAS))
+        iter_lora_requests = random.choices(
+            lora_requests, k=random.randint(1, NUM_LORAS)
+        )
         random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
+        iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
         set_active_loras(worker, lora_requests)
         assert worker.list_loras().issuperset(
-            {lora_request.lora_int_id
-             for lora_request in iter_lora_requests})
+            {lora_request.lora_int_id for lora_request in iter_lora_requests}
+        )
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index cc1b0d81955b..841d663a5e3c 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -10,7 +10,6 @@
 
 
 class DummyLoRAManager:
-
     def __init__(self, device: torch.device = "cuda:0"):
         super().__init__()
         self._loras: dict[str, LoRALayerWeights] = {}
@@ -33,12 +32,12 @@ def init_random_lora(
             module_name,
             rank=rank,
             lora_alpha=1,
-            lora_a=torch.rand([weight.shape[1], rank],
-                              dtype=weight.dtype,
-                              device=self._device),
-            lora_b=torch.rand([rank, weight.shape[0]],
-                              dtype=weight.dtype,
-                              device=self._device),
+            lora_a=torch.rand(
+                [weight.shape[1], rank], dtype=weight.dtype, device=self._device
+            ),
+            lora_b=torch.rand(
+                [rank, weight.shape[0]], dtype=weight.dtype, device=self._device
+            ),
         )
         if generate_embeddings_tensor:
             lora.embeddings_tensor = torch.rand(
@@ -143,27 +142,26 @@ def generate_data(
     op_type,
     device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
     ).to(device)
     total_tokens = seq_len_tensor.sum()
     if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
         lora_weights = torch.rand(
             (lora_nums, max_rank, hidden_size),  # col-major
             dtype=dtype,
         ).to(device)
         # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=dtype,
-                                     device=inputs_tensor.device)
+        ref_out_tensor = torch.zeros(
+            (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
+        )
         # NOTE  shrink kernel using torch.float32 as output type
-        our_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=torch.float32).to(device)
+        our_out_tensor = torch.zeros((total_tokens, max_rank), dtype=torch.float32).to(
+            device
+        )
     else:
         inputs_tensor = torch.rand(
             (total_tokens, max_rank),
@@ -181,15 +179,16 @@ def generate_data(
         ).to(device)
         # Ensure the same input.
         our_out_tensor = ref_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, )).to(device)
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    ).to(device)
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]].copy_(lora_index)
+        indices[current_offset : current_offset + seq_len_tensor[b_id]].copy_(
+            lora_index
+        )
         current_offset += seq_len_tensor[b_id].item()
 
     return PunicaTensors(
@@ -214,8 +213,7 @@ def generate_data_for_expand_nslices(
     nslices,
     device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
@@ -231,22 +229,25 @@ def generate_data_for_expand_nslices(
             torch.rand(
                 (lora_nums, hidden_size, max_rank),  # col-major
                 dtype=dtype,
-            ).to(device))
+            ).to(device)
+        )
     # expand op needs to complete y+=a@lora_b, so output is
     # initinized randomly
-    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
-                                dtype=dtype).to(device)
+    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), dtype=dtype).to(
+        device
+    )
     # Ensure the same input.
     our_out_tensor = ref_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, ))
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    )
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = (lora_index.item())
+        indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
+            lora_index.item()
+        )
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
@@ -273,8 +274,7 @@ def generate_data_for_nslices(
     op_type,
     device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
     b_seq_start_loc = torch.cumsum(
         torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
         dim=0,
@@ -283,9 +283,7 @@ def generate_data_for_nslices(
 
     lora_weights_lst = []
     if op_type == "shrink":
-
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
 
         for _ in range(nslices):
             if op_type == "shrink":
@@ -293,7 +291,8 @@ def generate_data_for_nslices(
                     torch.rand(
                         (lora_nums, max_rank, hidden_size),  # col-major
                         dtype=dtype,
-                    ).to(device))
+                    ).to(device)
+                )
         # NOTE  shrink kernel using torch.float32 as output type
         # shrink op need atomic_add, so output is initinized by 0
         our_out_tensor = torch.zeros(
@@ -310,23 +309,26 @@ def generate_data_for_nslices(
                 torch.rand(
                     (lora_nums, hidden_size, max_rank),  # col-major
                     dtype=dtype,
-                ).to(device))
+                ).to(device)
+            )
         # expand op needs to complete y+=a@lora_b, so output is
         # initinized randomly
-        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
-                                    dtype=dtype).to(device)
+        our_out_tensor = torch.rand(
+            (total_tokens, hidden_size * nslices), dtype=dtype
+        ).to(device)
 
     # Ensure the same input.
     ref_out_tensor = our_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, ))
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    )
     indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
     current_offset = 0
     for b_id in range(batches):
         lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = (lora_index.item())
+        indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
+            lora_index.item()
+        )
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 7bb5d8980d61..e75fac0481be 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -22,7 +22,7 @@ def use_v0_only(monkeypatch):
     """
     This module tests V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 MODELS = [
@@ -40,29 +40,28 @@ def test_metric_counter_prompt_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.4) as vllm_model:
+    with vllm_runner(
+        model, dtype=dtype, disable_log_stats=False, gpu_memory_utilization=0.4
+    ) as vllm_model:
         tokenizer = vllm_model.model.get_tokenizer()
-        prompt_token_counts = [
-            len(tokenizer.encode(p)) for p in example_prompts
-        ]
+        prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
         # This test needs at least 2 prompts in a batch of different lengths to
         # verify their token count is correct despite padding.
         assert len(example_prompts) > 1, "at least 2 prompts are required"
         assert prompt_token_counts[0] != prompt_token_counts[1], (
-            "prompts of different lengths are required")
+            "prompts of different lengths are required"
+        )
         vllm_prompt_token_count = sum(prompt_token_counts)
 
         _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.model.llm_engine.stat_loggers["prometheus"]
         metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
-            **stat_logger.labels)._value.get()
+            **stat_logger.labels
+        )._value.get()
 
     assert vllm_prompt_token_count == metric_count, (
-        f"prompt token count: {vllm_prompt_token_count!r}\n"
-        f"metric: {metric_count!r}")
+        f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -75,15 +74,15 @@ def test_metric_counter_generation_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.4) as vllm_model:
+    with vllm_runner(
+        model, dtype=dtype, disable_log_stats=False, gpu_memory_utilization=0.4
+    ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.model.llm_engine.stat_loggers["prometheus"]
         metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-            **stat_logger.labels)._value.get()
+            **stat_logger.labels
+        )._value.get()
         vllm_generation_count = 0
         for i in range(len(example_prompts)):
             vllm_output_ids, vllm_output_str = vllm_outputs[i]
@@ -93,8 +92,8 @@ def test_metric_counter_generation_tokens(
             vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
 
     assert vllm_generation_count == metric_count, (
-        f"generation token count: {vllm_generation_count!r}\n"
-        f"metric: {metric_count!r}")
+        f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}"
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -109,17 +108,18 @@ def test_metric_counter_generation_tokens_multi_step(
 ) -> None:
     num_scheduler_steps = 8
     with vllm_runner(
-            model,
-            disable_log_stats=False,
-            gpu_memory_utilization=0.4,
-            num_scheduler_steps=num_scheduler_steps,
-            disable_async_output_proc=disable_async_output_proc,
+        model,
+        disable_log_stats=False,
+        gpu_memory_utilization=0.4,
+        num_scheduler_steps=num_scheduler_steps,
+        disable_async_output_proc=disable_async_output_proc,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.model.llm_engine.stat_loggers["prometheus"]
         metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-            **stat_logger.labels)._value.get()
+            **stat_logger.labels
+        )._value.get()
         vllm_generation_count = 0
         for i in range(len(example_prompts)):
             vllm_output_ids, vllm_output_str = vllm_outputs[i]
@@ -130,25 +130,29 @@ def test_metric_counter_generation_tokens_multi_step(
 
     # The multi-step scheduling will continue to execute forward even when
     # encountering EOS, leading to slightly imprecise metrics.
-    assert abs(vllm_generation_count - metric_count) <\
-        len(example_prompts) * num_scheduler_steps, \
-        (f"generation token count: {vllm_generation_count!r}\n"
-         f"metric: {metric_count!r}")
+    assert (
+        abs(vllm_generation_count - metric_count)
+        < len(example_prompts) * num_scheduler_steps
+    ), f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}"
 
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize(
     "served_model_name",
-    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
-def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
-                                   served_model_name: list[str]) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.3,
-                     served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]],
+)
+def test_metric_set_tag_model_name(
+    vllm_runner, model: str, dtype: str, served_model_name: list[str]
+) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        disable_log_stats=False,
+        gpu_memory_utilization=0.3,
+        served_model_name=served_model_name,
+    ) as vllm_model:
+        stat_logger = vllm_model.model.llm_engine.stat_loggers["prometheus"]
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if envs.VLLM_CI_USE_S3:
@@ -156,12 +160,14 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
     if served_model_name is None or served_model_name == []:
         assert metrics_tag_content == model, (
             f"Metrics tag model_name is wrong! expect: {model!r}\n"
-            f"actual: {metrics_tag_content!r}")
+            f"actual: {metrics_tag_content!r}"
+        )
     else:
         assert metrics_tag_content == served_model_name[0], (
             f"Metrics tag model_name is wrong! expect: "
             f"{served_model_name[0]!r}\n"
-            f"actual: {metrics_tag_content!r}")
+            f"actual: {metrics_tag_content!r}"
+        )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -197,8 +203,7 @@ async def test_async_engine_log_metrics_regression(
         async for _ in results:
             pass
 
-    assert_metrics(model, async_engine.engine, disable_log_stats,
-                   len(example_prompts))
+    assert_metrics(model, async_engine.engine, disable_log_stats, len(example_prompts))
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -245,18 +250,17 @@ def test_metric_spec_decode(
     k = 5
 
     with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            gpu_memory_utilization=0.4,
-            speculative_config={
-                "model": model,
-                "num_speculative_tokens": k,
-            },
+        model,
+        dtype=dtype,
+        disable_log_stats=False,
+        gpu_memory_utilization=0.4,
+        speculative_config={
+            "model": model,
+            "num_speculative_tokens": k,
+        },
     ) as vllm_model:
-
         # Force log interval to be 0 to catch all metrics.
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.model.llm_engine.stat_loggers["prometheus"]
         stat_logger.local_interval = 0
 
         # Note that the purpose of this test is to verify spec decode
@@ -267,8 +271,7 @@ def test_metric_spec_decode(
             "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
             "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
             "counter_spec_decode_num_draft_tokens": lambda v: v == k,
-            "counter_spec_decode_num_emitted_tokens":
-            lambda v: 0 <= v <= k + 1,
+            "counter_spec_decode_num_emitted_tokens": lambda v: 0 <= v <= k + 1,
         }
 
         # Use one request to better inspect the metrics.
@@ -276,12 +279,15 @@ def test_metric_spec_decode(
 
         _ = vllm_model.generate_greedy(prompts, max_tokens)
         for metric_name, is_expected in metric_name_to_expected_fn.items():
-            metric_val = getattr(
-                stat_logger.metrics,
-                metric_name).labels(**stat_logger.labels)._value.get()
+            metric_val = (
+                getattr(stat_logger.metrics, metric_name)
+                .labels(**stat_logger.labels)
+                ._value.get()
+            )
             assert is_expected(metric_val), (
                 f"the value of metric {metric_name} ({metric_val}) "
-                "does not meet expectation")
+                "does not meet expectation"
+            )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -313,7 +319,6 @@ def test_metric_spec_decode_interval(
     engine = LLMEngine.from_engine_args(engine_args)
 
     try:
-
         engine.add_request(
             "request-id-0",
             example_prompts[0],
@@ -321,7 +326,7 @@ def test_metric_spec_decode_interval(
         )
 
         # set log internal
-        stat_logger = engine.stat_loggers['prometheus']
+        stat_logger = engine.stat_loggers["prometheus"]
         stat_logger.local_interval = log_interval
 
         # prefill
@@ -358,35 +363,37 @@ def test_metric_spec_decode_interval(
             "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
             "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
             "counter_spec_decode_num_draft_tokens": lambda v: v == k,
-            "counter_spec_decode_num_emitted_tokens":
-            lambda v: 0 <= v <= k + 1,
+            "counter_spec_decode_num_emitted_tokens": lambda v: 0 <= v <= k + 1,
         }
 
         for metric_name, is_expected in metric_name_to_expected_fn.items():
-            metric_val = getattr(
-                stat_logger.metrics,
-                metric_name).labels(**stat_logger.labels)._value.get()
+            metric_val = (
+                getattr(stat_logger.metrics, metric_name)
+                .labels(**stat_logger.labels)
+                ._value.get()
+            )
             assert is_expected(metric_val), (
                 f"the value of metric {metric_name} ({metric_val}) "
-                "does not meet expectation")
+                "does not meet expectation"
+            )
 
     finally:
         del engine
         cleanup_dist_env_and_memory()
 
 
-def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
-                   num_requests: int) -> None:
+def assert_metrics(
+    model: str, engine: LLMEngine, disable_log_stats: bool, num_requests: int
+) -> None:
     if disable_log_stats:
         with pytest.raises(AttributeError):
             _ = engine.stat_loggers
     else:
-        assert (engine.stat_loggers
-                is not None), "engine.stat_loggers should be set"
+        assert engine.stat_loggers is not None, "engine.stat_loggers should be set"
         # Ensure the count bucket of request-level histogram metrics matches
         # the number of requests as a simple sanity check to ensure metrics are
         # generated
-        labels = {'model_name': model}
+        labels = {"model_name": model}
         request_histogram_metrics = [
             "vllm:e2e_request_latency_seconds",
             "vllm:request_prompt_tokens",
@@ -395,10 +402,8 @@ def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
             "vllm:request_params_max_tokens",
         ]
         for metric_name in request_histogram_metrics:
-            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
-                                                     labels)
-            assert (
-                metric_value == num_requests), "Metrics should be collected"
+            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count", labels)
+            assert metric_value == num_requests, "Metrics should be collected"
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -418,9 +423,7 @@ def test_engine_log_metrics_ray(
     # We have to run in a Ray task for Ray metrics to be emitted correctly
     @ray.remote(num_gpus=1)
     def _inner():
-
         class _RayPrometheusStatLogger(RayPrometheusStatLogger):
-
             def __init__(self, *args, **kwargs):
                 self._i = 0
                 super().__init__(*args, **kwargs)
@@ -438,7 +441,8 @@ def log(self, *args, **kwargs):
         logger = _RayPrometheusStatLogger(
             local_interval=0.5,
             labels=dict(model_name=engine.model_config.served_model_name),
-            vllm_config=engine.vllm_config)
+            vllm_config=engine.vllm_config,
+        )
         engine.add_logger("ray", logger)
         for i, prompt in enumerate(example_prompts):
             engine.add_request(
diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py
index e89e60c5a02e..d476e709a8c5 100644
--- a/tests/mistral_tool_use/conftest.py
+++ b/tests/mistral_tool_use/conftest.py
@@ -17,8 +17,9 @@ def server_config(request):
     config = CONFIGS[request.param]
 
     if current_platform.is_rocm() and not config.get("supports_rocm", True):
-        pytest.skip("The {} model can't be tested on the ROCm platform".format(
-            config["model"]))
+        pytest.skip(
+            "The {} model can't be tested on the ROCm platform".format(config["model"])
+        )
 
     # download model and tokenizer using transformers
     snapshot_download(config["model"])
@@ -30,8 +31,9 @@ def server_config(request):
 def server(request, server_config: ServerConfig):
     model = server_config["model"]
     args_for_model = server_config["arguments"]
-    with RemoteOpenAIServer(model, ARGS + args_for_model,
-                            max_wait_seconds=480) as server:
+    with RemoteOpenAIServer(
+        model, ARGS + args_for_model, max_wait_seconds=480
+    ) as server:
         yield server
 
 
diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py
index 9bf6863f3f2b..3c4a543abe41 100644
--- a/tests/mistral_tool_use/test_mistral_tool_calls.py
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
@@ -19,12 +19,12 @@ async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
         model=model_name,
         tools=[WEATHER_TOOL],
         tool_choice=WEATHER_TOOL,
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
 
     assert choice.finish_reason != "tool_calls"  # "stop" or "length"
     assert choice.message.role == "assistant"
-    assert choice.message.tool_calls is None \
-           or len(choice.message.tool_calls) == 1
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 1
     assert len(choice.message.tool_calls[0].id) == 9  # length of 9 for mistral
diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py
index 7a026cd9bb61..13a234f8e26b 100644
--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
@@ -18,17 +18,16 @@ class ServerConfig(TypedDict, total=False):
 
 CONFIGS: dict[str, ServerConfig] = {
     "mistral": {
-        "model":
-        "mistralai/Mistral-7B-Instruct-v0.3",
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
         "arguments": [
-            "--tokenizer-mode", "mistral",
-            "--ignore-patterns=\"consolidated.safetensors\""
+            "--tokenizer-mode",
+            "mistral",
+            '--ignore-patterns="consolidated.safetensors"',
         ],
-        "system_prompt":
-        "You are a helpful assistant with access to tools. If a tool"
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
         " that you have would be helpful to answer a user query, "
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-        "to the user's question - just respond to it normally."
+        "to the user's question - just respond to it normally.",
     },
 }
diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py
index c6d89d849e9f..7539c8990db6 100644
--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
@@ -6,8 +6,10 @@
 
 @pytest.fixture
 def sample_regex():
-    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+    return (
+        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+    )
 
 
 @pytest.fixture
@@ -15,38 +17,25 @@ def sample_json_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
             "skills": {
                 "type": "array",
-                "items": {
-                    "type": "string",
-                    "maxLength": 10
-                },
-                "minItems": 3
+                "items": {"type": "string", "maxLength": 10},
+                "minItems": 3,
             },
             "work_history": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
-                        "company": {
-                            "type": "string"
-                        },
-                        "duration": {
-                            "type": "number"
-                        },
-                        "position": {
-                            "type": "string"
-                        }
+                        "company": {"type": "string"},
+                        "duration": {"type": "number"},
+                        "position": {"type": "string"},
                     },
-                    "required": ["company", "position"]
-                }
-            }
+                    "required": ["company", "position"],
+                },
+            },
         },
-        "required": ["name", "age", "skills", "work_history"]
+        "required": ["name", "age", "skills", "work_history"],
     }
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 140f00294765..0f2cb71fcd24 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -6,18 +6,31 @@
 
 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.activation import (GeluAndMul,
-                                                   ReLUSquaredActivation,
-                                                   SiluAndMul)
-from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
-                                                            vllm_topk_softmax)
+from vllm.model_executor.layers.activation import (
+    GeluAndMul,
+    ReLUSquaredActivation,
+    SiluAndMul,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    dispatch_topk_func,
+    vllm_topk_softmax,
+)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled)
+    is_rocm_aiter_moe_enabled,
+)
 from vllm.model_executor.layers.layernorm import (
-    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
-    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+    RMSNorm,
+    dispatch_cuda_rmsnorm_func,
+    fused_add_rms_norm,
+    rms_norm,
+    rocm_aiter_fused_add_rms_norm,
+    rocm_aiter_rms_norm,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
+    cutlass_scaled_mm,
+    dispatch_w8a8_blockscale_func,
+    w8a8_block_fp8_matmul,
+)
 from vllm.platforms import current_platform
 
 
@@ -64,13 +77,22 @@ class Relu3(ReLUSquaredActivation):
         ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
         # All but RMSNorm
         ("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
-    ])
-def test_enabled_ops(env: str, torch_level: int, use_inductor: bool,
-                     ops_enabled: list[int], default_on: bool):
+    ],
+)
+def test_enabled_ops(
+    env: str,
+    torch_level: int,
+    use_inductor: bool,
+    ops_enabled: list[int],
+    default_on: bool,
+):
     vllm_config = VllmConfig(
-        compilation_config=CompilationConfig(use_inductor=bool(use_inductor),
-                                             level=torch_level,
-                                             custom_ops=env.split(",")))
+        compilation_config=CompilationConfig(
+            use_inductor=bool(use_inductor),
+            level=torch_level,
+            custom_ops=env.split(","),
+        )
+    )
     with set_current_vllm_config(vllm_config):
         assert CustomOp.default_on() == default_on
 
@@ -98,39 +120,49 @@ class SiluAndMul2(SiluAndMul):
 
 
 @pytest.mark.parametrize(
-    "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
+    "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"]
+)
 def test_enabled_ops_invalid(env: str):
     with pytest.raises(Exception):  # noqa
-        vllm_config = VllmConfig(compilation_config=CompilationConfig(
-            custom_ops=env.split(",")))
+        vllm_config = VllmConfig(
+            compilation_config=CompilationConfig(custom_ops=env.split(","))
+        )
         with set_current_vllm_config(vllm_config):
             RMSNorm(1024).enabled()
 
 
 @pytest.mark.skipif(
     not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
-    reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
+    reason="AITER is a feature exclusive for ROCm and FP8_FNUZ",
+)
 @pytest.mark.parametrize("use_cutlass", [True, False])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
-def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
-                                  use_rocm_aiter_gemm_w8a8_blockscale: str,
-                                  monkeypatch):
-
+def test_w8a8_blockscale_dispatch(
+    use_cutlass: bool,
+    use_rocm_aiter: str,
+    use_rocm_aiter_gemm_w8a8_blockscale: str,
+    monkeypatch,
+):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
-                       use_rocm_aiter_gemm_w8a8_blockscale)
+    monkeypatch.setenv(
+        "VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_gemm_w8a8_blockscale
+    )
 
-    use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
-        int(use_rocm_aiter_gemm_w8a8_blockscale)))
+    use_aiter_and_is_supported = bool(int(use_rocm_aiter)) and bool(
+        int(use_rocm_aiter_gemm_w8a8_blockscale)
+    )
     block_scale_func = dispatch_w8a8_blockscale_func(
-        use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
+        use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported
+    )
     if use_cutlass:
         assert block_scale_func == cutlass_scaled_mm
-    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
-            use_rocm_aiter_gemm_w8a8_blockscale):
-        assert block_scale_func == (
-            torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
+    elif (
+        current_platform.is_rocm()
+        and int(use_rocm_aiter)
+        and int(use_rocm_aiter_gemm_w8a8_blockscale)
+    ):
+        assert block_scale_func == (torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
     else:
         assert block_scale_func == w8a8_block_fp8_matmul
 
@@ -142,7 +174,9 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
     is_rocm_aiter_moe_enabled.cache_clear()
     if current_platform.is_rocm() and int(use_rocm_aiter):
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            rocm_aiter_topk_softmax)
+            rocm_aiter_topk_softmax,
+        )
+
         assert topk_func == rocm_aiter_topk_softmax
     else:
         assert topk_func == vllm_topk_softmax
@@ -151,22 +185,28 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
 @pytest.mark.parametrize("add_residual", [True, False])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="AITER is a feature exclusive for ROCm")
-def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
-                           use_rocm_aiter_norm: str, monkeypatch):
+@pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="AITER is a feature exclusive for ROCm"
+)
+def test_rms_norm_dispatch(
+    add_residual: bool, use_rocm_aiter: str, use_rocm_aiter_norm: str, monkeypatch
+):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
     monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
     rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
 
     if not add_residual:
-        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
-                use_rocm_aiter_norm):
+        if (
+            current_platform.is_rocm()
+            and int(use_rocm_aiter)
+            and int(use_rocm_aiter_norm)
+        ):
             assert rms_norm_func == rocm_aiter_rms_norm
         else:
             assert rms_norm_func == rms_norm
-    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
-            use_rocm_aiter_norm):
+    elif (
+        current_platform.is_rocm() and int(use_rocm_aiter) and int(use_rocm_aiter_norm)
+    ):
         assert rms_norm_func == rocm_aiter_fused_add_rms_norm
     else:
         assert rms_norm_func == fused_add_rms_norm
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index f08c7f7efccb..d5b8de9b490c 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -11,15 +11,16 @@
 from vllm.config import ModelConfig
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor,
-    get_local_guided_decoding_logits_processor)
+    get_local_guided_decoding_logits_processor,
+)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
-    JSONLogitsProcessor, RegexLogitsProcessor)
+    JSONLogitsProcessor,
+    RegexLogitsProcessor,
+)
 from vllm.sampling_params import GuidedDecodingParams
 
-MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
-GUIDED_DECODING_BACKENDS = [
-    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
-]
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar", "guidance"]
 GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
 REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
@@ -35,16 +36,12 @@ def deepseek_r1_qwen_tokenizer():
     return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
 
 
-def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
-                                  sample_json_schema):
+def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
-    regex_LP = RegexLogitsProcessor(sample_regex,
-                                    zephyr_7B_tokenzer,
-                                    reasoner=None)
-    json_LP = JSONLogitsProcessor(sample_json_schema,
-                                  zephyr_7B_tokenzer,
-                                  whitespace_pattern=None,
-                                  reasoner=None)
+    regex_LP = RegexLogitsProcessor(sample_regex, zephyr_7B_tokenzer, reasoner=None)
+    json_LP = JSONLogitsProcessor(
+        sample_json_schema, zephyr_7B_tokenzer, whitespace_pattern=None, reasoner=None
+    )
 
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -62,11 +59,9 @@ def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
 @pytest.mark.parametrize("is_local", [True, False])
-async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
-                                                 sample_regex,
-                                                 sample_json_schema,
-                                                 zephyr_7B_tokenzer):
-
+async def test_guided_logits_processor_black_box(
+    backend: str, is_local: bool, sample_regex, sample_json_schema, zephyr_7B_tokenzer
+):
     config = ModelConfig(
         MODEL_NAME,
         task="generate",
@@ -78,10 +73,15 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
     )
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
 
-    regex_lp = get_local_guided_decoding_logits_processor(
-            regex_request, zephyr_7B_tokenzer, config) if is_local else \
-            await get_guided_decoding_logits_processor(
-                    regex_request, zephyr_7B_tokenzer, config)
+    regex_lp = (
+        get_local_guided_decoding_logits_processor(
+            regex_request, zephyr_7B_tokenzer, config
+        )
+        if is_local
+        else await get_guided_decoding_logits_processor(
+            regex_request, zephyr_7B_tokenzer, config
+        )
+    )
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -90,10 +90,10 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
     assert tensor.shape == original_tensor.shape
     assert not torch.allclose(tensor, original_tensor)
 
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
+    json_request = GuidedDecodingParams(json=sample_json_schema, backend=backend)
     json_lp = await get_guided_decoding_logits_processor(
-        json_request, zephyr_7B_tokenzer, config)
+        json_request, zephyr_7B_tokenzer, config
+    )
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -103,14 +103,17 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("backend",
-                         GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
+@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
 @pytest.mark.parametrize("is_local", [True, False])
 @pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"])
 async def test_guided_logits_processor_with_reasoning(
-        backend: str, is_local: bool, reasoning_backend: str, sample_regex,
-        sample_json_schema, deepseek_r1_qwen_tokenizer):
-
+    backend: str,
+    is_local: bool,
+    reasoning_backend: str,
+    sample_regex,
+    sample_json_schema,
+    deepseek_r1_qwen_tokenizer,
+):
     config = ModelConfig(
         REASONING_MODEL_NAME,
         task="generate",
@@ -120,16 +123,18 @@ async def test_guided_logits_processor_with_reasoning(
         seed=0,
         dtype="bfloat16",
     )
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process")
+    token_ids = deepseek_r1_qwen_tokenizer.encode("<think>here is the thinking process")
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
 
-    regex_lp = get_local_guided_decoding_logits_processor(regex_request,
-                    deepseek_r1_qwen_tokenizer, config,
-                    reasoning_backend) if is_local else \
-            await get_guided_decoding_logits_processor(
-                    regex_request, deepseek_r1_qwen_tokenizer, config,
-                    reasoning_backend)
+    regex_lp = (
+        get_local_guided_decoding_logits_processor(
+            regex_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend
+        )
+        if is_local
+        else await get_guided_decoding_logits_processor(
+            regex_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend
+        )
+    )
     assert regex_lp is not None
     tensor = torch.rand(151664)
     original_tensor = torch.clone(tensor)
@@ -137,15 +142,17 @@ async def test_guided_logits_processor_with_reasoning(
     assert tensor.shape == original_tensor.shape
     assert torch.allclose(tensor, original_tensor)
 
-    token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process")
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = get_local_guided_decoding_logits_processor(
-        json_request, deepseek_r1_qwen_tokenizer, config,
-        reasoning_backend) if is_local else \
-        await get_guided_decoding_logits_processor(
-            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
+    token_ids = deepseek_r1_qwen_tokenizer.encode("<think>here is the thinking process")
+    json_request = GuidedDecodingParams(json=sample_json_schema, backend=backend)
+    json_lp = (
+        get_local_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend
+        )
+        if is_local
+        else await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend
+        )
+    )
     assert json_lp is not None
     tensor = torch.rand(151664)
     original_tensor = torch.clone(tensor)
@@ -155,14 +162,18 @@ async def test_guided_logits_processor_with_reasoning(
 
     # Thinking is over, so the tensor should change.
     token_ids = deepseek_r1_qwen_tokenizer.encode(
-        "<think>here is the thinking process</think>")
-    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                        backend=backend)
-    json_lp = get_local_guided_decoding_logits_processor(
-        json_request, deepseek_r1_qwen_tokenizer, config,
-        reasoning_backend) if is_local else \
-        await get_guided_decoding_logits_processor(
-            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
+        "<think>here is the thinking process</think>"
+    )
+    json_request = GuidedDecodingParams(json=sample_json_schema, backend=backend)
+    json_lp = (
+        get_local_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend
+        )
+        if is_local
+        else await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend
+        )
+    )
     assert json_lp is not None
     tensor = torch.rand(151664)
     original_tensor = torch.clone(tensor)
@@ -172,20 +183,16 @@ async def test_guided_logits_processor_with_reasoning(
 
 
 def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
+    with pytest.raises(ValueError, match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
 
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
+    with pytest.raises(ValueError, match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, json_object=True)
 
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
+    with pytest.raises(ValueError, match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
 
-    with pytest.raises(ValueError,
-                       match="You can only use one kind of guided"):
+    with pytest.raises(ValueError, match="You can only use one kind of guided"):
         GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
 
 
@@ -193,8 +200,7 @@ def test_guided_decoding_backend_options():
     """Test backend-specific options"""
     with pytest.warns(DeprecationWarning):
         guided_decoding_params = GuidedDecodingParams(
-            backend=
-            "xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
+            backend="xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
         )
     assert guided_decoding_params.backend == "xgrammar"
     assert guided_decoding_params.disable_fallback
@@ -208,12 +214,11 @@ def test_pickle_xgrammar_tokenizer_data():
     except ImportError:
         pytest.skip("Could not import xgrammar to run test")
 
-    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
-        TokenizerData)
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import TokenizerData
+
     tokenizer_data = TokenizerData(
-        metadata=
-        '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
-        encoded_vocab=['!', '"', '#', '$', '%'],
+        metadata='{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
+        encoded_vocab=["!", '"', "#", "$", "%"],
     )
     pickled = pickle.dumps(tokenizer_data)
 
@@ -222,5 +227,6 @@ def test_pickle_xgrammar_tokenizer_data():
     depickled: TokenizerData = pickle.loads(pickled)
 
     assert depickled is not None
-    assert json.loads(
-        depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
+    assert (
+        json.loads(depickled.metadata)["vocab_type"] == xgr.VocabType.BYTE_LEVEL.value
+    )
diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py
index 532ebba038d3..af09cc7b4207 100644
--- a/tests/model_executor/test_logits_processor.py
+++ b/tests/model_executor/test_logits_processor.py
@@ -15,38 +15,36 @@
 
 
 class MockLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vocab_size: int, scale: float,
-                 fake_logits: torch.Tensor):
+    def __init__(self, vocab_size: int, scale: float, fake_logits: torch.Tensor):
         super().__init__(vocab_size=vocab_size, scale=scale)
         self.fake_logits = fake_logits.clone()
 
     def forward(self, *args, **kwargs):
-        with patch(
+        with (
+            patch(
                 "vllm.model_executor.layers.logits_processor._prune_hidden_states",
-                lambda x, y: x
-        ), patch(
+                lambda x, y: x,
+            ),
+            patch(
                 "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
-                lambda *args, **kwargs: self.fake_logits):
+                lambda *args, **kwargs: self.fake_logits,
+            ),
+        ):
             return super().forward(*args, **kwargs)
 
 
 def _prepare_test(
-        batch_size: int
+    batch_size: int,
 ) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
     vocab_size = 32000
     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, vocab_size),
-                             1e-2,
-                             dtype=input_tensor.dtype)
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=input_tensor.dtype)
     logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
     return input_tensor, fake_logits, logits_processor
 
 
 RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
@@ -72,10 +70,12 @@ def pick_ith(token_ids, logits):
                 request_id=f"test_{i}",
                 is_prompt=True,
                 seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0,
-                                               logits_processors=[pick_ith]),
+                sampling_params=SamplingParams(
+                    temperature=0, logits_processors=[pick_ith]
+                ),
                 block_tables={0: [1]},
-            ))
+            )
+        )
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     sampling_metadata = SamplingMetadata.prepare(
@@ -83,16 +83,15 @@ def pick_ith(token_ids, logits):
         seq_lens,
         query_lens=seq_lens,
         device=device,
-        pin_memory=is_pin_memory_available())
+        pin_memory=is_pin_memory_available(),
+    )
     logits_processor_output = logits_processor(
-        lm_head=None,
-        hidden_states=input_tensor,
-        sampling_metadata=sampling_metadata)
+        lm_head=None, hidden_states=input_tensor, sampling_metadata=sampling_metadata
+    )
 
     assert torch.isinf(logits_processor_output[:, 0]).all()
 
     fake_logits *= logits_processor.scale
-    torch.testing.assert_close(logits_processor_output[:, 1],
-                               fake_logits[:, 1],
-                               rtol=1e-4,
-                               atol=0.0)
+    torch.testing.assert_close(
+        logits_processor_output[:, 1], fake_logits[:, 1], rtol=1e-4, atol=0.0
+    )
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 4bdb651e5170..43fdacc4b6e6 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -14,23 +14,26 @@
 MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
 REVISION = os.environ.get("REVISION", "main")
 
-MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
-                                    "intfloat/multilingual-e5-base")
+MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "intfloat/multilingual-e5-base")
 REVISION_ROBERTA = os.environ.get("REVISION", "main")
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_model_loading_with_params(vllm_runner):
     """
     Test parameter weight loading with tp>1.
     """
-    with vllm_runner(model_name=MODEL_NAME,
-                     revision=REVISION,
-                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = vllm_model.embed("Write a short story about a robot that"
-                                  " dreams for the first time.\n")
+    with vllm_runner(
+        model_name=MODEL_NAME,
+        revision=REVISION,
+        dtype="float16",
+        max_model_len=MAX_MODEL_LEN,
+    ) as vllm_model:
+        output = vllm_model.embed(
+            "Write a short story about a robot that dreams for the first time.\n"
+        )
 
         model_config = vllm_model.model.llm_engine.model_config
         model_tokenizer = vllm_model.model.llm_engine.tokenizer
@@ -57,18 +60,22 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_roberta_model_loading_with_params(vllm_runner):
     """
     Test parameter weight loading with tp>1.
     """
-    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
-                     revision=REVISION_ROBERTA,
-                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = vllm_model.embed("Write a short story about a robot that"
-                                  " dreams for the first time.\n")
+    with vllm_runner(
+        model_name=MODEL_NAME_ROBERTA,
+        revision=REVISION_ROBERTA,
+        dtype="float16",
+        max_model_len=MAX_MODEL_LEN,
+    ) as vllm_model:
+        output = vllm_model.embed(
+            "Write a short story about a robot that dreams for the first time.\n"
+        )
 
         model_config = vllm_model.model.llm_engine.model_config
         model_tokenizer = vllm_model.model.llm_engine.tokenizer
@@ -95,18 +102,20 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_facebook_roberta_model_loading_with_params(vllm_runner):
     """
     Test loading roberta-base model with no lm_head.
     """
     model_name = "FacebookAI/roberta-base"
-    with vllm_runner(model_name=model_name,
-                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = vllm_model.embed("Write a short story about a robot that"
-                                  " dreams for the first time.\n")
+    with vllm_runner(
+        model_name=model_name, dtype="float16", max_model_len=MAX_MODEL_LEN
+    ) as vllm_model:
+        output = vllm_model.embed(
+            "Write a short story about a robot that dreams for the first time.\n"
+        )
 
         model_tokenizer = vllm_model.model.llm_engine.tokenizer
         assert model_tokenizer.tokenizer_id == model_name
diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py
index df625b8d6004..6dc120ddbac9 100644
--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
@@ -9,23 +9,24 @@
 from huggingface_hub.utils import LocalEntryNotFoundError
 
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, enable_hf_transfer)
+    download_weights_from_hf,
+    enable_hf_transfer,
+)
 
 
 def test_hf_transfer_auto_activation():
     if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
         # in case it is already set, we can't test the auto activation
-        pytest.skip(
-            "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
+        pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
     enable_hf_transfer()
     try:
         # enable hf hub transfer if available
         import hf_transfer  # type: ignore # noqa
+
         HF_TRANSFER_ACTIVE = True
     except ImportError:
         HF_TRANSFER_ACTIVE = False
-    assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
-            HF_TRANSFER_ACTIVE)
+    assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
 
 
 def test_download_weights_from_hf():
@@ -34,22 +35,30 @@ def test_download_weights_from_hf():
         # if offline is set and model is not cached
         huggingface_hub.constants.HF_HUB_OFFLINE = True
         with pytest.raises(LocalEntryNotFoundError):
-            download_weights_from_hf("facebook/opt-125m",
-                                     allow_patterns=["*.safetensors", "*.bin"],
-                                     cache_dir=tmpdir)
+            download_weights_from_hf(
+                "facebook/opt-125m",
+                allow_patterns=["*.safetensors", "*.bin"],
+                cache_dir=tmpdir,
+            )
 
         # download the model
         huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("facebook/opt-125m",
-                                 allow_patterns=["*.safetensors", "*.bin"],
-                                 cache_dir=tmpdir)
+        download_weights_from_hf(
+            "facebook/opt-125m",
+            allow_patterns=["*.safetensors", "*.bin"],
+            cache_dir=tmpdir,
+        )
 
         # now it should work offline
         huggingface_hub.constants.HF_HUB_OFFLINE = True
-        assert download_weights_from_hf(
-            "facebook/opt-125m",
-            allow_patterns=["*.safetensors", "*.bin"],
-            cache_dir=tmpdir) is not None
+        assert (
+            download_weights_from_hf(
+                "facebook/opt-125m",
+                allow_patterns=["*.safetensors", "*.bin"],
+                cache_dir=tmpdir,
+            )
+            is not None
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py
index b4c771840196..2c008bfb7507 100644
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -7,8 +7,12 @@
 
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
-                          HfRunner, VllmRunner)
+from ....conftest import (
+    DecoderPromptType,
+    ExplicitEncoderDecoderPrompt,
+    HfRunner,
+    VllmRunner,
+)
 from ....utils import multi_gpu_test
 from ...utils import check_logprobs_close
 
@@ -40,7 +44,7 @@ def run_test(
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ) -> None:
-    '''
+    """
     Test the vLLM BART model for a variety of encoder/decoder input prompts,
     by validating it against HuggingFace (HF) BART.
 
@@ -48,7 +52,7 @@ def run_test(
 
     * hf_runner: HuggingFace (HF) test model runner
     * vllm_runner: vLLM test model runner
-    * example_encoder_decoder_prompts: test fixture which provides a 
+    * example_encoder_decoder_prompts: test fixture which provides a
                                        dictionary of dummy prompts
     * model: the HF ID of the specific BART variant under test
     * dtype: the tensor datatype to employ
@@ -59,45 +63,45 @@ def run_test(
                            prompt scenarios to test
 
     A note on using HF BART as a baseline for validating vLLM BART,
-    specifically when the decoder prompt is None. 
-    
+    specifically when the decoder prompt is None.
+
     The HF GenerationMixin's default behavior is to force the first
     decoded token to be <BOS> if the prompt does not already contain
     <BOS> (this is accomplished using a logit
     processor setting.)
-    
+
     So when we use HF BART as our baseline for comparison, note that
     when the user provides a request with a None decoder prompt
     (i.e. a singleton encoder prompt, or else an explicit encoder/
     decoder prompt with the decoder sub-prompt set to None), HF and
     vLLM handle this in different ways:
-    
-    * HF will (1) tokenize the None prompt as an empty token-list, 
+
+    * HF will (1) tokenize the None prompt as an empty token-list,
       (2) append <decoder-start-token> to the beginning, yielding
       [<decoder-start-token>], (3) pass this token list to the model, and
       then (4) after computing logits during prefill, override the model
       logits & force <BOS> to be the first generated token.
-    
+
     * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
       start-token to the beginning, yielding [<decoder-start-token><BOS>],
       (3) pass these tokens to the model & proceed with generation.
-    
+
     The net effect is that compared to vLLM, the list of HF *decoded* tokens
     will contain one more initial <BOS> than the vLLM generated tokens,
     because vLLM's <BOS> token is injected into the prompt rather than into
     the generated output. This is in spite of the fact that overall, the
     complete sequences (prompt + decoded tokens) produced by vLLM will match
     HF.
-    
+
     So when we use HF decoded token output to validate vLLM's decoded token
     output, the testing process must account for the difference in decoded
     token sequences between vLLM and HF specifically in the
-    decoder-prompt-is-None case. 
-    
+    decoder-prompt-is-None case.
+
     One option is to disable the logit processor feature that forces the
     <BOS> token to be decoded (forced_bos_token_id = None), eliminating
     the problem entirely. However this is not "normal" BART usage.
-    
+
     The other option is - only in the decoder-prompt-is-None case - to
     discard the first decoded token from the HF output before comparing it
     to vLLM.
@@ -105,7 +109,7 @@ def run_test(
     To that end, when testing the scenario where the decoder prompt is None
     (and only in that one scenario), this test skips the first HF decoded
     token during the process of validating the vLLM decoded output.
-    '''
+    """
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
@@ -122,13 +126,16 @@ def run_test(
     # decoder-only unit tests expect), so when testing an encoder/decoder
     # model we must explicitly specify enforce_eager=True in the VllmRunner
     # constructor.
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    ) as vllm_model:
         vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            prompts, max_tokens, num_logprobs)
+            prompts, max_tokens, num_logprobs
+        )
 
     # Configuration settings for HF baseline
     hf_kwargs = {
@@ -139,20 +146,18 @@ def run_test(
         "length_penalty": 1.0,
         "early_stopping": False,
         "no_repeat_ngram_size": None,
-        "min_length": 0
+        "min_length": 0,
     }
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        hf_outputs = hf_model.generate_encoder_decoder_greedy_logprobs_limit(
             prompts,
             max_tokens,
             num_logprobs,
             **hf_kwargs,
-        ))
+        )
 
-    hf_skip_tokens = (1
-                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
+    hf_skip_tokens = 1 if decoder_prompt_type == DecoderPromptType.NONE else 0
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -169,8 +174,9 @@ def run_test(
 @pytest.mark.parametrize(
     "model",
     [
-        pytest.param("facebook/bart-base",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "facebook/bart-base", marks=[pytest.mark.core_model, pytest.mark.cpu_model]
+        ),
         pytest.param("facebook/bart-large-cnn"),
     ],
 )
@@ -178,9 +184,16 @@ def run_test(
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
-def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
-                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
-
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_encoder_decoder_prompts,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    decoder_prompt_type,
+) -> None:
     run_test(
         hf_runner,
         vllm_runner,
@@ -201,11 +214,17 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
-def test_models_distributed(hf_runner, vllm_runner,
-                            example_encoder_decoder_prompts,
-                            distributed_executor_backend, model, dtype,
-                            max_tokens, num_logprobs,
-                            decoder_prompt_type) -> None:
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_encoder_decoder_prompts,
+    distributed_executor_backend,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    decoder_prompt_type,
+) -> None:
     run_test(
         hf_runner,
         vllm_runner,
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index ea240d227889..816137e2b8e7 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -62,8 +62,7 @@
         pytest.param(
             "openbmb/MiniCPM3-4B",
             # fused_moe not supported on CPU
-            marks=[pytest.mark.core_model,
-                   large_gpu_mark(min_gb=32)],
+            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
         ),
         pytest.param(
             "facebook/opt-125m",  # opt
@@ -92,16 +91,24 @@
         pytest.param(
             "allenai/OLMoE-1B-7B-0924-Instruct",
             marks=[pytest.mark.cpu_model],
-        )
-    ])
+        ),
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
-                monkeypatch) -> None:
-
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    use_rocm_aiter: bool,
+    monkeypatch,
+) -> None:
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
@@ -122,34 +129,37 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
 
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-        prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
-                                                       else None)
+        prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None
 
         prompt_token_ids = []
         for prompt in example_prompts:
-            token_ids = hf_model.tokenizer(prompt,
-                                           return_tensors="pt").input_ids.to(
-                                               hf_model.model.device)
+            token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
+                hf_model.model.device
+            )
             prompt_token_ids.append(token_ids)
             if prompt_embeds is not None:
-                prompt_embeds.append(hf_model.model.get_input_embeddings()(
-                    token_ids).squeeze(0))
+                prompt_embeds.append(
+                    hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
+                )
 
     with vllm_runner(
-            model,
-            tokenizer_name=model_info.tokenizer or model,
-            tokenizer_mode=model_info.tokenizer_mode,
-            trust_remote_code=model_info.trust_remote_code,
-            max_num_seqs=2,
-            enable_prompt_embeds=use_prompt_embeds,
+        model,
+        tokenizer_name=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        max_num_seqs=2,
+        enable_prompt_embeds=use_prompt_embeds,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
         if prompt_embeds is not None:
             vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
-                prompt_embeds, max_tokens, num_logprobs)
+                prompt_embeds, max_tokens, num_logprobs
+            )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
diff --git a/tests/models/language/generation/test_gemma.py b/tests/models/language/generation/test_gemma.py
index 5be4ae874e61..85b6f29b151c 100644
--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@@ -11,17 +11,17 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
     with monkeypatch.context() as m:
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
         with vllm_runner(
-                model,
-                load_format="dummy",
+            model,
+            load_format="dummy",
         ) as llm:
             if model == "google/gemma-3-4b-it":
                 normalizers = llm.model.collective_rpc(
-                    lambda self: self.model_runner.model.language_model.model.
-                    normalizer.cpu().item())
+                    lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
+                )
                 config = llm.model.llm_engine.model_config.hf_config.text_config
             else:
                 normalizers = llm.model.collective_rpc(
-                    lambda self: self.model_runner.model.model.normalizer.cpu(
-                    ).item())
+                    lambda self: self.model_runner.model.model.normalizer.cpu().item()
+                )
                 config = llm.model.llm_engine.model_config.hf_config
             assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py
index 2a39f78a708e..e569e75ff3a8 100644
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -26,11 +26,13 @@ def test_models(
 ) -> None:
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index eba14e64553e..5ca37df49051 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -77,7 +77,6 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -88,13 +87,15 @@ def test_models(
     with hf_runner(model) as hf_model:
         if model not in HF_UNSUPPORTED_MODELS:
             hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
+                example_prompts, max_tokens, num_logprobs
+            )
         else:
             hf_outputs = None
 
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     if model in V1_SUPPORTED_MODELS:
         with monkeypatch.context() as m:
@@ -102,12 +103,15 @@ def test_models(
             if model in HYBRID_MODELS:
                 # required due to reorder_batch behaviour
                 m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-            with vllm_runner(model,
-                             max_num_seqs=MAX_NUM_SEQS,
-                             enforce_eager=True,
-                             enable_prefix_caching=False) as vllm_model:
+            with vllm_runner(
+                model,
+                max_num_seqs=MAX_NUM_SEQS,
+                enforce_eager=True,
+                enable_prefix_caching=False,
+            ) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
+                    example_prompts, max_tokens, num_logprobs
+                )
     else:
         vllm_v1_outputs = None
 
@@ -139,7 +143,6 @@ def test_batching(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -150,13 +153,14 @@ def test_batching(
     for_loop_outputs = []
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         for prompt in example_prompts:
-            single_output, = vllm_model.generate_greedy_logprobs([prompt],
-                                                                 max_tokens,
-                                                                 num_logprobs)
+            (single_output,) = vllm_model.generate_greedy_logprobs(
+                [prompt], max_tokens, num_logprobs
+            )
             for_loop_outputs.append(single_output)
 
         batched_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=for_loop_outputs,
@@ -181,18 +185,22 @@ def test_chunked_prefill(
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                      max_tokens, num_logprobs)
+    with vllm_runner(
+        model,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        chunked = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=False,
-                     max_num_seqs=max_num_seqs) as vllm_model:
+    with vllm_runner(
+        model, enable_chunked_prefill=False, max_num_seqs=max_num_seqs
+    ) as vllm_model:
         non_chunked = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=chunked,
@@ -211,8 +219,8 @@ def test_chunked_prefill_with_parallel_sampling(
     max_tokens: int,
 ) -> None:
     """
-    Tests chunked prefill in conjunction with n > 1. 
-    
+    Tests chunked prefill in conjunction with n > 1.
+
     In this case, prefill is populated with decoding tokens and
     we test that it doesn't fail.
 
@@ -220,16 +228,13 @@ def test_chunked_prefill_with_parallel_sampling(
     decoding steps inside a chunked prefill forward pass
     (where we have both prefill and decode together)
     """
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
+    sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
     with vllm_runner(
-            model,
-            enable_chunked_prefill=True,
-            # forces prefill chunks with decoding
-            max_num_batched_tokens=MAX_NUM_SEQS * 3,
-            max_num_seqs=MAX_NUM_SEQS,
+        model,
+        enable_chunked_prefill=True,
+        # forces prefill chunks with decoding
+        max_num_batched_tokens=MAX_NUM_SEQS * 3,
+        max_num_seqs=MAX_NUM_SEQS,
     ) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
 
@@ -247,10 +252,8 @@ def test_mamba_cache_cg_padding(
     batch size. If it's not, a torch RuntimeError will be raised because
     tensor dimensions aren't compatible.
     """
-    vllm_config = EngineArgs(model=model,
-                             trust_remote_code=True).create_engine_config()
-    while len(example_prompts) == vllm_config.pad_for_cudagraph(
-            len(example_prompts)):
+    vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
         example_prompts.append(example_prompts[0])
 
     try:
@@ -260,7 +263,8 @@ def test_mamba_cache_cg_padding(
         pytest.fail(
             "Couldn't run batch size which is not equal to a Cuda Graph "
             "captured batch size. "
-            "Could be related to mamba cache not padded correctly")
+            "Could be related to mamba cache not padded correctly"
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -277,8 +281,7 @@ def test_models_preemption_recompute(
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         scheduler = vllm_model.model.llm_engine.scheduler[0]
         scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
+        preempt_vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
         scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -310,8 +313,10 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
         with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
             vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
     except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up properly between"
+            "steps finished requests registered unnecessarily "
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -320,10 +325,10 @@ def test_state_cleanup(
     example_prompts,
     model: str,
 ) -> None:
-    """ 
+    """
     This test is for verifying that the Hybrid state is cleaned up between
     steps.
-    
+
     If its not cleaned, an error would be expected.
     """
     try:
@@ -331,8 +336,10 @@ def test_state_cleanup(
             for _ in range(10):
                 vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
     except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
-                    "could be related to finished_requests_ids")
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up between states, "
+            "could be related to finished_requests_ids"
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -343,15 +350,13 @@ def test_multistep_correctness(
     model: str,
     max_tokens: int,
 ) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_multistep = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
+    with vllm_runner(model, num_scheduler_steps=8, max_num_seqs=2) as vllm_model:
+        vllm_outputs_multistep = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-    with vllm_runner(model, num_scheduler_steps=1,
-                     max_num_seqs=2) as vllm_model:
+    with vllm_runner(model, num_scheduler_steps=1, max_num_seqs=2) as vllm_model:
         vllm_outputs_single_step = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
+            example_prompts, max_tokens
+        )
 
     check_outputs_equal(
         outputs_0_lst=vllm_outputs_multistep,
@@ -372,15 +377,15 @@ def test_distributed_correctness(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model, tensor_parallel_size=1,
-                     max_num_seqs=2) as vllm_model:
+    with vllm_runner(model, tensor_parallel_size=1, max_num_seqs=2) as vllm_model:
         vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model, tensor_parallel_size=2,
-                     max_num_seqs=2) as vllm_model:
+    with vllm_runner(model, tensor_parallel_size=2, max_num_seqs=2) as vllm_model:
         vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=vllm_outputs_tp_1,
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index c70698ede37a..c477789628b4 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -8,7 +8,9 @@
 import pytest
 
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
-    MistralToolCall, MistralToolParser)
+    MistralToolCall,
+    MistralToolParser,
+)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 
@@ -35,136 +37,114 @@
 ]
 
 # for function calling
-TOOLS = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
-                },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                 },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                "required": ["city", "state", "unit"],
             },
-            "required": ["city", "state", "unit"]
-        }
+        },
     },
-}, {
-    "type": "function",
-    "function": {
-        "name": "rewrite",
-        "description": "Rewrites text",
-        "parameters": {
-            "type": "object",
-            "required": [],
-            "properties": {
-                "text": {
-                    "type": "string",
-                    "description": "The input text to rewrite."
-                }
-            }
-        }
-    }
-}]
-MSGS = [
     {
-        "role": "system",
-        "content": "You are an assistant."
+        "type": "function",
+        "function": {
+            "name": "rewrite",
+            "description": "Rewrites text",
+            "parameters": {
+                "type": "object",
+                "required": [],
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "The input text to rewrite.",
+                    }
+                },
+            },
+        },
     },
+]
+MSGS = [
+    {"role": "system", "content": "You are an assistant."},
     {
-        "role":
-        "user",
-        "content":
-        "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors."  # noqa
+        "role": "user",
+        "content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.",  # noqa
     },
     {
-        "role":
-        "assistant",
-        "content":
-        "",
-        "tool_calls": [{
-            "id": "bbc5b7ede",
-            "type": "function",
-            "function": {
-                "name":
-                "rewrite",
-                "arguments":
-                '{\"text\":\"My English needs improvving, maybe I make errors.\"}'  # noqa
+        "role": "assistant",
+        "content": "",
+        "tool_calls": [
+            {
+                "id": "bbc5b7ede",
+                "type": "function",
+                "function": {
+                    "name": "rewrite",
+                    "arguments": '{"text":"My English needs improvving, maybe I make errors."}',  # noqa
+                },
             }
-        }]
+        ],
     },
     {
         "role": "tool",
-        "content":
-        "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}",  # noqa
+        "content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}',  # noqa
         "tool_call_id": "bbc5b7ede",
-        "name": "rewrite"
+        "name": "rewrite",
     },
     {
         "role": "assistant",
-        "content": "---\n\nMy English needs improving, maybe I make errors"
+        "content": "---\n\nMy English needs improving, maybe I make errors",
     },
     {
-        "role":
-        "user",
-        "content": ("Can you tell me what the temperate"
-                    " will be in Dallas, in fahrenheit?")
-    }
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
 ]
 
 SAMPLE_JSON_SCHEMA = {
     "type": "object",
     "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
         "skills": {
             "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
+            "items": {"type": "string", "maxLength": 10},
+            "minItems": 3,
         },
         "work_history": {
             "type": "array",
             "items": {
                 "type": "object",
                 "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "number"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
+                    "company": {"type": "string"},
+                    "duration": {"type": "number"},
+                    "position": {"type": "string"},
                 },
-                "required": ["company", "position"]
-            }
-        }
+                "required": ["company", "position"],
+            },
+        },
     },
-    "required": ["name", "age", "skills", "work_history"]
+    "required": ["name", "age", "skills", "work_history"],
 }
 
 
@@ -172,17 +152,25 @@
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model, dtype=dtype,
-                     tokenizer_mode="mistral") as vllm_model:
+    with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -196,27 +184,35 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
-                        max_tokens: int, num_logprobs: int) -> None:
+def test_mistral_format(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
     ) as mistral_format_model:
         mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="auto",
-            load_format="safetensors",
-            config_format="hf",
+        model,
+        dtype=dtype,
+        tokenizer_mode="auto",
+        load_format="safetensors",
+        config_format="hf",
     ) as hf_format_model:
         hf_format_outputs = hf_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_format_outputs,
@@ -228,34 +224,35 @@ def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(vllm_runner, model: str,
-                                    dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=8192,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
+def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_model_len=8192,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
         for prompt in SYMBOLIC_LANG_PROMPTS:
             msg = {"role": "user", "content": prompt}
-            outputs = vllm_model.model.chat([msg],
-                                            sampling_params=SAMPLING_PARAMS)
+            outputs = vllm_model.model.chat([msg], sampling_params=SAMPLING_PARAMS)
             assert "�" not in outputs[0].outputs[0].text.strip()
 
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tokenizer_mode="mistral",
-                     config_format="mistral",
-                     load_format="mistral") as vllm_model:
-
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
         msgs = copy.deepcopy(MSGS)
-        outputs = vllm_model.model.chat(msgs,
-                                        tools=TOOLS,
-                                        sampling_params=SAMPLING_PARAMS)
+        outputs = vllm_model.model.chat(
+            msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
+        )
 
         tokenizer = vllm_model.model.get_tokenizer()
         tool_parser = MistralToolParser(tokenizer)
@@ -267,16 +264,18 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
         assert parsed_message.tools_called
 
         assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
-        assert parsed_message.tool_calls[
-            0].function.name == "get_current_weather"
-        assert parsed_message.tool_calls[
-            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
+        assert parsed_message.tool_calls[0].function.name == "get_current_weather"
+        assert (
+            parsed_message.tool_calls[0].function.arguments
+            == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
+        )  # noqa
         assert parsed_message.content is None
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("guided_backend",
-                         ["outlines", "lm-format-enforcer", "xgrammar"])
+@pytest.mark.parametrize(
+    "guided_backend", ["outlines", "lm-format-enforcer", "xgrammar"]
+)
 def test_mistral_guided_decoding(
     monkeypatch: pytest.MonkeyPatch,
     vllm_runner,
@@ -288,26 +287,24 @@ def test_mistral_guided_decoding(
         m.setenv("VLLM_USE_V1", "0")
 
         with vllm_runner(
-                model,
-                dtype='bfloat16',
-                tokenizer_mode="mistral",
-                guided_decoding_backend=guided_backend,
+            model,
+            dtype="bfloat16",
+            tokenizer_mode="mistral",
+            guided_decoding_backend=guided_backend,
         ) as vllm_model:
             guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA)
-            params = SamplingParams(max_tokens=512,
-                                    temperature=0.7,
-                                    guided_decoding=guided_decoding)
-
-            messages = [{
-                "role": "system",
-                "content": "you are a helpful assistant"
-            }, {
-                "role":
-                "user",
-                "content":
-                f"Give an example JSON for an employee profile that "
-                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
-            }]
+            params = SamplingParams(
+                max_tokens=512, temperature=0.7, guided_decoding=guided_decoding
+            )
+
+            messages = [
+                {"role": "system", "content": "you are a helpful assistant"},
+                {
+                    "role": "user",
+                    "content": f"Give an example JSON for an employee profile that "
+                    f"fits this schema: {SAMPLE_JSON_SCHEMA}",
+                },
+            ]
             outputs = vllm_model.model.chat(messages, sampling_params=params)
 
         generated_text = outputs[0].outputs[0].text
@@ -315,8 +312,7 @@ def test_mistral_guided_decoding(
         assert outputs is not None
 
         try:
-            jsonschema.validate(instance=json_response,
-                                schema=SAMPLE_JSON_SCHEMA)
+            jsonschema.validate(instance=json_response, schema=SAMPLE_JSON_SCHEMA)
         except jsonschema.exceptions.ValidationError:
             pytest.fail("Generated response is not valid with JSON schema")
 
@@ -346,17 +342,10 @@ def get_vocab():
         "city": "Dallas",
         "state": "TX",
         "unit": "fahrenheit",
-        "sub_dict": {
-            "foo": "bar",
-            "inner": {
-                "x": 1,
-                "y": 2
-            }
-        },
+        "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
     }
 
-    model_output = (
-        f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
+    model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
 
     parsed = parser.extract_tool_calls(model_output, None)
 
diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
index 6c9cc2821c30..e640655784cc 100644
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -15,62 +15,56 @@
 
 def test_phimoe_routing_function():
     from vllm.model_executor.models.phimoe import phimoe_routing_function
+
     test_case = {
         0: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.1, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
         },
         1: {
-            "hidden_states":
-            torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
-                         dtype=torch.float32,
-                         requires_grad=False).view(4, 2),
-            "gating_output":
-            torch.tensor([0.4, 0.2, 0.3, 0.4],
-                         dtype=torch.float32,
-                         requires_grad=False),
-            "topk":
-            2,
-            "renormalize":
-            False,
-        }
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
+        },
     }
 
     ground_truth = {
         0: {
-            "topk_weights":
-            torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+            "topk_weights": torch.tensor(
+                [1.0, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
         },
         1: {
-            "topk_weights":
-            torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
-            "topk_ids":
-            torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
-        }
+            "topk_weights": torch.tensor(
+                [0.5, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        },
     }
 
     for test_id in test_case:
         topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
-        assert torch.allclose(topk_weights,
-                              ground_truth[test_id]["topk_weights"])
+        assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
         assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
 
 
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="This test takes a lot time to run on CPU, "
-                    "and vllm CI's disk space is not enough for this model.")
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="This test takes a lot time to run on CPU, "
+    "and vllm CI's disk space is not enough for this model.",
+)
 @large_gpu_test(min_gb=80)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -87,11 +81,13 @@ def test_models(
 ) -> None:
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index a663679a9c7c..66cdb5dd2a20 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -6,8 +6,7 @@
 import pytest
 
 from tests.conftest import HfRunner
-from tests.models.utils import (EmbedModelInfo, check_embeddings_close,
-                                matryoshka_fy)
+from tests.models.utils import EmbedModelInfo, check_embeddings_close, matryoshka_fy
 
 
 def run_embedding_correctness_test(
@@ -29,12 +28,14 @@ def run_embedding_correctness_test(
     )
 
 
-def correctness_test_embed_models(hf_runner,
-                                  vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts,
-                                  vllm_extra_kwargs=None,
-                                  hf_model_callback=None):
+def correctness_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    example_prompts,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
@@ -51,18 +52,16 @@ def correctness_test_embed_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     max_model_len=None,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model_info.name, task="embed", max_model_len=None, **vllm_extra_kwargs
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
 
     with hf_runner(
-            model_info.name,
-            dtype="float32",
-            is_sentence_transformer=True,
+        model_info.name,
+        dtype="float32",
+        is_sentence_transformer=True,
     ) as hf_model:
-
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 6c4fde5fdfa9..65c82f9a03c4 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -27,7 +27,6 @@
 
 
 class VllmMtebEncoder(mteb.Encoder):
-
     def __init__(self, vllm_model):
         super().__init__()
         self.model = vllm_model
@@ -50,8 +49,7 @@ def encode(
 
     def predict(
         self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
@@ -61,17 +59,15 @@ def predict(
         queries = [s[0] for s in sentences]
         corpus = [s[1] for s in sentences]
 
-        outputs = self.model.score(queries,
-                                   corpus,
-                                   truncate_prompt_tokens=-1,
-                                   use_tqdm=False)
+        outputs = self.model.score(
+            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
+        )
         scores = np.array(outputs)
         scores = scores[np.argsort(r)]
         return scores
 
 
 class OpenAIClientMtebEncoder(mteb.Encoder):
-
     def __init__(self, model_name: str, client):
         super().__init__()
         self.model_name = model_name
@@ -84,8 +80,9 @@ def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
 
-        embeddings = self.client.embeddings.create(model=self.model_name,
-                                                   input=sentences)
+        embeddings = self.client.embeddings.create(
+            model=self.model_name, input=sentences
+        )
         outputs = [d.embedding for d in embeddings.data]
         embeds = np.array(outputs)
         embeds = embeds[np.argsort(r)]
@@ -93,7 +90,6 @@ def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
 
 
 class ScoreClientMtebEncoder(mteb.Encoder):
-
     def __init__(self, model_name: str, url):
         super().__init__()
         self.model_name = model_name
@@ -102,8 +98,7 @@ def __init__(self, model_name: str, url):
 
     def predict(
         self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
@@ -119,27 +114,30 @@ def predict(
         return scores
 
     def get_score(self, query, corpus):
-        response = requests.post(self.url,
-                                 json={
-                                     "model": self.model_name,
-                                     "text_1": query,
-                                     "text_2": corpus,
-                                     "truncate_prompt_tokens": -1,
-                                 }).json()
-        return response['data'][0]["score"]
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "text_1": query,
+                "text_2": corpus,
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["data"][0]["score"]
 
 
 class RerankClientMtebEncoder(ScoreClientMtebEncoder):
-
     def get_score(self, query, corpus):
-        response = requests.post(self.url,
-                                 json={
-                                     "model": self.model_name,
-                                     "query": query,
-                                     "documents": [corpus],
-                                     "truncate_prompt_tokens": -1,
-                                 }).json()
-        return response['results'][0]["relevance_score"]
+        response = requests.post(
+            self.url,
+            json={
+                "model": self.model_name,
+                "query": query,
+                "documents": [corpus],
+                "truncate_prompt_tokens": -1,
+            },
+        ).json()
+        return response["results"][0]["relevance_score"]
 
 
 def run_mteb_embed_task(encoder, tasks):
@@ -158,11 +156,13 @@ def run_mteb_embed_task(encoder, tasks):
     return main_score
 
 
-def mteb_test_embed_models(hf_runner,
-                           vllm_runner,
-                           model_info: EmbedModelInfo,
-                           vllm_extra_kwargs=None,
-                           hf_model_callback=None):
+def mteb_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
@@ -171,23 +171,23 @@ def mteb_test_embed_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     max_model_len=None,
-                     **vllm_extra_kwargs) as vllm_model:
-
+    with vllm_runner(
+        model_info.name, task="embed", max_model_len=None, **vllm_extra_kwargs
+    ) as vllm_model:
         if model_info.architecture:
-            assert (model_info.architecture
-                    in vllm_model.model.llm_engine.model_config.architectures)
+            assert (
+                model_info.architecture
+                in vllm_model.model.llm_engine.model_config.architectures
+            )
 
-        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
-                                              MTEB_EMBED_TASKS)
+        vllm_main_score = run_mteb_embed_task(
+            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
+        )
         vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
 
-    with hf_runner(model_info.name,
-                   is_sentence_transformer=True,
-                   dtype="float32") as hf_model:
-
+    with hf_runner(
+        model_info.name, is_sentence_transformer=True, dtype="float32"
+    ) as hf_model:
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
@@ -226,8 +226,7 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
             top_k=10,
             save_predictions=True,
             output_folder=f"{results_folder}/stage2",
-            previous_results=
-            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
+            previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
             encode_kwargs={"show_progress_bar": False},
         )
         main_score = results[0].scores["test"][0]["main_score"]
@@ -235,14 +234,11 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
 
 
 def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
-    with hf_runner(model_name, is_cross_encoder=True,
-                   dtype="float32") as hf_model:
-
+    with hf_runner(model_name, is_cross_encoder=True, dtype="float32") as hf_model:
         original_predict = hf_model.predict
 
         def _predict(
-            sentences: list[tuple[str, str,
-                                  Optional[str]]],  # query, corpus, prompt
+            sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
             *args,
             **kwargs,
         ):
@@ -256,20 +252,22 @@ def _predict(
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
-        st_main_score = run_mteb_rerank(hf_model,
-                                        tasks=MTEB_RERANK_TASKS,
-                                        languages=MTEB_RERANK_LANGS)
+        st_main_score = run_mteb_rerank(
+            hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
+        )
         st_dtype = next(hf_model.model.model.parameters()).dtype
     return st_main_score, st_dtype
 
 
-def mteb_test_rerank_models(hf_runner,
-                            vllm_runner,
-                            model_info: RerankModelInfo,
-                            vllm_extra_kwargs=None,
-                            hf_model_callback=None,
-                            vllm_mteb_encoder=VllmMtebEncoder,
-                            atol=MTEB_RERANK_TOL):
+def mteb_test_rerank_models(
+    hf_runner,
+    vllm_runner,
+    model_info: RerankModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+    vllm_mteb_encoder=VllmMtebEncoder,
+    atol=MTEB_RERANK_TOL,
+):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
@@ -278,25 +276,29 @@ def mteb_test_rerank_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
-    with vllm_runner(model_info.name,
-                     task="score",
-                     max_model_len=None,
-                     max_num_seqs=8,
-                     **vllm_extra_kwargs) as vllm_model:
-
+    with vllm_runner(
+        model_info.name,
+        task="score",
+        max_model_len=None,
+        max_num_seqs=8,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
         model_config = vllm_model.model.llm_engine.model_config
 
         if model_info.architecture:
-            assert (model_info.architecture in model_config.architectures)
+            assert model_info.architecture in model_config.architectures
         assert model_config.hf_config.num_labels == 1
 
-        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
-                                          tasks=MTEB_RERANK_TASKS,
-                                          languages=MTEB_RERANK_LANGS)
+        vllm_main_score = run_mteb_rerank(
+            vllm_mteb_encoder(vllm_model),
+            tasks=MTEB_RERANK_TASKS,
+            languages=MTEB_RERANK_LANGS,
+        )
         vllm_dtype = model_config.dtype
 
     st_main_score, st_dtype = mteb_test_rerank_models_hf(
-        hf_runner, model_info.name, hf_model_callback)
+        hf_runner, model_info.name, hf_model_callback
+    )
 
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
index 64a8f25220da..9859a8b197db 100644
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -8,85 +8,75 @@
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("BAAI/bge-base-en",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("BAAI/bge-base-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
+    EmbedModelInfo("BAAI/bge-base-en", architecture="BertModel", enable_test=True),
+    EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
+        "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
+    ),
+    EmbedModelInfo(
+        "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    EmbedModelInfo(
+        "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
+    EmbedModelInfo(
+        "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    EmbedModelInfo(
+        "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
+    EmbedModelInfo(
+        "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
+    ),
+    EmbedModelInfo(
+        "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
+    ),
     ########## XLMRobertaModel
-    EmbedModelInfo("BAAI/bge-m3",
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
+    EmbedModelInfo("BAAI/bge-m3", architecture="XLMRobertaModel", enable_test=True),
     ########## Qwen2Model
-    EmbedModelInfo("BAAI/bge-code-v1",
-                   architecture="Qwen2Model",
-                   dtype="float32",
-                   enable_test=True),
+    EmbedModelInfo(
+        "BAAI/bge-code-v1", architecture="Qwen2Model", dtype="float32", enable_test=True
+    ),
 ]
 
 RERANK_MODELS = [
     ########## XLMRobertaForSequenceClassification
-    RerankModelInfo("BAAI/bge-reranker-base",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("BAAI/bge-reranker-large",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False),
-    RerankModelInfo("BAAI/bge-reranker-v2-m3",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False)
+    RerankModelInfo(
+        "BAAI/bge-reranker-base",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=True,
+    ),
+    RerankModelInfo(
+        "BAAI/bge-reranker-large",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=False,
+    ),
+    RerankModelInfo(
+        "BAAI/bge-reranker-v2-m3",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=False,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
index 7fa9485dbc7f..972eb88d5d3e 100644
--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -8,45 +8,40 @@
 
 from tests.conftest import HfRunner
 
-from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
-                         mteb_test_rerank_models)
+from .mteb_utils import RerankModelInfo, VllmMtebEncoder, mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                    architecture="GemmaForSequenceClassification"),
+    RerankModelInfo(
+        "BAAI/bge-reranker-v2-gemma", architecture="GemmaForSequenceClassification"
+    ),
 ]
 
 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
 
 
 class GemmaRerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
+
         super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
 
     @torch.no_grad()
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
         def get_inputs(pairs, tokenizer, prompt=None):
             if prompt is None:
                 prompt = PROMPT
 
             sep = "\n"
-            prompt_inputs = tokenizer(prompt,
-                                      return_tensors=None,
-                                      add_special_tokens=False)["input_ids"]
-            sep_inputs = tokenizer(sep,
-                                   return_tensors=None,
-                                   add_special_tokens=False)["input_ids"]
+            prompt_inputs = tokenizer(
+                prompt, return_tensors=None, add_special_tokens=False
+            )["input_ids"]
+            sep_inputs = tokenizer(sep, return_tensors=None, add_special_tokens=False)[
+                "input_ids"
+            ]
             inputs = []
             for query, passage in pairs:
                 query_inputs = tokenizer(
@@ -70,8 +65,7 @@ def get_inputs(pairs, tokenizer, prompt=None):
                     return_token_type_ids=False,
                     add_special_tokens=False,
                 )
-                item["input_ids"] = item[
-                    "input_ids"] + sep_inputs + prompt_inputs
+                item["input_ids"] = item["input_ids"] + sep_inputs + prompt_inputs
                 item["attention_mask"] = [1] * len(item["input_ids"])
                 inputs.append(item)
             return tokenizer.pad(
@@ -87,14 +81,19 @@ def get_inputs(pairs, tokenizer, prompt=None):
             inputs = inputs.to(self.model.device)
             _n_tokens = inputs["input_ids"].shape[1]
             logits = self.model(**inputs, return_dict=True).logits
-            _scores = (logits[:, -1,
-                              self.yes_loc].view(-1, ).float().sigmoid())
+            _scores = (
+                logits[:, -1, self.yes_loc]
+                .view(
+                    -1,
+                )
+                .float()
+                .sigmoid()
+            )
             scores.append(_scores[0].item())
         return torch.Tensor(scores)
 
 
 class GemmaMtebEncoder(VllmMtebEncoder):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.prompt = PROMPT
@@ -103,12 +102,10 @@ def __init__(self, *args, **kwargs):
 
     def predict(
         self,
-        sentences: list[tuple[str, str,
-                              Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
-
         _sentences = []
         for query, corpus, prompt in sentences:
             query = self.query_template.format(query=query)
@@ -119,8 +116,9 @@ def predict(
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
-                            monkeypatch) -> None:
+def test_rerank_models_mteb(
+    vllm_runner, model_info: RerankModelInfo, monkeypatch
+) -> None:
     monkeypatch.setenv("VLLM_USE_V1", "0")
 
     assert model_info.architecture == "GemmaForSequenceClassification"
@@ -133,8 +131,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
         }
     }
 
-    mteb_test_rerank_models(GemmaRerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_extra_kwargs,
-                            vllm_mteb_encoder=GemmaMtebEncoder)
+    mteb_test_rerank_models(
+        GemmaRerankerHfRunner,
+        vllm_runner,
+        model_info,
+        vllm_extra_kwargs,
+        vllm_mteb_encoder=GemmaMtebEncoder,
+    )
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 77df6d16a367..23cb01356938 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -18,12 +18,13 @@
 @pytest.mark.parametrize(
     "model",
     [
-        pytest.param("jason9693/Qwen2.5-1.5B-apeach",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "jason9693/Qwen2.5-1.5B-apeach",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
     ],
 )
-@pytest.mark.parametrize("dtype",
-                         ["half"] if current_platform.is_rocm() else ["float"])
+@pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"])
 def test_models(
     hf_runner,
     vllm_runner,
@@ -40,9 +41,9 @@ def test_models(
     with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForSequenceClassification
+    ) as hf_model:
         hf_outputs = hf_model.classify(example_prompts)
 
     # check logits difference
@@ -53,5 +54,6 @@ def test_models(
         # the tolerance value of 1e-2 is selected based on the
         # half datatype tests in
         # tests/models/language/pooling/test_embedding.py
-        assert torch.allclose(hf_output, vllm_output,
-                              1e-3 if dtype == "float" else 1e-2)
+        assert torch.allclose(
+            hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2
+        )
diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py
index 9a33063d7b46..c47c9c903b2a 100644
--- a/tests/models/language/pooling/test_cross_encoder.py
+++ b/tests/models/language/pooling/test_cross_encoder.py
@@ -5,14 +5,19 @@
 from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
-                    architecture="BertForSequenceClassification"),
-    RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
-                    architecture="Qwen3ForSequenceClassification")
+    RerankModelInfo(
+        "cross-encoder/ms-marco-TinyBERT-L-2-v2",
+        architecture="BertForSequenceClassification",
+    ),
+    RerankModelInfo(
+        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+        architecture="Qwen3ForSequenceClassification",
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index cc9e4102d5b7..cbd0f2d4efbd 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -26,35 +26,40 @@ def v1(run_with_both_engines):
         # case won't pass because gte-Qwen2-1.5B-instruct will cache custom
         # model code with bidirectional attention.
         # [Decoder-only]
-        pytest.param("BAAI/bge-multilingual-gemma2",
-                     marks=[pytest.mark.core_model]),
+        pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]),
         pytest.param(
             "intfloat/e5-mistral-7b-instruct",
             # CPU v1 doesn't support sliding window
-            marks=[pytest.mark.core_model]),
+            marks=[pytest.mark.core_model],
+        ),
         # the qwen models interfere with each other (see PR
         # https://github.com/vllm-project/vllm/pull/18720).
         # To avoid this problem, for now we skip v0 since it will be
         # deprecated anyway.
-        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
+        pytest.param(
+            "ssmits/Qwen2-7B-Instruct-embed-base",
+            marks=[pytest.mark.skip_v0, pytest.mark.cpu_model],
+        ),
         # [Encoder-only]
         pytest.param(
             "BAAI/bge-base-en-v1.5",
             marks=[
                 # CPU only supports V1
                 pytest.mark.core_model,
-                pytest.mark.skip_v1
-            ]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("intfloat/multilingual-e5-small",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+                pytest.mark.skip_v1,
+            ],
+        ),
+        pytest.param(
+            "sentence-transformers/all-MiniLM-L12-v2", marks=[pytest.mark.skip_v1]
+        ),
+        pytest.param("intfloat/multilingual-e5-small", marks=[pytest.mark.skip_v1]),
+        pytest.param(
+            "Alibaba-NLP/gte-Qwen2-1.5B-instruct", marks=[pytest.mark.skip_v1]
+        ),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param(
+            "sentence-transformers/stsb-roberta-base-v2", marks=[pytest.mark.skip_v1]
+        ),
     ],
 )
 def test_models(
@@ -71,13 +76,14 @@ def test_models(
 
     vllm_extra_kwargs = {}
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
-        vllm_extra_kwargs["override_pooler_config"] = \
-            PoolerConfig(pooling_type="MEAN", normalize=False)
+        vllm_extra_kwargs["override_pooler_config"] = PoolerConfig(
+            pooling_type="MEAN", normalize=False
+        )
 
     max_model_len: Optional[int] = 512
     if model in [
-            "sentence-transformers/all-MiniLM-L12-v2",
-            "sentence-transformers/stsb-roberta-base-v2"
+        "sentence-transformers/all-MiniLM-L12-v2",
+        "sentence-transformers/stsb-roberta-base-v2",
     ]:
         max_model_len = None
 
@@ -92,10 +98,9 @@ def test_models(
     with hf_runner(model, is_sentence_transformer=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    with vllm_runner(model,
-                     task="embed",
-                     max_model_len=max_model_len,
-                     **vllm_extra_kwargs) as vllm_model:
+    with vllm_runner(
+        model, task="embed", max_model_len=max_model_len, **vllm_extra_kwargs
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
 
     check_embeddings_close(
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index c2f70bb647a4..26a680b81325 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -15,8 +15,9 @@
 from ....utils import RemoteOpenAIServer
 
 # GritLM embedding implementation is only supported by XFormers backend.
-pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
-                                reason="GritLM requires XFormers")
+pytestmark = pytest.mark.skipif(
+    not importlib.util.find_spec("xformers"), reason="GritLM requires XFormers"
+)
 
 MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
 MAX_MODEL_LEN = 4000
@@ -76,8 +77,9 @@ async def run_client_embeddings(
 
 
 def gritlm_instruction(instruction):
-    return ("<|user|>\n" + instruction +
-            "\n<|embed|>\n" if instruction else "<|embed|>\n")
+    return (
+        "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n"
+    )
 
 
 def get_test_data():
@@ -86,7 +88,8 @@ def get_test_data():
     README.md in https://github.com/ContextualAI/gritlm
     """
     q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract", )
+        "Given a scientific paper title, retrieve the paper's abstract",
+    )
     queries = [
         "Bitcoin: A Peer-to-Peer Electronic Cash System",
         "Generative Representational Instruction Tuning",
@@ -120,9 +123,9 @@ def test_gritlm_offline_embedding(vllm_runner):
     queries, q_instruction, documents, d_instruction = get_test_data()
 
     with vllm_runner(
-            MODEL_NAME,
-            task="embed",
-            max_model_len=MAX_MODEL_LEN,
+        MODEL_NAME,
+        task="embed",
+        max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.model
 
@@ -167,9 +170,9 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
     input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
     with vllm_runner(
-            MODEL_NAME,
-            task="generate",
-            max_model_len=MAX_MODEL_LEN,
+        MODEL_NAME,
+        task="generate",
+        max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.model
 
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 0ad54785308e..58cf44dda226 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -9,61 +9,65 @@
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("thenlper/gte-large",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("thenlper/gte-base",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-small",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-large-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-base-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-small-zh",
-                   architecture="BertModel",
-                   enable_test=False),
+    EmbedModelInfo("thenlper/gte-large", architecture="BertModel", enable_test=True),
+    EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
+        "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
+    ),
+    EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
+        "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
+    ),
     ########### NewModel
-    EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
-                   architecture="GteNewModel",
-                   enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
-                   architecture="GteNewModel",
-                   enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
-                   architecture="GteNewModel",
-                   enable_test=True),
+    EmbedModelInfo(
+        "Alibaba-NLP/gte-multilingual-base",
+        architecture="GteNewModel",
+        enable_test=True,
+    ),
+    EmbedModelInfo(
+        "Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", enable_test=True
+    ),
+    EmbedModelInfo(
+        "Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", enable_test=True
+    ),
     ########### Qwen2ForCausalLM
-    EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                   architecture="Qwen2ForCausalLM",
-                   enable_test=True),
+    EmbedModelInfo(
+        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+        architecture="Qwen2ForCausalLM",
+        enable_test=True,
+    ),
     ########## ModernBertModel
-    EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
-                   architecture="ModernBertModel",
-                   enable_test=True),
+    EmbedModelInfo(
+        "Alibaba-NLP/gte-modernbert-base",
+        architecture="ModernBertModel",
+        enable_test=True,
+    ),
     ########## Qwen3ForCausalLM
-    EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
-                   architecture="Qwen3ForCausalLM",
-                   dtype="float32",
-                   enable_test=True),
-    EmbedModelInfo("Qwen/Qwen3-Embedding-4B",
-                   architecture="Qwen3ForCausalLM",
-                   dtype="float32",
-                   enable_test=False),
+    EmbedModelInfo(
+        "Qwen/Qwen3-Embedding-0.6B",
+        architecture="Qwen3ForCausalLM",
+        dtype="float32",
+        enable_test=True,
+    ),
+    EmbedModelInfo(
+        "Qwen/Qwen3-Embedding-4B",
+        architecture="Qwen3ForCausalLM",
+        dtype="float32",
+        enable_test=False,
+    ),
 ]
 
 V1FlashAttentionImpNotSupported = [
-    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
+    "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+    "Alibaba-NLP/gte-modernbert-base",
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
-                           monkeypatch) -> None:
+def test_embed_models_mteb(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, monkeypatch
+) -> None:
     if model_info.name in V1FlashAttentionImpNotSupported:
         monkeypatch.setenv("VLLM_USE_V1", "0")
 
@@ -71,14 +75,13 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
 
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info,
-                           vllm_extra_kwargs)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info, vllm_extra_kwargs)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo, example_prompts,
-                                  monkeypatch) -> None:
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts, monkeypatch
+) -> None:
     if model_info.name in V1FlashAttentionImpNotSupported:
         monkeypatch.setenv("VLLM_USE_V1", "0")
 
@@ -86,5 +89,6 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
 
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts, vllm_extra_kwargs)
+    correctness_test_embed_models(
+        hf_runner, vllm_runner, model_info, example_prompts, vllm_extra_kwargs
+    )
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
index d899aaada262..ab135c4540b7 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -8,40 +8,38 @@
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("intfloat/e5-small",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("intfloat/e5-base",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/e5-large",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/multilingual-e5-small",
-                   architecture="BertModel",
-                   enable_test=False),
+    EmbedModelInfo("intfloat/e5-small", architecture="BertModel", enable_test=True),
+    EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
+        "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
+    ),
     ########## XLMRobertaModel
-    EmbedModelInfo("intfloat/multilingual-e5-base",
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
-    EmbedModelInfo("intfloat/multilingual-e5-large",
-                   architecture="XLMRobertaModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
-                   architecture="XLMRobertaModel",
-                   enable_test=False),
+    EmbedModelInfo(
+        "intfloat/multilingual-e5-base",
+        architecture="XLMRobertaModel",
+        enable_test=True,
+    ),
+    EmbedModelInfo(
+        "intfloat/multilingual-e5-large",
+        architecture="XLMRobertaModel",
+        enable_test=False,
+    ),
+    EmbedModelInfo(
+        "intfloat/multilingual-e5-large-instruct",
+        architecture="XLMRobertaModel",
+        enable_test=False,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 9bfe7411e16b..9d6b21b1a3b8 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -7,53 +7,57 @@
 from vllm import PoolingParams
 
 from ...utils import EmbedModelInfo, RerankModelInfo
-from .embed_utils import (check_embeddings_close,
-                          correctness_test_embed_models, matryoshka_fy)
+from .embed_utils import (
+    check_embeddings_close,
+    correctness_test_embed_models,
+    matryoshka_fy,
+)
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 EMBEDDING_MODELS = [
-    EmbedModelInfo("jinaai/jina-embeddings-v3",
-                   architecture="XLMRobertaModel",
-                   is_matryoshka=True)
+    EmbedModelInfo(
+        "jinaai/jina-embeddings-v3", architecture="XLMRobertaModel", is_matryoshka=True
+    )
 ]
 
 RERANK_MODELS = [
-    RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
-                    architecture="XLMRobertaForSequenceClassification")
+    RerankModelInfo(
+        "jinaai/jina-reranker-v2-base-multilingual",
+        architecture="XLMRobertaForSequenceClassification",
+    )
 ]
 
 
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
-
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     def hf_model_callback(model):
         model.encode = partial(model.encode, task="text-matching")
 
-    mteb_test_embed_models(hf_runner,
-                           vllm_runner,
-                           model_info,
-                           hf_model_callback=hf_model_callback)
+    mteb_test_embed_models(
+        hf_runner, vllm_runner, model_info, hf_model_callback=hf_model_callback
+    )
 
 
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
     def hf_model_callback(model):
         model.encode = partial(model.encode, task="text-matching")
 
-    correctness_test_embed_models(hf_runner,
-                                  vllm_runner,
-                                  model_info,
-                                  example_prompts,
-                                  hf_model_callback=hf_model_callback)
+    correctness_test_embed_models(
+        hf_runner,
+        vllm_runner,
+        model_info,
+        example_prompts,
+        hf_model_callback=hf_model_callback,
+    )
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(hf_runner, vllm_runner,
-                            model_info: RerankModelInfo) -> None:
+def test_rerank_models_mteb(
+    hf_runner, vllm_runner, model_info: RerankModelInfo
+) -> None:
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
 
 
@@ -76,32 +80,32 @@ def test_matryoshka(
     example_prompts = [str(s).strip() for s in example_prompts]
 
     with hf_runner(
-            model_info.name,
-            dtype=dtype,
-            is_sentence_transformer=True,
+        model_info.name,
+        dtype=dtype,
+        is_sentence_transformer=True,
     ) as hf_model:
         hf_outputs = hf_model.encode(example_prompts, task="text-matching")
         hf_outputs = matryoshka_fy(hf_outputs, dimensions)
 
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     dtype=dtype,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_info.name, task="embed", dtype=dtype, max_model_len=None
+    ) as vllm_model:
         assert vllm_model.model.llm_engine.model_config.is_matryoshka
 
         matryoshka_dimensions = (
-            vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
+            vllm_model.model.llm_engine.model_config.matryoshka_dimensions
+        )
         assert matryoshka_dimensions is not None
 
         if dimensions not in matryoshka_dimensions:
             with pytest.raises(ValueError):
                 vllm_model.embed(
-                    example_prompts,
-                    pooling_params=PoolingParams(dimensions=dimensions))
+                    example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
+                )
         else:
             vllm_outputs = vllm_model.embed(
-                example_prompts,
-                pooling_params=PoolingParams(dimensions=dimensions))
+                example_prompts, pooling_params=PoolingParams(dimensions=dimensions)
+            )
 
             check_embeddings_close(
                 embeddings_0_lst=hf_outputs,
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py
index e74c58744dd2..6bd848699b21 100644
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -10,43 +10,42 @@
 from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
-                    architecture="Qwen2ForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
-                    architecture="Qwen2ForSequenceClassification",
-                    enable_test=False)
+    RerankModelInfo(
+        "mixedbread-ai/mxbai-rerank-base-v2",
+        architecture="Qwen2ForSequenceClassification",
+        enable_test=True,
+    ),
+    RerankModelInfo(
+        "mixedbread-ai/mxbai-rerank-large-v2",
+        architecture="Qwen2ForSequenceClassification",
+        enable_test=False,
+    ),
 ]
 
 
 class MxbaiRerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
+
         super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
         self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
 
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
         def process_inputs(pairs):
-            inputs = self.tokenizer(pairs,
-                                    padding=False,
-                                    truncation='longest_first',
-                                    return_attention_mask=False)
-            for i, ele in enumerate(inputs['input_ids']):
-                inputs['input_ids'][i] = ele
-            inputs = self.tokenizer.pad(inputs,
-                                        padding=True,
-                                        return_tensors="pt")
+            inputs = self.tokenizer(
+                pairs,
+                padding=False,
+                truncation="longest_first",
+                return_attention_mask=False,
+            )
+            for i, ele in enumerate(inputs["input_ids"]):
+                inputs["input_ids"][i] = ele
+            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
             for key in inputs:
                 inputs[key] = inputs[key].to(self.model.device)
             return inputs
@@ -78,5 +77,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
             "method": "from_2_way_softmax",
         }
 
-    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(
+        MxbaiRerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
+    )
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index e16ec239a338..e5840b77e606 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -7,30 +7,32 @@
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
-    EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
-                   architecture="NomicBertModel",
-                   enable_test=True),
-    EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
-                   architecture="NomicBertModel",
-                   enable_test=False),
-    EmbedModelInfo("nomic-ai/CodeRankEmbed",
-                   architecture="NomicBertModel",
-                   enable_test=False),
-    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
-                   architecture="NomicBertModel",
-                   enable_test=True)
+    EmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", enable_test=True
+    ),
+    EmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v1.5",
+        architecture="NomicBertModel",
+        enable_test=False,
+    ),
+    EmbedModelInfo(
+        "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
+    ),
+    EmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v2-moe",
+        architecture="NomicBertModel",
+        enable_test=True,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 250b3a52835a..ce348785ec15 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -7,10 +7,10 @@
 
 MODELS = [
     EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
-    #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
-    #EmbedModelInfo("nomic-ai/CodeRankEmbed"),
+    # EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
+    # EmbedModelInfo("nomic-ai/CodeRankEmbed"),
     EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
-    #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
+    # EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"),
 ]
 
 rope_theta = 1000
@@ -21,23 +21,20 @@
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_default(model_info, vllm_runner):
-    with vllm_runner(model_info.name, task="embed",
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(model_info.name, task="embed", max_model_len=None) as vllm_model:
         model_config = vllm_model.model.llm_engine.model_config
         if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
             # For nomic-embed-text-v2-moe the length is set to 512
             # by sentence_bert_config.json.
             assert model_config.max_model_len == 512
         else:
-            assert (
-                model_config.max_model_len == original_max_position_embeddings)
+            assert model_config.max_model_len == original_max_position_embeddings
 
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_set_max_model_len_legal(model_info, vllm_runner):
     # set max_model_len <= 512
-    with vllm_runner(model_info.name, task="embed",
-                     max_model_len=256) as vllm_model:
+    with vllm_runner(model_info.name, task="embed", max_model_len=256) as vllm_model:
         model_config = vllm_model.model.llm_engine.model_config
         assert model_config.max_model_len == 256
 
@@ -46,12 +43,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
         # For nomic-embed-text-v2-moe the length is set to 512
         # by sentence_bert_config.json.
         with pytest.raises(ValueError):
-            with vllm_runner(model_info.name, task="embed",
-                             max_model_len=1024):
+            with vllm_runner(model_info.name, task="embed", max_model_len=1024):
                 pass
     else:
-        with vllm_runner(model_info.name, task="embed",
-                         max_model_len=1024) as vllm_model:
+        with vllm_runner(
+            model_info.name, task="embed", max_model_len=1024
+        ) as vllm_model:
             model_config = vllm_model.model.llm_engine.model_config
             assert model_config.max_model_len == 1024
 
@@ -66,10 +63,9 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
     # set max_model_len > 2048 by hf_overrides
     hf_overrides = {"max_model_len": 4096}
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         task="embed",
-                         max_model_len=None,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name, task="embed", max_model_len=None, hf_overrides=hf_overrides
+        ):
             pass
 
 
@@ -80,16 +76,14 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
         "rope_scaling": {
             "rope_type": "yarn",
             "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
+            "original_max_position_embeddings": original_max_position_embeddings,
         },
-        "max_model_len": max_model_len
+        "max_model_len": max_model_len,
     }
 
-    with vllm_runner(model_info.name,
-                     task="embed",
-                     max_model_len=None,
-                     hf_overrides=hf_overrides):
+    with vllm_runner(
+        model_info.name, task="embed", max_model_len=None, hf_overrides=hf_overrides
+    ):
         pass
 
 
@@ -100,16 +94,17 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
         "rope_scaling": {
             "rope_type": "yarn",
             "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
-        }
+            "original_max_position_embeddings": original_max_position_embeddings,
+        },
     }
     # illegal max_model_len
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         task="embed",
-                         max_model_len=max_model_len + 1,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name,
+            task="embed",
+            max_model_len=max_model_len + 1,
+            hf_overrides=hf_overrides,
+        ):
             pass
 
     hf_overrides = {
@@ -117,15 +112,13 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
         "rope_scaling": {
             "rope_type": "yarn",
             "factor": factor,
-            "original_max_position_embeddings":
-            original_max_position_embeddings
+            "original_max_position_embeddings": original_max_position_embeddings,
         },
-        "max_model_len": max_model_len + 1
+        "max_model_len": max_model_len + 1,
     }
     # illegal max_model_len by hf_overrides
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name,
-                         task="embed",
-                         max_model_len=None,
-                         hf_overrides=hf_overrides):
+        with vllm_runner(
+            model_info.name, task="embed", max_model_len=None, hf_overrides=hf_overrides
+        ):
             pass
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 9c6a833b4138..36ef11b9b043 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -11,43 +11,42 @@
 from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
-                    architecture="Qwen3ForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("Qwen/Qwen3-Reranker-4B",
-                    architecture="Qwen3ForSequenceClassification",
-                    enable_test=False)
+    RerankModelInfo(
+        "Qwen/Qwen3-Reranker-0.6B",
+        architecture="Qwen3ForSequenceClassification",
+        enable_test=True,
+    ),
+    RerankModelInfo(
+        "Qwen/Qwen3-Reranker-4B",
+        architecture="Qwen3ForSequenceClassification",
+        enable_test=False,
+    ),
 ]
 
 
 class Qwen3RerankerHfRunner(HfRunner):
-
-    def __init__(self,
-                 model_name: str,
-                 dtype: str = "auto",
-                 *args: Any,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
+    ) -> None:
         from transformers import AutoModelForCausalLM, AutoTokenizer
+
         super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                       padding_side='left')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
         self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
 
-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
         def process_inputs(pairs):
-            inputs = self.tokenizer(pairs,
-                                    padding=False,
-                                    truncation='longest_first',
-                                    return_attention_mask=False)
-            for i, ele in enumerate(inputs['input_ids']):
-                inputs['input_ids'][i] = ele
-            inputs = self.tokenizer.pad(inputs,
-                                        padding=True,
-                                        return_tensors="pt")
+            inputs = self.tokenizer(
+                pairs,
+                padding=False,
+                truncation="longest_first",
+                return_attention_mask=False,
+            )
+            for i, ele in enumerate(inputs["input_ids"]):
+                inputs["input_ids"][i] = ele
+            inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
             for key in inputs:
                 inputs[key] = inputs[key].to(self.model.device)
             return inputs
@@ -72,7 +71,6 @@ def compute_logits(inputs):
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-
     assert model_info.architecture == "Qwen3ForSequenceClassification"
 
     vllm_extra_kwargs: dict[str, Any] = {
@@ -86,15 +84,14 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
     if model_info.name == "Qwen/Qwen3-Reranker-4B":
         vllm_extra_kwargs["max_num_seqs"] = 1
 
-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(
+        Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
+    )
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 @multi_gpu_test(num_gpus=2)
-def test_rerank_models_mteb_tp(vllm_runner,
-                               model_info: RerankModelInfo) -> None:
-
+def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None:
     assert model_info.architecture == "Qwen3ForSequenceClassification"
 
     vllm_extra_kwargs: dict[str, Any] = {
@@ -109,8 +106,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
     if model_info.name == "Qwen/Qwen3-Reranker-4B":
         vllm_extra_kwargs["max_num_seqs"] = 1
 
-    mteb_test_rerank_models(Qwen3RerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_extra_kwargs,
-                            atol=1.2e-2)
+    mteb_test_rerank_models(
+        Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs, atol=1.2e-2
+    )
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 3b7fab3ba5c9..9eeac29f9ab6 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -24,10 +24,8 @@ def v1(run_with_both_engines):
 def math_step_prompts():
     # ruff: noqa: E501
     data = {
-        "system":
-        "Please reason step by step, and put your final answer within \\boxed{}. ",
-        "query":
-        "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
+        "system": "Please reason step by step, and put your final answer within \\boxed{}. ",
+        "query": "Sue lives in a fun neighborhood.  One weekend, the neighbors decided to play a prank on Sue.  On Friday morning, the neighbors placed 18 pink plastic flamingos out on Sue's front yard.  On Saturday morning, the neighbors took back one third of the flamingos, painted them white, and put these newly painted white flamingos back out on Sue's front yard.  Then, on Sunday morning, they added another 18 pink plastic flamingos to the collection. At noon on Sunday, how many more pink plastic flamingos were out than white plastic flamingos?",
         "response": [
             "To find out how many more pink plastic flamingos were out than white plastic flamingos at noon on Sunday, we can break down the problem into steps. First, on Friday, the neighbors start with 18 pink plastic flamingos.",
             "On Saturday, they take back one third of the flamingos. Since there were 18 flamingos, (1/3 \\times 18 = 6) flamingos are taken back. So, they have (18 - 6 = 12) flamingos left in their possession. Then, they paint these 6 flamingos white and put them back out on Sue's front yard. Now, Sue has the original 12 pink flamingos plus the 6 new white ones. Thus, by the end of Saturday, Sue has (12 + 6 = 18) pink flamingos and 6 white flamingos.",
@@ -35,16 +33,16 @@ def math_step_prompts():
             "To find the difference, subtract the number of white flamingos from the number of pink flamingos: (36 - 6 = 30). Therefore, at noon on Sunday, there were 30 more pink plastic flamingos out than white plastic flamingos. The answer is (\\boxed{30}).",
         ],
     }
-    answer = "<extra_0>".join(data['response']) + "<extra_0>"
+    answer = "<extra_0>".join(data["response"]) + "<extra_0>"
     prompt = f"<im_start>system\n{data['system']}<im_end>\n<im_start>user\n{data['query']}<im_end>\n<im_start>assistant\n{answer}<im_end><|endoftext|>"
     return [prompt]
 
 
 def step_reward_patch_hf_model(hf_model: HfRunner):
-
     # Patch the hf_runner to use the step reward function
-    def make_step_rewards(logits: torch.Tensor,
-                          token_masks: torch.Tensor) -> list[list[float]]:
+    def make_step_rewards(
+        logits: torch.Tensor, token_masks: torch.Tensor
+    ) -> list[list[float]]:
         probabilities = F.softmax(logits, dim=-1)
         probabilities = probabilities * token_masks.unsqueeze(-1)
 
@@ -62,7 +60,7 @@ def reward(prompts: list[str]) -> list[list[float]]:
         outputs = hf_model.model(input_ids=input_ids)
 
         step_sep_id = hf_model.tokenizer.encode("<extra_0>")[0]
-        token_masks = (input_ids == step_sep_id)
+        token_masks = input_ids == step_sep_id
         return make_step_rewards(outputs[0], token_masks)
 
     hf_model.reward = reward  # type: ignore[attr-defined]
@@ -73,8 +71,10 @@ def reward(prompts: list[str]) -> list[list[float]]:
 @pytest.mark.parametrize(
     "model",
     [
-        pytest.param("Qwen/Qwen2.5-Math-PRM-7B",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index c75ff1445616..d220f0ec2597 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -37,8 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict([text_pair]).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, task="score", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
     assert len(vllm_outputs) == 1
@@ -56,8 +57,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, task="score", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
     assert len(vllm_outputs) == 2
@@ -76,8 +78,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        model_name, task="score", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
     assert len(vllm_outputs) == 2
@@ -95,17 +98,15 @@ def emb_model_name(request):
 def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
     text_pair = [TEXTS_1[0], TEXTS_2[0]]
 
-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
         hf_embeddings = hf_model.encode(text_pair)
-        hf_outputs = [
-            F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
-        ]
+        hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)]
 
-    with vllm_runner(emb_model_name,
-                     task="embed",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, task="embed", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
     assert len(vllm_outputs) == 1
@@ -120,20 +121,18 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
         [TEXTS_1[0], TEXTS_2[1]],
     ]
 
-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
         hf_outputs = [
             F.cosine_similarity(*map(torch.tensor, pair), dim=0)
             for pair in hf_embeddings
         ]
 
-    with vllm_runner(emb_model_name,
-                     task="embed",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, task="embed", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
     assert len(vllm_outputs) == 2
@@ -149,20 +148,18 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
         [TEXTS_1[1], TEXTS_2[1]],
     ]
 
-    with hf_runner(emb_model_name, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
+    with hf_runner(
+        emb_model_name, dtype=DTYPE, is_sentence_transformer=True
+    ) as hf_model:
+        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
         hf_outputs = [
             F.cosine_similarity(*map(torch.tensor, pair), dim=0)
             for pair in hf_embeddings
         ]
 
-    with vllm_runner(emb_model_name,
-                     task="embed",
-                     dtype=DTYPE,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(
+        emb_model_name, task="embed", dtype=DTYPE, max_model_len=None
+    ) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
     assert len(vllm_outputs) == 2
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index d6b5dbd08372..5174a481b139 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -7,50 +7,64 @@
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
-                   is_matryoshka=False,
-                   architecture="NomicBertModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
-                   is_matryoshka=True,
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
-                   is_matryoshka=True,
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
-                   is_matryoshka=True,
-                   architecture="GteModel",
-                   enable_test=True),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-xs",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=True,
+    ),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-s",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-long",
+        is_matryoshka=False,
+        architecture="NomicBertModel",
+        enable_test=True,
+    ),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-l",
+        is_matryoshka=False,
+        architecture="BertModel",
+        enable_test=False,
+    ),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v1.5",
+        is_matryoshka=True,
+        architecture="BertModel",
+        enable_test=True,
+    ),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-l-v2.0",
+        is_matryoshka=True,
+        architecture="XLMRobertaModel",
+        enable_test=True,
+    ),
+    EmbedModelInfo(
+        "Snowflake/snowflake-arctic-embed-m-v2.0",
+        is_matryoshka=True,
+        architecture="GteModel",
+        enable_test=True,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_mteb(hf_runner, vllm_runner,
-                           model_info: EmbedModelInfo) -> None:
+def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
-def test_embed_models_correctness(hf_runner, vllm_runner,
-                                  model_info: EmbedModelInfo,
-                                  example_prompts) -> None:
-    correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts)
+def test_embed_models_correctness(
+    hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts
+) -> None:
+    correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts)
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index 33aff1c873fc..48f92d61ce4e 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -20,51 +20,57 @@
 field."""
 
 
-def test_smaller_truncation_size(vllm_runner,
-                                 model_name=MODEL_NAME,
-                                 input_str=input_str):
-
+def test_smaller_truncation_size(
+    vllm_runner, model_name=MODEL_NAME, input_str=input_str
+):
     truncate_prompt_tokens = 10
 
-    with vllm_runner(model_name, task="embed",
-                     max_model_len=max_model_len) as vllm_model:
+    with vllm_runner(
+        model_name, task="embed", max_model_len=max_model_len
+    ) as vllm_model:
         vllm_output = vllm_model.model.encode(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )
 
     prompt_tokens = vllm_output[0].prompt_token_ids
 
     assert len(prompt_tokens) == truncate_prompt_tokens
 
 
-def test_max_truncation_size(vllm_runner,
-                             model_name=MODEL_NAME,
-                             input_str=input_str):
+def test_max_truncation_size(vllm_runner, model_name=MODEL_NAME, input_str=input_str):
     truncate_prompt_tokens = -1
 
-    with vllm_runner(model_name, task="embed",
-                     max_model_len=max_model_len) as vllm_model:
+    with vllm_runner(
+        model_name, task="embed", max_model_len=max_model_len
+    ) as vllm_model:
         vllm_output = vllm_model.model.encode(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )
 
     prompt_tokens = vllm_output[0].prompt_token_ids
 
     assert len(prompt_tokens) == max_model_len
 
 
-def test_bigger_truncation_size(vllm_runner,
-                                model_name=MODEL_NAME,
-                                input_str=input_str):
-
+def test_bigger_truncation_size(
+    vllm_runner, model_name=MODEL_NAME, input_str=input_str
+):
     truncate_prompt_tokens = max_model_len + 1
 
-    with pytest.raises(ValueError), vllm_runner(
-            model_name, task="embed",
-            max_model_len=max_model_len) as vllm_model:
-
+    with (
+        pytest.raises(ValueError),
+        vllm_runner(
+            model_name, task="embed", max_model_len=max_model_len
+        ) as vllm_model,
+    ):
         llm_output = vllm_model.model.encode(
-            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens
+        )
 
-        assert llm_output == f"""truncate_prompt_tokens value 
+        assert (
+            llm_output
+            == f"""truncate_prompt_tokens value 
                 ({truncate_prompt_tokens}) is greater than 
                 max_model_len ({max_model_len}). Please, select 
                 a smaller truncation size."""
+        )
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 98461676aa47..2b98e1530474 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -3,27 +3,41 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+
 import math
 import os
 from collections import defaultdict
 from pathlib import PosixPath
 
 import pytest
-from transformers import (AutoModel, AutoModelForImageTextToText,
-                          AutoModelForTextToWaveform, AutoModelForVision2Seq)
+from transformers import (
+    AutoModel,
+    AutoModelForImageTextToText,
+    AutoModelForTextToWaveform,
+    AutoModelForVision2Seq,
+)
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
 
-from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
-                          ImageTestAssets, VideoTestAssets, VllmRunner)
-from ....utils import (create_new_process_for_each_test, large_gpu_mark,
-                       multi_gpu_marks)
+from ....conftest import (
+    IMAGE_ASSETS,
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
+from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
-from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
-                              VLMTestInfo, VLMTestType)
+from .vlm_utils.types import (
+    CustomTestOptions,
+    ExpandableVLMTestArgs,
+    VLMTestInfo,
+    VLMTestType,
+)
 
 # This hack is needed for phi3v & paligemma models
 # ROCm Triton FA can run into shared memory issues with these models,
@@ -736,7 +750,7 @@ def _mark_splits(
     new_test_settings = dict[str, VLMTestInfo]()
 
     for i in range(num_groups):
-        models_in_group = models[i * split_size:(i + 1) * split_size]
+        models_in_group = models[i * split_size : (i + 1) * split_size]
 
         for model in models_in_group:
             for info in test_infos_by_model[model]:
@@ -767,12 +781,17 @@ def _mark_splits(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.IMAGE,
         create_new_process_for_each_test=False,
-    ))
-def test_single_image_models(tmp_path: PosixPath, model_type: str,
-                             test_case: ExpandableVLMTestArgs,
-                             hf_runner: type[HfRunner],
-                             vllm_runner: type[VllmRunner],
-                             image_assets: ImageTestAssets, monkeypatch):
+    ),
+)
+def test_single_image_models(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -792,12 +811,17 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.MULTI_IMAGE,
         create_new_process_for_each_test=False,
-    ))
-def test_multi_image_models(tmp_path: PosixPath, model_type: str,
-                            test_case: ExpandableVLMTestArgs,
-                            hf_runner: type[HfRunner],
-                            vllm_runner: type[VllmRunner],
-                            image_assets: ImageTestAssets, monkeypatch):
+    ),
+)
+def test_multi_image_models(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -817,12 +841,16 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.EMBEDDING,
         create_new_process_for_each_test=False,
-    ))
-def test_image_embedding_models(model_type: str,
-                                test_case: ExpandableVLMTestArgs,
-                                hf_runner: type[HfRunner],
-                                vllm_runner: type[VllmRunner],
-                                image_assets: ImageTestAssets, monkeypatch):
+    ),
+)
+def test_image_embedding_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -841,10 +869,16 @@ def test_image_embedding_models(model_type: str,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.VIDEO,
         create_new_process_for_each_test=False,
-    ))
-def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
-                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                      video_assets: VideoTestAssets, monkeypatch):
+    ),
+)
+def test_video_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    video_assets: VideoTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -863,10 +897,16 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.AUDIO,
         create_new_process_for_each_test=False,
-    ))
-def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
-                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                      audio_assets: AudioTestAssets, monkeypatch):
+    ),
+)
+def test_audio_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -885,7 +925,8 @@ def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.CUSTOM_INPUTS,
         create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_custom_inputs_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
@@ -911,13 +952,18 @@ def test_custom_inputs_models(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.IMAGE,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
-def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
-                                   test_case: ExpandableVLMTestArgs,
-                                   hf_runner: type[HfRunner],
-                                   vllm_runner: type[VllmRunner],
-                                   image_assets: ImageTestAssets, monkeypatch):
+def test_single_image_models_heavy(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -937,13 +983,18 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.MULTI_IMAGE,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
-def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
-                                  test_case: ExpandableVLMTestArgs,
-                                  hf_runner: type[HfRunner],
-                                  vllm_runner: type[VllmRunner],
-                                  image_assets: ImageTestAssets, monkeypatch):
+def test_multi_image_models_heavy(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -963,14 +1014,17 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.EMBEDDING,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
-def test_image_embedding_models_heavy(model_type: str,
-                                      test_case: ExpandableVLMTestArgs,
-                                      hf_runner: type[HfRunner],
-                                      vllm_runner: type[VllmRunner],
-                                      image_assets: ImageTestAssets,
-                                      monkeypatch):
+def test_image_embedding_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -989,11 +1043,16 @@ def test_image_embedding_models_heavy(model_type: str,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.VIDEO,
         create_new_process_for_each_test=True,
-    ))
-def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
-                            hf_runner: type[HfRunner],
-                            vllm_runner: type[VllmRunner],
-                            video_assets: VideoTestAssets, monkeypatch):
+    ),
+)
+def test_video_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    video_assets: VideoTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -1012,11 +1071,16 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.AUDIO,
         create_new_process_for_each_test=True,
-    ))
-def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
-                            hf_runner: type[HfRunner],
-                            vllm_runner: type[VllmRunner],
-                            audio_assets: AudioTestAssets, monkeypatch):
+    ),
+)
+def test_audio_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+    monkeypatch,
+):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -1035,7 +1099,8 @@ def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.CUSTOM_INPUTS,
         create_new_process_for_each_test=True,
-    ))
+    ),
+)
 @create_new_process_for_each_test()
 def test_custom_inputs_models_heavy(
     model_type: str,
diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py
index a622957f96f6..92a993d157b0 100644
--- a/tests/models/multimodal/generation/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
@@ -17,12 +17,12 @@
 # Florence-2 model repo's tokenizer config is missing some special tokens.
 # Therefore, we use a converted tokenizer from a forked repo
 TOKENIZER = "Isotr0py/Florence-2-tokenizer"
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<OD>",  # special task token which will output special tokens
-    "cherry_blossom":
-    "Describe in detail what is shown in the image.",
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<OD>",  # special task token which will output special tokens
+        "cherry_blossom": "Describe in detail what is shown in the image.",
+    }
+)
 
 
 def get_hf_images_prompts(
@@ -35,13 +35,13 @@ def get_hf_images_prompts(
             ExplicitEncoderDecoderPrompt(
                 encoder_prompt=encoder_prompt["prompt"],
                 decoder_prompt=None,
-            ))
+            )
+        )
         images.append(encoder_prompt["multi_modal_data"]["image"])
     return prompts, images
 
 
-def hf_to_vllm_output(hf_output: tuple[list[int], str,
-                                       Optional[SampleLogprobs]]):
+def hf_to_vllm_output(hf_output: tuple[list[int], str, Optional[SampleLogprobs]]):
     """Sanitize hf output to be comparable with vllm output."""
     output_ids, output_str, out_logprobs = hf_output
 
@@ -62,35 +62,39 @@ def run_test(
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ) -> None:
-    with vllm_runner(model,
-                     max_num_seqs=8,
-                     tokenizer_name=TOKENIZER,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model,
+        max_num_seqs=8,
+        tokenizer_name=TOKENIZER,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    ) as vllm_model:
         vllm_outputs_per_case = [
             vllm_model.generate_encoder_decoder_greedy_logprobs(
                 prompts,
                 max_tokens,
                 num_logprobs=num_logprobs,
                 skip_special_tokens=False,
-            ) for prompts in inputs
+            )
+            for prompts in inputs
         ]
 
     hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
 
     with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.language_model.lm_head
+        hf_model.model.get_output_embeddings = (
+            lambda: hf_model.model.language_model.lm_head
+        )
         hf_outputs_per_case = [
             hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                prompts, max_tokens, num_logprobs=num_logprobs, images=images)
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
             for prompts, images in hf_inputs
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
         check_logprobs_close(
             outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
             outputs_1_lst=vllm_outputs,
@@ -120,20 +124,31 @@ def run_test(
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                image_assets: ImageTestAssets, model: str,
-                size_factors: list[int], dtype: str, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+    model: str,
+    size_factors: list[int],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [[
-        ExplicitEncoderDecoderPrompt(
-            encoder_prompt=TextPrompt(
-                prompt=prompt,
-                multi_modal_data={"image": rescale_image_size(image, factor)}),
-            decoder_prompt=None,
-        ) for factor in size_factors
-    ] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        [
+            ExplicitEncoderDecoderPrompt(
+                encoder_prompt=TextPrompt(
+                    prompt=prompt,
+                    multi_modal_data={"image": rescale_image_size(image, factor)},
+                ),
+                decoder_prompt=None,
+            )
+            for factor in size_factors
+        ]
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
 
     run_test(
         hf_runner,
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index c5ffa5f3a70a..563626961106 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -10,8 +10,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
-                          VllmRunner)
+from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
@@ -64,50 +63,49 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=1,
-            dtype=dtype,
-            limit_mm_per_prompt={"audio": 1},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=64,
-            enforce_eager=True,
+        model,
+        task="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        dtype=dtype,
+        limit_mm_per_prompt={"audio": 1},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=64,
+        enforce_eager=True,
     ) as vllm_model:
         lora_request = LoRARequest("audio", 1, audio_lora_path)
         vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+                lora_request=lora_request,
+            )
             for prompts, audios in inputs
         ]
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
-
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
         hf_processor = hf_model.processor
         eos_token_id = hf_processor.tokenizer.eos_token_id
 
         hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    audios=[audios],
-                                                    eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=[audios],
+                eos_token_id=eos_token_id,
+            )
             for prompts, audios in inputs
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(output) for output in vllm_outputs
-            ],
+            outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
             name_0="hf",
             name_1="vllm",
         )
@@ -118,9 +116,16 @@ def run_test(
 @pytest.mark.parametrize("max_model_len", [2048])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, model: str,
-                audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
-                max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 949c0a80d31b..2aa2cb757e57 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
     give the same result.
     """
 
-    image_cherry = convert_image_mode(
-        ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
     image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
     images = [image_cherry, image_stop]
     video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
@@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
         ),
     ]
 
-    with vllm_runner(model,
-                     task="generate",
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": 2},
-                     max_model_len=32768,
-                     max_num_seqs=2,
-                     tensor_parallel_size=1,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model,
+        task="generate",
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 2},
+        max_model_len=32768,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+        enforce_eager=True,
+    ) as vllm_model:
         vllm_outputs_per_case = [
-            vllm_model.generate_greedy(prompts,
-                                       max_tokens,
-                                       images=images,
-                                       videos=videos)
+            vllm_model.generate_greedy(
+                prompts, max_tokens, images=images, videos=videos
+            )
             for prompts, images, videos in inputs
         ]
 
     all_results = [output[0][1] for output in vllm_outputs_per_case]
-    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
-               for total_str in all_results]
-    prompt_lengths = [prompt_len for _, prompt_len in outputs]
-    generated_strs = [
-        total_str[prompt_len:] for total_str, prompt_len in outputs
+    outputs = [
+        (total_str, total_str.find("assistant\n") + len("assistant\n"))
+        for total_str in all_results
     ]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
     interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
     interleaved_output_str, noninterleaved_output_str = generated_strs
 
diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
index 2bb01e494d43..deb9fea82bc9 100644
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -9,17 +9,24 @@
 
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
-                                     global_force_attn_backend_context_manager)
+from vllm.attention.selector import (
+    _Backend,
+    _cached_get_attn_backend,
+    global_force_attn_backend_context_manager,
+)
 from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    ImageTestAssets,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....quantization.utils import is_quant_method_supported
-from ....utils import (create_new_process_for_each_test, large_gpu_test,
-                       multi_gpu_test)
+from ....utils import create_new_process_for_each_test, large_gpu_test, multi_gpu_test
 from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 3
@@ -27,12 +34,12 @@
 
 LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|image|><|begin_of_text|>The meaning of the image is",
-    "cherry_blossom":
-    "<|image|><|begin_of_text|>The city is",
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|image|><|begin_of_text|>The meaning of the image is",
+        "cherry_blossom": "<|image|><|begin_of_text|>The city is",
+    }
+)
 
 text_only_prompts = [
     "The color of the sky is blue but sometimes it can also be",
@@ -43,32 +50,57 @@
 ]
 
 # Indices for inputs
-TEXT_ONLY = '0'
-IMAGE_AT_BEG = '1'
-IMAGE_AT_MIDDLE = '2'
-TWO_IMAGES = '3'
+TEXT_ONLY = "0"
+IMAGE_AT_BEG = "1"
+IMAGE_AT_MIDDLE = "2"
+TWO_IMAGES = "3"
 
 # Input tokenized
 prompt_data = {
     # Tell me a story
     TEXT_ONLY: [41551, 757, 264, 3446],
     # <|image|> What's the content of this image
-    IMAGE_AT_BEG:
-    [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
+    IMAGE_AT_BEG: [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220],
     # Hello <|image|>What' the content of this image
-    IMAGE_AT_MIDDLE:
-    [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217],
-    #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
+    IMAGE_AT_MIDDLE: [
+        9906,
+        220,
+        MLLAMA_IMAGE_TOKEN_ID,
+        3923,
+        6,
+        279,
+        2262,
+        315,
+        420,
+        2217,
+    ],
+    # <|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501
     TWO_IMAGES: [
-        MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30,
-        MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30
-    ]
+        MLLAMA_IMAGE_TOKEN_ID,
+        3957,
+        1070,
+        264,
+        37085,
+        304,
+        420,
+        2217,
+        30,
+        MLLAMA_IMAGE_TOKEN_ID,
+        3923,
+        596,
+        279,
+        10065,
+        304,
+        420,
+        2217,
+        30,
+    ],
 }
 
 
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
+):
     """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -79,7 +111,8 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
 
@@ -99,24 +132,28 @@ def _get_inputs(
     images = [asset.pil_image for asset in image_assets]
 
     if size_factors is not None:
-        inputs_per_image = [(
-            [prompt for _ in size_factors],
-            [rescale_image_size(image, factor) for factor in size_factors],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+        inputs_per_image = [
+            (
+                [prompt for _ in size_factors],
+                [rescale_image_size(image, factor) for factor in size_factors],
+            )
+            for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+        ]
     elif sizes is not None:
-        inputs_per_image = [(
-            [
-                prompt if size is not None else text_only_prompts[0]
-                for size in sizes
-            ],
-            [
-                image.resize(size) if size is not None else None
-                for size in sizes
-            ],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+        inputs_per_image = [
+            (
+                [
+                    prompt if size is not None else text_only_prompts[0]
+                    for size in sizes
+                ],
+                [image.resize(size) if size is not None else None for size in sizes],
+            )
+            for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+        ]
         if len(sizes) == 0:
             inputs_per_image.append(
-                (text_only_prompts, [None] * len(text_only_prompts)))
+                (text_only_prompts, [None] * len(text_only_prompts))
+            )
     else:
         raise ValueError("You must provide either `size_factors` or `sizes`")
 
@@ -136,8 +173,7 @@ def run_test(
     num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
-):
-    ...
+): ...
 
 
 @overload
@@ -153,8 +189,7 @@ def run_test(
     num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
-):
-    ...
+): ...
 
 
 def run_test(
@@ -200,7 +235,7 @@ def _run_test(
 
     All the image fixtures for the test are from IMAGE_ASSETS.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
+    For vllm runner, we provide MultiModalDataDict objects
     and corresponding MultiModalConfig as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
@@ -212,41 +247,39 @@ def _run_test(
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            model,
-            dtype=dtype,
-            max_model_len=19212,  # 3 max size images
-            max_num_seqs=3,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            limit_mm_per_prompt={"image":
-                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
+        model,
+        dtype=dtype,
+        max_model_len=19212,  # 3 max size images
+        max_num_seqs=3,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT},
+    ) as vllm_model:
         vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
             for prompts, images in inputs
         ]
 
-    with hf_runner(model,
-                   dtype=dtype,
-                   model_kwargs={"device_map": "auto"},
-                   auto_cls=AutoModelForImageTextToText) as hf_model:
+    with hf_runner(
+        model,
+        dtype=dtype,
+        model_kwargs={"device_map": "auto"},
+        auto_cls=AutoModelForImageTextToText,
+    ) as hf_model:
         hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
             for prompts, images in inputs
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
+                vllm_to_hf_output(vllm_output, model) for vllm_output in vllm_outputs
             ],
             name_0="hf",
             name_1="vllm",
@@ -273,26 +306,51 @@ def clear_cache():
         # Single-size, batched
         [(512, 512), (512, 512), (512, 512)],
         # Multi-size, batched
-        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
-         (1024, 1024), (512, 1536), (512, 2028)],
+        [
+            (512, 512),
+            (1024, 512),
+            (1536, 512),
+            (2048, 512),
+            (512, 1024),
+            (1024, 1024),
+            (512, 1536),
+            (512, 2028),
+        ],
         # Multi-size, batched, including text only
-        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
-         (1024, 1024), (512, 1536), (512, 2028), None],
+        [
+            (512, 512),
+            (1024, 512),
+            (1536, 512),
+            (2048, 512),
+            (512, 1024),
+            (1024, 1024),
+            (512, 1536),
+            (512, 2028),
+            None,
+        ],
         # mllama has 8 possible aspect ratios, carefully set the sizes
         # to cover all of them
-    ])
+    ],
+)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
-                                     model, sizes, dtype, max_tokens,
-                                     num_logprobs,
-                                     attn_backend: _Backend) -> None:
+def test_models_single_leading_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    sizes,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    attn_backend: _Backend,
+) -> None:
     with global_force_attn_backend_context_manager(attn_backend):
         if attn_backend == _Backend.FLASH_ATTN:
             # Flash Attention works only with bfloat16 data-type
-            dtype = 'bfloat16'
+            dtype = "bfloat16"
         run_test(
             hf_runner,
             vllm_runner,
@@ -313,36 +371,45 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
-                                     model, dtype, max_tokens, num_logprobs,
-                                     attn_backend: _Backend) -> None:
-
+def test_models_multi_leading_images(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    attn_backend: _Backend,
+) -> None:
     stop_sign = image_assets[0].pil_image
     cherry_blossom = image_assets[1].pil_image
 
-    inputs = [(
-        [
-            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
-            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
-            "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.",  # noqa: E501
-        ],
-        [
-            [stop_sign, cherry_blossom],
-            # Images with different sizes.
+    inputs = [
+        (
             [
-                stop_sign.resize((512, 512)),
-                stop_sign,
+                "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
+                "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
+                "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.",  # noqa: E501
             ],
             [
-                stop_sign,
-                stop_sign.resize((512, 1536)),
-                cherry_blossom.resize((512, 1024)),
+                [stop_sign, cherry_blossom],
+                # Images with different sizes.
+                [
+                    stop_sign.resize((512, 512)),
+                    stop_sign,
+                ],
+                [
+                    stop_sign,
+                    stop_sign.resize((512, 1536)),
+                    cherry_blossom.resize((512, 1024)),
+                ],
             ],
-        ])]
+        )
+    ]
     with global_force_attn_backend_context_manager(attn_backend):
         if attn_backend == _Backend.FLASH_ATTN:
             # Flash Attention works only with bfloat16 data-type
-            dtype = 'bfloat16'
+            dtype = "bfloat16"
         _run_test(
             hf_runner,
             vllm_runner,
@@ -362,27 +429,36 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
-                                   dtype, max_tokens, num_logprobs,
-                                   attn_backend: _Backend) -> None:
-
+def test_models_interleaved_images(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    attn_backend: _Backend,
+) -> None:
     stop_sign = image_assets[0].pil_image
     cherry_blossom = image_assets[1].pil_image
 
-    inputs = [(
-        [
-            "<|begin_of_text|>The content of the image <|image|> is",  # noqa: E501
-            "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "  # noqa: E501
-            "which is a stop sign and which is a cherry blossom?",  # noqa: E501
-        ],
-        [
-            [stop_sign],
-            [stop_sign, cherry_blossom],
-        ])]
+    inputs = [
+        (
+            [
+                "<|begin_of_text|>The content of the image <|image|> is",  # noqa: E501
+                "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "  # noqa: E501
+                "which is a stop sign and which is a cherry blossom?",  # noqa: E501
+            ],
+            [
+                [stop_sign],
+                [stop_sign, cherry_blossom],
+            ],
+        )
+    ]
     with global_force_attn_backend_context_manager(attn_backend):
         if attn_backend == _Backend.FLASH_ATTN:
             # Flash Attention works only with bfloat16 data-type
-            dtype = 'bfloat16'
+            dtype = "bfloat16"
         _run_test(
             hf_runner,
             vllm_runner,
@@ -431,8 +507,10 @@ def test_models_distributed(
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", ["float16"])
 @pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 def test_bnb_regression(
     image_assets: ImageTestAssets,
     model: str,
@@ -443,13 +521,10 @@ def test_bnb_regression(
     prompts = [
         {
             "prompt": "<|begin_of_text|>The content of the image <|image|> is",
-            "multi_modal_data": {
-                "image": stop_sign
-            },
+            "multi_modal_data": {"image": stop_sign},
         },
         {
-            "prompt":
-            "The color of the sky is blue but sometimes it can also be",
+            "prompt": "The color of the sky is blue but sometimes it can also be",
         },
     ]
     # Test regression about QKVCrossParallelLinear
@@ -519,8 +594,8 @@ def test_explicit_implicit_prompt(
     )
     outputs = llm.generate(prompts, sampling_params)
     n_prompts = len(prompts)
-    explicit_outputs = outputs[:n_prompts // 2]
-    implicit_outputs = outputs[n_prompts // 2:]
+    explicit_outputs = outputs[: n_prompts // 2]
+    implicit_outputs = outputs[n_prompts // 2 :]
     for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
         assert exp_output.outputs[0].text == imp_output.outputs[0].text
 
@@ -532,20 +607,28 @@ def test_explicit_implicit_prompt(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
-def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
-                    num_logprobs, attn_backend: _Backend) -> None:
-
+def test_regression(
+    vllm_runner,
+    image_assets,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    attn_backend: _Backend,
+) -> None:
     stop_sign = image_assets[0].pil_image
 
-    with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
+    with (
+        global_force_attn_backend_context_manager(attn_backend),
+        vllm_runner(
             model,
             dtype=dtype,
             max_model_len=8192,
             max_num_seqs=4,
             tensor_parallel_size=1,
-            limit_mm_per_prompt={"image":
-                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
-
+            limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT},
+        ) as vllm_model,
+    ):
         # Regression tests for https://github.com/vllm-project/vllm/issues/10648
 
         # Number of groups of image tokens is greater than the number of images
@@ -553,10 +636,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
         prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images"  # noqa: E501
         image = stop_sign
         with pytest.raises(ValueError):
-            vllm_model.generate_greedy_logprobs([prompt],
-                                                max_tokens,
-                                                num_logprobs,
-                                                images=[image])
+            vllm_model.generate_greedy_logprobs(
+                [prompt], max_tokens, num_logprobs, images=[image]
+            )
 
         # Batch of a text-only and image request that requires cross-attention
         prompts = [
@@ -567,10 +649,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
             None,
             [stop_sign],
         ]
-        vllm_model.generate_greedy_logprobs(prompts,
-                                            max_tokens,
-                                            num_logprobs,
-                                            images=images)
+        vllm_model.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs, images=images
+        )
 
         # Test the reverse order too for good measure
         prompts = [
@@ -581,10 +662,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
             [stop_sign],
             None,
         ]
-        vllm_model.generate_greedy_logprobs(prompts,
-                                            max_tokens,
-                                            num_logprobs,
-                                            images=images)
+        vllm_model.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs, images=images
+        )
 
         # Mixed batch with text and images with different numbers of tiles
         prompts = [
@@ -598,10 +678,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
             # smaller image must be 2nd for the repro
             [stop_sign.resize((448, 448))],
         ]
-        vllm_model.generate_greedy_logprobs(prompts,
-                                            max_tokens,
-                                            num_logprobs,
-                                            images=images)
+        vllm_model.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs, images=images
+        )
 
 
 class DummyModel:
@@ -612,22 +691,25 @@ class DummyModel:
 @pytest.mark.parametrize(
     "input_indices_and_output",
     # inputs, (cross_attention_mask, kv_range_for_decode)
-    [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)),
-     ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
-     ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
-     ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
-     ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
-      ((23, 24), [[0, 6], [6, 12]])),
-     ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
-     ([TWO_IMAGES], ((18, 12), [[6, 12]])),
-     ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))])
+    [
+        ([TEXT_ONLY], (None, None)),
+        ([IMAGE_AT_BEG], (None, None)),
+        ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)),
+        ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])),
+        ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])),
+        ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], ((23, 24), [[0, 6], [6, 12]])),
+        ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])),
+        ([TWO_IMAGES], ((18, 12), [[6, 12]])),
+        ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]])),
+    ],
+)
 def test_get_cross_attention_mask(input_indices_and_output) -> None:
-
     input_indices, expected_output = input_indices_and_output
 
     sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
-    num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices
-                 if i != TEXT_ONLY]
+    num_tiles = [
+        [2, 2] if i != TEXT_ONLY else [] for i in input_indices if i != TEXT_ONLY
+    ]
     input = torch.cat(sequences)
 
     seq_lens = [len(s) for s in sequences]
@@ -651,16 +733,18 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
 
     dummy = DummyModel()
 
-    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
-        .get_cross_attention_mask(dummy,
-                                  input,
-                                  attn_data,
-                                  num_tiles=num_tiles,
-                                  num_tokens_per_tile=3,
-                                  dtype=torch.bfloat16)
+    cross_attention_mask, kv_range_for_decode = (
+        MllamaForConditionalGeneration.get_cross_attention_mask(
+            dummy,
+            input,
+            attn_data,
+            num_tiles=num_tiles,
+            num_tokens_per_tile=3,
+            dtype=torch.bfloat16,
+        )
+    )
 
-    expected_cross_attention_mask, expected_kv_range_for_decode = \
-        expected_output
+    expected_cross_attention_mask, expected_kv_range_for_decode = expected_output
 
     assert kv_range_for_decode == expected_kv_range_for_decode
     if expected_cross_attention_mask is not None:
@@ -673,11 +757,19 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
 @pytest.mark.core_model
 @pytest.mark.parametrize(
     "input_indices",
-    [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE],
-     [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
-     [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]])
+    [
+        [TEXT_ONLY],
+        [IMAGE_AT_BEG],
+        [TEXT_ONLY, IMAGE_AT_BEG],
+        [IMAGE_AT_MIDDLE],
+        [TEXT_ONLY, IMAGE_AT_MIDDLE],
+        [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE],
+        [IMAGE_AT_MIDDLE, TEXT_ONLY],
+        [TWO_IMAGES],
+        [TEXT_ONLY, TWO_IMAGES],
+    ],
+)
 def test_get_full_text_row_masked_out_mask(input_indices) -> None:
-
     sequences = [torch.tensor(prompt_data[i]) for i in input_indices]
 
     seq_lens = [len(s) for s in sequences]
@@ -708,10 +800,11 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
 
     dummy = DummyModel()
 
-    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
-        .get_full_text_row_masked_out_mask(dummy,
-                                  attn_data,
-                                  torch.get_default_device())
+    full_text_row_masked_out_mask = (
+        MllamaForConditionalGeneration.get_full_text_row_masked_out_mask(
+            dummy, attn_data, torch.get_default_device()
+        )
+    )
 
     full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze()
     full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist()
@@ -721,30 +814,33 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
     for i, seq_len in enumerate(seq_lens):
         must_be_masked = input_indices[i] != TEXT_ONLY
         for _ in range(seq_len):
-            assert full_text_row_masked_out_mask[idx] == must_be_masked, \
-                f"full_text_row_masked_out_mask[{idx}] must be " \
-                f"'{must_be_masked}' "
+            assert full_text_row_masked_out_mask[idx] == must_be_masked, (
+                f"full_text_row_masked_out_mask[{idx}] must be '{must_be_masked}' "
+            )
             idx += 1
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [
-    ([6404], [[4]], [6404]),
-    ([0, 6404], [[4]], [6404]),
-    ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
-    ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
-])
-def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles,
-                                         expected) -> None:
-
+@pytest.mark.parametrize(
+    "encoder_seq_lens, num_tiles, expected",
+    [
+        ([6404], [[4]], [6404]),
+        ([0, 6404], [[4]], [6404]),
+        ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
+        ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
+    ],
+)
+def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles, expected) -> None:
     dummy = DummyModel()
     num_tokens_per_tile = 1601
-    actual_encoder_seq_lens = MllamaForConditionalGeneration \
-        ._get_and_validate_encoder_lens(
+    actual_encoder_seq_lens = (
+        MllamaForConditionalGeneration._get_and_validate_encoder_lens(
             dummy,
             encoder_seq_lens,
             num_tiles,
             num_tokens_per_tile,
         )
-    assert actual_encoder_seq_lens == expected, \
+    )
+    assert actual_encoder_seq_lens == expected, (
         f"Expected {expected} but got {actual_encoder_seq_lens}"
+    )
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index 4e8465778e25..2cf6e347a126 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -17,31 +17,39 @@
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)
 
 model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(model_path, "examples",
-                               "what_is_shown_in_this_image.wav")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
 models = [model_path]
 
 
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
+):
     """Sanitize vllm output to be comparable with hf output."""
     _, output_str, out_logprobs = vllm_output
 
@@ -71,8 +79,7 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput,
-                           Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
     model: str,
     *,
     max_model_len: int,
@@ -98,27 +105,29 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=2,
-            dtype=dtype,
-            limit_mm_per_prompt={"image": mm_limit},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=320,
-            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-            enforce_eager=True,
+        model,
+        task="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
     ) as vllm_model:
         lora_request = LoRARequest("vision", 1, vision_lora_path)
         vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
             for prompts, images, audios in inputs
         ]
 
@@ -127,42 +136,36 @@ def run_test(
     pytest.skip("HF impl is not compatible with current transformers")
 
     hf_model_kwargs = {"_attn_implementation": "sdpa"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
         hf_processor = hf_model.processor
         eos_token_id = hf_processor.tokenizer.eos_token_id
 
-        def patch_hf_processor(*args,
-                               text="",
-                               images=None,
-                               audio=None,
-                               sampling_rate=None,
-                               **kwargs):
+        def patch_hf_processor(
+            *args, text="", images=None, audio=None, sampling_rate=None, **kwargs
+        ):
             audios = None
             if audio is not None and sampling_rate is not None:
                 audios = [(audio, sampling_rate)]
-            return hf_processor(*args,
-                                text=text,
-                                images=images,
-                                audios=audios,
-                                **kwargs)
+            return hf_processor(
+                *args, text=text, images=images, audios=audios, **kwargs
+            )
 
         hf_model.processor = patch_hf_processor
 
         hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    audios=audios,
-                                                    eos_token_id=eos_token_id,
-                                                    num_logits_to_keep=0)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+                num_logits_to_keep=0,
+            )
             for prompts, images, audios in inputs
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_outputs,
@@ -189,16 +192,27 @@ def patch_hf_processor(*args,
 @pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_model_len: int, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-        None,
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
 
     run_test(
         hf_runner,
@@ -233,16 +247,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
 @pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_model_len: int,
-                             max_tokens: int, num_logprobs: int) -> None:
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case = [
         (
             [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [[rescale_image_size(image, factor) for image in images]
-             for factor in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
             None,
         ),
     ]
@@ -266,10 +290,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
-                              max_model_len: int, max_tokens: int,
-                              num_logprobs: int) -> None:
-
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     # use the example speech question so that the model outputs are reasonable
     audio = librosa.load(speech_question, sr=None)
     image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 1def825ab087..42fbfb99264b 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -38,33 +38,33 @@
 
 
 def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": PROMPT,
-        }] + [{
-            "type": "image_url",
-            "image_url": {
-                "url": url
-            }
-        } for url in urls],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": PROMPT,
+                }
+            ]
+            + [{"type": "image_url", "image_url": {"url": url}} for url in urls],
+        }
+    ]
 
 
 def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "content": PROMPT,
-        }, *({
-            "type": "image",
-            "image": download_image(url)
-        } for url in urls)],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "content": PROMPT,
+                },
+                *({"type": "image", "image": download_image(url)} for url in urls),
+            ],
+        }
+    ]
 
 
 def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
@@ -137,11 +137,17 @@ def _dump_outputs_w_logprobs(
     outputs: OutputsLogprobs,
     filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text, [{
-        k: asdict(v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in (logprobs or [])])
-                 for tokens, text, logprobs in outputs]
+    json_data = [
+        (
+            tokens,
+            text,
+            [
+                {k: asdict(v) for k, v in token_logprobs.items()}
+                for token_logprobs in (logprobs or [])
+            ],
+        )
+        for tokens, text, logprobs in outputs
+    ]
 
     with open(filename, "w") as f:
         json.dump(json_data, f)
@@ -151,10 +157,17 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
     with open(filename, "rb") as f:
         json_data = json.load(f)
 
-    return [(tokens, text, [{
-        int(k): Logprob(**v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
+    return [
+        (
+            tokens,
+            text,
+            [
+                {int(k): Logprob(**v) for k, v in token_logprobs.items()}
+                for token_logprobs in logprobs
+            ],
+        )
+        for tokens, text, logprobs in json_data
+    ]
 
 
 @large_gpu_test(min_gb=80)
@@ -167,21 +180,19 @@ def test_chat(
     model: str,
     dtype: str,
 ) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
-        FIXTURE_LOGPROBS_CHAT[model])
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
     with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
-            max_model_len=max_model_len,
-            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
+        max_model_len=max_model_len,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
         outputs = []
         for msg in MSGS:
-            output = vllm_model.model.chat(msg,
-                                           sampling_params=SAMPLING_PARAMS)
+            output = vllm_model.model.chat(msg, sampling_params=SAMPLING_PARAMS)
 
             outputs.extend(output)
 
@@ -190,46 +201,58 @@ def test_chat(
     for i in range(len(logprobs)):
         assert logprobs[i][-1] is None
         logprobs[i] = logprobs[i][:-1]
-    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
+    check_logprobs_close(
+        outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+        outputs_1_lst=logprobs,
+        name_0="h100_ref",
+        name_1="output",
+    )
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("prompt,expected_ranges",
-                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
-                           [PlaceholderRange(offset=11, length=494)]),
-                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
-                              PlaceholderRange(offset=11, length=266),
-                              PlaceholderRange(offset=277, length=1056),
-                              PlaceholderRange(offset=1333, length=418)
-                          ])])
-def test_multi_modal_placeholders(vllm_runner, prompt,
-                                  expected_ranges: list[PlaceholderRange],
-                                  monkeypatch) -> None:
-
+@pytest.mark.parametrize(
+    "prompt,expected_ranges",
+    [
+        (
+            _create_engine_inputs_hf(IMG_URLS[:1]),
+            [PlaceholderRange(offset=11, length=494)],
+        ),
+        (
+            _create_engine_inputs_hf(IMG_URLS[1:4]),
+            [
+                PlaceholderRange(offset=11, length=266),
+                PlaceholderRange(offset=277, length=1056),
+                PlaceholderRange(offset=1333, length=418),
+            ],
+        ),
+    ],
+)
+def test_multi_modal_placeholders(
+    vllm_runner, prompt, expected_ranges: list[PlaceholderRange], monkeypatch
+) -> None:
     # This placeholder checking test only works with V0 engine
     # where `multi_modal_placeholders` is returned with `RequestOutput`
     monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(
-            "mistral-community/pixtral-12b",
-            max_model_len=8192,
-            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+        "mistral-community/pixtral-12b",
+        max_model_len=8192,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
         outputs = vllm_model.model.generate(prompt)
 
         assert len(outputs) == 1, f"{len(outputs)=}"
         output: RequestOutput = outputs[0]
-        assert hasattr(output,
-                       "multi_modal_placeholders"), f"{output.__dict__=}"
-        assert "image" in output.multi_modal_placeholders, \
+        assert hasattr(output, "multi_modal_placeholders"), f"{output.__dict__=}"
+        assert "image" in output.multi_modal_placeholders, (
             f"{output.multi_modal_placeholders.keys()=}"
-        image_placeholder_ranges: list[
-            PlaceholderRange] = output.multi_modal_placeholders["image"]
-        assert len(image_placeholder_ranges) == len(
-            expected_ranges), f"{image_placeholder_ranges=}"
-        for real_range, expected_range in zip(image_placeholder_ranges,
-                                              expected_ranges):
-            assert real_range == expected_range, \
-                f"{real_range=} {expected_range=}"
+        )
+        image_placeholder_ranges: list[PlaceholderRange] = (
+            output.multi_modal_placeholders["image"]
+        )
+        assert len(image_placeholder_ranges) == len(expected_ranges), (
+            f"{image_placeholder_ranges=}"
+        )
+        for real_range, expected_range in zip(
+            image_placeholder_ranges, expected_ranges
+        ):
+            assert real_range == expected_range, f"{real_range=} {expected_range=}"
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index a2793b8c8ddf..46acaabed5b2 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -11,8 +11,13 @@
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 
-from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
-                          PromptVideoInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    VIDEO_ASSETS,
+    PromptImageInput,
+    PromptVideoInput,
+    VllmRunner,
+)
 from ...utils import check_logprobs_close
 
 
@@ -21,7 +26,7 @@ def use_v0_only(monkeypatch):
     """
     V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 models = ["Qwen/Qwen2-VL-2B-Instruct"]
@@ -36,28 +41,29 @@ def qwen2_vl_chat_template(*query):
     return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
 
 
-IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the biggest text's content in this image?",
-    ),
-    "cherry_blossom":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the season shown in this image? ",
-        "Reply with a short sentence (no more than 20 words)",
-    ),
-})
-
-VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "baby_reading":
-    qwen2_vl_chat_template(
-        VIDEO_PLACEHOLDER,
-        "Describe this video with a short sentence ",
-        "(no more than 20 words)",
-    ),
-})
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the biggest text's content in this image?",
+        ),
+        "cherry_blossom": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the season shown in this image? ",
+            "Reply with a short sentence (no more than 20 words)",
+        ),
+    }
+)
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)
 
 MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
     IMAGE_PLACEHOLDER,
@@ -79,17 +85,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 
 
 def batch_make_image_embeddings(
-        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
+    image_batches: list[Union[Image.Image, list[Image.Image]]],
+    processor,
+    llm: VllmRunner,
+) -> list[Qwen2VLPromptImageEmbeddingInput]:
     """batched image embeddings for Qwen2-VL
 
-    This will infer all images' embeddings in a single batch, 
+    This will infer all images' embeddings in a single batch,
       and split the result according to input batches.
 
     image_batches:
       - Single-image batches: `list[Image.Image]`
       - Multiple-image batches: `list[list[Image.Image]]]`
-    
+
     returns: `list[Qwen2VLPromptImageEmbeddingInput]`
     """
 
@@ -110,9 +118,9 @@ def batch_make_image_embeddings(
     # image to pixel values
     image_processor = processor.image_processor
 
-    preprocess_result = image_processor \
-        .preprocess(images=images, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=images, return_tensors="pt"
+    ).data
     pixel_values = preprocess_result["pixel_values"]
     image_grid_thw = preprocess_result["image_grid_thw"]
 
@@ -121,12 +129,11 @@ def get_image_embeds(model):
         with torch.no_grad():
             visual = model.visual
 
-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            image_grid_thw_on_device = image_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=image_grid_thw_on_device)
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            image_grid_thw_on_device = image_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(pixel_values_on_device, grid_thw=image_grid_thw_on_device)
 
     # V1 Test: this calls a V0 internal.
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
@@ -140,16 +147,21 @@ def get_image_embeds(model):
         merge_size = image_processor.merge_size
         cur_batch_embed_len = sum(
             grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in image_grid_thw[image_counter:image_counter +
-                                           cur_batch_image_count])
+            for grid_thw in image_grid_thw[
+                image_counter : image_counter + cur_batch_image_count
+            ]
+        )
 
-        result.append({
-            "image_embeds":
-            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "image_grid_thw":
-            image_grid_thw[image_counter:image_counter +
-                           cur_batch_image_count],
-        })
+        result.append(
+            {
+                "image_embeds": image_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "image_grid_thw": image_grid_thw[
+                    image_counter : image_counter + cur_batch_image_count
+                ],
+            }
+        )
 
         embed_counter += cur_batch_embed_len
         image_counter += cur_batch_image_count
@@ -163,13 +175,13 @@ def get_image_embeds(model):
 
 
 def batch_make_video_embeddings(
-        video_batches: PromptVideoInput, processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
+    video_batches: PromptVideoInput, processor, llm: VllmRunner
+) -> list[Qwen2VLPromptVideoEmbeddingInput]:
     """batched video embeddings for Qwen2-VL
 
     A NDArray represents a single video's all frames.
 
-    This will infer all videos' embeddings in a single batch, 
+    This will infer all videos' embeddings in a single batch,
       and split the result according to input batches.
 
     video_batches:
@@ -194,9 +206,9 @@ def batch_make_video_embeddings(
     # video to pixel values
     image_processor = processor.image_processor
 
-    preprocess_result = image_processor \
-        .preprocess(images=None, videos=videos, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=None, videos=videos, return_tensors="pt"
+    ).data
     pixel_values = preprocess_result["pixel_values_videos"]
     video_grid_thw = preprocess_result["video_grid_thw"]
 
@@ -205,12 +217,11 @@ def get_image_embeds(model):
         with torch.no_grad():
             visual = model.visual
 
-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            video_grid_thw_on_device = video_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=video_grid_thw_on_device)
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(pixel_values_on_device, grid_thw=video_grid_thw_on_device)
 
     # V1 Test: this calls a V0 internal.
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
@@ -224,16 +235,21 @@ def get_image_embeds(model):
         merge_size = image_processor.merge_size
         cur_batch_embed_len = sum(
             grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in video_grid_thw[video_counter:video_counter +
-                                           cur_batch_video_count])
+            for grid_thw in video_grid_thw[
+                video_counter : video_counter + cur_batch_video_count
+            ]
+        )
 
-        result.append({
-            "video_embeds":
-            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "video_grid_thw":
-            video_grid_thw[video_counter:video_counter +
-                           cur_batch_video_count],
-        })
+        result.append(
+            {
+                "video_embeds": video_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "video_grid_thw": video_grid_thw[
+                    video_counter : video_counter + cur_batch_video_count
+                ],
+            }
+        )
 
         embed_counter += cur_batch_embed_len
         video_counter += cur_batch_video_count
@@ -266,25 +282,24 @@ def run_embedding_input_test(
     processor = AutoProcessor.from_pretrained(model)
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4000,
-                     max_num_seqs=3,
-                     dtype=dtype,
-                     limit_mm_per_prompt={
-                         "image": mm_limit,
-                         "video": mm_limit
-                     },
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-
+    with vllm_runner(
+        model,
+        task="generate",
+        max_model_len=4000,
+        max_num_seqs=3,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
         outputs_per_case_for_original_input = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images or None,
+                videos=videos or None,
+            )
             for prompts, images, videos in inputs
         ]
 
@@ -293,17 +308,19 @@ def run_embedding_input_test(
                 prompts,
                 max_tokens,
                 num_logprobs=num_logprobs,
-                images=batch_make_image_embeddings(
-                    images, processor, vllm_model) if images else None,
-                videos=batch_make_video_embeddings(
-                    videos, processor, vllm_model) if videos else None)
+                images=batch_make_image_embeddings(images, processor, vllm_model)
+                if images
+                else None,
+                videos=batch_make_video_embeddings(videos, processor, vllm_model)
+                if videos
+                else None,
+            )
             for prompts, images, videos in inputs
         ]
 
-    for outputs_for_original_input, \
-        outputs_for_embeddings_input \
-        in zip(outputs_per_case_for_original_input,
-            outputs_per_case_for_embeddings_input):
+    for outputs_for_original_input, outputs_for_embeddings_input in zip(
+        outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
+    ):
         check_logprobs_close(
             outputs_0_lst=outputs_for_original_input,
             outputs_1_lst=outputs_for_embeddings_input,
@@ -328,18 +345,25 @@ def run_embedding_input_test(
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
-                                         size_factors, dtype: str,
-                                         max_tokens: int,
-                                         num_logprobs: int) -> None:
+def test_qwen2_vl_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
             [prompt for _ in size_factors],
             [rescale_image_size(image, factor) for factor in size_factors],
             [],
-        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
+        )
+        for image, prompt in zip(images, IMAGE_PROMPTS)
+    ]
 
     run_embedding_input_test(
         vllm_runner,
@@ -370,21 +394,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
-                                                  model, size_factors,
-                                                  dtype: str, max_tokens: int,
-                                                  num_logprobs: int) -> None:
+def test_qwen2_vl_multiple_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_case: list[tuple[list[str], PromptImageInput,
-                                PromptVideoInput]] = [(
-                                    [MULTIIMAGE_PROMPT for _ in size_factors],
-                                    [[
-                                        rescale_image_size(image, factor)
-                                        for image in images
-                                    ] for factor in size_factors],
-                                    [],
-                                )]
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
+            [MULTIIMAGE_PROMPT for _ in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
+            [],
+        )
+    ]
 
     run_embedding_input_test(
         vllm_runner,
@@ -414,22 +444,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
-                                         size_factors, dtype: str,
-                                         max_tokens: int,
-                                         num_logprobs: int) -> None:
+def test_qwen2_vl_video_embeddings_input(
+    vllm_runner,
+    video_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     num_frames = 4
     sampled_vids = [
         sample_frames_from_video(asset.np_ndarrays, num_frames)
         for asset in video_assets
     ]
 
-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
             [prompt for _ in size_factors],
             [],
             [rescale_video_size(video, factor) for factor in size_factors],
-        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
+        )
+        for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
+    ]
 
     run_embedding_input_test(
         vllm_runner,
diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
index e7e7bd3154a1..da1e7c7486fd 100644
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -15,12 +15,12 @@
 
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
-AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
-    "mary_had_lamb":
-    "Transcribe this into English.",
-    "winning_call":
-    "What is happening in this audio clip?",
-})
+AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
+    {
+        "mary_had_lamb": "Transcribe this into English.",
+        "winning_call": "What is happening in this audio clip?",
+    }
+)
 
 MULTI_AUDIO_PROMPT = "Describe each of the audios above."
 
@@ -33,7 +33,7 @@
     "enable_chunked_prefill": True,
     "max_num_seqs": 2,
     # Use a very small limit to exercise chunked prefill.
-    "max_num_batched_tokens": 16
+    "max_num_batched_tokens": 16,
 }
 
 
@@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
     for key, value in params_kwargs.items():
         if isinstance(value, bool):
             if value:
-                args.append(f"--{key.replace('_','-')}")
+                args.append(f"--{key.replace('_', '-')}")
         else:
-            args.append(f"--{key.replace('_','-')}={value}")
+            args.append(f"--{key.replace('_', '-')}={value}")
     return args
 
 
-@pytest.fixture(params=[
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
+@pytest.fixture(
+    params=[
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ]
+)
 def server(request, audio_assets: AudioTestAssets):
     args = [
-        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
         "--limit-mm-per-prompt",
-        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+        json.dumps({"audio": len(audio_assets)}),
+        "--trust-remote-code",
     ] + params_kwargs_to_cli_args(request.param)
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
         yield remote_server
 
 
@@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     placeholder = f"{placeholder}\n" * audio_count
 
-    return tokenizer.apply_chat_template([{
-        'role': 'user',
-        'content': f"{placeholder}{question}"
-    }],
-                                         tokenize=False,
-                                         add_generation_prompt=True)
+    return tokenizer.apply_chat_template(
+        [{"role": "user", "content": f"{placeholder}{question}"}],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
 
 
 def run_multi_audio_test(
@@ -99,19 +104,21 @@ def run_multi_audio_test(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    with vllm_runner(model,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={
-                         "audio":
-                         max((len(audio) for _, audio in prompts_and_audios))
-                     },
-                     **kwargs) as vllm_model:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        limit_mm_per_prompt={
+            "audio": max((len(audio) for _, audio in prompts_and_audios))
+        },
+        **kwargs,
+    ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             [prompt for prompt, _ in prompts_and_audios],
             max_tokens,
             num_logprobs=num_logprobs,
-            audios=[audios for _, audios in prompts_and_audios])
+            audios=[audios for _, audios in prompts_and_audios],
+        )
 
     # The HuggingFace model doesn't support multiple audios yet, so
     # just assert that some tokens were generated.
@@ -122,21 +129,25 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int, num_logprobs: int,
-                                     vllm_kwargs: dict) -> None:
-
-    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
-                              VLLM_PLACEHOLDER)
+@pytest.mark.parametrize(
+    "vllm_kwargs",
+    [
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ],
+)
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    vllm_kwargs: dict,
+) -> None:
+    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
     run_multi_audio_test(
         vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
         MODEL_NAME,
         dtype=dtype,
         max_tokens=max_tokens,
@@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
 async def test_online_serving(client, audio_assets: AudioTestAssets):
     """Exercises online serving with/without chunked prefill enabled."""
 
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *[{
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio.url
-                }
-            } for audio in audio_assets],
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
-
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *[
+                    {"type": "audio_url", "audio_url": {"url": audio.url}}
+                    for audio in audio_assets
+                ],
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                },
+            ],
+        }
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )
 
     assert len(chat_completion.choices) == 1
     choice = chat_completion.choices[0]
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index b4439dfe020c..aa9628435e4d 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -6,8 +6,12 @@
 import pytest
 import pytest_asyncio
 from mistral_common.audio import Audio
-from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
-                                                       TextChunk, UserMessage)
+from mistral_common.protocol.instruct.messages import (
+    AudioChunk,
+    RawAudio,
+    TextChunk,
+    UserMessage,
+)
 
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 
@@ -17,8 +21,12 @@
 
 MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
 MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
 ]
 
 
@@ -30,10 +38,9 @@ def server(request, audio_assets: AudioTestAssets):
         json.dumps({"audio": len(audio_assets)}),
     ] + MISTRAL_FORMAT_ARGS
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
         yield remote_server
 
 
@@ -64,15 +71,17 @@ def _get_prompt(audio_assets, question):
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int,
-                                     num_logprobs: int) -> None:
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
     vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
     run_multi_audio_test(
         vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
         MODEL_NAME,
         dtype=dtype,
         max_tokens=max_tokens,
@@ -92,23 +101,22 @@ def asset_to_chunk(asset):
         return audio_dict
 
     audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *audio_chunks,
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
-
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *audio_chunks,
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                },
+            ],
+        }
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )
 
     assert len(chat_completion.choices) == 1
     choice = chat_completion.choices[0]
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 363d55153aac..7eac8bb1b47a 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -12,8 +12,7 @@
 
 PROMPTS = [
     {
-        "prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
         "multi_modal_data": {
             "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
         },
@@ -25,9 +24,8 @@
                 "audio": AudioAsset("winning_call").audio_and_sample_rate,
             },
         },
-        "decoder_prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    }
+        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    },
 ]
 
 EXPECTED = {
@@ -41,7 +39,7 @@
         " is June and the third base. They're going to wave him in. The throw"
         " to the plate will be late. The Mariners are going to play for the"
         " American League Championship. I don't believe it. It just continues"
-        " by all five."
+        " by all five.",
     ],
     "openai/whisper-small": [
         " The first words I spoke in the original pornograph. A little piece"
@@ -51,7 +49,7 @@
         " comes joy. Here is Junior to third base. They're gonna wave him"
         " in. The throw to the plate will be late. The Mariners are going to"
         " play for the American League Championship. I don't believe it. It"
-        " just continues. My, oh my."
+        " just continues. My, oh my.",
     ],
     "openai/whisper-medium": [
         " The first words I spoke in the original phonograph, a little piece"
@@ -62,7 +60,7 @@
         " Jorgen at third base. They're going to wave him in. The throw to the"
         " plate will be late. The Mariners are going to play for the American"
         " League Championship. I don't believe it. It just continues. My, oh"
-        " my."
+        " my.",
     ],
     "openai/whisper-large-v3": [
         " The first words I spoke in the original phonograph, a little piece"
@@ -73,7 +71,7 @@
         " Junior to third base. They're going to wave him in. The throw to the"
         " plate will be late. The Mariners are going to play for the American"
         " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
+        " my.",
     ],
     "openai/whisper-large-v3-turbo": [
         " The first words I spoke in the original phonograph, a little piece"
@@ -84,8 +82,8 @@
         " Junior to third base. They're going to wave him in. The throw to the"
         " plate will be late. The Mariners are going to play for the American"
         " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
-    ]
+        " my.",
+    ],
 }
 
 
@@ -100,11 +98,11 @@ def run_test(
     expected_list = EXPECTED[model] * 10
 
     with vllm_runner(
-            model,
-            dtype="half",
-            max_model_len=448,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
+        model,
+        dtype="half",
+        max_model_len=448,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         llm = vllm_model.model
 
@@ -123,7 +121,8 @@ def run_test(
 
 @pytest.mark.core_model
 @pytest.mark.parametrize(
-    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]
+)
 @create_new_process_for_each_test()
 def test_models(vllm_runner, model) -> None:
     run_test(
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 03c08240d6a8..859c2ffd9df1 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Helpers for building inputs that can be leveraged for different test types.
-"""
+"""Helpers for building inputs that can be leveraged for different test types."""
+
 from collections.abc import Iterable
 from pathlib import PosixPath
 from typing import Callable, Optional, Union
@@ -10,20 +10,30 @@
 
 from vllm.multimodal.audio import AudioResampler
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)
 
 from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
-from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
-                    TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
-                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
-                    ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
-                    VLMTestInfo)
-
-
-def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
-                                                                     str],
-                             test_placeholder: str) -> str:
+from .types import (
+    SINGLE_AUDIO_BASE_PROMPT,
+    SINGLE_IMAGE_BASE_PROMPTS,
+    TEST_AUDIO_PLACEHOLDER,
+    TEST_IMG_PLACEHOLDER,
+    TEST_VIDEO_PLACEHOLDER,
+    VIDEO_BASE_PROMPT,
+    ImageSizeWrapper,
+    PromptWithMultiModalInput,
+    SizeType,
+    VLMTestInfo,
+)
+
+
+def replace_test_placeholder(
+    prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
+) -> str:
     """Given a prompt, replaces each test placeholder with the
     model-specific tag.
     """
@@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
     return img_prompt
 
 
-def get_model_prompts(base_prompts: Iterable[str],
-                      img_idx_to_prompt: Optional[Callable[[int], str]],
-                      video_idx_to_prompt: Optional[Callable[[int], str]],
-                      audio_idx_to_prompt: Optional[Callable[[int], str]],
-                      prompt_formatter: Callable[[str], str]) -> list[str]:
+def get_model_prompts(
+    base_prompts: Iterable[str],
+    img_idx_to_prompt: Optional[Callable[[int], str]],
+    video_idx_to_prompt: Optional[Callable[[int], str]],
+    audio_idx_to_prompt: Optional[Callable[[int], str]],
+    prompt_formatter: Callable[[str], str],
+) -> list[str]:
     """Given a model-agnostic base prompt and test configuration for a model(s)
     to be tested, update the media placeholders and apply the prompt formatting
     to get the test prompt string for this model.
@@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
         # Replace the multimodal placeholders in the base prompt with
         # the correct ones for the model that we are testing
         if img_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   img_idx_to_prompt,
-                                                   TEST_IMG_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
+            )
 
         if video_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   video_idx_to_prompt,
-                                                   TEST_VIDEO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
+            )
 
         if audio_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   audio_idx_to_prompt,
-                                                   TEST_AUDIO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
+            )
 
         # Apply the prompt formatter to wrap the base prompt with
         # the correct media placeholders to get the model test prompt
@@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
     tmp_path: Optional[PosixPath] = None,
 ) -> list[PromptWithMultiModalInput]:
     if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build single image inputs")
+        raise ValueError("Prompt formatter must be set to build single image inputs")
 
-    model_prompts = get_model_prompts(test_info.single_image_prompts,
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        test_info.single_image_prompts,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
 
     # For models that require a local path / URL encoded in the image; export
     # assets and encode into tmp_path for this test. This should be avoided
@@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(
 
 
 def build_single_image_inputs(
-        images, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    images, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
     # For every image / prompt pair, get a pair containing two lists of
     # length size_factors, where the first contains duplicates of the model
     # prompt [str], and the second contains copies of the image after being
@@ -125,7 +138,8 @@ def build_single_image_inputs(
                 apply_image_size_scaling(image, size, size_wrapper.type)
                 for size in size_wrapper.data
             ],
-        ) for image, prompt in zip(images, model_prompts)
+        )
+        for image, prompt in zip(images, model_prompts)
     ]
 
 
@@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
     tmp_path: Optional[PosixPath] = None,
 ) -> list[PromptWithMultiModalInput]:
     if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build multi image inputs")
+        raise ValueError("Prompt formatter must be set to build multi image inputs")
 
-    model_prompts = get_model_prompts([test_info.multi_image_prompt],
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        [test_info.multi_image_prompt],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )
 
     if test_info.prompt_path_encoder is not None:
         if tmp_path is None:
@@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(
 
 
 def build_multi_image_inputs(
-        image_lists, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    image_lists, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
     return [
         PromptWithMultiModalInput(
             prompts=[prompt for _ in size_wrapper.data],
-            image_data=[[
-                apply_image_size_scaling(image, size, size_wrapper.type)
-                for image in images
-            ] for size in size_wrapper.data],
-        ) for images, prompt in zip(image_lists, model_prompts)
+            image_data=[
+                [
+                    apply_image_size_scaling(image, size, size_wrapper.type)
+                    for image in images
+                ]
+                for size in size_wrapper.data
+            ],
+        )
+        for images, prompt in zip(image_lists, model_prompts)
     ]
 
 
@@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
     # These conditions will always be true if invoked through filtering,
     # but we still check them in case this is ever called directly
     if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build image embedding inputs")
-    if size_wrapper.type != SizeType.SIZE_FACTOR or not \
-            all(factor == 1.0 for factor in size_wrapper.data):
+        raise ValueError("Prompt formatter must be set to build image embedding inputs")
+    if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
+        factor == 1.0 for factor in size_wrapper.data
+    ):
         raise ValueError("Embedding tests require constant (1.0) size factors")
     if test_info.convert_assets_to_embeddings is None:
         raise ValueError("No conversion func for getting embeddings found")
@@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
     assert len(images) == len(model_prompts)
 
     inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
-    vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
-                                                size_wrapper)
+    vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
     return inputs, vllm_embeddings
 
 
@@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
         for asset in video_assets
     ]
 
-    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
-                    else rescale_video_size)
+    video_scaler = (
+        resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
+    )
 
     return [
         PromptWithMultiModalInput(
             prompts=[prompt for _ in size_wrapper.data],
-            video_data=[
-                video_scaler(video, size) for size in size_wrapper.data
-            ],
-        ) for video, prompt in zip(sampled_vids, model_prompts)
+            video_data=[video_scaler(video, size) for size in size_wrapper.data],
+        )
+        for video, prompt in zip(sampled_vids, model_prompts)
     ]
 
 
-def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
-                             size_type: SizeType):
+def apply_image_size_scaling(
+    image, size: Union[float, tuple[int, int]], size_type: SizeType
+):
     """Applies a size scaler to one image; this can be a an image size factor,
     which scales the image while maintaining the aspect ratio"""
     # Special case for embeddings; if it's a tensor, it's only valid if we
@@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
         method="librosa",
     )
     audios = [asset.audio_and_sample_rate for asset in audio_assets]
-    resampled_audios = [(
-        resampler.resample(
-            audio,
-            orig_sr=sr,
-        ),
-        int(resampler.target_sr),
-    ) for audio, sr in audios]
+    resampled_audios = [
+        (
+            resampler.resample(
+                audio,
+                orig_sr=sr,
+            ),
+            int(resampler.target_sr),
+        )
+        for audio, sr in audios
+    ]
 
     return [
         PromptWithMultiModalInput(
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index 336e2dd2b120..fe36dfbf26f6 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -4,19 +4,28 @@
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
 """
+
 import itertools
 from collections import OrderedDict
 from collections.abc import Iterable
 
 import pytest
 
-from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
-                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
+from .types import (
+    EMBEDDING_SIZE_FACTORS,
+    ExpandableVLMTestArgs,
+    ImageSizeWrapper,
+    SizeType,
+    VLMTestInfo,
+    VLMTestType,
+)
 
 
 def get_filtered_test_settings(
-        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
-        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    new_proc_per_test: bool,
+) -> dict[str, VLMTestInfo]:
     """Given the dict of potential test settings to run, return a subdict
     of tests who have the current test type enabled with the matching val for
     fork_per_test.
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
     def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
         return test_info.test_type == test_type or (
             isinstance(test_info.test_type, Iterable)
-            and test_type in test_info.test_type)
+            and test_type in test_info.test_type
+        )
 
     matching_tests = {}
     for test_name, test_info in test_settings.items():
@@ -36,62 +46,69 @@ def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
                 assert test_info.convert_assets_to_embeddings is not None
             # Custom test inputs need to explicitly define the mm limit/inputs
             if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
-                assert (test_info.custom_test_opts is not None
-                        and isinstance(test_info.custom_test_opts, Iterable))
+                assert test_info.custom_test_opts is not None and isinstance(
+                    test_info.custom_test_opts, Iterable
+                )
             # For all types besides custom inputs, we need a prompt formatter
             else:
                 assert test_info.prompt_formatter is not None
 
             # Everything looks okay; keep if this is has correct proc handling
-            if (test_info.distributed_executor_backend
-                    is not None) == new_proc_per_test:
+            if (
+                test_info.distributed_executor_backend is not None
+            ) == new_proc_per_test:
                 matching_tests[test_name] = test_info
 
     return matching_tests
 
 
-def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
-                             test_type: VLMTestType,
-                             create_new_process_for_each_test: bool):
+def get_parametrized_options(
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    create_new_process_for_each_test: bool,
+):
     """Converts all of our VLMTestInfo into an expanded list of parameters.
     This is similar to nesting pytest parametrize calls, but done directly
     through an itertools product so that each test can set things like
     size factors etc, while still running in isolated test cases.
     """
     matching_tests = get_filtered_test_settings(
-        test_settings, test_type, create_new_process_for_each_test)
+        test_settings, test_type, create_new_process_for_each_test
+    )
 
     # Ensure that something is wrapped as an iterable it's not already
-    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
 
     def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
         # This is essentially the same as nesting a bunch of mark.parametrize
         # decorators, but we do it programmatically to allow overrides for on
         # a per-model basis, while still being able to execute each of these
         # as individual test cases in pytest.
-        iter_kwargs = OrderedDict([
-            ("model", ensure_wrapped(test_info.models)),
-            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
-            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
-            ("dtype", ensure_wrapped(test_info.dtype)),
-            ("distributed_executor_backend",
-             ensure_wrapped(test_info.distributed_executor_backend)),
-        ])
+        iter_kwargs = OrderedDict(
+            [
+                ("model", ensure_wrapped(test_info.models)),
+                ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+                ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+                ("dtype", ensure_wrapped(test_info.dtype)),
+                (
+                    "distributed_executor_backend",
+                    ensure_wrapped(test_info.distributed_executor_backend),
+                ),
+            ]
+        )
 
         # num_frames is video only
         if test_type == VLMTestType.VIDEO:
-            iter_kwargs["num_video_frames"] = ensure_wrapped(
-                test_info.num_video_frames)
+            iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
 
         # No sizes passed for custom inputs, since inputs are directly provided
         if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
             wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
             if wrapped_sizes is None:
-                raise ValueError(
-                    f"Sizes must be set for test type {test_type}")
+                raise ValueError(f"Sizes must be set for test type {test_type}")
             iter_kwargs["size_wrapper"] = wrapped_sizes
 
-        #Otherwise expand the custom test options instead
+        # Otherwise expand the custom test options instead
         elif test_type == VLMTestType.CUSTOM_INPUTS:
             if test_info.custom_test_opts is None:
                 raise ValueError("Test has type CUSTOM_INPUTS, but none given")
@@ -121,8 +138,8 @@ def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
 
 
 def get_wrapped_test_sizes(
-        test_info: VLMTestInfo,
-        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
+    test_info: VLMTestInfo, test_type: VLMTestType
+) -> tuple[ImageSizeWrapper, ...]:
     """Given a test info which may have size factors or fixed sizes, wrap them
     and combine them into an iterable, each of which will be used in parameter
     expansion.
@@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
     """
     # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
     if test_type == VLMTestType.EMBEDDING:
-        return tuple([
-            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
-            for factor in EMBEDDING_SIZE_FACTORS
-        ])
+        return tuple(
+            [
+                ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+                for factor in EMBEDDING_SIZE_FACTORS
+            ]
+        )
     # Audio and Custom inputs have preprocessed inputs
     elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
         return tuple()
 
-    size_factors = test_info.image_size_factors \
-        if test_info.image_size_factors else []
-    fixed_sizes = test_info.image_sizes \
-        if test_info.image_sizes else []
+    size_factors = test_info.image_size_factors if test_info.image_size_factors else []
+    fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
 
     wrapped_factors = [
         ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
@@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
     ]
 
     wrapped_sizes = [
-        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
-        for size in fixed_sizes
+        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
     ]
 
     return tuple(wrapped_factors + wrapped_sizes)
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 8c83d8f8a8a2..cefb45227fed 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
+
 from typing import Any, Callable, Optional
 
 import torch
@@ -75,16 +76,18 @@ def run_test(
     if vllm_runner_kwargs:
         vllm_runner_kwargs_.update(vllm_runner_kwargs)
 
-    with vllm_runner(model,
-                     max_model_len=max_model_len,
-                     max_num_seqs=max_num_seqs,
-                     dtype=dtype,
-                     limit_mm_per_prompt=limit_mm_per_prompt,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=enforce_eager,
-                     task=task,
-                     **vllm_runner_kwargs_) as vllm_model:
+    with vllm_runner(
+        model,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        dtype=dtype,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=enforce_eager,
+        task=task,
+        **vllm_runner_kwargs_,
+    ) as vllm_model:
         tokenizer = vllm_model.model.get_tokenizer()
 
         vllm_kwargs: dict[str, Any] = {}
@@ -94,21 +97,19 @@ def run_test(
             vllm_kwargs["stop"] = stop_str
 
         for prompts, image_data, video_data, audio_data in vllm_inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
             vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
             vllm_output = vllm_model.generate_greedy_logprobs(
                 prompts,
                 max_tokens,
                 num_logprobs=num_logprobs,
-                **vllm_kwargs_with_mm_data)
+                **vllm_kwargs_with_mm_data,
+            )
             vllm_outputs_per_mm.append(vllm_output)
 
-    hf_model = hf_runner(model,
-                         dtype=dtype,
-                         auto_cls=auto_cls,
-                         model_kwargs=hf_model_kwargs)
+    hf_model = hf_runner(
+        model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
+    )
 
     # Some models need to patch things like the model processor, e.g., internvl
     if patch_hf_runner is not None:
@@ -128,16 +129,15 @@ def run_test(
             hf_kwargs["stop_strings"] = stop_str
 
         for prompts, image_data, video_data, audio_data in inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
             hf_kwargs_with_mm_data = hf_kwargs | mm_data
             hf_output = hf_model.generate_greedy_logprobs_limit(
                 prompts,
                 max_tokens,
                 num_logprobs=num_logprobs,
                 tokenizer=tokenizer,
-                **hf_kwargs_with_mm_data)
+                **hf_kwargs_with_mm_data,
+            )
             hf_outputs_per_mm.append(hf_output)
 
     # Apply output processing / sanitation to the vLLM and HF runner results
@@ -149,8 +149,7 @@ def run_test(
         second_runner_processor=vllm_output_post_proc,
     )
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
-                                        vllm_outputs_per_mm):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
         # This is usually check_logprobs_close, but it's passed through to
         # allow things like check_outputs_equal where needed
         comparator(
@@ -170,15 +169,19 @@ def process_runner_outputs(
 ):
     """Applies the runner processor(s) to the runner outputs, if any."""
     if first_runner_processor is not None:
-        first_runner_outputs = process_outputs(first_runner_processor, model,
-                                               first_runner_outputs)
+        first_runner_outputs = process_outputs(
+            first_runner_processor, model, first_runner_outputs
+        )
     if second_runner_processor is not None:
-        second_runner_outputs = process_outputs(second_runner_processor, model,
-                                                second_runner_outputs)
+        second_runner_outputs = process_outputs(
+            second_runner_processor, model, second_runner_outputs
+        )
     return first_runner_outputs, second_runner_outputs
 
 
 def process_outputs(output_processor, model, outputs_per_image):
     """Applies a model specific post-processor function to a runner's output"""
-    return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
+    return [
+        [output_processor(res, model) for res in outputs]
+        for outputs in outputs_per_image
+    ]
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index c53243b42e38..3886547b8a8b 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
+
 from io import BytesIO
 from typing import Callable
 
@@ -8,8 +9,11 @@
 from PIL import Image
 
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)
 
 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
@@ -18,7 +22,7 @@
 
 def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
     """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
-    
+
     Args:
         formatter: model-specific prompt formatter.
     """
@@ -44,7 +48,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
             stop_sign,
             rescale_image_size(stop_sign, 0.25),
             cherry_blossom.resize((183, 488)),
-            cherry_blossom.resize((488, 183))
+            cherry_blossom.resize((488, 183)),
         ],
         cherry_blossom,
     ]
@@ -57,10 +61,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
     ]
 
 
-def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
-                                          num_frames: int = 16):
+def multi_video_multi_aspect_ratio_inputs(
+    formatter: Callable[[str], str], num_frames: int = 16
+):
     """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
-    
+
     Args:
         formatter: model-specific prompt formatter.
     """
@@ -84,7 +89,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
             video,
             rescale_video_size(video, 0.25),
             resize_video(video, (183, 488)),
-            resize_video(video, (488, 183))
+            resize_video(video, (488, 183)),
         ],
         video,
     ]
@@ -99,7 +104,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
 
 def different_patch_input_cases_internvl():
     images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
-    formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    formatter = (
+        lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    )  # noqa: E501
     single_img_prompts = [
         "<image>\nWhat's the content in the center of the image?",
         "<image>\nWhat is the season?",
@@ -124,8 +131,9 @@ def windows_attention_image_qwen2_5_vl():
 
     question = "Describe the image."
     img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
-    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
+    )
 
     wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
     return build_single_image_inputs([image], [prompt], wrapped_sf)
@@ -139,8 +147,9 @@ def video_with_metadata_glm4_1v():
     formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
 
     scales = [0.1, 0.2, 0.25]
-    video_input = [[(rescale_video_size(video_array, scale), metadata)]
-                   for scale in scales]
+    video_input = [
+        [(rescale_video_size(video_array, scale), metadata)] for scale in scales
+    ]
     prompts = [formatted_prompt] * len(video_input)
 
     return [
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index c1a2aa0dcafb..1e005b49d72a 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -4,6 +4,7 @@
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
 """
+
 import types
 from pathlib import PosixPath
 from typing import Optional, Union
@@ -14,8 +15,13 @@
 import regex as re
 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig, GenerationMixin)
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    BatchFeature,
+    GenerationConfig,
+    GenerationMixin,
+)
 from transformers.video_utils import VideoMetadata
 
 from vllm.sequence import SampleLogprobs
@@ -27,8 +33,7 @@
 
 
 ####### vLLM output processors functions
-def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [blip2 models] to be comparable with hf output."""
     _, output_str, out_logprobs = vllm_output
 
@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
-                           model: str) -> RunnerOutput:
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [fuyu models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 def qwen_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
     """Sanitize vllm output [qwen models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
 
 
 def qwen2_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
     """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
 
 
 def kimiv_vl_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
     """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
-def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                  model: str) -> RunnerOutput:
+def llava_image_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
     config = AutoConfig.from_pretrained(model)
     mm_token_id = config.image_token_index
     return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
 
 
 def llava_video_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
     config = AutoConfig.from_pretrained(model)
     mm_token_id = config.video_token_index
     return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
 
 
-def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
-                             mm_token_id: int) -> RunnerOutput:
+def _llava_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str, mm_token_id: int
+) -> RunnerOutput:
     """Sanitize vllm output [Llava models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
         if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
     ]
 
@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
     return config.to_dict()
 
 
-def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                      model: str) -> RunnerOutput:
+def llava_onevision_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
     """Sanitize vllm output [llava-onevision] to compare with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
         if token_id != video_token_id or output_ids[idx - 1] != video_token_id
     ]
 
@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [mantis] to compare with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
     return output_ids, hf_output_str, out_logprobs
 
 
-def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output [phi3v] to be comparable with hf output."""
     _, output_str, out_logprobs = vllm_output
 
@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
     """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
 
@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 ####### Post-processors for HF outputs
-def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<｜end▁of▁sentence｜>"):
         output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def idefics3_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<end_of_utterance>"):
         output_str = output_str.split("<end_of_utterance>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     # Based on Idefics3
     return idefics3_trunc_hf_output(hf_output, model)
 
 
-def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<|eot_id|>"):
         output_str = output_str.split("<|eot_id|>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def minimax_vl_01_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<end_of_sentence>"):
         output_str = output_str.split("<end_of_sentence>")[0]
     return output_ids, output_str, out_logprobs
 
 
-def ultravox_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
 
     tokenizer = AutoTokenizer.from_pretrained(model)
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
 
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str,
-        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
+    tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
+) -> str:
     """Given a temporary dir path, export one or more image assets into the
     tempdir & replace its contents with the local path to the string so that
     the HF version of Qwen-VL can resolve the path and load the image in its
@@ -313,8 +314,9 @@ def processor(*args, text="", images=None, **kwargs):
         return BatchFeature(data=inputs, tensor_type="pt")
 
     hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language.model.embed_tokens
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language.model.embed_tokens
+    )
     return hf_model
 
 
@@ -358,11 +360,10 @@ def processor(*args, text="", images=None, **kwargs):
         assert len(contents) == len(images)
 
         return hf_processor.apply_chat_template(
-            [{
-                "role": "user",
-                "image": image,
-                "content": content
-            } for image, content in zip(images, contents)],
+            [
+                {"role": "user", "image": image, "content": content}
+                for image, content in zip(images, contents)
+            ],
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
@@ -370,8 +371,9 @@ def processor(*args, text="", images=None, **kwargs):
         )
 
     hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.transformer.output_layer
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.transformer.output_layer
+    )
     return hf_model
 
 
@@ -388,10 +390,9 @@ def processor(*args, videos=None, **kwargs):
         else:
             video_metadata = None
 
-        return hf_processor(*args,
-                            videos=videos,
-                            video_metadata=video_metadata,
-                            **kwargs)
+        return hf_processor(
+            *args, videos=videos, video_metadata=video_metadata, **kwargs
+        )
 
     hf_model.processor = processor
     return hf_model
@@ -407,8 +408,9 @@ def __init__(self, hf_runner: HfRunner):
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
 
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
             self.use_msac = self.config.use_msac
@@ -416,11 +418,14 @@ def __init__(self, hf_runner: HfRunner):
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
             # yapf: disable
             from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_h2ovl,
+            )
 
             # yapf: enable
             images = [images] if isinstance(images, Image) else images
@@ -432,29 +437,26 @@ def __call__(self, text: str, images: Union[Image, list[Image]],
                     max_num=self.max_num,
                     use_thumbnail=self.use_thumbnail,
                     use_msac=self.use_msac,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
             ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
             pixel_values = torch.cat(pixel_values, dim=0)
             for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                 image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
             prompt = self.tokenizer(text, return_tensors="pt")
             prompt.update({"pixel_values": pixel_values})
             return prompt
 
-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = H2OVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
 
 
@@ -468,19 +470,23 @@ def __init__(self, hf_runner: HfRunner):
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
 
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
             self.min_num = self.config.min_dynamic_patch
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
             from vllm.model_executor.models.skyworkr1v import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_skyworkr1v)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_skyworkr1v,
+            )
+
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
                 image_to_pixel_values_skyworkr1v(
@@ -489,29 +495,26 @@ def __call__(self, text: str, images: Union[Image, list[Image]],
                     min_num=self.min_num,
                     max_num=self.max_num,
                     use_thumbnail=self.use_thumbnail,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
             ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
             pixel_values = torch.cat(pixel_values, dim=0)
             for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                 image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
             prompt = self.tokenizer(text, return_tensors="pt")
             prompt.update({"pixel_values": pixel_values})
             return prompt
 
-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = SkyworkR1VProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
 
 
@@ -525,8 +528,9 @@ def __init__(self, hf_runner: HfRunner):
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
 
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
             self.min_num = self.config.min_dynamic_patch
@@ -541,8 +545,13 @@ def __call__(
             **kwargs,
         ):
             from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_internvl,
+                video_to_pixel_values_internvl,
+            )
+
             images = [images] if isinstance(images, Image) else images
             videos = [videos] if isinstance(videos, np.ndarray) else videos
             if images is not None:
@@ -553,7 +562,8 @@ def __call__(
                         min_num=self.min_num,
                         max_num=self.max_num,
                         use_thumbnail=self.use_thumbnail,
-                    ) for image in images
+                    )
+                    for image in images
                 ]
                 num_patches_images = [
                     pixel_value.shape[0] for pixel_value in pixel_values_images
@@ -569,7 +579,8 @@ def __call__(
                         min_num=1,
                         max_num=1,
                         use_thumbnail=False,
-                    ) for video in videos
+                    )
+                    for video in videos
                 ]
                 num_patches_videos = [
                     pixel_value.shape[0] for pixel_value in pixel_values_videos
@@ -581,38 +592,37 @@ def __call__(
             while ("<image>" in text) or ("<video>" in text):
                 image_index = text.find("<image>")
                 video_index = text.find("<video>")
-                if image_index == -1 or (video_index > -1
-                                         and video_index < image_index):
+                if image_index == -1 or (
+                    video_index > -1 and video_index < image_index
+                ):
                     num_patches = num_patches_videos.pop(0)
                     pixel_values.append(pixel_values_videos.pop(0))
-                    context_tokens = IMG_START + \
-                        IMG_CONTEXT * self.num_image_token + IMG_END
-                    video_tokens = ''.join([
-                        f'Frame{i+1}: {context_tokens}'
-                        for i in range(num_patches)
-                    ])
-                    text = text.replace('<video>', video_tokens, 1)
+                    context_tokens = (
+                        IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
+                    )
+                    video_tokens = "".join(
+                        [f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
+                    )
+                    text = text.replace("<video>", video_tokens, 1)
                 else:
                     num_patches = num_patches_images.pop(0)
                     pixel_values.append(pixel_values_images.pop(0))
-                    context_tokens = IMG_CONTEXT * self.num_image_token \
-                        * num_patches
+                    context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                     image_tokens = IMG_START + context_tokens + IMG_END
-                    text = text.replace('<image>', image_tokens, 1)
+                    text = text.replace("<image>", image_tokens, 1)
             pixel_values = torch.cat(pixel_values, dim=0)
 
             prompt = self.tokenizer(text, return_tensors="pt")
             prompt.update({"pixel_values": pixel_values})
             return prompt
 
-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     hf_model.model.img_context_token_id = img_context_token_id
     hf_model.processor = InternVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
     return hf_model
 
 
@@ -632,7 +642,7 @@ def _internvl_generate(
     input_embeds = input_embeds.reshape(B * N, C)
 
     input_ids = input_ids.reshape(B * N)
-    selected = (input_ids == self.img_context_token_id)
+    selected = input_ids == self.img_context_token_id
     assert selected.sum() != 0
     input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
 
@@ -779,8 +789,9 @@ def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
 
 def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )
 
     def processor(*args, text="", images=None, **kwargs):
         text_tokenizer = hf_model.model.get_text_tokenizer()
@@ -788,8 +799,7 @@ def processor(*args, text="", images=None, **kwargs):
 
         prompt_start_and_end = {
             "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
             "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
         }
         for start, end in prompt_start_and_end.values():
@@ -798,7 +808,8 @@ def processor(*args, text="", images=None, **kwargs):
                 break
 
         prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
-            text_or_conversations=text, images=images)
+            text_or_conversations=text, images=images
+        )
         attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
 
         inputs = {
diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py
index 562f89df1347..c91ae117b558 100644
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -3,23 +3,34 @@
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
+
 from pathlib import PosixPath
 
-from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
-                           VideoTestAssets, VllmRunner)
+from .....conftest import (
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
 from . import builders, core
 from .types import ExpandableVLMTestArgs, VLMTestInfo
 
 
 ####### Entrypoints for running different test types
-def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                          test_case: ExpandableVLMTestArgs,
-                          hf_runner: type[HfRunner],
-                          vllm_runner: type[VllmRunner],
-                          image_assets: ImageTestAssets):
+def run_single_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
     assert test_case.size_wrapper is not None
     inputs = builders.build_single_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"image": 1},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
-def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                         test_case: ExpandableVLMTestArgs,
-                         hf_runner: type[HfRunner],
-                         vllm_runner: type[VllmRunner],
-                         image_assets: ImageTestAssets):
+def run_multi_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
     assert test_case.size_wrapper is not None
     inputs = builders.build_multi_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"image": len(image_assets)},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
-def run_embedding_test(*, model_test_info: VLMTestInfo,
-                       test_case: ExpandableVLMTestArgs,
-                       hf_runner: type[HfRunner],
-                       vllm_runner: type[VllmRunner],
-                       image_assets: ImageTestAssets):
+def run_embedding_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
     assert test_case.size_wrapper is not None
     inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper)
+        model_test_info, image_assets, test_case.size_wrapper
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
         limit_mm_per_prompt={"image": 1},
         vllm_embeddings=vllm_embeddings,
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
 def run_video_test(
@@ -90,8 +113,11 @@ def run_video_test(
     assert test_case.size_wrapper is not None
     assert test_case.num_video_frames is not None
     inputs = builders.build_video_inputs_from_test_info(
-        model_test_info, video_assets, test_case.size_wrapper,
-        test_case.num_video_frames)
+        model_test_info,
+        video_assets,
+        test_case.size_wrapper,
+        test_case.num_video_frames,
+    )
 
     core.run_test(
         hf_runner=hf_runner,
@@ -103,7 +129,8 @@ def run_video_test(
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"video": len(video_assets)},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
 def run_audio_test(
@@ -114,8 +141,7 @@ def run_audio_test(
     vllm_runner: type[VllmRunner],
     audio_assets: AudioTestAssets,
 ):
-    inputs = builders.build_audio_inputs_from_test_info(
-        model_test_info, audio_assets)
+    inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
 
     core.run_test(
         hf_runner=hf_runner,
@@ -127,13 +153,17 @@ def run_audio_test(
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt={"audio": 1},
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
 
 
-def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
-                           test_case: ExpandableVLMTestArgs,
-                           hf_runner: type[HfRunner],
-                           vllm_runner: type[VllmRunner]):
+def run_custom_inputs_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+):
     # Custom test cases can provide inputs directly, but they need to
     # explicitly provided a CustomTestConfig, which wraps the inputs and
     # the limit_mm_per_prompt
@@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
         num_logprobs=test_case.num_logprobs,
         limit_mm_per_prompt=limit_mm_per_prompt,
         distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 0ec7909e744d..1c67c7bd6b68 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
+
 from collections.abc import Iterable
 from enum import Enum
 from pathlib import PosixPath
@@ -15,9 +16,16 @@
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
-                           ImageTestAssets, PromptAudioInput, PromptImageInput,
-                           PromptVideoInput)
+from .....conftest import (
+    AUDIO_ASSETS,
+    IMAGE_ASSETS,
+    HfRunner,
+    ImageAsset,
+    ImageTestAssets,
+    PromptAudioInput,
+    PromptImageInput,
+    PromptVideoInput,
+)
 from ....utils import check_logprobs_close
 
 # meta image tag; will be replaced by the appropriate tag for the model
@@ -47,6 +55,7 @@
 
 class PromptWithMultiModalInput(NamedTuple):
     """Holds the multimodal input for a single test case."""
+
     prompts: list[str]
     image_data: Optional[PromptImageInput] = None
     video_data: Optional[PromptVideoInput] = None
@@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):
 
     # Function for converting ImageAssets to image embeddings;
     # We need to define this explicitly for embedding tests
-    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
-                                                    torch.Tensor]] = None
+    convert_assets_to_embeddings: Optional[
+        Callable[[ImageTestAssets], torch.Tensor]
+    ] = None
 
     # Exposed options for vLLM runner; we change these in a several tests,
     # but the defaults are derived from VllmRunner & the engine defaults
@@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
     # for Qwen-VL, which requires encoding the image path / url into the prompt
     # for HF runner
     prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
-                 str]] = None  # noqa: E501
+        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
+    ] = None  # noqa: E501
 
     # Allows configuring a test to run with custom inputs
     custom_test_opts: Optional[list[CustomTestOptions]] = None
@@ -190,6 +200,7 @@ def get_non_parametrized_runner_kwargs(self):
 
 class ExpandableVLMTestArgs(NamedTuple):
     """The expanded kwargs which correspond to a single test case."""
+
     model: str
     max_tokens: int
     num_logprobs: int
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index f889eea5e839..1c6a9a16e31e 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -17,18 +17,21 @@
     # T -> X
     (
         "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501,
-        Image.new("RGB", (56, 56))),
+        Image.new("RGB", (56, 56)),
+    ),
     # T -> X
-    ("Query: Retrieve an image of this caption: cherry blossom",
-     Image.new("RGB", (56, 56))),
+    (
+        "Query: Retrieve an image of this caption: cherry blossom",
+        Image.new("RGB", (56, 56)),
+    ),
 ]
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "What is shown in this image?",
-    "cherry_blossom":
-    "What is shown in this image?"
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "What is shown in this image?",
+        "cherry_blossom": "What is shown in this image?",
+    }
+)
 
 MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
 
@@ -36,34 +39,30 @@
 def get_messages(image: Image.Image, text: str, embed_text: bool):
     # assert False, 'remember to use outer [] as required'
     if embed_text:
-        messages = [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": Image.new("RGB", (56, 56)),
-                    "resized_height": 1,
-                    "resized_width": 1
-                },  # need a dummy image here for an easier process.
-                {
-                    "type": "text",
-                    "text": text
-                },
-            ]
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": Image.new("RGB", (56, 56)),
+                        "resized_height": 1,
+                        "resized_width": 1,
+                    },  # need a dummy image here for an easier process.
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
     else:
-        messages = [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image",
-                "image": image
-            }, {
-                "type": "text",
-                "text": text
-            }]
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
     return messages
 
 
@@ -71,8 +70,10 @@ def apply_chat_template_and_add_eos(
     messages: list[dict],
     apply_chat_template_fn: Callable,
 ):
-    prompt = apply_chat_template_fn(
-        messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
+    prompt = (
+        apply_chat_template_fn(messages, tokenize=False, add_generation_prompt=True)
+        + "<|endoftext|>"
+    )
     return prompt
 
 
@@ -86,16 +87,14 @@ def _run_test(
     *,
     dtype: str,
 ) -> None:
-    '''SET PYTHONPATH'''
+    """SET PYTHONPATH"""
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     task="embed",
-                     dtype=dtype,
-                     enforce_eager=True,
-                     max_model_len=8192) as vllm_model:
+    with vllm_runner(
+        model, task="embed", dtype=dtype, enforce_eager=True, max_model_len=8192
+    ) as vllm_model:
         tokenizer = vllm_model.model.get_tokenizer()
         texts = [
             # this is necessary because vllm_model.embed will not apply any
@@ -105,25 +104,25 @@ def _run_test(
             apply_chat_template_and_add_eos(
                 get_messages(image, text, False),
                 apply_chat_template_fn=tokenizer.apply_chat_template,
-            ) for text, image in zip(input_texts, input_images)
+            )
+            for text, image in zip(input_texts, input_images)
             # vllm will replace the pad token with the actual image,
             # which may be a placeholder image, later.
         ]
         vllm_outputs = vllm_model.embed(texts, images=input_images)
 
     hf_outputs = []
-    with hf_runner(model,
-                   dtype=dtype,
-                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
-
+    with hf_runner(
+        model, dtype=dtype, auto_cls=Qwen2VLForConditionalGeneration
+    ) as hf_model:
         prompts = []
-        for text, image, embed_text in zip(input_texts, input_images,
-                                           embed_texts):
+        for text, image, embed_text in zip(input_texts, input_images, embed_texts):
             # dse requires non-standard input processing
             # because it needs an image_pad token
             messages = get_messages(image, text, embed_text)
             prompt = apply_chat_template_and_add_eos(
-                messages, hf_model.processor.apply_chat_template)
+                messages, hf_model.processor.apply_chat_template
+            )
 
             prompts.append(prompt)
 
@@ -145,9 +144,9 @@ def _run_test(
                     return_dict=True,
                     output_hidden_states=True,
                 )
-                pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
-                                            p=2,
-                                            dim=-1)
+                pooled_output = F.normalize(
+                    outputs.hidden_states[-1][0, -1], p=2, dim=-1
+                )
 
                 all_outputs.append(pooled_output.tolist())
 
@@ -170,8 +169,9 @@ def test_models_text(
     model: str,
     dtype: str,
 ) -> None:
-    input_texts_images = [(text, image_placeholder)
-                          for text, image_placeholder in HF_TEXT_PROMPTS]
+    input_texts_images = [
+        (text, image_placeholder) for text, image_placeholder in HF_TEXT_PROMPTS
+    ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
     embed_texts = [True] * len(input_texts)
@@ -198,8 +198,7 @@ def test_models_image(
     dtype: str,
 ) -> None:
     input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index 3e2be34a50ad..b474e851319a 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -29,7 +29,7 @@ def run_intern_vit_test(
     img_processor = CLIPImageProcessor.from_pretrained(model)
     images = [asset.pil_image for asset in image_assets]
     pixel_values = [
-        img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
+        img_processor(images, return_tensors="pt").pixel_values.to(torch_dtype)
         for images in images
     ]
 
@@ -37,15 +37,16 @@ def run_intern_vit_test(
     if not getattr(config, "norm_type", None):
         config.norm_type = "rms_norm"
 
-    hf_model = AutoModel.from_pretrained(model,
-                                         torch_dtype=torch_dtype,
-                                         trust_remote_code=True).to("cuda")
+    hf_model = AutoModel.from_pretrained(
+        model, torch_dtype=torch_dtype, trust_remote_code=True
+    ).to("cuda")
     hf_outputs_per_image = [
         hf_model(pixel_value.to("cuda")).last_hidden_state
         for pixel_value in pixel_values
     ]
 
     from vllm.model_executor.models.intern_vit import InternVisionModel
+
     vllm_model = InternVisionModel(config)
     vllm_model.load_weights(hf_model.state_dict().items())
 
@@ -54,22 +55,23 @@ def run_intern_vit_test(
 
     vllm_model = vllm_model.to("cuda", torch_dtype)
     vllm_outputs_per_image = [
-        vllm_model(pixel_values=pixel_value.to("cuda"))
-        for pixel_value in pixel_values
+        vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
     ]
     del vllm_model
     cleanup_dist_env_and_memory()
 
     cos_similar = nn.CosineSimilarity(dim=-1)
-    for vllm_output, hf_output in zip(vllm_outputs_per_image,
-                                      hf_outputs_per_image):
+    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
         assert cos_similar(vllm_output, hf_output).mean() > 0.99
 
 
-@pytest.mark.parametrize("model_id", [
-    "OpenGVLab/InternViT-300M-448px",
-    "OpenGVLab/InternViT-6B-448px-V1-5",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "OpenGVLab/InternViT-300M-448px",
+        "OpenGVLab/InternViT-6B-448px-V1-5",
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
     run_intern_vit_test(
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index 50c91f1f81ca..4ca5749f2765 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -29,7 +29,6 @@ def vllm_reranker(
     query_type: str = "text",
     doc_type: str = "text",
 ):
-
     def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
         return {"type": "image_url", "image_url": {"url": f"{url}"}}
 
@@ -38,23 +37,25 @@ def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
         query = query_strs
     elif query_type == "image":
         query = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in query_strs])
+            content=[create_image_param(url) for url in query_strs]
+        )
 
     documents: Union[list[str], ScoreMultiModalParam]
     if doc_type == "text":
         documents = document_strs
     elif doc_type == "image":
         documents = ScoreMultiModalParam(
-            content=[create_image_param(url) for url in document_strs])
+            content=[create_image_param(url) for url in document_strs]
+        )
 
     with vllm_runner(
-            model_name,
-            task="score",
-            dtype=dtype,
-            max_num_seqs=2,
-            max_model_len=2048,
-            mm_processor_kwargs=mm_processor_kwargs,
-            limit_mm_per_prompt=limit_mm_per_prompt,
+        model_name,
+        task="score",
+        dtype=dtype,
+        max_num_seqs=2,
+        max_model_len=2048,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
     ) as vllm_model:
         outputs = vllm_model.model.score(query, documents)
 
@@ -78,16 +79,15 @@ def hf_reranker(
     data_pairs = [[query_strs[0], d] for d in document_strs]
 
     with hf_runner(
-            model_name,
-            dtype=dtype,
-            trust_remote_code=True,
-            auto_cls=AutoModel,
-            model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
+        model_name,
+        dtype=dtype,
+        trust_remote_code=True,
+        auto_cls=AutoModel,
+        model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
     ) as hf_model:
-        return hf_model.model.compute_score(data_pairs,
-                                            max_length=2048,
-                                            query_type=query_type,
-                                            doc_type=doc_type)
+        return hf_model.model.compute_score(
+            data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
+        )
 
 
 # Visual Documents Reranking
@@ -100,10 +100,12 @@ def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
         "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
     ]
 
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "text", "image")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "text", "image")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "text", "image"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "text", "image"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@@ -127,10 +129,12 @@ def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
         lower computational requirements.""",  # noqa: E501
         "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
     ]
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "text", "text")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "text", "text")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "text", "text"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "text", "text"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@@ -157,10 +161,12 @@ def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
         "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
     ]
 
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "image", "text")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "image", "text")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "image", "text"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "image", "text"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
@@ -178,10 +184,12 @@ def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
         "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
     ]
 
-    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
-                             "image", "image")
-    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
-                                 documents, "image", "image")
+    hf_outputs = hf_reranker(
+        hf_runner, model_name, dtype, query, documents, "image", "image"
+    )
+    vllm_outputs = vllm_reranker(
+        vllm_runner, model_name, dtype, query, documents, "image", "image"
+    )
 
     assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
     assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
index 4a8f5cafbe48..92298d7b76cc 100644
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -24,9 +24,10 @@
 #    built with LAPACK support.
 pytestmark = pytest.mark.skipif(
     not current_platform.is_cuda(),
-    reason="Llava Next model uses op that is only supported in CUDA")
+    reason="Llava Next model uses op that is only supported in CUDA",
+)
 
-llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
 
 HF_TEXT_PROMPTS = [
     # T -> X
@@ -34,18 +35,21 @@
         "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
     ),
     # T -> X
-    llama3_template.format(
-        "cherry blossom\nSummary above sentence in one word: "),
+    llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
 ]
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    # I -> X
-    "stop_sign":
-    llama3_template.format("<image>\nSummary above image in one word: "),
-    # I -> X
-    "cherry_blossom":
-    llama3_template.format("<image>\nSummary above image in one word: "),
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        # I -> X
+        "stop_sign": llama3_template.format(
+            "<image>\nSummary above image in one word: "
+        ),
+        # I -> X
+        "cherry_blossom": llama3_template.format(
+            "<image>\nSummary above image in one word: "
+        ),
+    }
+)
 
 MODELS = ["royokong/e5-v"]
 
@@ -63,23 +67,22 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     task="embed",
-                     dtype=dtype,
-                     max_model_len=4096,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model, task="embed", dtype=dtype, max_model_len=4096, enforce_eager=True
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForImageTextToText) as hf_model:
+    with hf_runner(
+        model, dtype=dtype, auto_cls=AutoModelForImageTextToText
+    ) as hf_model:
         # Patch the issue where generation_config.json is missing
-        hf_model.processor.patch_size = \
-            hf_model.model.config.vision_config.patch_size
+        hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size
 
         # Patch the issue where image_token_id
         # exceeds the maximum allowed vocab size
         hf_model.model.resize_token_embeddings(
-            hf_model.model.language_model.vocab_size + 1)
+            hf_model.model.language_model.vocab_size + 1
+        )
 
         all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
@@ -91,8 +94,7 @@ def _run_test(
                 return_dict=True,
                 output_hidden_states=True,
             )
-            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
-                                        dim=-1)
+            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)
 
             all_outputs.append(pooled_output.tolist())
 
@@ -142,8 +144,7 @@ def test_models_image(
     dtype: str,
 ) -> None:
     input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
index 9a4b6d3ff8a8..35bc5d5b0e7b 100644
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -19,14 +19,14 @@
     "Retrieve an image of this caption: cherry blossom",
 ]
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    # T + I -> X
-    "stop_sign":
-    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
-    # I -> X
-    "cherry_blossom":
-    "<|image_1|> Represent the given image for classification",  # noqa: E501
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        # T + I -> X
+        "stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
+        # I -> X
+        "cherry_blossom": "<|image_1|> Represent the given image for classification",  # noqa: E501
+    }
+)
 
 MODELS = ["TIGER-Lab/VLM2Vec-Full"]
 
@@ -44,14 +44,14 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, task="embed", dtype=dtype,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model, task="embed", dtype=dtype, enforce_eager=True
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
         all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
         all_outputs = []
@@ -114,18 +114,21 @@ def test_models_image(
     dtype: str,
 ) -> None:
     input_texts_images = [
-        (text, asset.pil_image)
-        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
     # add cases for special_tokens
-    input_texts_images.append((
-        "\n<s><|user|>\n <|image_1|>\n\t <s>"
-        "Represent the given image for classification<|end|>"
-        "\n<|assistant|>\n",
-        Image.open(
-            get_vllm_public_assets(filename="cherry_blossom.jpg",
-                                   s3_prefix=VLM_IMAGES_DIR)),
-    ))
+    input_texts_images.append(
+        (
+            "\n<s><|user|>\n <|image_1|>\n\t <s>"
+            "Represent the given image for classification<|end|>"
+            "\n<|assistant|>\n",
+            Image.open(
+                get_vllm_public_assets(
+                    filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
+                )
+            ),
+        )
+    )
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fd5842523178..d6801d99b0ca 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -6,8 +6,7 @@
 
 import numpy as np
 import pytest
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
 
@@ -16,9 +15,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               cached_tokenizer_from_config,
-                                               encode_tokens)
+from vllm.transformers_utils.tokenizer import (
+    AnyTokenizer,
+    MistralTokenizer,
+    cached_tokenizer_from_config,
+    encode_tokens,
+)
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@@ -31,12 +33,15 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
     # Ensure video metadata is included
     if "video" in mm_data:
         video = mm_data["video"]
-        mm_data["video"] = (video, {
-            "total_num_frames": len(video),
-            "fps": len(video),
-            "duration": 1,
-            "video_backend": "opencv"
-        })
+        mm_data["video"] = (
+            video,
+            {
+                "total_num_frames": len(video),
+                "fps": len(video),
+                "duration": 1,
+                "video_backend": "opencv",
+            },
+        )
     return mm_data
 
 
@@ -90,27 +95,22 @@ def _test_processing_correctness(
     input_to_hit = {
         "image": Image.new("RGB", size=(128, 128)),
         "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
-        "audio": (np.zeros((512, )), 16000),
+        "audio": (np.zeros((512,)), 16000),
     }
     input_factory = {
-        "image":
-        partial(random_image, rng, min_wh=128, max_wh=256),
-        "video":
-        partial(random_video,
-                rng,
-                min_frames=2,
-                max_frames=8,
-                min_wh=128,
-                max_wh=256),
-        "audio":
-        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+        "image": partial(random_image, rng, min_wh=128, max_wh=256),
+        "video": partial(
+            random_video, rng, min_frames=2, max_frames=8, min_wh=128, max_wh=256
+        ),
+        "audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
     }
 
     for batch_idx in range(num_batches):
         mm_data = {
-            k:
-            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit + 1))]
+            k: [
+                (input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+                for _ in range(rng.randint(limit + 1))
+            ]
             for k, limit in limit_mm_per_prompt.items()
         }
 
@@ -119,12 +119,16 @@ def _test_processing_correctness(
         # Mistral chat outputs tokens directly, rather than text prompts
         if isinstance(tokenizer, MistralTokenizer):
             images = mm_data.get("image", [])
-            request = ChatCompletionRequest(messages=[
-                UserMessage(content=[
-                    TextChunk(text=""),
-                    *(ImageChunk(image=image) for image in images),
-                ]),
-            ])
+            request = ChatCompletionRequest(
+                messages=[
+                    UserMessage(
+                        content=[
+                            TextChunk(text=""),
+                            *(ImageChunk(image=image) for image in images),
+                        ]
+                    ),
+                ]
+            )
             res = tokenizer.mistral.encode_chat_completion(request)
             prompt = res.tokens
         else:
@@ -245,16 +249,14 @@ def _test_processing_correctness_one(
             baseline_text_result,
             baseline_tokenized_result,
             ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
         )
 
         _assert_inputs_equal(
             cached_text_result,
             cached_tokenized_result,
             ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
         )
 
 
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 76e4acc67d4d..8944d32b4545 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for H2OVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional
 
@@ -23,8 +24,10 @@ def _get_expected_num_patches(
     min_num: int,
     max_num: int,
 ):
-    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
-                                                  get_h2ovl_target_ratios)
+    from vllm.model_executor.models.h2ovl import (
+        calculate_h2ovl_targets,
+        get_h2ovl_target_ratios,
+    )
 
     width, height = image.size
 
@@ -101,7 +104,8 @@ def _run_check(
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
@@ -114,10 +118,13 @@ def _run_check(
     assert pixel_shape[0] == total_expected_num_patches
 
 
-@pytest.mark.parametrize("model_id", [
-    "h2oai/h2ovl-mississippi-800m",
-    "h2oai/h2ovl-mississippi-2b",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "h2oai/h2ovl-mississippi-800m",
+        "h2oai/h2ovl-mississippi-2b",
+    ],
+)
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -164,10 +171,7 @@ def test_processor_override(
 
     _run_check(
         processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
         min_num,
         max_num,
         hf_processor_mm_kwargs,
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index d3a55993e558..2028d13943cc 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Idefics3's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import Idefics3Config
 
@@ -17,7 +18,8 @@
     [
         ({"size": {"longest_edge": 364}}, 169),
         ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
-    ])
+    ],
+)
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -42,8 +44,11 @@ def test_processor_override(
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
     prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
 
     # Build mm_data
@@ -57,8 +62,7 @@ def test_processor_override(
     # Ensure the placeholders format are correct
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = ctx.get_hf_config().image_token_id
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index c3e2841a8f06..56d654f04969 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for InternVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional
 
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
     max_num: int,
 ):
     from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
+        calculate_internvl_targets,
+        get_internvl_target_ratios,
+    )
 
     width, height = image.size
 
@@ -61,7 +64,8 @@ def _run_check(
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
@@ -121,10 +125,7 @@ def test_processor_override(
 
     _run_check(
         processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
         min_num,
         max_num,
         hf_processor_mm_kwargs,
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 9ef7af556291..57b0433f36ae 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -11,8 +11,7 @@
 from ...utils import build_model_context
 
 
-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
 @pytest.mark.parametrize("mm_processor_kwargs", [{}])
 @pytest.mark.parametrize("num_imgs", [1, 5])
 @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
@@ -38,13 +37,14 @@ def test_processor_override(
     hf_processor = processor.info.get_hf_processor()
     vocab = tokenizer.get_vocab()
 
-    prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
-        + "<|image|>" * num_imgs \
+    prompt = (
+        "<|begin_of_text|><|header_start|>user<|header_end|>"
+        + "<|image|>" * num_imgs
         + "<|eot|><|header_start|>assistant<|header_end|>"
+    )
     mm_data = {
         "image": [
-            image_assets[(i % len(image_assets))].pil_image
-            for i in range(num_imgs)
+            image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
         ]
     }
     if tokenized_prompt:
@@ -64,22 +64,23 @@ def test_processor_override(
         if tiles_x * tiles_y > 1:
             num_x_separators += (tiles_x - 1) * tiles_y
             num_y_separators += tiles_y
-    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
-        == num_x_separators
-    assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
-        ==  num_y_separators
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
+    assert (
+        prompt_token_ids.count(vocab[hf_processor.tile_global_token])
+        == num_y_separators
+    )
 
     # image token offsets
     img_locs = processed_inputs["mm_placeholders"].get("image", [])
     assert len(img_locs) == num_imgs
-    assert [img_loc.offset for img_loc in img_locs] == \
-        [i for i, v in enumerate(prompt_token_ids) \
-        if v == config.boi_token_index]
+    assert [img_loc.offset for img_loc in img_locs] == [
+        i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
+    ]
 
     # patch sizes and masks
-    num_patches_per_chunk = processor.info.get_patch_per_chunk(
-        config.vision_config)
-    assert prompt_token_ids.count(config.image_token_index) \
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
+    assert (
+        prompt_token_ids.count(config.image_token_index)
         == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
-    assert mm_kwargs["pixel_values"].shape[0] \
-        == mm_kwargs["patches_per_image"].sum()
+    )
+    assert mm_kwargs["pixel_values"].shape[0] == mm_kwargs["patches_per_image"].sum()
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index ca34d1d758a4..ffe7ca17b5d6 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
     image_size: ImageSize,
 ) -> None:
     info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )
 
     try:
         assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
         failed_size_excs.append((image_size, exc))
 
 
-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 def test_processor_max_tokens(model_id):
     ctx = build_model_context(
@@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(
 
         # NOTE: There is a BOS token
         assert first_placeholder.offset == 1
-        assert first_placeholder.length == (
-            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        assert (
+            first_placeholder.length
+            == (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        )
 
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
@@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
     image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
     ]
 
     _test_image_prompt_replacements(
@@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
 
 
-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index e6344c4e7e6f..f5c552fe6476 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
     image_size: ImageSize,
 ) -> None:
     info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )
 
     try:
         assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
         failed_size_excs.append((image_size, exc))
 
 
-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 def test_processor_max_tokens(model_id):
     ctx = build_model_context(
         model_id,
@@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         assert first_placeholder.offset == 0
-        assert first_placeholder.length == len(
-            processed_inputs["prompt_token_ids"]) // num_imgs
+        assert (
+            first_placeholder.length
+            == len(processed_inputs["prompt_token_ids"]) // num_imgs
+        )
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
 
@@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
     pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
@@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
     image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
     ]
 
     _test_image_prompt_replacements(
@@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
 
 
-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
index 9387212e3f10..11e000123511 100644
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
     num_imgs: int,
     image_sizes: list[ImageSize],
 ) -> None:
-
     failed_size_excs = list[tuple[ImageSize, Exception]]()
 
     for size in image_sizes:
-        _validate_image_prompt_replacements_one(processor, num_imgs,
-                                                failed_size_excs, size)
+        _validate_image_prompt_replacements_one(
+            processor, num_imgs, failed_size_excs, size
+        )
 
     if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
         raise AssertionError(msg)
 
 
@@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
     image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
     ]
 
     _test_image_prompt_replacements(
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
index a6b20a1e3678..e03182267b8d 100644
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for mllama's multimodal preprocessing and profiling."""
+
 import pytest
 from transformers import MllamaConfig
 
@@ -10,8 +11,7 @@
 from ...utils import build_model_context
 
 
-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
 @pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
 @pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
 def test_profiling(
@@ -46,8 +46,7 @@ def test_profiling(
 
     hf_config = ctx.get_hf_config(MllamaConfig)
     image_size = hf_config.vision_config.image_size
-    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
-                        ] * max_num_seqs
+    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)] * max_num_seqs
 
     mm_kwargs = processor.apply(
         prompt=dummy_mm_data.prompt,
@@ -67,6 +66,5 @@ def test_profiling(
     ]
 
     # simulate mllama image-present prefill.
-    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
-                                          encoder_seq_lens):
+    for actual_len, last_group_len in zip(actual_encoder_seq_lens, encoder_seq_lens):
         assert actual_len >= last_group_len
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 3ce88bc427f5..6ce570aaa7c1 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional
 
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
     max_num: int,
 ):
     from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
+        calculate_internvl_targets,
+        get_internvl_target_ratios,
+    )
 
     width, height = image.size
 
@@ -63,7 +66,8 @@ def _run_check(
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
     print(total_expected_num_patches)
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
@@ -76,8 +80,7 @@ def _run_check(
     assert pixel_shape[0] == total_expected_num_patches
 
 
-@pytest.mark.parametrize("model_id",
-                         ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
+@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -124,10 +127,7 @@ def test_processor_override(
 
     _run_check(
         processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
         min_num,
         max_num,
         hf_processor_mm_kwargs,
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 1f3646f79486..436f029f9f0e 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi3v's multimodal preprocessing kwargs."""
+
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -18,7 +19,8 @@
         ({"num_crops": 16}, 1921),
         # the default num_crops of phi-3.5-vision is 4
         ({}, 757),
-    ])
+    ],
+)
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
index f16d261c2c6a..b6759342ffdf 100644
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi4mm's multimodal preprocessing kwargs."""
+
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -18,7 +19,8 @@
         ({"dynamic_hd": 16}, 4433),
         # the default num_crops of phi-4-multimodal is 36
         ({}, 9585),
-    ])
+    ],
+)
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -46,8 +48,7 @@ def test_processor_override(
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
 
-    image_size = ctx.get_hf_config(
-    ).embd_layer["image_embd_layer"]["crop_size"]
+    image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
     dummy_image_size = (image_size * 7, image_size * 7)
     dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
     mm_data = {"image": [dummy_image] * num_imgs}
@@ -56,5 +57,6 @@ def test_processor_override(
 
     # Ensure we have the right number of placeholders per num_crops size
     img_tok_count = processed_inputs["prompt_token_ids"].count(
-        _IMAGE_PLACEHOLDER_TOKEN_ID)
+        _IMAGE_PLACEHOLDER_TOKEN_ID
+    )
     assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 9d1cd183387b..a719cc3fd0cd 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -12,10 +12,12 @@
 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
 # yapf: disable
 @pytest.mark.parametrize(
-    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
+    [
         ({}, 1426, (5704, 1176)),
         ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
-    ])
+    ],
+)
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index af8f983388c6..20018513d067 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for smolvlm's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import SmolVLMConfig
 
@@ -17,7 +18,8 @@
     [
         ({"max_image_size": {"longest_edge": 384}}, 1377),
         ({"max_image_size": {"longest_edge": 768}}, 405),
-    ])
+    ],
+)
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -42,8 +44,11 @@ def test_processor_override(
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
     prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
 
     # Build mm_data
@@ -57,8 +62,7 @@ def test_processor_override(
     # Ensure the placeholders format are correct
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = ctx.get_hf_config().image_token_id
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 5f20452aff3d..23b498e6b071 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -19,7 +19,7 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
     """Create weights from safetensors checkpoint metadata"""
     metadata = try_get_safetensors_metadata(repo)
     weight_names = list(metadata.weight_map.keys())
-    with torch.device('meta'):
+    with torch.device("meta"):
         return ((name, torch.empty(0)) for name in weight_names)
 
 
@@ -81,6 +81,7 @@ def test_hf_model_weights_mapper(model_arch: str):
 
     weights_missing = ref_weight_names - weight_names
     weights_unmapped = weight_names - ref_weight_names
-    assert (not weights_missing and not weights_unmapped), (
+    assert not weights_missing and not weights_unmapped, (
         f"Following weights are not mapped correctly: {weights_unmapped}, "
-        f"Missing expected weights: {weights_missing}.")
+        f"Missing expected weights: {weights_missing}."
+    )
diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py
index de6851e2fc28..15fb06280f46 100644
--- a/tests/models/quantization/test_aqlm.py
+++ b/tests/models/quantization/test_aqlm.py
@@ -22,24 +22,24 @@
 # print(outputs)
 # ```
 ground_truth_generations = [
-    '\n### Features\n\n- **High-throughput**: v',
-    'The major milestones in the development of artificial intelligence from '
-    '195',
-    'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information. The',
-    'Explain the difference between supervised and unsupervised learning.'
-    '\nExplain',
-    'Write a short story about a robot that dreams for the first time. The',
-    'Analyze the impact of the COVID-19 pandemic on global economic',
-    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
-    'The early bird catches the worm.\nThe early bird catches the'
+    "\n### Features\n\n- **High-throughput**: v",
+    "The major milestones in the development of artificial intelligence from 195",
+    "Compare and contrast artificial intelligence with human intelligence in "
+    "terms of processing information. The",
+    "Explain the difference between supervised and unsupervised learning.\nExplain",
+    "Write a short story about a robot that dreams for the first time. The",
+    "Analyze the impact of the COVID-19 pandemic on global economic",
+    "The Mona Lisa is a painting by Leonardo da Vinci, and it",
+    "The early bird catches the worm.\nThe early bird catches the",
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("aqlm")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="AQLM is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("aqlm")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="AQLM is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [16])
@@ -52,15 +52,14 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     # loop through the prompts to compare against the ground truth generations
     for prompt_idx in range(len(example_prompts)):
-        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
-            prompt_idx]
+        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[prompt_idx]
 
         print("Prompt:          ", repr(example_prompts[prompt_idx]))
         print("Reference output:", repr(ground_truth_generations[prompt_idx]))
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index bd696198931f..660e9a7320f3 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -11,12 +11,12 @@
 from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
 from ..utils import check_logprobs_close
 
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-    "cherry_blossom":
-    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+        "cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    }
+)
 
 
 def run_awq_test(
@@ -34,10 +34,13 @@ def run_awq_test(
 ):
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
@@ -45,37 +48,40 @@ def run_awq_test(
     # will hurt multiprocessing backend with fork method (the default method).
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(source_model,
-                     max_model_len=4096,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        source_model,
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    ) as vllm_model:
         source_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
             for prompts, images in inputs_per_image
         ]
 
-    with vllm_runner(quant_model,
-                     quantization="awq",
-                     max_model_len=4096,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        quant_model,
+        quantization="awq",
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    ) as vllm_model:
         quant_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
             for prompts, images in inputs_per_image
         ]
 
-    for source_outputs, quant_outputs in zip(source_outputs_per_image,
-                                             quant_outputs_per_image):
+    for source_outputs, quant_outputs in zip(
+        source_outputs_per_image, quant_outputs_per_image
+    ):
         # TODO: Check whether using original CLIPVisionModel can improve
         # consistency against HF
         check_logprobs_close(
@@ -107,10 +113,17 @@ def run_awq_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @torch.inference_mode()
-def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs,
-                    monkeypatch) -> None:
-
+def test_awq_models(
+    vllm_runner,
+    image_assets,
+    source_model,
+    quant_model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    monkeypatch,
+) -> None:
     # Test V1: this test hangs during setup on single-scale input.
     # TODO: fixure out why and re-enable this on V1.
     monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py
index 754ac9a29a13..f516cc2724a6 100644
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -7,9 +7,10 @@
 bitblas/GPTQ models are in the top 3 selections of each other.
 
 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass
 
 import pytest
@@ -24,8 +25,10 @@ class ModelPair:
 
 
 model_pairs = [
-    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
-              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+    ModelPair(
+        model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+        model_gptq="hxbgsyxh/opt-125m-4bit-128g",
+    ),
 ]
 
 
@@ -43,16 +46,19 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_bitblas,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
         bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index e53902cdb8f4..40a6e5c2bec8 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-'''Tests whether bitsandbytes computation is enabled correctly.
+"""Tests whether bitsandbytes computation is enabled correctly.
 
 Run `pytest tests/quantization/test_bitsandbytes.py`.
-'''
+"""
 
 import gc
 
@@ -18,8 +18,10 @@
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
-    ("mistralai/Mistral-7B-Instruct-v0.3",
-     "quantize inflight model with both HF and Mistral format weights")
+    (
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "quantize inflight model with both HF and Mistral format weights",
+    ),
 ]
 
 models_4bit_to_embedding_test = [
@@ -31,72 +33,84 @@
 ]
 
 models_pre_qaunt_4bit_to_test = [
-    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
-     'read pre-quantized 4-bit FP4 model'),
-    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
+    (
+        "PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
+        "read pre-quantized 4-bit FP4 model",
+    ),
+    ("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
 ]
 
 models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8',
-     'read pre-quantized llama 8-bit model'),
+    ("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
     ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, False, hf_model_kwargs)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_qaunt_4bit_to_test)
-def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                       model_name, description) -> None:
+def test_load_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
+    )
 
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
 
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
+def test_load_pre_quant_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_quant_8bit_to_test)
-def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
 
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
+def test_load_8bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
 @multi_gpu_test(num_gpus=2)
-def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             example_prompts[:1],
-                             model_name,
-                             False,
-                             hf_model_kwargs,
-                             vllm_tp_size=2)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+def test_load_tp_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner,
+        vllm_runner,
+        example_prompts[:1],
+        model_name,
+        False,
+        hf_model_kwargs,
+        vllm_tp_size=2,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
 @multi_gpu_test(num_gpus=2)
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
@@ -119,27 +133,32 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
     compare_two_settings(model_name, common_args, pp_args)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
-def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
-                            model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
-    ))
-    with vllm_runner(model_name,
-                     quantization='bitsandbytes',
-                     enforce_eager=False) as llm:
-        vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
-                                                    max_tokens=32,
-                                                    num_logprobs=5)
+def test_4bit_bnb_moe_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    )
+    with vllm_runner(
+        model_name, quantization="bitsandbytes", enforce_eager=False
+    ) as llm:
+        vllm_outputs = llm.generate_greedy_logprobs(
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
 
     with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
         transformers_outputs = llm.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens=32, num_logprobs=5)
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
     check_logprobs_close(
         outputs_0_lst=transformers_outputs,
         outputs_1_lst=vllm_outputs,
@@ -148,10 +167,11 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
     )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_4bit_to_embedding_test)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_4bit_bnb_embedding_model(
     model_name,
@@ -161,7 +181,6 @@ def test_4bit_bnb_embedding_model(
     example_prompts,
     dtype: str,
 ) -> None:
-
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
     # sentence_transformers will strip the input texts, see:
@@ -171,20 +190,21 @@ def test_4bit_bnb_embedding_model(
     example_prompts = [str(s).strip() for s in example_prompts]
 
     # Inflight 4bit quantization
-    with vllm_runner(model_name,
-                     task="embed",
-                     dtype=dtype,
-                     gpu_memory_utilization=0.5,
-                     quantization="bitsandbytes") as vllm_model:
+    with vllm_runner(
+        model_name,
+        task="embed",
+        dtype=dtype,
+        gpu_memory_utilization=0.5,
+        quantization="bitsandbytes",
+    ) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
 
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
     with hf_runner(
-            model_name,
-            dtype=dtype,
-            model_kwargs=hf_model_kwargs,
-            is_sentence_transformer=True,
+        model_name,
+        dtype=dtype,
+        model_kwargs=hf_model_kwargs,
+        is_sentence_transformer=True,
     ) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
@@ -209,22 +229,24 @@ def log_generated_texts(prompts, outputs, runner_name):
     return logged_texts
 
 
-def validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             prompts,
-                             model_name,
-                             pre_quant=False,
-                             hf_model_kwargs=None,
-                             vllm_tp_size=1,
-                             max_tokens=8):
-
+def validate_generated_texts(
+    hf_runner,
+    vllm_runner,
+    prompts,
+    model_name,
+    pre_quant=False,
+    hf_model_kwargs=None,
+    vllm_tp_size=1,
+    max_tokens=8,
+):
     # NOTE: run vLLM first, as it requires a clean process
     # when using distributed inference
-    with vllm_runner(model_name,
-                     quantization=None if pre_quant else 'bitsandbytes',
-                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=False) as llm:
-
+    with vllm_runner(
+        model_name,
+        quantization=None if pre_quant else "bitsandbytes",
+        tensor_parallel_size=vllm_tp_size,
+        enforce_eager=False,
+    ) as llm:
         vllm_outputs = llm.generate_greedy(prompts, max_tokens)
         vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
 
@@ -248,8 +270,10 @@ def validate_generated_texts(hf_runner,
         hf_str = hf_log["generated_text"]
         vllm_str = vllm_log["generated_text"]
         prompt = hf_log["prompt"]
-        assert hf_str == vllm_str, (f"Model: {model_name}"
-                                    f"Mismatch between HF and vLLM outputs:\n"
-                                    f"Prompt: {prompt}\n"
-                                    f"HF Output: '{hf_str}'\n"
-                                    f"vLLM Output: '{vllm_str}'")
+        assert hf_str == vllm_str, (
+            f"Model: {model_name}"
+            f"Mismatch between HF and vLLM outputs:\n"
+            f"Prompt: {prompt}\n"
+            f"HF Output: '{hf_str}'\n"
+            f"vLLM Output: '{vllm_str}'"
+        )
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 10914abf9ad3..8a9d80d74015 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -5,6 +5,7 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
+
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
@@ -14,21 +15,33 @@
 from ..utils import check_logprobs_close
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize(
     "kv_cache_dtype,base_model,test_model",
     [
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
+        ),
         # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
         # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct")
-    ])
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("enforce_eager", [True])
@@ -61,37 +74,38 @@ def test_models(
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
     if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
-        pytest.skip(
-            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
+        pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
 
     with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
         m.setenv(STR_BACKEND_ENV_VAR, backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
 
         with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         check_logprobs_close(
             outputs_0_lst=baseline_outputs,
@@ -102,15 +116,18 @@ def test_models(
 
 
 @pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(),
-                    reason="test for the CPU backend.")
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
 @pytest.mark.parametrize(
     "kv_cache_dtype,base_model,test_model",
     [
         # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
-    ])
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 # Due to low-precision numerical divergence, this test is too sensitive for
@@ -131,30 +148,32 @@ def test_cpu_models(
     numerical sensitive kernels.
     """
     with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
 
         with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )
 
         check_logprobs_close(
             outputs_0_lst=baseline_outputs,
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 3e77d3e71039..5e2438857aee 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -100,35 +100,37 @@ def check_model_outputs(
 ):
     tokenizer = AutoTokenizer.from_pretrained(model.original_model)
     if tokenizer.chat_template is not None:
-        messages = [[{
-            'role': 'user',
-            'content': prompt
-        }] for prompt in prompts]
-        prompts = tokenizer.apply_chat_template(messages,
-                                                tokenize=False,
-                                                add_generation_prompt=True)
+        messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
 
     # Run gguf model.
-    with vllm_runner(model_name=model.gguf_model,
-                     enforce_eager=True,
-                     tokenizer_name=model.original_model,
-                     dtype=dtype,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=tp_size) as gguf_model:
+    with vllm_runner(
+        model_name=model.gguf_model,
+        enforce_eager=True,
+        tokenizer_name=model.original_model,
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=tp_size,
+    ) as gguf_model:
         gguf_outputs = gguf_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )
 
     # Run unquantized model.
     # Should run with tp=1, otherwise the test will stuck at
     # nccl initialization.
     with vllm_runner(
-            model_name=model.original_model,
-            enforce_eager=True,  # faster tests
-            dtype=dtype,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=1) as original_model:
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as original_model:
         original_outputs = original_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=original_outputs,
@@ -138,12 +140,14 @@ def check_model_outputs(
     )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", [
-    pytest.param(test_config, marks=test_config.marks)
-    for test_config in MODELS
-])
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
@@ -157,12 +161,15 @@ def test_models(
     num_logprobs: int,
     tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", [LLAMA_CONFIG])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [8])
@@ -178,5 +185,6 @@ def test_distributed(
     num_logprobs: int,
     tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py
index c3aed77525de..b29c5e769ce8 100644
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -7,9 +7,10 @@
 bitblas/GPTQ models are in the top 3 selections of each other.
 
 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass
 
 import pytest
@@ -41,16 +42,19 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_gptq,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
         bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py
index db70a3bd2c04..cf52ae39214d 100644
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -9,6 +9,7 @@
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
 """
+
 import os
 
 import pytest
@@ -26,20 +27,20 @@
 MODELS = [
     # act_order==True, group_size=128
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-
     # 8-bit, act_order==True, group_size=channelwise
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-
     # 4-bit, act_order==True, group_size=128
-    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
 ]
 
 
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -55,29 +56,34 @@ def test_models(
     model_name, revision = model
 
     # Run marlin.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype=dtype,
-                     quantization="marlin",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_marlin_model:
-
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype=dtype,
+        quantization="marlin",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_marlin_model:
         gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
     _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
 
     # Run gptq.
     # The naive gptq kernel doesn't support bf16 yet.
     # Here we always compare fp16/bf16 gpt marlin kernel
     # to fp16 gptq kernel.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype="half",
-                     quantization="gptq",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_model:
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype="half",
+        quantization="gptq",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py
index 9b86ae95ba5c..85426ee5b089 100644
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -6,6 +6,7 @@
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
 """
+
 from dataclasses import dataclass
 
 import pytest
@@ -24,15 +25,18 @@ class ModelPair:
 
 model_pairs = [
     # 4-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
+    ),
     # # 4-bit, group_size == channelwise
     # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
     #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
-
     # 8-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
+    ),
     # # 8-bit, group_size == channelwise
     # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
     #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
@@ -40,10 +44,12 @@ class ModelPair:
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="Marlin24 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin_24")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="Marlin24 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [8])
@@ -56,16 +62,19 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_marlin,
-                     dtype=dtype,
-                     quantization="gptq_marlin_24") as marlin_24_model:
+    with vllm_runner(
+        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+    ) as marlin_24_model:
         marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py
index 6ad526cc893f..9be365b81cb4 100644
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -5,6 +5,7 @@
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
 """
+
 import os
 
 import pytest
@@ -22,13 +23,13 @@
 EXPECTED_STRS_MAP = {
     "nvidia/Llama-3.1-8B-Instruct-FP8": [
         "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
         'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
+        "**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
     ]
 }
 
@@ -39,10 +40,12 @@
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
 @pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
     model = LLM(
@@ -55,12 +58,11 @@ def test_models(example_prompts, model_name) -> None:
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
         for prompt in example_prompts
     ]
     params = SamplingParams(max_tokens=20, temperature=0)
@@ -78,4 +80,5 @@ def test_models(example_prompts, model_name) -> None:
         generated_str = generations[i]
         expected_str = expected_strs[i]
         assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py
index 7b8a334bbc36..d598e405be81 100644
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
-"""Tests Quark mxfp4 models against ground truth generation
-"""
+"""Tests Quark mxfp4 models against ground truth generation"""
+
 import pytest
 
 from vllm import LLM, SamplingParams
@@ -11,13 +11,13 @@
 
 EXPECTED_STRS_MAP = {
     "amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
-        '\n### Key Features\n\n* **High-throughput Inference**: vLL',
-        '\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
-        'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
-        '\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
-        '\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
-        'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
+        "\n### Key Features\n\n* **High-throughput Inference**: vLL",
+        "\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
+        "A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
+        "\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
+        "\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
+        "The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
         " everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
     ]
 }
@@ -38,4 +38,5 @@ def test_models(example_prompts, model_name) -> None:
         output_str = output.outputs[0].text
         expected_str = EXPECTED_STRS_MAP[model_name][i]
         assert expected_str == output_str, (
-            f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
+            f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
+        )
diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
index b95dad9a4eff..cb85b3f599bf 100644
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -4,6 +4,7 @@
 """Tests Model Optimizer nvfp4 models against ground truth generation
 Note: these tests will only pass on B200
 """
+
 import os
 from typing import List
 
@@ -21,14 +22,14 @@
 
 EXPECTED_STRS_MAP = {
     "nvidia/Llama-3.3-70B-Instruct-FP4": [
-        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
-        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
-        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+        "vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
+        "A neural network is a type of machine learning model inspired by the structure and function of the human brain",
+        "In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
     ]
 }
 
@@ -39,11 +40,13 @@
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
 @pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build "
-    " and test input model being too large and hanging the system.")
-@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
-                    reason="modelopt_fp4 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt_fp4"),
+    reason="modelopt_fp4 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
     model = LLM(
@@ -56,12 +59,11 @@ def test_models(example_prompts, model_name) -> None:
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
         for prompt in example_prompts
     ]
     params = SamplingParams(max_tokens=20, temperature=0)
@@ -79,4 +81,5 @@ def test_models(example_prompts, model_name) -> None:
         generated_str = generations[i]
         expected_str = expected_strs[i]
         assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 2adfa859a1c7..ad6528630aee 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -85,8 +85,10 @@ def check_transformers_version(
         If the installed transformers version does not meet the requirements,
         perform the given action.
         """
-        if (self.min_transformers_version is None
-                and self.max_transformers_version is None):
+        if (
+            self.min_transformers_version is None
+            and self.max_transformers_version is None
+        ):
             return
 
         current_version = TRANSFORMERS_VERSION
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 52005e74ef7e..82ce7b985cb4 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -19,11 +19,11 @@
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 @create_new_process_for_each_test()
 def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
-    """The reason for using create_new_process_for_each_test is to avoid 
-    the WARNING: 
-        "We must use the 'spawn' multiprocessing start method. Overriding 
+    """The reason for using create_new_process_for_each_test is to avoid
+    the WARNING:
+        "We must use the 'spawn' multiprocessing start method. Overriding
         VLLM_WORKER_MULTIPROC_METHOD to 'spawn'."
-    The spawn process causes the _initialize_kv_caches_v1 function below to 
+    The spawn process causes the _initialize_kv_caches_v1 function below to
     become ineffective.
     """
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
@@ -31,14 +31,17 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     model_info.check_transformers_version(on_fail="skip")
 
     # FIXME: Possible memory leak in the previous tests?
-    if model_arch in ("Glm4vForConditionalGeneration",
-                      "GraniteSpeechForConditionalGeneration",
-                      "KimiVLForConditionalGeneration"):
+    if model_arch in (
+        "Glm4vForConditionalGeneration",
+        "GraniteSpeechForConditionalGeneration",
+        "KimiVLForConditionalGeneration",
+    ):
         pytest.skip("Avoid OOM")
 
     if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
         from vllm.model_executor.models.llama4 import Llama4ForCausalLM
         from vllm.model_executor.models.registry import ModelRegistry
+
         ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
 
     # Avoid OOM and reduce initialization time by only using 1 layer
@@ -49,40 +52,45 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
 
         # Ensure at least 2 expert per group
         # Since `grouped_topk` assumes top-2
-        n_group = getattr(text_config, 'n_group', None)
+        n_group = getattr(text_config, "n_group", None)
         num_experts = n_group * 2 if n_group is not None else 2
 
         # we use three layers for Gemma-3n to check
         # both normal layer and kv_shared_layer
-        num_hidden_layers = (3 if model_arch
-                             == "Gemma3nForConditionalGeneration" else 1)
-
-        text_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": num_hidden_layers,
-            "num_experts": num_experts,
-            "num_experts_per_tok": 2,
-            "num_local_experts": num_experts,
-            # Otherwise there will not be any expert layers
-            "first_k_dense_replace": 0,
-            # To avoid OOM on DeepSeek-V3
-            "n_routed_experts": num_experts,
-            # For Gemma-3n
-            "num_kv_shared_layers": 1,
-        })
+        num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
 
-        if hasattr(hf_config, "vision_config"):
-            hf_config.vision_config.update({
+        text_config.update(
+            {
                 "num_layers": 1,
-                "num_hidden_layers": 1,
-            })
+                "num_hidden_layers": num_hidden_layers,
+                "num_experts": num_experts,
+                "num_experts_per_tok": 2,
+                "num_local_experts": num_experts,
+                # Otherwise there will not be any expert layers
+                "first_k_dense_replace": 0,
+                # To avoid OOM on DeepSeek-V3
+                "n_routed_experts": num_experts,
+                # For Gemma-3n
+                "num_kv_shared_layers": 1,
+            }
+        )
+
+        if hasattr(hf_config, "vision_config"):
+            hf_config.vision_config.update(
+                {
+                    "num_layers": 1,
+                    "num_hidden_layers": 1,
+                }
+            )
 
         # e.g.: ibm-granite/granite-speech-3.3-2b
         if hasattr(hf_config, "encoder_config"):
-            hf_config.encoder_config.update({
-                "num_layers": 1,
-                "num_hidden_layers": 1,
-            })
+            hf_config.encoder_config.update(
+                {
+                    "num_layers": 1,
+                    "num_hidden_layers": 1,
+                }
+            )
 
         return hf_config
 
@@ -102,10 +110,11 @@ def _initialize_kv_caches_v1(self, vllm_config):
         # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
         return 1, 0, scheduler_kv_cache_config
 
-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1), monkeypatch.context() as m):
+    with (
+        patch.object(V0LLMEngine, "_initialize_kv_caches", _initialize_kv_caches_v0),
+        patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
+        monkeypatch.context() as m,
+    ):
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
         if model_arch == "Phi4FlashForCausalLM":
@@ -119,7 +128,9 @@ def _initialize_kv_caches_v1(self, vllm_config):
             speculative_config={
                 "model": model_info.speculative_model,
                 "num_speculative_tokens": 1,
-            } if model_info.speculative_model else None,
+            }
+            if model_info.speculative_model
+            else None,
             trust_remote_code=model_info.trust_remote_code,
             max_model_len=model_info.max_model_len,
             # these tests seem to produce leftover memory
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 59de35644c12..8e6a43b918ba 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -53,9 +53,9 @@ def test_oot_registration_embedding(
     with monkeypatch.context() as m:
         m.setenv("VLLM_PLUGINS", "register_dummy_model")
         prompts = ["Hello, my name is", "The text does not matter"]
-        llm = LLM(model=dummy_gemma2_embedding_path,
-                  load_format="dummy",
-                  max_model_len=2048)
+        llm = LLM(
+            model=dummy_gemma2_embedding_path, load_format="dummy", max_model_len=2048
+        )
         outputs = llm.embed(prompts)
 
         for output in outputs:
@@ -72,27 +72,28 @@ def test_oot_registration_multimodal(
 ):
     with monkeypatch.context() as m:
         m.setenv("VLLM_PLUGINS", "register_dummy_model")
-        prompts = [{
-            "prompt": "What's in the image?<image>",
-            "multi_modal_data": {
-                "image": image
+        prompts = [
+            {
+                "prompt": "What's in the image?<image>",
+                "multi_modal_data": {"image": image},
             },
-        }, {
-            "prompt": "Describe the image<image>",
-            "multi_modal_data": {
-                "image": image
+            {
+                "prompt": "Describe the image<image>",
+                "multi_modal_data": {"image": image},
             },
-        }]
+        ]
 
         sampling_params = SamplingParams(temperature=0)
-        llm = LLM(model=dummy_llava_path,
-                  load_format="dummy",
-                  max_num_seqs=1,
-                  trust_remote_code=True,
-                  gpu_memory_utilization=0.98,
-                  max_model_len=4096,
-                  enforce_eager=True,
-                  limit_mm_per_prompt={"image": 1})
+        llm = LLM(
+            model=dummy_llava_path,
+            load_format="dummy",
+            max_num_seqs=1,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.98,
+            max_model_len=4096,
+            enforce_eager=True,
+            limit_mm_per_prompt={"image": 1},
+        )
 
         first_token = llm.get_tokenizer().decode(0)
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 01b2260abe8c..6a60a6800223 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,16 +6,22 @@
 import pytest
 import torch.cuda
 
-from vllm.model_executor.models import (is_pooling_model,
-                                        is_text_generation_model,
-                                        supports_multimodal)
-from vllm.model_executor.models.adapters import (as_embedding_model,
-                                                 as_reward_model,
-                                                 as_seq_cls_model)
-from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
-                                                 _SPECULATIVE_DECODING_MODELS,
-                                                 _TEXT_GENERATION_MODELS,
-                                                 ModelRegistry)
+from vllm.model_executor.models import (
+    is_pooling_model,
+    is_text_generation_model,
+    supports_multimodal,
+)
+from vllm.model_executor.models.adapters import (
+    as_embedding_model,
+    as_reward_model,
+    as_seq_cls_model,
+)
+from vllm.model_executor.models.registry import (
+    _MULTIMODAL_MODELS,
+    _SPECULATIVE_DECODING_MODELS,
+    _TEXT_GENERATION_MODELS,
+    ModelRegistry,
+)
 from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test
@@ -33,8 +39,7 @@ def test_registry_imports(model_arch):
     if model_arch in _SPECULATIVE_DECODING_MODELS:
         return  # Ignore these models which do not have a unified format
 
-    if (model_arch in _TEXT_GENERATION_MODELS
-            or model_arch in _MULTIMODAL_MODELS):
+    if model_arch in _TEXT_GENERATION_MODELS or model_arch in _MULTIMODAL_MODELS:
         assert is_text_generation_model(model_cls)
 
     # All vLLM models should be convertible to a pooling model
@@ -47,14 +52,17 @@ def test_registry_imports(model_arch):
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
-    ("LlamaForCausalLM", False, False, False),
-    ("MllamaForConditionalGeneration", True, False, False),
-    ("LlavaForConditionalGeneration", True, True, False),
-    ("BertForSequenceClassification", False, False, True),
-    ("RobertaForSequenceClassification", False, False, True),
-    ("XLMRobertaForSequenceClassification", False, False, True),
-])
+@pytest.mark.parametrize(
+    "model_arch,is_mm,init_cuda,is_ce",
+    [
+        ("LlamaForCausalLM", False, False, False),
+        ("MllamaForConditionalGeneration", True, False, False),
+        ("LlavaForConditionalGeneration", True, True, False),
+        ("BertForSequenceClassification", False, False, True),
+        ("RobertaForSequenceClassification", False, False, True),
+        ("XLMRobertaForSequenceClassification", False, False, True),
+    ],
+)
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
     assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
 
@@ -68,15 +76,19 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
                 "Please test using a different one.",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
-    ("MLPSpeculatorPreTrainedModel", False, False),
-    ("DeepseekV2ForCausalLM", True, False),
-    ("Qwen2VLForConditionalGeneration", True, True),
-])
+@pytest.mark.parametrize(
+    "model_arch,is_pp,init_cuda",
+    [
+        ("MLPSpeculatorPreTrainedModel", False, False),
+        ("DeepseekV2ForCausalLM", True, False),
+        ("Qwen2VLForConditionalGeneration", True, True),
+    ],
+)
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
     assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
 
@@ -88,13 +100,16 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
                 "Please test using a different one.",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
 
 def test_hf_registry_coverage():
-    untested_archs = (ModelRegistry.get_supported_archs() -
-                      HF_EXAMPLE_MODELS.get_supported_archs())
+    untested_archs = (
+        ModelRegistry.get_supported_archs() - HF_EXAMPLE_MODELS.get_supported_archs()
+    )
 
     assert not untested_archs, (
         "Please add the following architectures to "
-        f"`tests/models/registry.py`: {untested_archs}")
+        f"`tests/models/registry.py`: {untested_archs}"
+    )
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index b7b99ce41cbb..ab0613bd9e64 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the functionality of the Transformers backend."""
+
 from typing import Any, Optional, Union
 
 import pytest
@@ -51,13 +52,15 @@ def check_implementation(
 
 @pytest.mark.skipif(
     current_platform.is_rocm(),
-    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
+    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.",
+)
 @pytest.mark.parametrize(
     "model,model_impl",
     [
         ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
         ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
-    ])  # trust_remote_code=True by default
+    ],
+)  # trust_remote_code=True by default
 def test_models(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
@@ -65,23 +68,23 @@ def test_models(
     model: str,
     model_impl: str,
 ) -> None:
-    check_implementation(hf_runner,
-                         vllm_runner,
-                         example_prompts,
-                         model,
-                         model_impl=model_impl)
+    check_implementation(
+        hf_runner, vllm_runner, example_prompts, model, model_impl=model_impl
+    )
 
 
 def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
     prompts, _, _ = prep_prompts(4, (800, 801))
     kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
     kwargs_test = {"model_impl": "transformers", **kwargs_ref}
-    check_implementation(vllm_runner,
-                         vllm_runner,
-                         prompts,
-                         model="hmellor/tiny-random-Gemma2ForCausalLM",
-                         kwargs_ref=kwargs_ref,
-                         kwargs_test=kwargs_test)
+    check_implementation(
+        vllm_runner,
+        vllm_runner,
+        prompts,
+        model="hmellor/tiny-random-Gemma2ForCausalLM",
+        kwargs_ref=kwargs_ref,
+        kwargs_test=kwargs_test,
+    )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -91,24 +94,30 @@ def test_distributed(
     example_prompts,
 ):
     kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
-    check_implementation(hf_runner,
-                         vllm_runner,
-                         example_prompts,
-                         "meta-llama/Llama-3.2-1B-Instruct",
-                         kwargs_test=kwargs)
+    check_implementation(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        "meta-llama/Llama-3.2-1B-Instruct",
+        kwargs_test=kwargs,
+    )
 
 
 @pytest.mark.skipif(
     current_platform.is_rocm(),
-    reason="bitsandbytes quantization is currently not supported in rocm.")
-@pytest.mark.parametrize("model, quantization_kwargs", [
-    (
-        "meta-llama/Llama-3.2-1B-Instruct",
-        {
-            "quantization": "bitsandbytes",
-        },
-    ),
-])
+    reason="bitsandbytes quantization is currently not supported in rocm.",
+)
+@pytest.mark.parametrize(
+    "model, quantization_kwargs",
+    [
+        (
+            "meta-llama/Llama-3.2-1B-Instruct",
+            {
+                "quantization": "bitsandbytes",
+            },
+        ),
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_quantization(
@@ -120,18 +129,18 @@ def test_quantization(
     num_logprobs: int,
 ) -> None:
     with vllm_runner(
-            model, model_impl="auto", enforce_eager=True,
-            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model, model_impl="auto", enforce_eager=True, **quantization_kwargs
+    ) as vllm_model:  # type: ignore[arg-type]
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
+        )
 
     with vllm_runner(
-            model,
-            model_impl="transformers",
-            enforce_eager=True,
-            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model, model_impl="transformers", enforce_eager=True, **quantization_kwargs
+    ) as vllm_model:  # type: ignore[arg-type]
         transformers_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
+        )
     check_logprobs_close(
         outputs_0_lst=transformers_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
index b52327a1844f..800cb072daa7 100644
--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@@ -7,7 +7,6 @@
 
 
 class ModuleWithBatchNorm(torch.nn.Module):
-
     def __init__(self):
         super().__init__()
         self.bn = torch.nn.BatchNorm1d(2)
@@ -17,7 +16,6 @@ def forward(self, x):
 
 
 class ModuleWithNestedBatchNorm(torch.nn.Module):
-
     def __init__(self):
         super().__init__()
         self.nested_mod = ModuleWithBatchNorm()
@@ -64,9 +62,11 @@ def weight_generator():
     new_mod = ModuleWithNestedBatchNorm()
 
     assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
     assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
 
     loader = AutoWeightsLoader(new_mod)
@@ -74,9 +74,9 @@ def weight_generator():
 
     # Ensure the stats are updated
     assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
 
 
@@ -98,9 +98,11 @@ def weight_generator():
     new_mod = ModuleWithNestedBatchNorm()
 
     assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
     assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
 
     loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
@@ -108,9 +110,9 @@ def weight_generator():
 
     # Ensure the stats are updated
     assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
 
 
@@ -134,9 +136,11 @@ def weight_generator():
     new_mod = ModuleWithNestedBatchNorm()
 
     assert not torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
     assert not torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var
+    )
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
 
     loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
@@ -144,7 +148,7 @@ def weight_generator():
 
     # Ensure the stats are updated
     assert torch.all(
-        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
-    assert torch.all(
-        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean
+    )
+    assert torch.all(new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index 310d3a3719b6..b33f53d7bf0c 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -8,8 +8,12 @@
 
 
 @pytest.mark.parametrize(
-    ("feature_sample_layers", "num_layers_loaded", "max_possible_layers",
-     "expected_features"),
+    (
+        "feature_sample_layers",
+        "num_layers_loaded",
+        "max_possible_layers",
+        "expected_features",
+    ),
     [
         # All layers loaded
         ([1, 10], 10, 10, [1, 10]),
@@ -17,19 +21,19 @@
         # Some layers not loaded
         ([1, 10], 10, 20, [1, 10]),
         ([-20, -11], 10, 20, [1, 10]),
-    ])
-def test_resolve_visual_encoder_outputs(feature_sample_layers,
-                                        num_layers_loaded, max_possible_layers,
-                                        expected_features):
+    ],
+)
+def test_resolve_visual_encoder_outputs(
+    feature_sample_layers, num_layers_loaded, max_possible_layers, expected_features
+):
     """
     Test that offsets are correctly handled for vision feature layers.
     """
-    encoder_outputs = [
-        torch.tensor([idx]) for idx in range(num_layers_loaded + 1)
-    ]
+    encoder_outputs = [torch.tensor([idx]) for idx in range(num_layers_loaded + 1)]
     output_tensor = resolve_visual_encoder_outputs(
         encoder_outputs=encoder_outputs,
         feature_sample_layers=feature_sample_layers,
         post_layer_norm=None,
-        max_possible_layers=max_possible_layers)
+        max_possible_layers=max_possible_layers,
+    )
     assert torch.equal(torch.tensor(expected_features), output_tensor)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index cdf8d02df73c..503baa435402 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -30,16 +30,18 @@ def check_outputs_equal(
     """
     assert len(outputs_0_lst) == len(outputs_1_lst)
 
-    for prompt_idx, (outputs_0,
-                     outputs_1) in enumerate(zip(outputs_0_lst,
-                                                 outputs_1_lst)):
+    for prompt_idx, (outputs_0, outputs_1) in enumerate(
+        zip(outputs_0_lst, outputs_1_lst)
+    ):
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
         # The text and token outputs should exactly match
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
+        fail_msg = (
+            f"Test{prompt_idx}:"
+            f"\n{name_0}:\t{output_str_0!r}"
+            f"\n{name_1}:\t{output_str_1!r}"
+        )
 
         assert output_str_0 == output_str_1, fail_msg
         assert output_ids_0 == output_ids_1, fail_msg
@@ -51,9 +53,9 @@ def check_outputs_equal(
 # * List of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
-                                                                    float]],
-                                                          SampleLogprobs]]]
+TokensTextLogprobs = tuple[
+    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]]
+]
 
 # Allow for tokens to be represented as str's rather than IDs;
 # tuple of
@@ -62,9 +64,9 @@ def check_outputs_equal(
 # * Optional list of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
-                                                        list[dict[str,
-                                                                  Logprob]]]]]
+TextTextLogprobs = tuple[
+    list[str], str, Optional[Union[list[dict[str, float]], list[dict[str, Logprob]]]]
+]
 
 # Representation of generated sequence as a tuple of
 # * Token ID list
@@ -74,18 +76,21 @@ def check_outputs_equal(
 #
 # Allows prompt logprobs to be requested.
 TokensTextLogprobsPromptLogprobs = tuple[
-    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
-    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]
+    list[int],
+    str,
+    Optional[Union[list[dict[int, float]], SampleLogprobs]],
+    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]],
+]
 
 
 def check_logprobs_close(
     *,
-    outputs_0_lst: Sequence[Union[TokensTextLogprobs,
-                                  TokensTextLogprobsPromptLogprobs,
-                                  TextTextLogprobs]],
-    outputs_1_lst: Sequence[Union[TokensTextLogprobs,
-                                  TokensTextLogprobsPromptLogprobs,
-                                  TextTextLogprobs]],
+    outputs_0_lst: Sequence[
+        Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
+    ],
+    outputs_1_lst: Sequence[
+        Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
+    ],
     name_0: str,
     name_1: str,
     num_outputs_0_skip_tokens: int = 0,
@@ -125,9 +130,9 @@ def check_logprobs_close(
     assert len(outputs_0_lst) == len(outputs_1_lst)
 
     # Loop through responses to each prompt.
-    for prompt_idx, (outputs_0,
-                     outputs_1) in enumerate(zip(outputs_0_lst,
-                                                 outputs_1_lst)):
+    for prompt_idx, (outputs_0, outputs_1) in enumerate(
+        zip(outputs_0_lst, outputs_1_lst)
+    ):
         assert len(outputs_0) == len(outputs_1)
         if len(outputs_0) == 3:
             assert len(outputs_1) == 3
@@ -152,17 +157,18 @@ def check_logprobs_close(
             ) = outputs_1
 
             # Test prompt logprobs closeness
-            if (prompt_logprobs_0 is not None
-                    and prompt_logprobs_1 is not None):
+            if prompt_logprobs_0 is not None and prompt_logprobs_1 is not None:
                 # Both sequences' prompt logprobs lists are not `None``
                 # (although individual list elements may be `None`);
                 # for each token's logprobs:
                 for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
-                        zip(prompt_logprobs_0, prompt_logprobs_1)):
+                    zip(prompt_logprobs_0, prompt_logprobs_1)
+                ):
                     fail_msg = (
                         f"Prompt logprobs test:"
                         f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
-                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
+                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}"
+                    )
 
                     if logprobs_elem_0 is None:
                         # If the seq 0 token's logprobs are `None`,
@@ -173,20 +179,24 @@ def check_logprobs_close(
                         # the seq 1 token's logprobs must not be `None`
                         assert logprobs_elem_1 is not None, fail_msg
                         # Logprobs check: top-k token choices must be the same
-                        assert (set(logprobs_elem_0.keys()) == set(
-                            logprobs_elem_1.keys())), fail_msg
+                        assert set(logprobs_elem_0.keys()) == set(
+                            logprobs_elem_1.keys()
+                        ), fail_msg
             else:
                 # Both sequence logprobs lists must be `None`
-                fail_msg = (f"Prompt logprobs test:"
-                            f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
-                            f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
+                fail_msg = (
+                    f"Prompt logprobs test:"
+                    f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
+                    f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}"
+                )
 
-                assert (prompt_logprobs_0 is None
-                        and prompt_logprobs_1 is None), fail_msg
+                assert prompt_logprobs_0 is None and prompt_logprobs_1 is None, fail_msg
         else:
-            raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
-                             f"{len(outputs_0)} elements were provided: "
-                             f"{outputs_0}")
+            raise ValueError(
+                f"Outputs tuple must have 3 or 4 elements but "
+                f"{len(outputs_0)} elements were provided: "
+                f"{outputs_0}"
+            )
 
         if logprobs_0 is None:
             logprobs_0 = [None] * len(output_ids_0)
@@ -203,9 +213,9 @@ def check_logprobs_close(
         logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
 
         # Loop through generated tokens.
-        for idx, (output_id_0,
-                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
-
+        for idx, (output_id_0, output_id_1) in enumerate(
+            zip(output_ids_0, output_ids_1)
+        ):
             is_tok_mismatch = output_id_0 != output_id_1
 
             # If generated tokens don't match
@@ -220,7 +230,8 @@ def check_logprobs_close(
                     f"Test{prompt_idx}:"
                     f"\nMatched tokens:\t{output_ids_0[:idx]}"
                     f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
-                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}"
+                )
 
                 assert logprobs_elem_0 is not None, fail_msg
                 assert logprobs_elem_1 is not None, fail_msg
@@ -241,9 +252,11 @@ def check_logprobs_close(
             if output_str_0 != output_str_1 and warn_on_mismatch:
                 # The token outputs exactly match,
                 # so the text outputs should exactly match as well
-                fail_msg = (f"Test{prompt_idx}:"
-                            f"\n{name_0}:\t{output_str_0!r}"
-                            f"\n{name_1}:\t{output_str_1!r}")
+                fail_msg = (
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}"
+                )
 
                 with warnings.catch_warnings():
                     # This ensures that repeated warnings are shown
@@ -306,18 +319,22 @@ def check_embeddings_close(
     assert len(embeddings_0_lst) == len(embeddings_1_lst)
 
     for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
-            zip(embeddings_0_lst, embeddings_1_lst)):
+        zip(embeddings_0_lst, embeddings_1_lst)
+    ):
         assert len(embeddings_0) == len(embeddings_1), (
-            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
-
-        sim = F.cosine_similarity(torch.tensor(embeddings_0),
-                                  torch.tensor(embeddings_1),
-                                  dim=0)
-
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\nCosine similarity: \t{sim:.4f}"
-                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
-                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}"
+        )
+
+        sim = F.cosine_similarity(
+            torch.tensor(embeddings_0), torch.tensor(embeddings_1), dim=0
+        )
+
+        fail_msg = (
+            f"Test{prompt_idx}:"
+            f"\nCosine similarity: \t{sim:.4f}"
+            f"\n{name_0}:\t{embeddings_0[:16]!r}"
+            f"\n{name_1}:\t{embeddings_1[:16]!r}"
+        )
 
         assert sim >= 1 - tol, fail_msg
 
diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py
index 375b248ebeda..a6a8b33e19d3 100644
--- a/tests/mq_llm_engine/conftest.py
+++ b/tests/mq_llm_engine/conftest.py
@@ -9,4 +9,4 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 5ff08cbb3248..cee6cbe8227a 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -26,9 +26,7 @@ def tmp_socket():
 
 @pytest.mark.asyncio
 async def test_abort(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, ipc_path=tmp_socket) as engine:
         client = await engine.make_client()
 
         request_id_to_be_aborted = "request-aborted"
@@ -39,18 +37,19 @@ async def test_abort(tmp_socket):
         tasks = []
         for request_id in request_ids_a:
             tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, EXPECTED_TOKENS)))
+                asyncio.create_task(generate(client, request_id, EXPECTED_TOKENS))
+            )
 
         # Aborted.
         task_aborted = asyncio.create_task(
-            generate(client, request_id_to_be_aborted, EXPECTED_TOKENS))
+            generate(client, request_id_to_be_aborted, EXPECTED_TOKENS)
+        )
 
         # Requests started after one to be aborted.
         for request_id in request_ids_b:
             tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, EXPECTED_TOKENS)))
+                asyncio.create_task(generate(client, request_id, EXPECTED_TOKENS))
+            )
 
         # Actually abort.
         await asyncio.sleep(0.5)
@@ -60,7 +59,8 @@ async def test_abort(tmp_socket):
         for task in tasks:
             count, request_id = await task
             assert count == EXPECTED_TOKENS, (
-                f"{request_id} generated only {count} tokens")
+                f"{request_id} generated only {count} tokens"
+            )
 
         # Cancel task (this will hang indefinitely if not).
         task_aborted.cancel()
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 3feee01dadf7..5cb4b632adac 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -40,11 +40,13 @@ def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
     engine = MQLLMEngine.from_engine_args(
         engine_args=engine_args,
         usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
+        ipc_path=ipc_path,
+    )
 
     # Raise error during first forward pass.
     engine.engine.model_executor.execute_model = Mock(
-        side_effect=RAISED_ERROR(RAISED_VALUE))
+        side_effect=RAISED_ERROR(RAISED_VALUE)
+    )
 
     # Run engine.
     engine.start()
@@ -52,10 +54,9 @@ def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
 
 @pytest.mark.asyncio
 async def test_evil_forward(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_forward) as engine:
-
+    with RemoteMQLLMEngine(
+        engine_args=ENGINE_ARGS, ipc_path=tmp_socket, run_fn=run_with_evil_forward
+    ) as engine:
         client = await engine.make_client()
 
         # Server should be healthy after initial probe.
@@ -64,9 +65,11 @@ async def test_evil_forward(tmp_socket):
 
         # Throws an error that should get ENGINE_DEAD_ERROR.
         with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
+            async for _ in client.generate(
+                prompt="Hello my name is",
+                sampling_params=SamplingParams(),
+                request_id=str(uuid.uuid4()),
+            ):
                 pass
         assert client.errored
 
@@ -79,13 +82,13 @@ async def test_evil_forward(tmp_socket):
         client.close()
 
 
-def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs,
-                                        ipc_path: str):
+def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs, ipc_path: str):
     # Make engine.
     engine = MQLLMEngine.from_engine_args(
         engine_args=engine_args,
         usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
+        ipc_path=ipc_path,
+    )
 
     # Raise error during first forward pass.
     engine.engine.model_executor.check_health = Mock(side_effect=RAISED_ERROR)
@@ -97,15 +100,15 @@ def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs,
 @pytest.mark.asyncio
 async def test_failed_health_check(tmp_socket):
     with RemoteMQLLMEngine(
-            engine_args=ENGINE_ARGS,
-            ipc_path=tmp_socket,
-            run_fn=run_with_evil_model_executor_health) as engine:
-
+        engine_args=ENGINE_ARGS,
+        ipc_path=tmp_socket,
+        run_fn=run_with_evil_model_executor_health,
+    ) as engine:
         client = await engine.make_client()
         assert client.is_running
 
         # Health probe should throw RAISED_ERROR.
-        await asyncio.sleep(15.)
+        await asyncio.sleep(15.0)
 
         with pytest.raises(RAISED_ERROR):
             await client.check_health()
@@ -113,9 +116,11 @@ async def test_failed_health_check(tmp_socket):
 
         # Generate call should throw ENGINE_DEAD_ERROR
         with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
+            async for _ in client.generate(
+                prompt="Hello my name is",
+                sampling_params=SamplingParams(),
+                request_id=str(uuid.uuid4()),
+            ):
                 pass
 
         client.close()
@@ -126,7 +131,8 @@ def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str):
     engine = MQLLMEngine.from_engine_args(
         engine_args=engine_args,
         usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
+        ipc_path=ipc_path,
+    )
 
     # Raise error during abort call.
     engine.engine.abort_request = Mock(side_effect=RAISED_ERROR)
@@ -137,10 +143,9 @@ def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str):
 
 @pytest.mark.asyncio
 async def test_failed_abort(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_abort) as engine:
-
+    with RemoteMQLLMEngine(
+        engine_args=ENGINE_ARGS, ipc_path=tmp_socket, run_fn=run_with_evil_abort
+    ) as engine:
         client = await engine.make_client()
         assert client.is_running
 
@@ -155,9 +160,10 @@ async def test_failed_abort(tmp_socket):
         # with reference to the original KeyError("foo")
         with pytest.raises(MQEngineDeadError) as execinfo:
             async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id=str(uuid.uuid4())):
+                prompt="Hello my name is",
+                sampling_params=SamplingParams(max_tokens=10),
+                request_id=str(uuid.uuid4()),
+            ):
                 pass
         assert "KeyError" in repr(execinfo.value)
         assert client.errored
@@ -171,10 +177,9 @@ async def test_failed_abort(tmp_socket):
 
 @pytest.mark.asyncio
 async def test_batch_error(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_abort) as engine:
-
+    with RemoteMQLLMEngine(
+        engine_args=ENGINE_ARGS, ipc_path=tmp_socket, run_fn=run_with_evil_abort
+    ) as engine:
         client = await engine.make_client()
         assert client.is_running
 
@@ -187,9 +192,11 @@ async def do_generate(client):
             # to get enough time to get process a request
             # that will crash the engine
             params = SamplingParams(min_tokens=2048, max_tokens=2048)
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=params,
-                                           request_id=str(uuid.uuid4())):
+            async for _ in client.generate(
+                prompt="Hello my name is",
+                sampling_params=params,
+                request_id=str(uuid.uuid4()),
+            ):
                 pass
 
         tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
@@ -210,25 +217,25 @@ async def do_generate(client):
 
 @pytest.mark.asyncio
 async def test_bad_request(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, ipc_path=tmp_socket) as engine:
         client = await engine.make_client()
 
         # Invalid request should fail, but not crash the server.
         with pytest.raises(ValueError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id="abcd-1",
-                                           lora_request=LoRARequest(
-                                               "invalid-lora", 1,
-                                               "invalid-path")):
+            async for _ in client.generate(
+                prompt="Hello my name is",
+                sampling_params=SamplingParams(),
+                request_id="abcd-1",
+                lora_request=LoRARequest("invalid-lora", 1, "invalid-path"),
+            ):
                 pass
 
         # This request should be okay.
-        async for _ in client.generate(prompt="Hello my name is",
-                                       sampling_params=SamplingParams(),
-                                       request_id="abcd-2"):
+        async for _ in client.generate(
+            prompt="Hello my name is",
+            sampling_params=SamplingParams(),
+            request_id="abcd-2",
+        ):
             pass
 
         # Shutdown.
@@ -238,9 +245,7 @@ async def test_bad_request(tmp_socket):
 @pytest.mark.asyncio
 async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
+        parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
         parser = make_arg_parser(parser)
         args = parser.parse_args([])
 
@@ -257,7 +262,8 @@ def mock_init():
 
         assert end - start < 60, (
             "Expected vLLM to gracefully shutdown in <60s "
-            "if there is an error in the startup.")
+            "if there is an error in the startup."
+        )
 
 
 @pytest.mark.asyncio
@@ -265,6 +271,7 @@ async def test_mp_cuda_init():
     # it should not crash, when cuda is initialized
     # in the API server process
     import torch
+
     torch.cuda.init()
     parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
     parser = make_arg_parser(parser)
@@ -276,9 +283,7 @@ async def test_mp_cuda_init():
 
 @pytest.mark.asyncio
 async def test_engine_process_death(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, ipc_path=tmp_socket) as engine:
         client = await engine.make_client()
         assert client.is_running
 
@@ -287,9 +292,11 @@ async def test_engine_process_death(tmp_socket):
 
         # Generate call should fail
         with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
+            async for _ in client.generate(
+                prompt="Hello my name is",
+                sampling_params=SamplingParams(),
+                request_id=str(uuid.uuid4()),
+            ):
                 pass
 
         # And the health check should show the engine is dead
@@ -299,8 +306,7 @@ async def test_engine_process_death(tmp_socket):
         client.close()
 
 
-def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
-                                   ipc_path: str):
+def run_with_evil_input_processing(engine_args: AsyncEngineArgs, ipc_path: str):
     """Simulate an exception while preparing inputs for the model.
     In the wild, this could be something like a multimodal input processor
     failing on invalid image data."""
@@ -309,7 +315,8 @@ def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
     engine = MQLLMEngine.from_engine_args(
         engine_args=engine_args,
         usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
+        ipc_path=ipc_path,
+    )
 
     runner = engine.engine.model_executor.driver_worker.worker.model_runner
 
@@ -327,10 +334,11 @@ def raiser(_, seq_group_metadata: SequenceGroupMetadata):
 
 @pytest.mark.asyncio
 async def test_failed_inputs(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_input_processing) as engine:
-
+    with RemoteMQLLMEngine(
+        engine_args=ENGINE_ARGS,
+        ipc_path=tmp_socket,
+        run_fn=run_with_evil_input_processing,
+    ) as engine:
         client = await engine.make_client()
         assert client.is_running
 
@@ -339,24 +347,22 @@ async def test_failed_inputs(tmp_socket):
 
         async def run_failing_request():
             async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id="evil" + str(uuid.uuid4())):
+                prompt="Hello my name is",
+                sampling_params=SamplingParams(max_tokens=10),
+                request_id="evil" + str(uuid.uuid4()),
+            ):
                 pass
 
         async def run_passing_request():
             async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id=str(uuid.uuid4())):
+                prompt="Hello my name is",
+                sampling_params=SamplingParams(max_tokens=10),
+                request_id=str(uuid.uuid4()),
+            ):
                 pass
 
-        passing_tasks = [
-            asyncio.create_task(run_passing_request()) for _ in range(10)
-        ]
-        failing_tasks = [
-            asyncio.create_task(run_failing_request()) for _ in range(10)
-        ]
+        passing_tasks = [asyncio.create_task(run_passing_request()) for _ in range(10)]
+        failing_tasks = [asyncio.create_task(run_failing_request()) for _ in range(10)]
         await asyncio.gather(*failing_tasks, return_exceptions=True)
         await asyncio.gather(*passing_tasks)
 
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index e9fd5b814f28..a74b1518f269 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -27,9 +27,7 @@ def tmp_socket():
 
 @pytest.mark.asyncio
 async def test_load(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, ipc_path=tmp_socket) as engine:
         client = await engine.make_client()
 
         request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
@@ -38,22 +36,24 @@ async def test_load(tmp_socket):
         tasks = []
         for request_id in request_ids:
             tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, NUM_EXPECTED_TOKENS)))
+                asyncio.create_task(generate(client, request_id, NUM_EXPECTED_TOKENS))
+            )
 
         # Confirm that we got all the EXPECTED tokens from the requests.
         failed_request_id = None
         tokens = None
         for task in tasks:
             num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
+            if (
+                num_generated_tokens != NUM_EXPECTED_TOKENS
+                and failed_request_id is None
+            ):
                 failed_request_id = request_id
                 tokens = num_generated_tokens
 
         assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
+            f"{failed_request_id} generated {tokens} but expected {NUM_EXPECTED_TOKENS}"
+        )
 
         # Shutdown.
         client.close()
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index 7976d5031aea..7e2ffc0525f4 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -14,22 +14,21 @@
 
 
 async def generate(
-        client: MQLLMEngineClient,
-        request_id: str,
-        num_tokens: int,
-        return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]:
-
+    client: MQLLMEngineClient,
+    request_id: str,
+    num_tokens: int,
+    return_output: bool = False,
+) -> Union[RequestOutput, tuple[int, str]]:
     final_output = None
     count = 0
     async for out in client.generate(
-            request_id=request_id,
-            prompt="Hello my name is Robert and",
-            sampling_params=SamplingParams(max_tokens=num_tokens,
-                                           temperature=0)):
-
+        request_id=request_id,
+        prompt="Hello my name is Robert and",
+        sampling_params=SamplingParams(max_tokens=num_tokens, temperature=0),
+    ):
         count += 1
         final_output = out
-        await asyncio.sleep(0.)
+        await asyncio.sleep(0.0)
 
     if return_output:
         return final_output
@@ -43,24 +42,21 @@ def run_normal(engine_args: AsyncEngineArgs, ipc_path: str):
     engine = MQLLMEngine.from_engine_args(
         engine_args=engine_args,
         usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
+        ipc_path=ipc_path,
+    )
 
     # Run engine.
     engine.start()
 
 
 class RemoteMQLLMEngine:
-
-    def __init__(self,
-                 engine_args: AsyncEngineArgs,
-                 ipc_path: str,
-                 run_fn: Callable = run_normal) -> None:
-
+    def __init__(
+        self, engine_args: AsyncEngineArgs, ipc_path: str, run_fn: Callable = run_normal
+    ) -> None:
         self.engine_args = engine_args
         self.ipc_path = ipc_path
         context = multiprocessing.get_context("spawn")
-        self.proc = context.Process(target=run_fn,
-                                    args=(engine_args, ipc_path))
+        self.proc = context.Process(target=run_fn, args=(engine_args, ipc_path))
         self.proc.start()
 
     def __enter__(self):
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 56e339d485c5..397120c65fc1 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -9,8 +9,11 @@
 from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_logprobs_close
-from ..utils import (completions_with_server_args, get_client_text_generations,
-                     get_client_text_logprob_generations)
+from ..utils import (
+    completions_with_server_args,
+    get_client_text_generations,
+    get_client_text_logprob_generations,
+)
 
 MODELS = [
     "JackFram/llama-160m",
@@ -29,10 +32,13 @@
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 1),
-    (2, 2),
-])
+@pytest.mark.parametrize(
+    ("tp_size, pp_size"),
+    [
+        (1, 1),
+        (2, 2),
+    ],
+)
 @pytest.mark.parametrize("eager_mode", [False, True])
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
@@ -78,10 +84,10 @@ async def test_multi_step(
       num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                     completions endpoint; `None` -> no logprobs
     """
-    if enable_chunked_prefill and \
-        (pp_size > 1 or attention_backend != "FLASH_ATTN"):
-        pytest.skip("Multi-step with Chunked-Prefill only supports"
-                    "PP=1 and FLASH_ATTN backend")
+    if enable_chunked_prefill and (pp_size > 1 or attention_backend != "FLASH_ATTN"):
+        pytest.skip(
+            "Multi-step with Chunked-Prefill only supportsPP=1 and FLASH_ATTN backend"
+        )
 
     with monkeypatch.context() as m:
         m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
@@ -93,8 +99,10 @@ async def test_multi_step(
         assert len(prompts) == num_prompts
 
         server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+        ms_server_args = DEFAULT_SERVER_ARGS + [
+            "--num-scheduler-steps",
+            f"{num_scheduler_steps}",
+        ]
 
         if not is_async:
             ms_server_args += ["--disable-async-output-proc"]
@@ -121,13 +129,15 @@ async def test_multi_step(
             model,
             server_args + distributed_args,
             num_logprobs,
-            max_wait_seconds=5 * 240)
+            max_wait_seconds=5 * 240,
+        )
         test_completions = await completions_with_server_args(
             prompts,
             model,
             ms_server_args + distributed_args,
             num_logprobs,
-            max_wait_seconds=5 * 240)
+            max_wait_seconds=5 * 240,
+        )
 
         # Assert multi-step scheduling produces identical tokens
         # to single-step scheduling.
@@ -137,10 +147,8 @@ async def test_multi_step(
 
         # Assert multi-step scheduling produces nearly-identical logprobs
         # to single-step scheduling.
-        ref_text_logprobs = get_client_text_logprob_generations(
-            ref_completions)
-        test_text_logprobs = get_client_text_logprob_generations(
-            test_completions)
+        ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
+        test_text_logprobs = get_client_text_logprob_generations(test_completions)
         check_logprobs_close(
             outputs_0_lst=ref_text_logprobs,
             outputs_1_lst=test_text_logprobs,
@@ -149,9 +157,12 @@ async def test_multi_step(
         )
 
 
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 2),
-])
+@pytest.mark.parametrize(
+    ("tp_size, pp_size"),
+    [
+        (1, 2),
+    ],
+)
 @pytest.mark.asyncio
 async def test_multi_step_pp_smoke(
     tp_size: int,
@@ -194,15 +205,19 @@ async def test_multi_step_pp_smoke(
 
         test_args = [
             "--tensor-parallel-size",
-            str(tp_size), "--pipeline-parallel-size",
-            str(pp_size), "--max-num-seqs",
-            str(max_num_seqs)
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+            "--max-num-seqs",
+            str(max_num_seqs),
         ]
 
         server_args = DEFAULT_SERVER_ARGS + test_args
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-          test_args
+        ms_server_args = (
+            DEFAULT_SERVER_ARGS
+            + ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+            + test_args
+        )
 
         # Spin up client/server & issue completion API requests.
         # Default `max_wait_seconds` is 240 but was empirically
@@ -214,7 +229,8 @@ async def test_multi_step_pp_smoke(
             server_cli_args=server_args,
             num_logprobs=None,
             max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
+            max_tokens=max_tokens,
+        )
 
         test_completions = await completions_with_server_args(
             prompts=prompts,
@@ -222,7 +238,8 @@ async def test_multi_step_pp_smoke(
             server_cli_args=ms_server_args,
             num_logprobs=None,
             max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
+            max_tokens=max_tokens,
+        )
 
         # Assert multi-step scheduling produces identical tokens
         # to single-step scheduling.
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index 0df00c98b72c..1416e4c3abd1 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -72,11 +72,12 @@ def test_multi_step_llm(
       num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                     completions endpoint; `None` -> 1 logprob returned.
     """
-    if current_platform.is_rocm() and \
-        (attention_backend == "FLASHINFER" or enable_chunked_prefill):
+    if current_platform.is_rocm() and (
+        attention_backend == "FLASHINFER" or enable_chunked_prefill
+    ):
         pytest.skip(
-            "Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
-            "on ROCm")
+            "Multi-Step with FLASHINFER or Chunked-Prefill is not supportedon ROCm"
+        )
 
     with monkeypatch.context() as m:
         m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
@@ -88,24 +89,30 @@ def test_multi_step_llm(
         assert len(prompts) == num_prompts
 
         with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=enable_chunked_prefill,
-                num_scheduler_steps=num_scheduler_steps,
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            enable_chunked_prefill=enable_chunked_prefill,
+            num_scheduler_steps=num_scheduler_steps,
         ) as vllm_model:
-            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                            if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                prompts, max_tokens, num_logprobs))
+            vllm_outputs = (
+                vllm_model.generate_greedy(prompts, max_tokens)
+                if num_logprobs is None
+                else vllm_model.generate_greedy_logprobs(
+                    prompts, max_tokens, num_logprobs
+                )
+            )
 
         with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                          if num_logprobs is None else
-                          hf_model.generate_greedy_logprobs_limit(
-                              prompts, max_tokens, num_logprobs))
+            hf_outputs = (
+                hf_model.generate_greedy(prompts, max_tokens)
+                if num_logprobs is None
+                else hf_model.generate_greedy_logprobs_limit(
+                    prompts, max_tokens, num_logprobs
+                )
+            )
 
         if num_logprobs is None:
             check_outputs_equal(
@@ -185,31 +192,33 @@ def test_multi_step_llm_w_prompt_logprobs(
         assert len(prompts) == num_prompts
 
         with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            num_scheduler_steps=num_scheduler_steps,
         ) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy_logprobs(
                 prompts,
                 max_tokens,
                 num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
+                num_prompt_logprobs=num_prompt_logprobs,
+            )
 
         with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
         ) as vllm_model:
             single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
                 prompts,
                 max_tokens,
                 num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
+                num_prompt_logprobs=num_prompt_logprobs,
+            )
 
         check_logprobs_close(
             outputs_0_lst=single_step_vllm_outputs,
@@ -230,7 +239,8 @@ def test_multi_step_llm_w_prompt_logprobs(
 @pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
 @pytest.mark.skipif(
     current_platform.is_rocm(),
-    reason="Multi-Step + Chunked-Prefill not supported on ROCm")
+    reason="Multi-Step + Chunked-Prefill not supported on ROCm",
+)
 def test_multi_step_llm_chunked_prefill_prefix_cache(
     vllm_runner,
     example_prompts,
@@ -312,58 +322,66 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
         assert len(example_prompts) >= 2
         challenge_prompts = copy.deepcopy(example_prompts)
         challenge_prompts[0] = (
-            'vLLM is a high-throughput and memory-efficient '
-            'inference and serving engine for LLMs.\n')  # 24 tok
+            "vLLM is a high-throughput and memory-efficient "
+            "inference and serving engine for LLMs.\n"
+        )  # 24 tok
         challenge_prompts[1] = (
-            'Briefly describe the major milestones in the '
-            'development of artificial intelligence from 1950 to 2020.\n'
+            "Briefly describe the major milestones in the "
+            "development of artificial intelligence from 1950 to 2020.\n"
         )  # 30 tok
 
         # If necessary, adjust the length of `challenge_prompts` to match
         # `num_prompts`
         if len(challenge_prompts) < num_prompts:
-            challenge_prompts = (challenge_prompts *
-                                 ((num_prompts // len(challenge_prompts)) + 1))
+            challenge_prompts = challenge_prompts * (
+                (num_prompts // len(challenge_prompts)) + 1
+            )
         challenge_prompts = challenge_prompts[:num_prompts]
         assert len(challenge_prompts) == num_prompts
 
         # Single-step scheduler baseline
         with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            num_scheduler_steps=num_scheduler_steps,
+            max_model_len=48,
+            max_num_batched_tokens=48,
+            max_num_seqs=4,
+            block_size=16,
         ) as vllm_model:
             outputs_baseline = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
+                vllm_model.generate_greedy(challenge_prompts, max_tokens)
+                if num_logprobs is None
+                else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs
+                )
+            )
 
         # multi-step+"single-step chunked prefill"+APC
         with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            enable_chunked_prefill=True,
+            enable_prefix_caching=True,
+            num_scheduler_steps=num_scheduler_steps,
+            max_model_len=48,
+            max_num_batched_tokens=48,
+            max_num_seqs=4,
+            block_size=16,
         ) as vllm_model:
             outputs_w_features = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
+                vllm_model.generate_greedy(challenge_prompts, max_tokens)
+                if num_logprobs is None
+                else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs
+                )
+            )
 
         if num_logprobs is None:
             # No-logprobs test
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
index ffb3a6fe86b4..069317f9c9c0 100644
--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
@@ -6,8 +6,7 @@
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 
 
-def assert_nested_tensors_equal(expected: NestedTensors,
-                                actual: NestedTensors):
+def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors):
     assert type(expected) == type(actual)  # noqa: E721
     if isinstance(expected, torch.Tensor):
         assert torch.equal(expected, actual)
@@ -16,8 +15,9 @@ def assert_nested_tensors_equal(expected: NestedTensors,
             assert_nested_tensors_equal(expected_item, actual_item)
 
 
-def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
-                                   actual: MultiModalKwargs):
+def assert_multimodal_inputs_equal(
+    expected: MultiModalKwargs, actual: MultiModalKwargs
+):
     assert set(expected.keys()) == set(actual.keys())
     for key in expected:
         assert_nested_tensors_equal(expected[key], actual[key])
@@ -49,19 +49,10 @@ def test_multimodal_input_batch_nested_tensors():
     a = torch.rand([2, 3])
     b = torch.rand([2, 3])
     c = torch.rand([2, 3])
-    result = MultiModalKwargs.batch([{
-        "image": [a]
-    }, {
-        "image": [b]
-    }, {
-        "image": [c]
-    }])
-    assert_multimodal_inputs_equal(result, {
-        "image":
-        torch.stack([a.unsqueeze(0),
-                     b.unsqueeze(0),
-                     c.unsqueeze(0)])
-    })
+    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b]}, {"image": [c]}])
+    assert_multimodal_inputs_equal(
+        result, {"image": torch.stack([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])}
+    )
 
 
 def test_multimodal_input_batch_heterogeneous_lists():
@@ -70,8 +61,8 @@ def test_multimodal_input_batch_heterogeneous_lists():
     c = torch.rand([1, 2, 3])
     result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
     assert_multimodal_inputs_equal(
-        result,
-        {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
+        result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}
+    )
 
 
 def test_multimodal_input_batch_multiple_batchable_lists():
@@ -81,9 +72,8 @@ def test_multimodal_input_batch_multiple_batchable_lists():
     d = torch.rand([1, 2, 3])
     result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
     assert_multimodal_inputs_equal(
-        result,
-        {"image": torch.stack([torch.stack([a, b]),
-                               torch.stack([c, d])])})
+        result, {"image": torch.stack([torch.stack([a, b]), torch.stack([c, d])])}
+    )
 
 
 def test_multimodal_input_batch_mixed_stacking_depths():
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 2f97475f121a..0e8eba527d75 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -13,20 +13,30 @@
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
-                                    MultiModalKwargsItem,
-                                    MultiModalSharedField)
+from vllm.multimodal.inputs import (
+    MultiModalFieldElem,
+    MultiModalKwargs,
+    MultiModalKwargsItem,
+    MultiModalSharedField,
+)
+
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        ProcessingCache, PromptIndexTargets,
-                                        PromptInsertion, PromptReplacement,
-                                        apply_text_matches,
-                                        apply_token_matches,
-                                        find_mm_placeholders,
-                                        find_text_matches, find_token_matches,
-                                        iter_token_matches,
-                                        replace_token_matches)
+from vllm.multimodal.processing import (
+    PlaceholderFeaturesInfo,
+    ProcessingCache,
+    PromptIndexTargets,
+    PromptInsertion,
+    PromptReplacement,
+    apply_text_matches,
+    apply_token_matches,
+    find_mm_placeholders,
+    find_text_matches,
+    find_token_matches,
+    iter_token_matches,
+    replace_token_matches,
+)
+
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import AnyTokenizer
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 3fdf7e33ca5f..a3dda8f7e262 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -15,13 +15,17 @@
 
 from tests.utils import multi_gpu_test
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.distributed.parallel_state import (init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
-from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
-                                   run_dp_sharded_vision_model)
+from vllm.multimodal.utils import (
+    MediaConnector,
+    merge_and_sort_multimodal_metadata,
+    run_dp_sharded_vision_model,
+)
 from vllm.platforms import current_platform
 from vllm.utils import get_open_port, update_environment_variables
 
@@ -48,17 +52,16 @@ def url_images() -> dict[str, Image.Image]:
     connector = MediaConnector()
 
     return {
-        image_url: connector.fetch_image(image_url)
-        for image_url in TEST_IMAGE_URLS
+        image_url: connector.fetch_image(image_url) for image_url in TEST_IMAGE_URLS
     }
 
 
 def get_supported_suffixes() -> tuple[str, ...]:
     # We should at least test the file types mentioned in GPT-4 with Vision
-    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
+    OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif")
 
     # Additional file types that are supported by us
-    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
+    EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff")
 
     return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
 
@@ -80,8 +83,9 @@ async def test_fetch_image_http(image_url: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
-async def test_fetch_image_base64(url_images: dict[str, Image.Image],
-                                  image_url: str, suffix: str):
+async def test_fetch_image_base64(
+    url_images: dict[str, Image.Image], image_url: str, suffix: str
+):
     connector = MediaConnector()
     url_image = url_images[image_url]
 
@@ -91,14 +95,14 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
         try:
             mime_type = mimetypes.types_map[suffix]
         except KeyError:
-            pytest.skip('No MIME type')
+            pytest.skip("No MIME type")
 
     with NamedTemporaryFile(suffix=suffix) as f:
         try:
             url_image.save(f.name)
         except Exception as e:
-            if e.args[0] == 'cannot write mode RGBA as JPEG':
-                pytest.skip('Conversion not supported')
+            if e.args[0] == "cannot write mode RGBA as JPEG":
+                pytest.skip("Conversion not supported")
 
             raise
 
@@ -124,30 +128,36 @@ async def test_fetch_image_local_files(image_url: str):
         local_connector = MediaConnector(allowed_local_media_path=temp_dir)
 
         origin_image = connector.fetch_image(image_url)
-        origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
-                          quality=100,
-                          icc_profile=origin_image.info.get('icc_profile'))
+        origin_image.save(
+            os.path.join(temp_dir, os.path.basename(image_url)),
+            quality=100,
+            icc_profile=origin_image.info.get("icc_profile"),
+        )
 
         image_async = await local_connector.fetch_image_async(
-            f"file://{temp_dir}/{os.path.basename(image_url)}")
+            f"file://{temp_dir}/{os.path.basename(image_url)}"
+        )
         image_sync = local_connector.fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}")
+            f"file://{temp_dir}/{os.path.basename(image_url)}"
+        )
         # Check that the images are equal
         assert not ImageChops.difference(image_sync, image_async).getbbox()
 
         with pytest.raises(ValueError, match="must be a subpath"):
             await local_connector.fetch_image_async(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
         with pytest.raises(RuntimeError, match="Cannot load local files"):
             await connector.fetch_image_async(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
 
         with pytest.raises(ValueError, match="must be a subpath"):
             local_connector.fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
         with pytest.raises(RuntimeError, match="Cannot load local files"):
-            connector.fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}")
+            connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
 @pytest.mark.asyncio
@@ -168,9 +178,12 @@ async def test_fetch_image_error_conversion():
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
 async def test_fetch_video_http(video_url: str, num_frames: int):
     connector = MediaConnector(
-        media_io_kwargs={"video": {
-            "num_frames": num_frames,
-        }})
+        media_io_kwargs={
+            "video": {
+                "num_frames": num_frames,
+            }
+        }
+    )
 
     video_sync, metadata_sync = connector.fetch_video(video_url)
     video_async, metadata_async = await connector.fetch_video_async(video_url)
@@ -188,7 +201,6 @@ class TestCase(NamedTuple):
 
 
 def test_merge_and_sort_multimodal_metadata():
-
     test_cases = [
         # Single modality should return result as is but flattened
         TestCase(
@@ -206,7 +218,6 @@ def test_merge_and_sort_multimodal_metadata():
             ],
             expected_hashes=["hash1", "hash2"],
         ),
-
         # Single modality without hashes return None for mm hash.
         TestCase(
             mm_positions={
@@ -223,7 +234,6 @@ def test_merge_and_sort_multimodal_metadata():
             ],
             expected_hashes=None,
         ),
-
         # Multiple modalities with hashes should return sorted modalities
         # and flattened ranges and hashes.
         TestCase(
@@ -235,7 +245,7 @@ def test_merge_and_sort_multimodal_metadata():
                 "audio": [
                     PlaceholderRange(offset=0, length=2),
                     PlaceholderRange(offset=2, length=3),
-                ]
+                ],
             },
             mm_hashes={
                 "image": ["image_hash1", "image_hash2"],
@@ -249,10 +259,12 @@ def test_merge_and_sort_multimodal_metadata():
                 PlaceholderRange(offset=11, length=5),
             ],
             expected_hashes=[
-                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+                "audio_hash1",
+                "audio_hash2",
+                "image_hash1",
+                "image_hash2",
             ],
         ),
-
         # Multiple modalities without hashes should return sorted modalities
         # and flattened ranges and None.
         TestCase(
@@ -264,7 +276,7 @@ def test_merge_and_sort_multimodal_metadata():
                 "audio": [
                     PlaceholderRange(offset=0, length=2),
                     PlaceholderRange(offset=2, length=3),
-                ]
+                ],
             },
             mm_hashes=None,
             expected_modalities=["audio", "audio", "image", "image"],
@@ -276,7 +288,6 @@ def test_merge_and_sort_multimodal_metadata():
             ],
             expected_hashes=None,
         ),
-
         # Three modalities
         TestCase(
             mm_positions={
@@ -291,16 +302,14 @@ def test_merge_and_sort_multimodal_metadata():
                     PlaceholderRange(offset=3, length=4),
                     PlaceholderRange(offset=7, length=5),
                     PlaceholderRange(offset=12, length=6),
-                ]
+                ],
             },
             mm_hashes={
                 "image": ["image_hash1", "image_hash2"],
                 "audio": ["audio_hash1"],
-                "video": ["video_hash1", "video_hash2", "video_hash3"]
+                "video": ["video_hash1", "video_hash2", "video_hash3"],
             },
-            expected_modalities=[
-                "audio", "video", "video", "video", "image", "image"
-            ],
+            expected_modalities=["audio", "video", "video", "video", "image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=4),
@@ -310,16 +319,26 @@ def test_merge_and_sort_multimodal_metadata():
                 PlaceholderRange(offset=22, length=8),
             ],
             expected_hashes=[
-                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
-                "image_hash1", "image_hash2"
+                "audio_hash1",
+                "video_hash1",
+                "video_hash2",
+                "video_hash3",
+                "image_hash1",
+                "image_hash2",
             ],
         ),
     ]
 
-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
+    for (
+        mm_positions,
+        mm_hashes,
+        expected_modalities,
+        expected_ranges,
+        expected_hashes,
+    ) in test_cases:
         modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
+            mm_positions, mm_hashes
+        )
 
         assert modalities == expected_modalities
         assert ranges == expected_ranges
@@ -327,9 +346,7 @@ def test_merge_and_sort_multimodal_metadata():
 
 
 def test_merge_and_sort_multimodal_metadata_with_interleaving():
-
     test_cases = [
-
         # <image> <audio> <image> <audio>
         TestCase(
             mm_positions={
@@ -340,7 +357,7 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 "audio": [
                     PlaceholderRange(offset=5, length=2),
                     PlaceholderRange(offset=11, length=4),
-                ]
+                ],
             },
             mm_hashes={
                 "image": ["image_hash1", "image_hash2"],
@@ -354,10 +371,12 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 PlaceholderRange(offset=11, length=4),
             ],
             expected_hashes=[
-                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
+                "image_hash1",
+                "audio_hash1",
+                "image_hash2",
+                "audio_hash2",
             ],
         ),
-
         # <image> <image> <audio> <video> <image>
         TestCase(
             mm_positions={
@@ -371,7 +390,7 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 ],
                 "video": [
                     PlaceholderRange(offset=8, length=5),
-                ]
+                ],
             },
             mm_hashes=None,
             expected_modalities=["image", "image", "audio", "video", "image"],
@@ -384,7 +403,6 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
             ],
             expected_hashes=None,
         ),
-
         # <image> <audio> <video> <image> with hashes
         TestCase(
             mm_positions={
@@ -397,7 +415,7 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 ],
                 "video": [
                     PlaceholderRange(offset=10, length=5),
-                ]
+                ],
             },
             mm_hashes={
                 "image": ["image_hash1", "image_hash2"],
@@ -412,15 +430,24 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 PlaceholderRange(offset=18, length=4),
             ],
             expected_hashes=[
-                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+                "image_hash1",
+                "audio_hash1",
+                "video_hash1",
+                "image_hash2",
             ],
         ),
     ]
 
-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
+    for (
+        mm_positions,
+        mm_hashes,
+        expected_modalities,
+        expected_ranges,
+        expected_hashes,
+    ) in test_cases:
         modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
+            mm_positions, mm_hashes
+        )
 
         assert modalities == expected_modalities
         assert ranges == expected_ranges
@@ -464,10 +491,11 @@ def test_run_dp_sharded_vision_model(batch_size: int):
     )
 
 
-def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
-                                          batch_size: int, master_port: int):
+def run_dp_sharded_vision_model_vs_direct(
+    local_rank: int, world_size: int, batch_size: int, master_port: int
+):
     """
-    Test that run_dp_sharded_vision_model produces the same results as 
+    Test that run_dp_sharded_vision_model produces the same results as
     calling the model directly.
     """
 
@@ -478,13 +506,15 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
     torch.cuda.set_device(device)
     torch.set_default_device(device)
 
-    update_environment_variables({
-        'RANK': str(local_rank),
-        'LOCAL_RANK': str(local_rank),
-        'WORLD_SIZE': str(world_size),
-        'MASTER_ADDR': 'localhost',
-        'MASTER_PORT': str(master_port),
-    })
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": str(master_port),
+        }
+    )
 
     # initialize distributed
     init_distributed_environment()
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 897c9c33461a..6f8799f102eb 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -6,8 +6,7 @@
 
 from vllm import envs
 from vllm.multimodal.image import ImageMediaIO
-from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
-                                   VideoMediaIO)
+from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader, VideoMediaIO
 
 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -16,7 +15,6 @@
 
 @VIDEO_LOADER_REGISTRY.register("test_video_loader_1")
 class TestVideoLoader1(VideoLoader):
-
     @classmethod
     def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
         return FAKE_OUTPUT_1
@@ -24,7 +22,6 @@ def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
 
 @VIDEO_LOADER_REGISTRY.register("test_video_loader_2")
 class TestVideoLoader2(VideoLoader):
-
     @classmethod
     def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
         return FAKE_OUTPUT_2
@@ -47,13 +44,10 @@ def test_video_loader_type_doesnt_exist():
 
 @VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
 class Assert10Frames1FPSVideoLoader(VideoLoader):
-
     @classmethod
-    def load_bytes(cls,
-                   data: bytes,
-                   num_frames: int = -1,
-                   fps: float = -1.0,
-                   **kwargs) -> npt.NDArray:
+    def load_bytes(
+        cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs
+    ) -> npt.NDArray:
         assert num_frames == 10, "bad num_frames"
         assert fps == 1.0, "bad fps"
         return FAKE_OUTPUT_2
@@ -68,11 +62,8 @@ def test_video_media_io_kwargs():
     _ = videoio.load_bytes(b"test")
 
     videoio = VideoMediaIO(
-        imageio, **{
-            "num_frames": 10,
-            "fps": 1.0,
-            "not_used": "not_used"
-        })
+        imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"}
+    )
     _ = videoio.load_bytes(b"test")
 
     with pytest.raises(AssertionError, match="bad num_frames"):
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 23346509a06f..ced9341f91fa 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -6,7 +6,7 @@
 
 
 def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int):
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    w, h = rng.randint(min_wh, max_wh, size=(2,))
     arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
     return Image.fromarray(arr)
 
@@ -19,7 +19,7 @@ def random_video(
     max_wh: int,
 ):
     num_frames = rng.randint(min_frames, max_frames)
-    w, h = rng.randint(min_wh, max_wh, size=(2, ))
+    w, h = rng.randint(min_wh, max_wh, size=(2,))
     return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
 
 
diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py
index 2d6e5f523cb8..00c7b6441960 100644
--- a/tests/neuron/1_core/test_activation.py
+++ b/tests/neuron/1_core/test_activation.py
@@ -10,11 +10,14 @@
 
 
 @pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
-@pytest.mark.parametrize("num_tokens,d,dtype", [
-    (7, 512, torch.half),
-    (7, 512, torch.float),
-    (83, 512, torch.half),
-])
+@pytest.mark.parametrize(
+    "num_tokens,d,dtype",
+    [
+        (7, 512, torch.half),
+        (7, 512, torch.float),
+        (83, 512, torch.half),
+    ],
+)
 @torch.inference_mode()
 def test_act_and_mul(
     activation: str,
@@ -35,8 +38,7 @@ def test_act_and_mul(
         layer = FastGELU()
         fn = F.gelu
     else:
-        raise NotImplementedError(
-            f"activation {activation} is not implemented.")
+        raise NotImplementedError(f"activation {activation} is not implemented.")
     assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
     out = layer.to(device=device).forward_neuron(x)
     ref_out = fn(x.cpu())
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
index efec56360c14..76e1bdc29da6 100644
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@@ -8,7 +8,9 @@
 from neuronxcc import nki
 
 from vllm.attention.ops.nki_flash_attn import (
-    load_block_tables, transform_block_tables_for_indirect_load)
+    load_block_tables,
+    transform_block_tables_for_indirect_load,
+)
 
 
 def is_power_of_2(n):
@@ -23,16 +25,17 @@ def nki_load_and_transform_block_tables(
     head_id,
     block_size_tiling_factor,
 ):
-    assert is_power_of_2(
-        num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
-    block_tables_sbuf = load_block_tables(block_tables, num_tiles,
-                                          num_blocks_per_tile)
+    assert is_power_of_2(num_blocks_per_tile), (
+        f"{num_blocks_per_tile=} must be power of 2"
+    )
+    block_tables_sbuf = load_block_tables(block_tables, num_tiles, num_blocks_per_tile)
 
     # we need to pass an Index as head_id
     head_id = nl.arange(1)[None, :] + head_id
 
     block_tables_transposed = transform_block_tables_for_indirect_load(
-        block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
+        block_tables_sbuf, block_size_tiling_factor, num_head, head_id
+    )
     B_P_SIZE = 128
     assert block_tables_transposed.shape[1] == B_P_SIZE
 
@@ -73,8 +76,9 @@ def ref_block_tables_transform(
 
     num_blocks_per_tile = block_tables_transposed.shape[0]
     assert num_blocks_per_tile % B_F_SIZE == 0
-    return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
-                                        B_F_SIZE, num_tiles_padded)
+    return block_tables_transposed.view(
+        num_blocks_per_tile // B_F_SIZE, B_F_SIZE, num_tiles_padded
+    )
 
 
 @pytest.mark.parametrize(
@@ -109,10 +113,12 @@ def test_load_and_transform_block_tables(
 
     device = xm.xla_device()
 
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
+    compiler_flags_str = " ".join(
+        [
+            "-O1",
+            "--retry_failed_compilation",
+        ]
+    )
     with monkeypatch.context() as m:
         m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
 
@@ -130,7 +136,7 @@ def test_load_and_transform_block_tables(
         block_tables = torch.randint(
             0,
             max_num_blocks,
-            (num_tiles * num_blocks_per_tile, ),
+            (num_tiles * num_blocks_per_tile,),
             dtype=torch.int32,
         )
         nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
@@ -149,6 +155,5 @@ def test_load_and_transform_block_tables(
             head_id,
             block_size_tiling_factor,
         )
-        assert (nki_out.shape == ref_out.shape
-                ), f"{nki_out.shape=} != {ref_out.shape=}"
+        assert nki_out.shape == ref_out.shape, f"{nki_out.shape=} != {ref_out.shape=}"
         assert torch.all(nki_out == ref_out)
diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
index 670889ad6b58..e38ed058cc6e 100644
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -14,42 +14,38 @@
         (32, 12, 64, 4, 128),  # Typical sequence processing
         (1, 12, 64, 4, 128),  # Single token update
         (128, 12, 64, 4, 128),  # Longer sequence
-
         # Medium model configuration (e.g., GPT-2 medium)
         (64, 16, 96, 8, 256),  # Standard batch
         (256, 16, 96, 8, 256),  # Large batch
-
         # Large model configuration (e.g., GPT-3 style)
         (48, 32, 128, 16, 512),  # Typical processing window
         (512, 32, 128, 16, 512),  # Full context window
-
         # Edge cases and stress tests
         (1024, 8, 32, 32, 32),  # Many tokens, small heads
         (16, 64, 256, 4, 64),  # Few tokens, many heads
         (2048, 24, 128, 64, 128),  # Large scale test
-
         # Minimal configurations for debugging
         (4, 2, 16, 2, 16),  # Tiny test case
         (1, 1, 8, 1, 8),  # Minimal possible
-    ])
-def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
-                           block_size):
+    ],
+)
+def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks, block_size):
     # Set random seed for reproducibility
     torch.manual_seed(42)
 
     # Create CPU tensors for reference implementation
     key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
+        torch.tensor(d_head)
+    )
     value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
+        torch.tensor(d_head)
+    )
     key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
     value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
     slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
 
     # Run reference implementation on CPU
-    block_indices = torch.div(slot_mapping_cpu,
-                              block_size,
-                              rounding_mode="floor")
+    block_indices = torch.div(slot_mapping_cpu, block_size, rounding_mode="floor")
     block_offsets = slot_mapping_cpu % block_size
 
     for i in range(num_tokens):
@@ -59,7 +55,7 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
         value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
 
     # Create XLA device tensors
-    device = torch.device('xla')
+    device = torch.device("xla")
     key = key_cpu.to(device)
     value = value_cpu.to(device)
     key_cache = torch.zeros_like(key_cache_cpu, device=device)
@@ -76,11 +72,7 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
     value_cache_result = value_cache.cpu()
 
     # Assert results match
-    torch.testing.assert_close(key_cache_result,
-                               key_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
-    torch.testing.assert_close(value_cache_result,
-                               value_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
+    torch.testing.assert_close(key_cache_result, key_cache_cpu, rtol=1e-5, atol=1e-5)
+    torch.testing.assert_close(
+        value_cache_result, value_cache_cpu, rtol=1e-5, atol=1e-5
+    )
diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py
index c6fce1d1a063..3e4f99384b1a 100644
--- a/tests/neuron/1_core/test_layernorm.py
+++ b/tests/neuron/1_core/test_layernorm.py
@@ -8,13 +8,16 @@
 from vllm.platforms import current_platform
 
 
-@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
-    (7, 8, False, torch.half),
-    (83, 768, False, torch.half),
-    (83, 768, True, torch.half),
-    (83, 768, True, torch.bfloat16),
-    (83, 768, True, torch.float32),
-])
+@pytest.mark.parametrize(
+    "num_tokens,hidden_size,add_residual,dtype",
+    [
+        (7, 8, False, torch.half),
+        (83, 768, False, torch.half),
+        (83, 768, True, torch.half),
+        (83, 768, True, torch.bfloat16),
+        (83, 768, True, torch.float32),
+    ],
+)
 @torch.inference_mode()
 def test_rms_norm(
     num_tokens: int,
@@ -44,14 +47,8 @@ def test_rms_norm(
     # Therefore, we use a larger tolerance.
     if add_residual:
         assert out[0].is_xla, "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out[0].cpu(),
-                                   ref_out[0],
-                                   atol=1e-2,
-                                   rtol=1e-2)
-        torch.testing.assert_close(out[1].cpu(),
-                                   ref_out[1],
-                                   atol=1e-2,
-                                   rtol=1e-2)
+        torch.testing.assert_close(out[0].cpu(), ref_out[0], atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(out[1].cpu(), ref_out[1], atol=1e-2, rtol=1e-2)
     else:
         assert out.is_xla, "output tensor is expected to be XLA tensor"
         torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py
index ce9eadf5a883..36d7961f2385 100644
--- a/tests/neuron/1_core/test_logits_processor.py
+++ b/tests/neuron/1_core/test_logits_processor.py
@@ -15,30 +15,30 @@
 
 
 class MockLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vocab_size: int, scale: float,
-                 fake_logits: torch.Tensor):
+    def __init__(self, vocab_size: int, scale: float, fake_logits: torch.Tensor):
         super().__init__(vocab_size=vocab_size, scale=scale)
         self.fake_logits = fake_logits.clone()
 
     def forward(self, *args, **kwargs):
-        with patch(
+        with (
+            patch(
                 "vllm.model_executor.layers.logits_processor._prune_hidden_states",
-                lambda x, y: x
-        ), patch(
+                lambda x, y: x,
+            ),
+            patch(
                 "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
-                lambda *args, **kwargs: self.fake_logits):
+                lambda *args, **kwargs: self.fake_logits,
+            ),
+        ):
             return super().forward(*args, **kwargs)
 
 
 def _prepare_test(
-        batch_size: int
+    batch_size: int,
 ) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
     vocab_size = 32000
     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, vocab_size),
-                             1e-2,
-                             dtype=input_tensor.dtype)
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=input_tensor.dtype)
     logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
     return input_tensor, fake_logits, logits_processor
 
@@ -71,10 +71,12 @@ def pick_ith(token_ids, logits):
                 request_id=f"test_{i}",
                 is_prompt=True,
                 seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0,
-                                               logits_processors=[pick_ith]),
+                sampling_params=SamplingParams(
+                    temperature=0, logits_processors=[pick_ith]
+                ),
                 block_tables={0: [1]},
-            ))
+            )
+        )
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     sampling_metadata = SamplingMetadata.prepare(
@@ -82,14 +84,13 @@ def pick_ith(token_ids, logits):
         seq_lens,
         query_lens=seq_lens,
         device=device,
-        pin_memory=is_pin_memory_available())
+        pin_memory=is_pin_memory_available(),
+    )
     logits_processor_output = logits_processor(
-        lm_head=None,
-        hidden_states=input_tensor,
-        sampling_metadata=sampling_metadata)
+        lm_head=None, hidden_states=input_tensor, sampling_metadata=sampling_metadata
+    )
 
     fake_logits *= logits_processor.scale
-    torch.testing.assert_close(logits_processor_output[:, 1],
-                               fake_logits[:, 1],
-                               rtol=1e-4,
-                               atol=0.0)
+    torch.testing.assert_close(
+        logits_processor_output[:, 1], fake_logits[:, 1], rtol=1e-4, atol=0.0
+    )
diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py
index 5f3268810f9f..a80fa93da185 100644
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ b/tests/neuron/1_core/test_neuron_model_runner.py
@@ -11,12 +11,10 @@
 from vllm.sequence import SequenceData, SequenceGroupMetadata
 from vllm.worker.neuron_model_runner import NeuronModelRunner
 
-os.environ[
-    'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
+os.environ["VLLM_NEURON_FRAMEWORK"] = NeuronFramework.TRANSFORMERS_NEURONX.value
 
 
-def _create_neuron_model_runner(model: str, *args,
-                                **kwargs) -> NeuronModelRunner:
+def _create_neuron_model_runner(model: str, *args, **kwargs) -> NeuronModelRunner:
     engine_args = EngineArgs(model, *args, **kwargs)
     engine_config = engine_args.create_engine_config()
     vllm_config = VllmConfig(
@@ -49,9 +47,7 @@ def test_update_neuron_sampling_params_not_full_batch():
                 request_id="test_0",
                 is_prompt=True,
                 seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
+                sampling_params=SamplingParams(temperature=0.5, top_k=1, top_p=0.5),
                 block_tables={0: [1]},
             )
         ]
@@ -63,15 +59,16 @@ def test_update_neuron_sampling_params_not_full_batch():
         # placed at index 1. So the sampling parameters will be:
         # Index 0: default sampling parameters
         # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
+        neuron_sampling_params = model_runner.model_config.neuron_sampling_params
         assert neuron_sampling_params.temperature == [1.0, 0.5]
         assert neuron_sampling_params.top_k == [
-            model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
+            model_runner._MAX_NEURON_SAMPLING_TOP_K,
+            1,
         ]
         assert neuron_sampling_params.top_p == [1.0, 0.5]
         model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)
+            neuron_sampling_params
+        )
 
 
 def test_update_neuron_sampling_params_full_batch():
@@ -95,20 +92,16 @@ def test_update_neuron_sampling_params_full_batch():
                 request_id="test_0",
                 is_prompt=True,
                 seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
+                sampling_params=SamplingParams(temperature=0.5, top_k=1, top_p=0.5),
                 block_tables={0: [1]},
             ),
             SequenceGroupMetadata(
                 request_id="test_0",
                 is_prompt=True,
                 seq_data={1: SequenceData.from_seqs([4, 5, 6])},
-                sampling_params=SamplingParams(temperature=0.2,
-                                               top_k=2,
-                                               top_p=0.2),
+                sampling_params=SamplingParams(temperature=0.2, top_k=2, top_p=0.2),
                 block_tables={1: [0]},
-            )
+            ),
         ]
 
         model_runner.prepare_model_input(seq_group_metadata_list)
@@ -118,10 +111,10 @@ def test_update_neuron_sampling_params_full_batch():
         # placed at index 1. So the sampling parameters will be:
         # Index 0: sequence 1's sampling parameters
         # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
+        neuron_sampling_params = model_runner.model_config.neuron_sampling_params
         assert neuron_sampling_params.temperature == [0.2, 0.5]
         assert neuron_sampling_params.top_k == [2, 1]
         assert neuron_sampling_params.top_p == [0.2, 0.5]
         model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)
+            neuron_sampling_params
+        )
diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py
index 086300269592..9e0630f08200 100644
--- a/tests/neuron/1_core/test_neuron_quant.py
+++ b/tests/neuron/1_core/test_neuron_quant.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.model_executor.layers.quantization.neuron_quant import (
-    NeuronQuantConfig)
+from vllm.model_executor.layers.quantization.neuron_quant import NeuronQuantConfig
 
 
 def test_get_supported_act_dtypes():
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index abf7febc2955..6ce54f68c9d5 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -11,7 +11,6 @@
 
 
 class BlockDiagonalCausalFromBottomRightMask:
-
     @staticmethod
     def _from_seqlens(query_lens, seq_lens, block_size=None):
         from torch import logical_and, logical_or
@@ -28,8 +27,7 @@ def _from_seqlens(query_lens, seq_lens, block_size=None):
             key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
         n_keys = sum(key_lens_blockaligned)
 
-        a = (torch.arange(n_queries).reshape(n_queries,
-                                             1).expand(n_queries, n_keys))
+        a = torch.arange(n_queries).reshape(n_queries, 1).expand(n_queries, n_keys)
         b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
         q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
         k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
@@ -67,26 +65,28 @@ def from_seqlens(query_lens, seq_lens, block_size=None):
         contexted = block_size is None
         if contexted:
             prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens)
+                query_lens, seq_lens
+            )
             active_mask = None
         else:
             prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens, block_size)
+                query_lens, seq_lens, block_size
+            )
             active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, query_lens)
+                query_lens, query_lens
+            )
         return prior_mask, active_mask
 
 
-def ref_softmax(x: torch.Tensor,
-                dim: int,
-                mixed_precision=False,
-                return_max_reduce=False):
+def ref_softmax(
+    x: torch.Tensor, dim: int, mixed_precision=False, return_max_reduce=False
+):
     max_value = torch.amax(x, dim=dim, keepdims=True)
     exp = torch.exp(x - max_value)
     if mixed_precision:
-        sum_value = torch.sum(exp.astype(torch.float32),
-                              dim=dim,
-                              keepdims=True).astype(x.dtype)
+        sum_value = torch.sum(exp.astype(torch.float32), dim=dim, keepdims=True).astype(
+            x.dtype
+        )
     else:
         sum_value = torch.sum(exp, dim=dim, keepdims=True)
     if return_max_reduce:
@@ -107,7 +107,8 @@ def ref_masked_attention(
         masked_score = scaled_qk + attn_mask.float()
     if return_max_reduce:
         norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
-            masked_score, dim=-1, return_max_reduce=True)
+            masked_score, dim=-1, return_max_reduce=True
+        )
     else:
         norm_score = ref_softmax(masked_score, dim=-1)
     out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
@@ -121,7 +122,7 @@ def ref_masked_attention(
             scaled_qk,
         )
     else:
-        return (out, )
+        return (out,)
 
 
 def ref_context_attention(
@@ -141,7 +142,8 @@ def ref_context_attention(
         value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
 
     attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens)
+        query_lens, seq_lens
+    )
 
     # convert binary mask to -inf values
     attn_mask = torch.logical_not(attn_mask)
@@ -158,8 +160,7 @@ def ref_context_attention(
 
     output = output.unsqueeze(1)
     if return_max_reduce:
-        cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
-            debug_tensors)
+        cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = debug_tensors
         return (
             output,
             cached_max,
@@ -189,17 +190,17 @@ def sample_inputs(
     max_model_len = (max_query_len + max_ctx_len) * 4
     max_block_per_request = max_model_len // block_size
     cache_size = (batch_size * max_block_per_request) + 2
-    prefill_ctx_lens = torch.randint(min_ctx_len,
-                                     max_ctx_len + 1, (prefill_batch_size, ),
-                                     dtype=torch.long).tolist()
-    decode_ctx_lens = torch.randint(min_ctx_len,
-                                    max_ctx_len + 1, (decode_batch_size, ),
-                                    dtype=torch.long).tolist()
+    prefill_ctx_lens = torch.randint(
+        min_ctx_len, max_ctx_len + 1, (prefill_batch_size,), dtype=torch.long
+    ).tolist()
+    decode_ctx_lens = torch.randint(
+        min_ctx_len, max_ctx_len + 1, (decode_batch_size,), dtype=torch.long
+    ).tolist()
     ctx_lens = prefill_ctx_lens + decode_ctx_lens
     query_lens = torch.randint(
         min_query_len,
         max_query_len + 1,
-        (prefill_batch_size, ),
+        (prefill_batch_size,),
         dtype=torch.long,
     ).tolist() + [1 for _ in range(decode_batch_size)]
     seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
@@ -213,36 +214,27 @@ def sample_inputs(
     kv.uniform_(-1, 1)
     key, value = kv.unbind(dim=1)
 
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
+    k_cache = torch.zeros(cache_size, block_size, num_kv_heads, head_size, dtype=dtype)
+    v_cache = torch.zeros(cache_size, block_size, num_kv_heads, head_size, dtype=dtype)
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     values = torch.arange(0, cache_size, dtype=torch.long)
     values = values[torch.randperm(cache_size)]
-    block_table = values[:batch_size * max_block_per_request].view(
-        batch_size, max_block_per_request)
+    block_table = values[: batch_size * max_block_per_request].view(
+        batch_size, max_block_per_request
+    )
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
-                                            dtype=torch.long),
-                               dim=0)
+    b_start_loc = torch.cumsum(
+        torch.tensor([0] + query_lens[:-1], dtype=torch.long), dim=0
+    )
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0
+    )
     for i in range(batch_size):
         for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
         cur_ctx = 0
         block_id = 0
         while cur_ctx < b_ctx_len[i]:
@@ -253,12 +245,12 @@ def sample_inputs(
                 end_loc = start_loc + block_size
             start_slot = block_table[i, block_id] * block_size
             end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
+            k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc]
+            )
+            v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc]
+            )
             cur_ctx += block_size
             block_id += 1
     kv_cache = torch.stack([k_cache, v_cache])
@@ -276,16 +268,15 @@ def sample_inputs(
     )
 
 
-def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
-                            num_blocks):
+def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, num_blocks):
     context_lens = seq_lens - query_lens
     blocks_per_seq = (context_lens + block_size - 1) // block_size
     num_seqs = len(seq_lens)
     active_blocks: list[int] = []
     for seq_id in range(num_seqs):
         active_blocks = (
-            active_blocks +
-            block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+            active_blocks + block_tables[seq_id, : blocks_per_seq[seq_id]].tolist()
+        )
     return F.pad(
         torch.tensor(active_blocks, dtype=torch.int32),
         (0, num_blocks - len(active_blocks)),
@@ -298,29 +289,33 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
     "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
     [
         # Test minimal configurations (small block size)
-        (1, 199, 1, 512, 4, 2, 8, False
-         ),  # minimal block size, small dimensions
+        (1, 199, 1, 512, 4, 2, 8, False),  # minimal block size, small dimensions
         (1, 199, 1, 512, 4, 2, 8, True),  # same with mixed precision
-
         # Test common/medium configurations
         (4, 12, 32, 2048, 32, 8, 64, False),  # common case, larger heads
-        (4, 12, 32, 2048, 16, 4, 32,
-         True),  # medium size, mixed precision, grouped-query attention (GQA)
-
+        (
+            4,
+            12,
+            32,
+            2048,
+            16,
+            4,
+            32,
+            True,
+        ),  # medium size, mixed precision, grouped-query attention (GQA)
         # Test large configurations
         (4, 12, 256, 8192, 8, 1, 128, False),  # large blocks, large head size
         (4, 12, 256, 8192, 64, 8, 64, True),  # large blocks, many heads
-
         # Test asymmetric configurations
         (2, 24, 64, 4096, 12, 4, 96, False),  # varied batch sizes
         (8, 8, 128, 2048, 24, 2, 48, True),  # balanced batches
-
         # Test edge cases
         (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
         (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
         (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
         (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
-    ])
+    ],
+)
 @torch.inference_mode()
 def test_contexted_kv_attention(
     monkeypatch: pytest.MonkeyPatch,
@@ -333,20 +328,23 @@ def test_contexted_kv_attention(
     large_tile_size,
     mixed_precision: bool,
 ) -> None:
-
     import torch_xla.core.xla_model as xm
 
-    from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
-                                                   reorder_context_mask)
+    from vllm.attention.ops.nki_flash_attn import (
+        flash_attn_varlen_nkifunc,
+        reorder_context_mask,
+    )
 
     assert large_tile_size % block_size == 0
 
     device = xm.xla_device()
 
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
+    compiler_flags_str = " ".join(
+        [
+            "-O1",
+            "--retry_failed_compilation",
+        ]
+    )
     with monkeypatch.context() as m:
         m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
 
@@ -397,26 +395,28 @@ def test_contexted_kv_attention(
 
         # build neuron program
         B_P_SIZE = 128
-        assert (large_tile_size >= B_P_SIZE
-                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
+        assert large_tile_size >= B_P_SIZE, (
+            f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
+        )
 
         def pad_to_multiple(a, b):
             return cdiv(a, b) * b
 
         def pad_to_next_power_of_2(a):
             assert a > 0
-            return 2**int(a - 1).bit_length()
+            return 2 ** int(a - 1).bit_length()
 
         # calculate input shapes
         max_num_queries = pad_to_next_power_of_2(sum(query_lens))
         context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
         num_active_blocks = cdiv(context_lens, block_size).sum().item()
-        num_active_blocks = pad_to_multiple(num_active_blocks,
-                                            large_tile_size // block_size)
+        num_active_blocks = pad_to_multiple(
+            num_active_blocks, large_tile_size // block_size
+        )
         context_kv_len = num_active_blocks * block_size
-        assert (
-            context_kv_len %
-            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
+        assert context_kv_len % large_tile_size == 0, (
+            f"invalid context_kv_len={context_kv_len}"
+        )
 
         # pad QKV tensors
         pad_dims = (
@@ -450,9 +450,9 @@ def pad_to_next_power_of_2(a):
         )
 
         # Build attention masks
-        prior_mask, active_mask = (
-            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-                query_lens, seq_lens, block_size=block_size))
+        prior_mask, active_mask = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+            query_lens, seq_lens, block_size=block_size
+        )
         prior_mask_padded = F.pad(
             prior_mask,
             (
@@ -475,11 +475,9 @@ def pad_to_next_power_of_2(a):
             "constant",
             0,
         ).bool()
-        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
-                                 dim=1)
+        attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
 
-        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
-                                         block_size)
+        attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
 
         input_args = (
             query.to(device=device),
@@ -508,7 +506,6 @@ def pad_to_next_power_of_2(a):
             "constant",
             0,
         )
-        output_ref = output_ref_padded.transpose(
-            0, 1)[0, :num_actual_tokens, :, :]
+        output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
 
         torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py
index a7ac79729986..a12053db5600 100644
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ b/tests/neuron/1_core/test_rotary_embedding.py
@@ -12,16 +12,19 @@
 
 
 @pytest.mark.parametrize(
-    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
+    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key",
+    [
         (16, False, 32, 32, 1024, True),
         (16, False, 32, 128, 1024, True),
         (16, True, 32, 32, 1024, True),
         (16, True, 32, 128, 1024, True),
         (16, False, 32, 128, 1024, False),
         (16, True, 32, 128, 1024, False),
-    ])
-def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
-                                  head_size, seq_len, use_key):
+    ],
+)
+def test_rotary_embedding_opcheck(
+    max_position, is_neox_style, rotary_dim, head_size, seq_len, use_key
+):
     import torch_xla.core.xla_model as xm
 
     device = xm.xla_device()
@@ -32,37 +35,28 @@ def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
     base = 10000
     num_heads = 8
 
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
+    rot = RotaryEmbedding(
+        head_size, rotary_dim, max_position, base, is_neox_style, torch.float32
+    )
 
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device="cpu")
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=torch.float32,
-                        device="cpu")
+    positions = torch.randint(0, max_position, (batch_size, seq_len), device="cpu")
+    query = torch.randn(
+        batch_size, seq_len, num_heads * head_size, dtype=torch.float32, device="cpu"
+    )
     key = torch.randn_like(query) if use_key else None
-    assert positions.is_cpu, \
-        "reference input tensor is expected to be CPU tensor."
-    ref_query, ref_key = rot.to(device="cpu").forward_native(
-        positions, query, key)
+    assert positions.is_cpu, "reference input tensor is expected to be CPU tensor."
+    ref_query, ref_key = rot.to(device="cpu").forward_native(positions, query, key)
     out_query, out_key = rot.to(device=device).forward_neuron(
-        positions.to(device=device), query.to(device=device),
-        key.to(device=device) if key is not None else None)
+        positions.to(device=device),
+        query.to(device=device),
+        key.to(device=device) if key is not None else None,
+    )
     if use_key:
-        assert out_query.is_xla and out_key.is_xla, \
+        assert out_query.is_xla and out_key.is_xla, (
             "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out_key.cpu(),
-                                   ref_key,
-                                   atol=1e-2,
-                                   rtol=1e-2)
+        )
+        torch.testing.assert_close(out_key.cpu(), ref_key, atol=1e-2, rtol=1e-2)
     else:
         assert out_key is None, "expected returned key to be None"
-        assert out_query.is_xla, \
-            "output tensor is expected to be XLA tensor"
-    torch.testing.assert_close(out_query.cpu(),
-                               ref_query,
-                               atol=1e-2,
-                               rtol=1e-2)
+        assert out_query.is_xla, "output tensor is expected to be XLA tensor"
+    torch.testing.assert_close(out_query.cpu(), ref_query, atol=1e-2, rtol=1e-2)
diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py
index 85a48dae58aa..3ef7e730e0d2 100644
--- a/tests/neuron/2_core/test_comm_ops.py
+++ b/tests/neuron/2_core/test_comm_ops.py
@@ -10,9 +10,13 @@
 from typing_extensions import ParamSpec
 
 from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.utils import get_distributed_init_method, get_open_port
 
 _P = ParamSpec("_P")
@@ -37,11 +41,9 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
 
 
 def all_gather_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
+    init_distributed_environment(
+        tp_degree, index, distributed_init_method, index, backend="xla"
+    )
     ensure_model_parallel_initialized(tp_degree, 1)
 
     num_dimensions = 3
@@ -52,8 +54,8 @@ def all_gather_test_worker(index, tp_degree, distributed_init_method):
 
     all_gather_dimension = -1
     all_tensors = [
-        torch.arange(total_size, dtype=torch.float32,
-                     device="xla").reshape(tensor_size) * (r + 1)
+        torch.arange(total_size, dtype=torch.float32, device="xla").reshape(tensor_size)
+        * (r + 1)
         for r in range(tp_degree)
     ]
     expected = torch.cat(all_tensors, dim=all_gather_dimension)
@@ -63,11 +65,9 @@ def all_gather_test_worker(index, tp_degree, distributed_init_method):
 
 
 def all_reduce_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
+    init_distributed_environment(
+        tp_degree, index, distributed_init_method, index, backend="xla"
+    )
     ensure_model_parallel_initialized(tp_degree, 1)
 
     num_elements = 8
@@ -82,20 +82,20 @@ def all_reduce_test_worker(index, tp_degree, distributed_init_method):
 
 
 @pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target",
-                         [all_reduce_test_worker, all_gather_test_worker])
+@pytest.mark.parametrize(
+    "test_target", [all_reduce_test_worker, all_gather_test_worker]
+)
 @reinitialize_neuron_runtime
-def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
-                                              test_target):
-
-    with patch('torch_xla._XLAC._xla_runtime_is_initialized',
-               return_value=False):
+def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size, test_target):
+    with patch("torch_xla._XLAC._xla_runtime_is_initialized", return_value=False):
         distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
+            "127.0.0.1", get_open_port()
+        )
 
         monkeypatch.setenv("VLLM_USE_V1", "1")
         monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
-        monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
-                           ','.join(['1' for _ in range(tp_size)]))
+        monkeypatch.setenv(
+            "NEURON_PJRT_PROCESSES_NUM_DEVICES", ",".join(["1" for _ in range(tp_size)])
+        )
 
         xmp.spawn(test_target, args=(tp_size, distributed_init_method))
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
index cac642af0310..b273b9e78c6f 100644
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
@@ -13,8 +13,7 @@
 from vllm import LLM, SamplingParams
 
 
-def patch_eagle_draft_with_lm_head(target_model_id: str,
-                                   draft_model_id: str) -> str:
+def patch_eagle_draft_with_lm_head(target_model_id: str, draft_model_id: str) -> str:
     # In NxDI, draft model checkpoint must include lm_head weights from target
     # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
     # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
@@ -22,11 +21,12 @@ def patch_eagle_draft_with_lm_head(target_model_id: str,
     final_draft_dir = "/tmp/patched_eagle_draft"
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        target_dir = snapshot_download(repo_id=target_model_id,
-                                       local_dir=os.path.join(
-                                           tmp_dir, "target"))
-        draft_dir = snapshot_download(repo_id=draft_model_id,
-                                      local_dir=os.path.join(tmp_dir, "draft"))
+        target_dir = snapshot_download(
+            repo_id=target_model_id, local_dir=os.path.join(tmp_dir, "target")
+        )
+        draft_dir = snapshot_download(
+            repo_id=draft_model_id, local_dir=os.path.join(tmp_dir, "draft")
+        )
 
         lm_head_key = "lm_head.weight"
         index_path = os.path.join(target_dir, "model.safetensors.index.json")
@@ -51,13 +51,14 @@ def patch_eagle_draft_with_lm_head(target_model_id: str,
 def test_eagle():
     patched_draft_path = patch_eagle_draft_with_lm_head(
         target_model_id="meta-llama/Llama-2-7b-hf",
-        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
+        draft_model_id="yuhuili/EAGLE-llama2-chat-7B",
+    )
     llm = LLM(
         model="meta-llama/Llama-2-7b-hf",
         speculative_config={
             "model": patched_draft_path,
             "num_speculative_tokens": 5,
-            "max_model_len": 128
+            "max_model_len": 128,
         },
         max_num_seqs=1,
         max_model_len=128,
@@ -65,19 +66,21 @@ def test_eagle():
         override_neuron_config={
             "enable_eagle_speculation": True,
             "enable_fused_speculation": True,
-            "fused_qkv": True
+            "fused_qkv": True,
         },
     )
     prompts = [
         "The president of the United States is",
     ]
     outputs = llm.generate(prompts, SamplingParams(top_k=1))
-    expected_output = " the head of state and head of government of " \
-    "the United States. The president direct"
+    expected_output = (
+        " the head of state and head of government of "
+        "the United States. The president direct"
+    )
 
     for output in outputs:
         generated_text = output.outputs[0].text
         print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
+        assert expected_output == generated_text
 
     print("Neuron Eagle speculation test passed.")
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
index d02fff943e90..f3f8f5a607e9 100644
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
@@ -5,15 +5,17 @@
 
 
 def test_mistral():
-    llm = LLM(model="mistralai/Mistral-7B-v0.1",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=128,
-              use_v2_block_manager=True,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True
-              })
+    llm = LLM(
+        model="mistralai/Mistral-7B-v0.1",
+        tensor_parallel_size=2,
+        max_num_seqs=4,
+        max_model_len=128,
+        use_v2_block_manager=True,
+        override_neuron_config={
+            "sequence_parallel_enabled": False,
+            "skip_warmup": True,
+        },
+    )
 
     # Send more prompts than the compiled batch size (4) and request
     # varying generation lengths to test accuracy related to Neuron
@@ -33,15 +35,14 @@ def test_mistral():
         SamplingParams(top_k=1, max_tokens=30),
         SamplingParams(top_k=1, max_tokens=40),
         SamplingParams(top_k=1, max_tokens=50),
-        SamplingParams(top_k=1, max_tokens=60)
+        SamplingParams(top_k=1, max_tokens=60),
     ]
 
     outputs = llm.generate(prompts, sampling_params)
 
     expected_outputs = [
         " the most powerful person in the world. He is",
-        " a city of many faces. It is a city of history, culture, art, "
-        "fashion, and",
+        " a city of many faces. It is a city of history, culture, art, fashion, and",
         "\n\nAnnapurna Labs is a semiconductor company that was founded "
         "in 2013 by Amazon. The company is",
         " to be happy.\n\nI believe that happiness is a choice.\n\nI "
@@ -54,12 +55,12 @@ def test_mistral():
         " and I am a 10 year old male. I am a very friendly and "
         "affectionate boy who loves to be around people. I am a very "
         "active boy who loves to play and run around. I am a very smart "
-        "boy who loves to learn new things. I am a very loyal boy"
+        "boy who loves to learn new things. I am a very loyal boy",
     ]
 
     for expected_output, output in zip(expected_outputs, outputs):
         generated_text = output.outputs[0].text
         print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
+        assert expected_output == generated_text
 
     print("Neuron Mistral test passed.")
diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py
index 6b97f47d4db3..b0e57258ac7b 100644
--- a/tests/neuron/2_core/test_multi_lora.py
+++ b/tests/neuron/2_core/test_multi_lora.py
@@ -8,25 +8,23 @@
 
 
 def test_llama_single_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              use_v2_block_manager=True,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=1,
-              max_lora_rank=256,
-              device="neuron")
+    sql_lora_files = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-hf",
+        tensor_parallel_size=2,
+        max_num_seqs=4,
+        max_model_len=512,
+        use_v2_block_manager=True,
+        override_neuron_config={
+            "sequence_parallel_enabled": False,
+            "skip_warmup": True,
+            "lora_modules": [{"name": "lora_id_1", "path": sql_lora_files}],
+        },
+        enable_lora=True,
+        max_loras=1,
+        max_lora_rank=256,
+        device="neuron",
+    )
     """For multi-lora requests using NxDI as the backend, only the lora_name 
     needs to be specified. The lora_id and lora_path are supplied at the LLM 
     class/server initialization, after which the paths are handled by NxDI"""
@@ -35,46 +33,42 @@ def test_llama_single_lora():
         "The president of the United States is",
         "The capital of France is",
     ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_1])
+    outputs = llm.generate(
+        prompts, SamplingParams(top_k=1), lora_request=[lora_req_1, lora_req_1]
+    )
 
     expected_outputs = [
         " the head of state and head of government of the United States. "
         "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
+        " a city of contrasts. The city is home to the Eiffel Tower",
     ]
 
     for expected_output, output in zip(expected_outputs, outputs):
         generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
+        assert expected_output == generated_text
 
 
 def test_llama_multiple_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              use_v2_block_manager=True,
-              override_neuron_config={
-                  "sequence_parallel_enabled":
-                  False,
-                  "skip_warmup":
-                  True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }, {
-                      "name": "lora_id_2",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=2,
-              max_lora_rank=256,
-              device="neuron")
+    sql_lora_files = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-hf",
+        tensor_parallel_size=2,
+        max_num_seqs=4,
+        max_model_len=512,
+        use_v2_block_manager=True,
+        override_neuron_config={
+            "sequence_parallel_enabled": False,
+            "skip_warmup": True,
+            "lora_modules": [
+                {"name": "lora_id_1", "path": sql_lora_files},
+                {"name": "lora_id_2", "path": sql_lora_files},
+            ],
+        },
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=256,
+        device="neuron",
+    )
     """For multi-lora requests using NxDI as the backend, only the lora_name 
     needs to be specified. The lora_id and lora_path are supplied at the LLM 
     class/server initialization, after which the paths are handled by NxDI"""
@@ -84,16 +78,16 @@ def test_llama_multiple_lora():
         "The president of the United States is",
         "The capital of France is",
     ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_2])
+    outputs = llm.generate(
+        prompts, SamplingParams(top_k=1), lora_request=[lora_req_1, lora_req_2]
+    )
 
     expected_outputs = [
         " the head of state and head of government of the United States. "
         "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
+        " a city of contrasts. The city is home to the Eiffel Tower",
     ]
 
     for expected_output, output in zip(expected_outputs, outputs):
         generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
+        assert expected_output == generated_text
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index 3e2c2577da66..cd98efdd1390 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -13,11 +13,10 @@
 PA_NAME = "swapnilbp/llama_tweet_ptune"
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def adapter_cache(request, tmpdir_factory):
     # Create dir that mimics the structure of the adapter cache
-    adapter_cache = tmpdir_factory.mktemp(
-        request.module.__name__) / "adapter_cache"
+    adapter_cache = tmpdir_factory.mktemp(request.module.__name__) / "adapter_cache"
     return adapter_cache
 
 
diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py
index 6307bb63897a..eeffac5d3edd 100644
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
@@ -3,10 +3,11 @@
 
 from setuptools import setup
 
-setup(name='vllm_add_dummy_model',
-      version='0.1',
-      packages=['vllm_add_dummy_model'],
-      entry_points={
-          'vllm.general_plugins':
-          ["register_dummy_model = vllm_add_dummy_model:register"]
-      })
+setup(
+    name="vllm_add_dummy_model",
+    version="0.1",
+    packages=["vllm_add_dummy_model"],
+    entry_points={
+        "vllm.general_plugins": ["register_dummy_model = vllm_add_dummy_model:register"]
+    },
+)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index b2085b01c45c..457187e4b492 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -19,5 +19,4 @@ def register():
         )
 
     if "MyLlava" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model("MyLlava",
-                                     "vllm_add_dummy_model.my_llava:MyLlava")
+        ModelRegistry.register_model("MyLlava", "vllm_add_dummy_model.my_llava:MyLlava")
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index 797353e4f7a8..f0436c837864 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -15,7 +15,6 @@
 
 
 class MyGemma2Embedding(nn.Module):
-
     is_pooling_model = True
 
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
@@ -23,8 +22,9 @@ class MyGemma2Embedding(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        self.model = Gemma2Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
+        self.model = Gemma2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         self.pooler = Pooler.from_config_with_defaults(
             vllm_config.model_config.pooler_config,
@@ -34,7 +34,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
 
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
+            self.model.make_empty_intermediate_tensors
+        )
 
     def forward(
         self,
@@ -57,8 +58,8 @@ def forward(
         return torch.zeros_like(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-
         weights = self.hf_to_vllm_mapper.apply(weights)
-        weights = ((name, data) for name, data in weights
-                   if not name.startswith("lm_head."))
+        weights = (
+            (name, data) for name, data in weights if not name.startswith("lm_head.")
+        )
         return self.model.load_weights(weights)
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index da97cf7e2b40..284de68b4612 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -5,22 +5,25 @@
 
 import torch
 
-from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
-                                              LlavaForConditionalGeneration,
-                                              LlavaMultiModalProcessor,
-                                              LlavaProcessingInfo)
+from vllm.model_executor.models.llava import (
+    LlavaDummyInputsBuilder,
+    LlavaForConditionalGeneration,
+    LlavaMultiModalProcessor,
+    LlavaProcessingInfo,
+)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
-                                        info=LlavaProcessingInfo,
-                                        dummy_inputs=LlavaDummyInputsBuilder)
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaMultiModalProcessor,
+    info=LlavaProcessingInfo,
+    dummy_inputs=LlavaDummyInputsBuilder,
+)
 class MyLlava(LlavaForConditionalGeneration):
-
     def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
+    ) -> Optional[torch.Tensor]:
         # this dummy model always predicts the first token
         logits = super().compute_logits(hidden_states, sampling_metadata)
         if logits is not None:
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index 8c34407e3e07..09f7a6190090 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -10,10 +10,9 @@
 
 
 class MyOPTForCausalLM(OPTForCausalLM):
-
     def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
+    ) -> Optional[torch.Tensor]:
         # this dummy model always predicts the first token
         logits = super().compute_logits(hidden_states, sampling_metadata)
         if logits is not None:
diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py
index a531826628cd..b976dddb7fb5 100644
--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -4,13 +4,15 @@
 from setuptools import setup
 
 setup(
-    name='vllm_add_dummy_platform',
-    version='0.1',
-    packages=['vllm_add_dummy_platform'],
+    name="vllm_add_dummy_platform",
+    version="0.1",
+    packages=["vllm_add_dummy_platform"],
     entry_points={
-        'vllm.platform_plugins': [
+        "vllm.platform_plugins": [
             "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
         ],
-        "vllm.general_plugins":
-        ["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
-    })
+        "vllm.general_plugins": [
+            "dummy_custom_ops = vllm_add_dummy_platform:register_ops"
+        ],
+    },
+)
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
index e38fb2fbf934..f2d516f52b8b 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.attention.backends.placeholder_attn import (
-    PlaceholderAttentionBackend)
+from vllm.attention.backends.placeholder_attn import PlaceholderAttentionBackend
 
 
 class DummyAttentionBackend(PlaceholderAttentionBackend):
-
     @staticmethod
     def get_name() -> str:
         return "Dummy_Backend"
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
index 1fcc3fc66617..b73028574526 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
@@ -15,6 +15,5 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.addition_config = True
 
-    def forward_oot(self, *args,
-                    **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+    def forward_oot(self, *args, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
         return super().forward_oot(*args, **kwargs)
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index e67825f89d81..d6ce409c1170 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -24,6 +24,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             # Activate custom ops for v1.
             compilation_config.custom_ops = ["all"]
 
-    def get_attn_backend_cls(self, backend_name, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1, use_mla):
-        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
\ No newline at end of file
+    def get_attn_backend_cls(
+        self,
+        backend_name,
+        head_size,
+        dtype,
+        kv_cache_dtype,
+        block_size,
+        use_v1,
+        use_mla,
+    ):
+        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py
index c8c1b81ca218..a6a8b33e19d3 100644
--- a/tests/plugins_tests/conftest.py
+++ b/tests/plugins_tests/conftest.py
@@ -9,4 +9,4 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
+    monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index ef99c3dadd32..f5280199366a 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -12,19 +12,25 @@
 def test_platform_plugins():
     # simulate workload by running an example
     import runpy
+
     current_file = __file__
     import os
+
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/basic/basic.py")
+        "examples",
+        "offline_inference/basic/basic.py",
+    )
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly
     from vllm.platforms import _init_trace, current_platform
+
     assert current_platform.device_name == "DummyDevice", (
         f"Expected DummyDevice, got {current_platform.device_name}, "
         "possibly because current_platform is imported before the plugin"
-        f" is loaded. The first import:\n{_init_trace}")
+        f" is loaded. The first import:\n{_init_trace}"
+    )
 
 
 def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
@@ -39,10 +45,13 @@ def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
     # simulate workload by running an example
     load_general_plugins()
     from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+
     layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
     assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
         f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
-        "possibly because the custom op is not registered correctly.")
+        "possibly because the custom op is not registered correctly."
+    )
     assert hasattr(layer, "addition_config"), (
         "Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
-        "which is set by the custom op.")
+        "which is set by the custom op."
+    )
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 8c2121610868..49958926e0fd 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -12,13 +12,11 @@
 
 
 class DummyV0Scheduler(Scheduler):
-
     def schedule(self):
         raise Exception("Exception raised by DummyV0Scheduler")
 
 
 class DummyV1Scheduler(V1Scheduler):
-
     def schedule(self):
         raise Exception("Exception raised by DummyV1Scheduler")
 
@@ -27,7 +25,6 @@ def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
         with pytest.raises(Exception) as exception_info:
-
             engine_args = EngineArgs(
                 model="facebook/opt-125m",
                 enforce_eager=True,  # reduce test time
@@ -40,8 +37,7 @@ def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
             engine.add_request("0", "foo", sampling_params)
             engine.step()
 
-        assert str(
-            exception_info.value) == "Exception raised by DummyV0Scheduler"
+        assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
 
 
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
@@ -52,7 +48,6 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
         with pytest.raises(Exception) as exception_info:
-
             engine_args = EngineArgs(
                 model="facebook/opt-125m",
                 enforce_eager=True,  # reduce test time
@@ -65,5 +60,4 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
             engine.add_request("0", "foo", sampling_params)
             engine.step()
 
-        assert str(
-            exception_info.value) == "Exception raised by DummyV1Scheduler"
+        assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index f00a8f6998cb..7aa7ba0eb706 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
+
 import pytest
 
 from vllm import LLM
@@ -13,7 +14,6 @@
     # Example models with sliding window.
     ("bigcode/starcoder2-3b", 4096, 16384),
     # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
-
     # Confirm model with sliding window works.
     # config has "use_sliding_window": false
     ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
@@ -23,27 +23,35 @@
 
 
 @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
-def test_disable_sliding_window(model_len_len, ):
+def test_disable_sliding_window(
+    model_len_len,
+):
     model, sliding_len, full_len = model_len_len
     vllm_disabled_model = LLM(model, disable_sliding_window=True)
     vllm_disabled_model.generate("Hi my name is")
     model_config = vllm_disabled_model.llm_engine.model_config
     assert model_config.max_model_len == sliding_len, (
-        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
-        model_config.max_model_len)
+        "Max len expected to equal sliding_len of %s, but got %s",
+        sliding_len,
+        model_config.max_model_len,
+    )
 
     del vllm_disabled_model
     cleanup_dist_env_and_memory()
 
-    vllm_enabled_model = LLM(model,
-                             enforce_eager=True,
-                             disable_sliding_window=False,
-                             enable_prefix_caching=False)
+    vllm_enabled_model = LLM(
+        model,
+        enforce_eager=True,
+        disable_sliding_window=False,
+        enable_prefix_caching=False,
+    )
     vllm_enabled_model.generate("Hi my name is")
     model_config = vllm_enabled_model.llm_engine.model_config
     assert model_config.max_model_len == full_len, (
-        "Max len expected to equal full_len of %s, but got %s", full_len,
-        model_config.max_model_len)
+        "Max len expected to equal full_len of %s, but got %s",
+        full_len,
+        model_config.max_model_len,
+    )
 
     del vllm_enabled_model
     cleanup_dist_env_and_memory()
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index a65fc934b16a..9acab4a48dc4 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -26,7 +26,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     This module relies on V0 internals, so set VLLM_USE_V1=0.
     """
     with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
+        m.setenv("VLLM_USE_V1", "0")
         yield
 
 
@@ -80,37 +80,36 @@ def test_mixed_requests(
 
         cached_prompt = example_prompts[cached_position]
         with vllm_runner(
-                model,
-                dtype=dtype,
-                enable_prefix_caching=True,
-                enable_chunked_prefill=enable_chunked_prefill,
-                block_size=block_size,
+            model,
+            dtype=dtype,
+            enable_prefix_caching=True,
+            enable_chunked_prefill=enable_chunked_prefill,
+            block_size=block_size,
         ) as vllm_model:
             # Run the first prompt so the cache is populated
-            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
-                                                      max_tokens)
+            vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
 
             # Run all the promopts
-            greedy_params = SamplingParams(temperature=0.0,
-                                           max_tokens=max_tokens)
-            req_outputs = vllm_model.model.generate(example_prompts,
-                                                    greedy_params)
+            greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+            req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
 
             # Verify number of cached tokens
             for i in range(len(req_outputs)):
                 if i == cached_position:
                     expected_num_cached_tokens = (
-                        len(req_outputs[i].prompt_token_ids) //
-                        block_size) * block_size
+                        len(req_outputs[i].prompt_token_ids) // block_size
+                    ) * block_size
                 else:
                     expected_num_cached_tokens = 0
-                assert (req_outputs[i].num_cached_tokens ==
-                        expected_num_cached_tokens)
+                assert req_outputs[i].num_cached_tokens == expected_num_cached_tokens
 
-            vllm_outputs = [(
-                output.prompt_token_ids + list(output.outputs[0].token_ids),
-                output.prompt + output.outputs[0].text,
-            ) for output in req_outputs]
+            vllm_outputs = [
+                (
+                    output.prompt_token_ids + list(output.outputs[0].token_ids),
+                    output.prompt + output.outputs[0].text,
+                )
+                for output in req_outputs
+            ]
 
         check_outputs_equal(
             outputs_0_lst=hf_outputs,
@@ -126,7 +125,6 @@ def test_unstable_prompt_sequence(
     backend: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
@@ -135,14 +133,15 @@ def test_unstable_prompt_sequence(
         m.setenv(STR_BACKEND_ENV_VAR, backend)
 
         with vllm_runner(
-                "Qwen/Qwen2.5-0.5B-Instruct",
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                max_model_len=4096,
+            "Qwen/Qwen2.5-0.5B-Instruct",
+            enable_chunked_prefill=True,
+            enable_prefix_caching=True,
+            max_model_len=4096,
         ) as vllm_model:
             for prompt in UNSTABLE_PROMPT_SEQUENCE:
-                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                    SamplingParams(max_tokens=1))
+                vllm_model.generate(
+                    TokensPrompt(prompt_token_ids=prompt), SamplingParams(max_tokens=1)
+                )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -209,23 +208,25 @@ def test_fully_cached_prefill_needs_uncached_token(model):
 
     sched_metas, sched_out, _ = scheduler.last_schedule_ret()
     assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupB.request_id)
-    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
-            max_num_batched_tokens)
+    assert (
+        sched_out.scheduled_seq_groups[0].seq_group.request_id == seq_groupB.request_id
+    )
+    assert sched_out.scheduled_seq_groups[0].token_chunk_size == max_num_batched_tokens
 
     # When seqB is finished, seqC could be prefilled.
     while not seqB.is_finished():
         engine.step()
         sched_metas, sched_out, _ = scheduler.last_schedule_ret()
         assert len(sched_out.scheduled_seq_groups) == 1
-        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-                seq_groupB.request_id)
+        assert (
+            sched_out.scheduled_seq_groups[0].seq_group.request_id
+            == seq_groupB.request_id
+        )
 
     engine.step()
     sched_metas, sched_out, _ = scheduler.last_schedule_ret()
     assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupC.request_id)
-    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
-        seqA_tokens)
+    assert (
+        sched_out.scheduled_seq_groups[0].seq_group.request_id == seq_groupC.request_id
+    )
+    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens)
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
index 2b603fe8f022..b7c0573c0b85 100644
--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
@@ -7,24 +7,26 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 
 MODEL_PATH = "bigscience/bloomz-560m"
-PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
+PA_PATH = "stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"
 
 
 def do_sample(llm, pa_name: str, pa_id: int):
-
     prompts = [
         "Tweet text : @nationalgridus I have no water and the bill is \
         current and paid. Can you do something about this? Label : ",
-        "Tweet text : @nationalgridus Looks good thanks! Label : "
+        "Tweet text : @nationalgridus Looks good thanks! Label : ",
     ]
-    sampling_params = vllm.SamplingParams(temperature=0.0,
-                                          max_tokens=3,
-                                          stop_token_ids=[3])
-
-    outputs = llm.generate(prompts,
-                           sampling_params,
-                           prompt_adapter_request=PromptAdapterRequest(
-                               pa_name, pa_id, PA_PATH, 8) if pa_id else None)
+    sampling_params = vllm.SamplingParams(
+        temperature=0.0, max_tokens=3, stop_token_ids=[3]
+    )
+
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        prompt_adapter_request=PromptAdapterRequest(pa_name, pa_id, PA_PATH, 8)
+        if pa_id
+        else None,
+    )
 
     # Print the outputs.
     generated_texts = []
@@ -38,11 +40,13 @@ def do_sample(llm, pa_name: str, pa_id: int):
 
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_twitter_prompt_adapter(enforce_eager: bool):
-    llm = vllm.LLM(MODEL_PATH,
-                   enforce_eager=enforce_eager,
-                   enable_prompt_adapter=True,
-                   max_prompt_adapter_token=8)
-
-    expected_output = ['complaint', 'no complaint']
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enforce_eager=enforce_eager,
+        enable_prompt_adapter=True,
+        max_prompt_adapter_token=8,
+    )
+
+    expected_output = ["complaint", "no complaint"]
 
     assert do_sample(llm, "twitter_pa", pa_id=1) == expected_output
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
index 4f273afb4e36..71882df55551 100644
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
@@ -5,24 +5,32 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 
 MODEL_PATH = "bigscience/bloomz-560m"
-pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
-pa_path2 = 'swapnilbp/angry_tweet_ptune'
+pa_path = "stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"
+pa_path2 = "swapnilbp/angry_tweet_ptune"
 
 
 def do_sample(engine):
-
     prompts = [
-        ("Tweet text: I have complaints! Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
-         PromptAdapterRequest("hate_speech", 1, pa_path2, 8)),
-        ("Tweet text: I have no problems Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
-         PromptAdapterRequest("hate_speech2", 2, pa_path2, 8)),
-        ("Tweet text: I have complaints! Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3), None),
-        ("Tweet text: I have no problems Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
-         PromptAdapterRequest("complain", 3, pa_path, 8)),
+        (
+            "Tweet text: I have complaints! Label: ",
+            SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+            PromptAdapterRequest("hate_speech", 1, pa_path2, 8),
+        ),
+        (
+            "Tweet text: I have no problems Label: ",
+            SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+            PromptAdapterRequest("hate_speech2", 2, pa_path2, 8),
+        ),
+        (
+            "Tweet text: I have complaints! Label: ",
+            SamplingParams(temperature=0.0, max_tokens=3),
+            None,
+        ),
+        (
+            "Tweet text: I have no problems Label: ",
+            SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
+            PromptAdapterRequest("complain", 3, pa_path, 8),
+        ),
     ]
 
     request_id = 0
@@ -30,10 +38,12 @@ def do_sample(engine):
     while prompts or engine.has_unfinished_requests():
         if prompts:
             prompt, sampling_params, pa_request = prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               prompt_adapter_request=pa_request)
+            engine.add_request(
+                str(request_id),
+                prompt,
+                sampling_params,
+                prompt_adapter_request=pa_request,
+            )
             request_id += 1
 
         request_outputs = engine.step()
@@ -45,12 +55,12 @@ def do_sample(engine):
 
 
 def test_multi_prompt_adapters():
-    engine_args = EngineArgs(model=MODEL_PATH,
-                             max_prompt_adapters=3,
-                             enable_prompt_adapter=True,
-                             max_prompt_adapter_token=8)
+    engine_args = EngineArgs(
+        model=MODEL_PATH,
+        max_prompt_adapters=3,
+        enable_prompt_adapter=True,
+        max_prompt_adapter_token=8,
+    )
     engine = LLMEngine.from_engine_args(engine_args)
-    expected_output = {
-        ' quot;I', 'hate speech', 'no complaint', 'not hate speech'
-    }
+    expected_output = {" quot;I", "hate speech", "no complaint", "not hate speech"}
     assert do_sample(engine) == expected_output
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
index ba2e15b81bc1..305b7eebbb9a 100644
--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -13,20 +13,22 @@
 
 
 def do_sample(engine):
-
     prompt_text = "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]"  # noqa: E501
 
     # first prompt with a prompt adapter and second without adapter
     prompts = [
-        (prompt_text,
-         SamplingParams(temperature=0.0, max_tokens=100,
-                        stop=["[/assistant]"]),
-         PromptAdapterRequest("hate_speech", 1, pa_path,
-                              8), LoRARequest("sql_test", 1, lora_path)),
-        (prompt_text,
-         SamplingParams(temperature=0.0, max_tokens=100,
-                        stop=["[/assistant]"]), None,
-         LoRARequest("sql_test", 1, lora_path)),
+        (
+            prompt_text,
+            SamplingParams(temperature=0.0, max_tokens=100, stop=["[/assistant]"]),
+            PromptAdapterRequest("hate_speech", 1, pa_path, 8),
+            LoRARequest("sql_test", 1, lora_path),
+        ),
+        (
+            prompt_text,
+            SamplingParams(temperature=0.0, max_tokens=100, stop=["[/assistant]"]),
+            None,
+            LoRARequest("sql_test", 1, lora_path),
+        ),
     ]
 
     request_id = 0
@@ -34,11 +36,13 @@ def do_sample(engine):
     while prompts or engine.has_unfinished_requests():
         if prompts:
             prompt, sampling_params, pa_request, lora_request = prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               prompt_adapter_request=pa_request,
-                               lora_request=lora_request)
+            engine.add_request(
+                str(request_id),
+                prompt,
+                sampling_params,
+                prompt_adapter_request=pa_request,
+                lora_request=lora_request,
+            )
             request_id += 1
 
         request_outputs = engine.step()
@@ -50,11 +54,13 @@ def do_sample(engine):
 
 
 def test_lora_prompt_adapter():
-    engine_args = EngineArgs(model=MODEL_PATH,
-                             enable_prompt_adapter=True,
-                             enable_lora=True,
-                             max_num_seqs=60,
-                             max_prompt_adapter_token=8)
+    engine_args = EngineArgs(
+        model=MODEL_PATH,
+        enable_prompt_adapter=True,
+        enable_lora=True,
+        max_num_seqs=60,
+        max_prompt_adapter_token=8,
+    )
     engine = LLMEngine.from_engine_args(engine_args)
     result = do_sample(engine)
 
diff --git a/tests/quantization/reference_mxfp4.py b/tests/quantization/reference_mxfp4.py
index 2ef251933f68..d84659ed035e 100644
--- a/tests/quantization/reference_mxfp4.py
+++ b/tests/quantization/reference_mxfp4.py
@@ -14,14 +14,15 @@
 FLOAT4_EXP_BIAS = 1
 FLOAT4_MANTISSA_BITS = 1
 
-FLOAT16_VAL_TO_ADD = (1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
-FLOAT16_SIGN_EXPONENT_MASK = ((
-    (1 << (FLOAT16_EXP_BITS + 1)) - 1) << FLOAT16_MANTISSA_BITS)
+FLOAT16_VAL_TO_ADD = 1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
+FLOAT16_SIGN_EXPONENT_MASK = (
+    (1 << (FLOAT16_EXP_BITS + 1)) - 1
+) << FLOAT16_MANTISSA_BITS
 
-BFLOAT16_VAL_TO_ADD = (1 <<
-                       (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
-BFLOAT16_SIGN_EXPONENT_MASK = ((
-    (1 << (BFLOAT16_EXP_BITS + 1)) - 1) << BFLOAT16_MANTISSA_BITS)
+BFLOAT16_VAL_TO_ADD = 1 << (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
+BFLOAT16_SIGN_EXPONENT_MASK = (
+    (1 << (BFLOAT16_EXP_BITS + 1)) - 1
+) << BFLOAT16_MANTISSA_BITS
 
 
 def e8m0_to_half(scale, half_dtype: torch.dtype):
@@ -30,19 +31,19 @@ def e8m0_to_half(scale, half_dtype: torch.dtype):
     scale_exp = scale.to(torch.int16) - 127
 
     # This can be implemented with bitwise operations in a proper kernel.
-    scale_half = 2.0**(scale_exp.to(torch.float))
+    scale_half = 2.0 ** (scale_exp.to(torch.float))
 
     return scale_half.to(half_dtype)
 
 
-def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
-                               half_exp_bias: int, half_mantissa_bits: int):
+def upcast_fp4_to_fp16_or_bf16(
+    val, float_dtype: torch.dtype, half_exp_bias: int, half_mantissa_bits: int
+):
     assert val.dtype == torch.uint8
 
-    unpacked = torch.zeros(*val.shape[:-1],
-                           val.shape[-1] * 2,
-                           dtype=torch.uint8,
-                           device=val.device)
+    unpacked = torch.zeros(
+        *val.shape[:-1], val.shape[-1] * 2, dtype=torch.uint8, device=val.device
+    )
     unpacked[..., 1::2] = (val >> 4) & 0x0F  # Extract high 4 bits.
     unpacked[..., ::2] = val & 0x0F  # Extract low 4 bits.
 
@@ -72,8 +73,11 @@ def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
     new_exp = new_exp.to(torch.int32)
     sign = sign.to(torch.int32)
 
-    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
-        new_mantissa << (half_mantissa_bits - 1))
+    qdq_val = (
+        (sign << 15)
+        + (new_exp << half_mantissa_bits)
+        + (new_mantissa << (half_mantissa_bits - 1))
+    )
 
     assert qdq_val.max() <= 65535
     assert qdq_val.min() >= 0
@@ -84,8 +88,9 @@ def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
     return result
 
 
-def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
-                   float_dtype: torch.dtype) -> torch.Tensor:
+def dq_mxfp4_torch(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
     assert x.dtype == torch.uint8
     assert scale.dtype == torch.uint8
 
@@ -98,10 +103,12 @@ def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
 
     scale_half = e8m0_to_half(scale, half_dtype=float_dtype)
 
-    x_half = upcast_fp4_to_fp16_or_bf16(x,
-                                        float_dtype=float_dtype,
-                                        half_exp_bias=half_exp_bias,
-                                        half_mantissa_bits=half_mantissa_bits)
+    x_half = upcast_fp4_to_fp16_or_bf16(
+        x,
+        float_dtype=float_dtype,
+        half_exp_bias=half_exp_bias,
+        half_mantissa_bits=half_mantissa_bits,
+    )
 
     x_half = x_half.reshape(*x_half.shape[:-1], -1, 32)
     x_half = x_half * scale_half[..., None]
@@ -110,8 +117,9 @@ def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
     return x_half
 
 
-def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
-                         half_exp_bias: int):
+def fp16_to_fp4_simulate(
+    val, half_mantissa_bits: int, half_exp_bits: int, half_exp_bias: int
+):
     # Casts an fp16/bf16 input to the restricted values of float4_e2m1,
     # that is to say [0., 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, -0.0,
     # -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0].
@@ -119,7 +127,7 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
     float_type = val.dtype
 
     # "rshift_cuda" not implemented for 'UInt16'
-    val_view = val.view(torch.int16)  #.to(torch.int32)
+    val_view = val.view(torch.int16)  # .to(torch.int32)
 
     exp = val_view >> half_mantissa_bits
     exp = exp & ((1 << half_exp_bits) - 1)
@@ -147,23 +155,15 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
 
     tail = mantissa_plus_one & ((1 << tail_bits) - 1)
 
-    round_close = (tail < half)  # round towards 0
-    round_away = (tail > half)  # round away from 0
+    round_close = tail < half  # round towards 0
+    round_away = tail > half  # round away from 0
     tie = tail == half
 
-    new_mantissa_close = torch.zeros(val.shape,
-                                     device=val.device,
-                                     dtype=torch.bool)
-    new_exp_close = torch.zeros(val.shape,
-                                device=val.device,
-                                dtype=torch.uint16)
+    new_mantissa_close = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
+    new_exp_close = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
 
-    new_mantissa_away = torch.zeros(val.shape,
-                                    device=val.device,
-                                    dtype=torch.bool)
-    new_exp_away = torch.zeros(val.shape,
-                               device=val.device,
-                               dtype=torch.uint16)
+    new_mantissa_away = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
+    new_exp_away = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
 
     new_exp_tie = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
 
@@ -202,27 +202,29 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
     new_exp_tie = (exp > (half_exp_bias - 2)) * (exp + (mantissa_last == 1))
 
     # Gather round up, round down and tie.
-    new_exp = round_away * new_exp_away \
-        + round_close * new_exp_close \
-        + tie * new_exp_tie
+    new_exp = (
+        round_away * new_exp_away + round_close * new_exp_close + tie * new_exp_tie
+    )
 
-    new_mantissa = round_away * new_mantissa_away \
-        + round_close * new_mantissa_close
+    new_mantissa = round_away * new_mantissa_away + round_close * new_mantissa_close
 
     # if new_exp > 3:
     #     new_mantissa = 1
-    new_mantissa = new_mantissa + (new_exp >
-                                   (2 + half_exp_bias)) * (new_mantissa == 0)
+    new_mantissa = new_mantissa + (new_exp > (2 + half_exp_bias)) * (new_mantissa == 0)
 
     # Clamp the exponent to acceptable values.
     new_exp = (new_exp >= (half_exp_bias - 2)) * torch.clamp(
-        new_exp, half_exp_bias - 2, half_exp_bias + 2)
+        new_exp, half_exp_bias - 2, half_exp_bias + 2
+    )
 
     sign = sign.to(torch.int32)
     new_mantissa = new_mantissa.to(torch.int32)
 
-    qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
-        new_mantissa << (half_mantissa_bits - 1))
+    qdq_val = (
+        (sign << 15)
+        + (new_exp << half_mantissa_bits)
+        + (new_mantissa << (half_mantissa_bits - 1))
+    )
 
     assert qdq_val.max() <= 65535
     assert qdq_val.min() >= 0
@@ -233,8 +235,9 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
     return result
 
 
-def qdq_mxfp4_torch(x: torch.Tensor,
-                    scale_calculation_mode: str = "even") -> torch.Tensor:
+def qdq_mxfp4_torch(
+    x: torch.Tensor, scale_calculation_mode: str = "even"
+) -> torch.Tensor:
     half_dtype = x.dtype
 
     if half_dtype == torch.float16:
@@ -258,8 +261,7 @@ def qdq_mxfp4_torch(x: torch.Tensor,
 
     block_max = block_max.view(torch.uint16).to(torch.int32)
 
-    block_max_uint = torch.bitwise_and(block_max + val_to_add,
-                                       sign_exponent_mask)
+    block_max_uint = torch.bitwise_and(block_max + val_to_add, sign_exponent_mask)
 
     assert block_max_uint.max() <= 65535
     assert block_max_uint.min() >= 0
@@ -268,20 +270,23 @@ def qdq_mxfp4_torch(x: torch.Tensor,
 
     block_max = block_max_uint.view(half_dtype)
 
-    scale_exp = FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(
-        torch.int32) - 2
+    scale_exp = (
+        FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(torch.int32) - 2
+    )
 
     scale_exp = torch.clamp(scale_exp, 0, 2 * FLOAT8_E8M0_MAX_EXP)
 
-    scale = 2.0**(scale_exp - FLOAT8_E8M0_MAX_EXP)
+    scale = 2.0 ** (scale_exp - FLOAT8_E8M0_MAX_EXP)
     scale = scale.to(half_dtype)
 
     x = x / scale[..., None]
 
-    x_fp4 = fp16_to_fp4_simulate(x,
-                                 half_exp_bits=half_exp_bits,
-                                 half_mantissa_bits=half_mantissa_bits,
-                                 half_exp_bias=half_exp_bias)
+    x_fp4 = fp16_to_fp4_simulate(
+        x,
+        half_exp_bits=half_exp_bits,
+        half_mantissa_bits=half_mantissa_bits,
+        half_exp_bias=half_exp_bias,
+    )
 
     x_fp4 = x_fp4 * scale[..., None]
     return x_fp4.reshape(*x_fp4.shape[:-2], -1)
diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py
index 1c41d904b816..69632ae6cac7 100644
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
- on the AutoRound.
+on the AutoRound.
 
- Validating the configuration and printing results for manual checking.
+Validating the configuration and printing results for manual checking.
 
- Run `pytest tests/quantization/test_auto_round.py`.
+Run `pytest tests/quantization/test_auto_round.py`.
 """
 
 import pytest
@@ -14,18 +14,19 @@
 
 MODELS = [
     "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",  ##auto_round:auto_gptq
-    "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound"  ##auto_round:auto_awq
+    "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound",  ##auto_round:auto_awq
 ]
 
 
-@pytest.mark.skipif(not current_platform.is_cpu()
-                    and not current_platform.is_xpu()
-                    and not current_platform.is_cuda(),
-                    reason="only supports CPU/XPU/CUDA backend.")
+@pytest.mark.skipif(
+    not current_platform.is_cpu()
+    and not current_platform.is_xpu()
+    and not current_platform.is_cuda(),
+    reason="only supports CPU/XPU/CUDA backend.",
+)
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
     with vllm_runner(model) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=8)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
     assert output
     print(f"{output[0][1]}")
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index db7e50eff72b..587d163629b5 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -13,13 +13,20 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16, cutlass_fp4_supported)
+    CompressedTensors24,
+    CompressedTensorsLinearMethod,
+    CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16,
+    cutlass_fp4_supported,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    sparse_cutlass_supported)
+    sparse_cutlass_supported,
+)
 from vllm.platforms import current_platform
 
 # AITER only supports per-channel-per-channel INT8 gemm
@@ -27,7 +34,7 @@
 # It does not support mix precision MM and mix quantization scheme.
 ROCM_AITER_SUPPORTED_INT8_MODEL = [
     "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
 ]
 
 # TritonScaledMMLinearKernel only supports symmetric quantization.
@@ -46,7 +53,7 @@ def use_v0_only(monkeypatch):
     This module relies on V0 internals, so set VLLM_USE_V1=0.
     """
     if not current_platform.is_cpu():
-        monkeypatch.setenv('VLLM_USE_V1', '0')
+        monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 @pytest.mark.parametrize(
@@ -78,8 +85,10 @@ def use_v0_only(monkeypatch):
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
 
-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
         pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
 
     with vllm_runner(model_path, enforce_eager=True) as llm:
@@ -104,14 +113,10 @@ def zp_valid(zp: Optional[torch.Tensor]):
             assert zp_valid(gate_up_proj.input_zero_point)
             assert zp_valid(down_proj.input_zero_point)
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(o_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(gate_up_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            assert isinstance(down_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
+            assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
 
             assert qkv_proj.scheme.strategy == strategy
@@ -149,7 +154,8 @@ def zp_valid(zp: Optional[torch.Tensor]):
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize(
-    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 def test_compressed_tensors_w8a8_logprobs(
     hf_runner,
     vllm_runner,
@@ -160,33 +166,33 @@ def test_compressed_tensors_w8a8_logprobs(
     use_aiter,
     monkeypatch,
 ):
-
-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
         pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
-            pytest.skip(
-                f"Skip model {model_path} as it is not support by aiter.")
+            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
         # this will enable VLLM_ROCM_USE_AITER_LINEAR
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     dtype = "bfloat16"
 
     # skip language translation prompt for the static per tensor asym model
-    if (model_path ==
-            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
-        ):  # noqa: E501
+    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
         example_prompts = example_prompts[0:-1]
 
     with hf_runner(model_path, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     with vllm_runner(model_path, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -222,7 +228,8 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
     ],
 )
 @pytest.mark.parametrize(
-    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
 def test_compressed_tensors_w8a8_dynamic_per_token(
     vllm_runner,
     model_args,
@@ -231,14 +238,15 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
 ):
     model_path, strategy = model_args
 
-    if current_platform.is_rocm(
-    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+    if (
+        current_platform.is_rocm()
+        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
+    ):
         pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
 
     if use_aiter:
         if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
-            pytest.skip(
-                f"Skip model {model_path} as it is not support by aiter.")
+            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
         # this will enable VLLM_ROCM_USE_AITER_LINEAR
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -249,8 +257,7 @@ def check_model(model):
 
             qkv_proj = layer.self_attn.qkv_proj
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
             assert not qkv_proj.scheme.is_static_input_scheme
             assert qkv_proj.scheme.strategy == strategy
@@ -264,21 +271,60 @@ def check_model(model):
 
 @pytest.mark.parametrize(
     "wNa16_args",
-    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
-      True, False),
-     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
-      False),
-     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
-      True, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
-      8, False, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
-      "channel", None, 8, False, False),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
-      "group", 128, 8, False, True)],
+    [
+        (
+            "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+            "channel",
+            None,
+            8,
+            True,
+            False,
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+            "group",
+            128,
+            8,
+            True,
+            False,
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+            "channel",
+            None,
+            4,
+            True,
+            False,
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256",
+            "group",
+            128,
+            8,
+            False,
+            False,
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
+            "channel",
+            None,
+            8,
+            False,
+            False,
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
+            "group",
+            128,
+            8,
+            False,
+            True,
+        ),
+    ],
+)
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="The tests are skipped on non-CUDA platform."
 )
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
     with vllm_runner(model) as llm:
@@ -287,13 +333,11 @@ def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
 
             assert qkv_proj.scheme.strategy == strategy
-            assert qkv_proj.scheme.group_size == (-1
-                                                  if group is None else group)
+            assert qkv_proj.scheme.group_size == (-1 if group is None else group)
 
             assert qkv_proj.scheme.pack_factor == pack_factor
             assert qkv_proj.scheme.symmetric == symmetric
@@ -305,8 +349,9 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
     with vllm_runner(model_path) as llm:
@@ -316,8 +361,7 @@ def check_model(model):
 
             qkv_proj = layer.self_attn.qkv_proj
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
             assert qkv_proj.weight_packed.dtype is torch.int32
 
@@ -336,8 +380,7 @@ def check_model(model):
 
             qkv_proj = layer.self_attn.qkv_proj
 
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(
                 qkv_proj.scheme,
                 (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
@@ -357,8 +400,9 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 def test_compressed_tensors_kv_cache(vllm_runner):
     model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
@@ -370,10 +414,7 @@ def test_compressed_tensors_kv_cache(vllm_runner):
     not sparse_cutlass_supported(),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
-def _test_2of4_quant_models(qkv_proj,
-                            weight_strategy,
-                            input_strategy,
-                            format="dense"):
+def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="dense"):
     assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
     assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
@@ -387,8 +428,7 @@ def _test_2of4_quant_models(qkv_proj,
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -435,8 +475,7 @@ def check_model(model):
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not current_platform.has_device_capability(90),
+    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -597,17 +636,14 @@ def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
             assert qkv_proj.scheme.weight_quant is None
             assert qkv_proj.scheme.input_quant is None
             assert not qkv_proj.scheme.quantized
             assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = (
-                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            )  # noqa: E501
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
             assert sparsity_map.get("Linear").format == "dense"
             assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
@@ -623,7 +659,8 @@ def check_model(model):
     reason="Cutlass is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
-    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
+    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]
+)
 def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
     model = args_2of4
     with vllm_runner(model) as llm:
@@ -632,17 +669,14 @@ def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
             assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
             assert qkv_proj.scheme.weight_quant is None
             assert qkv_proj.scheme.input_quant is None
             assert not qkv_proj.scheme.quantized
             assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = (
-                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            )  # noqa: E501
+            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
             assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
             assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
@@ -655,9 +689,11 @@ def check_model(model):
 
 @pytest.mark.parametrize(
     "args",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
-      CompressedTensorsW4A16Fp4),
-     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
+    [
+        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
+        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
+    ],
+)
 def test_compressed_tensors_nvfp4(vllm_runner, args):
     model, scheme = args
     with vllm_runner(model, enforce_eager=True) as llm:
@@ -666,11 +702,12 @@ def check_model(model):
             layer = model.model.layers[0]
 
             qkv_proj = layer.self_attn.qkv_proj
-            assert isinstance(qkv_proj.quant_method,
-                              CompressedTensorsLinearMethod)
-            if isinstance(qkv_proj.scheme, scheme) or isinstance(
-                    qkv_proj.scheme,
-                    CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
+            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+            if (
+                isinstance(qkv_proj.scheme, scheme)
+                or isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
+                and not cutlass_fp4_supported()
+            ):
                 assert True
             else:
                 raise AssertionError("FP4 Scheme Mismatch")
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 8b0ffc0fe42f..cf0b2d38212c 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -43,7 +43,6 @@ class ModelPair:
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
-
     # AUTOAWQ
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
     ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
@@ -57,15 +56,17 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
 
     try:
-        model_config = ModelConfig(model_path,
-                                   task="auto",
-                                   tokenizer=model_path,
-                                   tokenizer_mode="auto",
-                                   trust_remote_code=False,
-                                   seed=0,
-                                   dtype="float16",
-                                   revision=None,
-                                   quantization=quantization_arg)
+        model_config = ModelConfig(
+            model_path,
+            task="auto",
+            tokenizer=model_path,
+            tokenizer_mode="auto",
+            trust_remote_code=False,
+            seed=0,
+            dtype="float16",
+            revision=None,
+            quantization=quantization_arg,
+        )
         found_quantization_type = model_config.quantization
     except ValueError:
         found_quantization_type = "ERROR"
@@ -73,4 +74,5 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
     assert found_quantization_type == expected_type, (
         f"Expected quant_type == {expected_type} for {model_path}, "
         f"but found {found_quantization_type} "
-        f"for no --quantization {quantization_arg} case")
+        f"for no --quantization {quantization_arg} case"
+    )
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 08d9573ecf0b..25d1dc59f617 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,77 +1,108 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Expanded quantized model tests for CPU offloading
-# Base tests: tests/basic_correctness/test_cpu_offload.py
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-
-from ..utils import compare_two_settings
-
-
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
-def test_cpu_offload_fp8():
-    # Test quantization of an unquantized checkpoint
-    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
-                         ["--quantization", "fp8"],
-                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test loading a quantized checkpoint
-    compare_two_settings("neuralmagic/Qwen2-1.5B-Instruct-FP8", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test GPTQ Marlin
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test GPTQ
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-                         ["--quantization", "gptq"],
-                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
-                    reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test AWQ Marlin
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test AWQ
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
-                         ["--quantization", "awq"],
-                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-
-
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors(monkeypatch):
-    # This quant method is sensitive to dummy weights, so we force real weights
-    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
-    # Test wNa16
-    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
-                         ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test w4a16_marlin24
-    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
-                         [], ["--cpu-offload-gb", "1"],
-                         max_wait_seconds=480)
-    # Test w8a8
-    compare_two_settings(
-        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
-        ["--cpu-offload-gb", "1"],
-        max_wait_seconds=480)
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Expanded quantized model tests for CPU offloading
+# Base tests: tests/basic_correctness/test_cpu_offload.py
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ..utils import compare_two_settings
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
+def test_cpu_offload_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings(
+        "meta-llama/Llama-3.2-1B-Instruct",
+        ["--quantization", "fp8"],
+        ["--quantization", "fp8", "--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+    # Test loading a quantized checkpoint
+    compare_two_settings(
+        "neuralmagic/Qwen2-1.5B-Instruct-FP8",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
+    # Test GPTQ Marlin
+    compare_two_settings(
+        "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+    # Test GPTQ
+    compare_two_settings(
+        "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+        ["--quantization", "gptq"],
+        ["--quantization", "gptq", "--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("awq_marlin"),
+    reason="awq_marlin is not supported on this GPU type.",
+)
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
+    # Test AWQ Marlin
+    compare_two_settings(
+        "Qwen/Qwen2-1.5B-Instruct-AWQ",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+    # Test AWQ
+    compare_two_settings(
+        "Qwen/Qwen2-1.5B-Instruct-AWQ",
+        ["--quantization", "awq"],
+        ["--quantization", "awq", "--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
+    # Test wNa16
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+    # Test w4a16_marlin24
+    compare_two_settings(
+        "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
+    # Test w8a8
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+        [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480,
+    )
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 50179b9a904d..5509d23fea24 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -2,9 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # flake8: noqa
-"""Tests experts_int8 quantization startup and generation, 
+"""Tests experts_int8 quantization startup and generation,
 doesn't test correctness
 """
+
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
@@ -12,8 +13,10 @@
 MODELS = ["ai21labs/Jamba-tiny-random"]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
-                    reason="ExpertsInt8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("experts_int8"),
+    reason="ExpertsInt8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [10])
@@ -25,7 +28,5 @@ def test_model_experts_int8_startup(
     dtype: str,
     max_tokens: int,
 ) -> None:
-
-    with vllm_runner(model, dtype=dtype,
-                     quantization="experts_int8") as vllm_model:
+    with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index e5ab7b3dd3cf..d0155844fee7 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -4,13 +4,16 @@
 
 Run `pytest tests/quantization/test_fp8.py --forked`.
 """
+
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
-                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8KVCacheMethod,
+    Fp8LinearMethod,
+)
 from vllm.platforms import current_platform
 
 MODELS = [
@@ -20,15 +23,18 @@
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            use_rocm_aiter: bool, monkeypatch) -> None:
-
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_model_load_and_run(
+    vllm_runner, model_id: str, force_marlin: bool, use_rocm_aiter: bool, monkeypatch
+) -> None:
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -38,8 +44,7 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
     with vllm_runner(model_id) as llm:
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(prompts=["Hello my name is"],
-                                      max_tokens=10)
+        outputs = llm.generate_greedy(prompts=["Hello my name is"], max_tokens=10)
         print(outputs[0][1])
 
 
@@ -51,13 +56,17 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
-                                     use_rocm_aiter: bool, monkeypatch):
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_kv_cache_model_load_and_run(
+    vllm_runner, model_id: str, use_rocm_aiter: bool, monkeypatch
+):
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -90,19 +99,26 @@ def check_model(model):
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(prompts=["Hello my name is"],
-                                      max_tokens=10)
+        outputs = llm.generate_greedy(prompts=["Hello my name is"], max_tokens=10)
         print(outputs[0][1])
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
 @pytest.mark.parametrize(
-    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
-def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         use_rocm_aiter: bool, monkeypatch) -> None:
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+def test_load_fp16_model(
+    vllm_runner,
+    kv_cache_dtype: str,
+    force_marlin: bool,
+    use_rocm_aiter: bool,
+    monkeypatch,
+) -> None:
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -112,9 +128,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
-    with vllm_runner("facebook/opt-125m",
-                     quantization="fp8",
-                     kv_cache_dtype=kv_cache_dtype) as llm:
+    with vllm_runner(
+        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+    ) as llm:
 
         def check_model(model):
             fc1 = model.model.decoder.layers[0].fc1
@@ -141,26 +157,29 @@ def check_model(model):
                     pytest.skip(
                         "Skip `test_load_fp16_model`. "
                         "It only runs on ROCm platform with FP8 compute."
-                        " e.g. MI300X and above.")
+                        " e.g. MI300X and above."
+                    )
             else:  # unsupported platform
-                pytest.skip("Skip `test_load_fp16_model`. "
-                            "It only runs on CUDA and ROCm platform.")
+                pytest.skip(
+                    "Skip `test_load_fp16_model`. "
+                    "It only runs on CUDA and ROCm platform."
+                )
 
         llm.apply_model(check_model)
 
 
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="FP8 is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_scaled_fp8_quant(dtype) -> None:
-
     def quantize_ref(tensor, inv_scale):
         # The reference implementation that fully aligns to
         # the kernel being tested.
         finfo = torch.finfo(torch.float8_e4m3fn)
         scale = inv_scale.reciprocal()
-        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
-                                                           max=finfo.max)
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
         qweight = qweight.to(torch.float8_e4m3fn)
         return qweight
 
@@ -179,18 +198,16 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
 
     # Reference dynamic quantizaton
     y = quantize_ref(x, inv_scale)
-    torch.testing.assert_close(ref_y,
-                               per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Static quantization
     y, _ = ops.scaled_fp8_quant(x, inv_scale)
-    torch.testing.assert_close(ref_y,
-                               per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
     y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
     assert y.shape[0] == 17
     torch.testing.assert_close(
         ref_y,
-        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
-                              dtype))
+        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, dtype),
+    )
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 23b999e7c679..d9ddd55c065f 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -10,10 +10,10 @@
 
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
-    get_dynamic_override)
+    get_dynamic_override,
+)
 
 PROMPT = "On the surface of Mars, we found"
 
@@ -21,29 +21,34 @@
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
-     True),
-    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
-     False),
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", True),
+    (
+        "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+        False,
+    ),
 ]
 
 
 @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
-def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
-                           monkeypatch):
+def test_gptq_with_dynamic(
+    vllm_runner, model_id: str, use_marlin_kernel: bool, monkeypatch
+):
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
 
     vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
 
-    linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
-        GPTQLinearMethod)
+    linear_method_cls = (
+        GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
+    )
 
-    for name, submodule in (vllm_model.model.llm_engine.model_executor.
-                            driver_worker.model_runner.model.named_modules()):
+    for (
+        name,
+        submodule,
+    ) in vllm_model.model.llm_engine.model_executor.driver_worker.model_runner.model.named_modules():
         if name == "lm_head":
             assert isinstance(submodule.quant_method, linear_method_cls)
-        elif name == 'model.layers.0.self_attn.qkv_proj':
+        elif name == "model.layers.0.self_attn.qkv_proj":
             # The first layer is quantized using bits=4, group_size=128
             # desc_act=True
             assert isinstance(submodule.quant_method, linear_method_cls)
@@ -51,20 +56,18 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
             assert config.weight_bits == 4
             assert config.group_size == 128
             assert config.desc_act
-        elif name == 'model.layers.1.self_attn.qkv_proj':
+        elif name == "model.layers.1.self_attn.qkv_proj":
             # The second layer is quantized using bits=8, group_size=32
             # desc_act=False
             assert isinstance(submodule.quant_method, linear_method_cls)
             config = submodule.quant_method.quant_config
-            assert get_dynamic_override(config, layer_name=name,
-                                        key="bits") == 8
-            assert get_dynamic_override(config,
-                                        layer_name=name,
-                                        key="group_size") == 32
-            assert not get_dynamic_override(
-                config, layer_name=name, key="desc_act")
-        elif (name == 'model.layers.2.self_attn.qkv_proj'
-              or name == 'model.layers.2.mlp.gate_up_proj'):
+            assert get_dynamic_override(config, layer_name=name, key="bits") == 8
+            assert get_dynamic_override(config, layer_name=name, key="group_size") == 32
+            assert not get_dynamic_override(config, layer_name=name, key="desc_act")
+        elif (
+            name == "model.layers.2.self_attn.qkv_proj"
+            or name == "model.layers.2.mlp.gate_up_proj"
+        ):
             # All other layers (layer index >= 2) are not quantized
             assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
 
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index 34b1b6c2e5b6..ae9b1df3377d 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test model set-up and inference for quantized HF models supported
- on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
- 
- Validating the configuration and printing results for manual checking.
+on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
 
- Run `pytest tests/quantization/test_ipex_quant.py`.
+Validating the configuration and printing results for manual checking.
+
+Run `pytest tests/quantization/test_ipex_quant.py`.
 """
 
 import pytest
@@ -19,14 +19,14 @@
 DTYPE = ["bfloat16"]
 
 
-@pytest.mark.skipif(not current_platform.is_cpu()
-                    and not current_platform.is_xpu(),
-                    reason="only supports Intel CPU/XPU backend.")
+@pytest.mark.skipif(
+    not current_platform.is_cpu() and not current_platform.is_xpu(),
+    reason="only supports Intel CPU/XPU backend.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
     with vllm_runner(model, dtype=dtype) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
     assert output
     print(output)
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 11f78a23bb4c..690e76606238 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -9,11 +9,11 @@
 import torch
 
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
 from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    UnquantizedEmbeddingMethod)
+    UnquantizedEmbeddingMethod,
+)
 
 PROMPT = "On the surface of Mars, we found"
 
@@ -21,7 +21,7 @@
     ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
     ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
-    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
+    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False),
 ]
 
 
@@ -34,21 +34,24 @@ def test_lm_head(
 ) -> None:
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
-    with vllm_runner(model_id, dtype=torch.float16,
-                     max_model_len=2048) as vllm_model:
+    with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model:
 
         def check_model(model):
             lm_head_layer = model.lm_head
             if lm_head_quantized:
-                assert isinstance(lm_head_layer.quant_method,
-                                  (GPTQLinearMethod, GPTQMarlinLinearMethod,
-                                   MarlinLinearMethod))
+                assert isinstance(
+                    lm_head_layer.quant_method,
+                    (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod),
+                )
             else:
-                assert isinstance(lm_head_layer.quant_method,
-                                  UnquantizedEmbeddingMethod)
+                assert isinstance(
+                    lm_head_layer.quant_method, UnquantizedEmbeddingMethod
+                )
 
         vllm_model.apply_model(check_model)
 
         print(
-            vllm_model.generate_greedy(prompts=["Hello my name is"],
-                                       max_tokens=10)[0][1])
+            vllm_model.generate_greedy(prompts=["Hello my name is"], max_tokens=10)[0][
+                1
+            ]
+        )
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 5f78bc30504c..4cd743ef625f 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -4,30 +4,31 @@
 
 Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
 """
+
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
-from vllm.model_executor.layers.quantization.ptpc_fp8 import (
-    PTPCFp8LinearMethod)
+from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
 from vllm.platforms import current_platform
 
 
-@pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
-                    reason="PTPC FP8 is not supported on this GPU type.")
-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="This test is for ROCm GPU.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("ptpc_fp8"),
+    reason="PTPC FP8 is not supported on this GPU type.",
+)
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
 @pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
 def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
-
     try:
-        with vllm_runner("facebook/opt-125m",
-                         dtype=dtype,
-                         quantization="ptpc_fp8",
-                         kv_cache_dtype=kv_cache_dtype) as llm:
-
+        with vllm_runner(
+            "facebook/opt-125m",
+            dtype=dtype,
+            quantization="ptpc_fp8",
+            kv_cache_dtype=kv_cache_dtype,
+        ) as llm:
             model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
             fc1 = model.model.decoder.layers[0].fc1
             assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
@@ -46,9 +47,10 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
             output = llm.generate_greedy("Hello my name is", max_tokens=20)
             assert output
     except AssertionError as e:
-        if str(
-                e
-        ) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.":  # noqa: E501
+        if (
+            str(e)
+            == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified."
+        ):  # noqa: E501
             # If the error message matches, the test passes
             pass
         else:
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 2db11cb997d1..a9537be9df05 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -19,24 +19,27 @@
 from packaging import version
 
 from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
-    QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
+    QuarkLinearMethod,
+    QuarkW8A8Fp8,
+    QuarkW8A8Int8,
+)
 from vllm.platforms import current_platform
 
 from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
 
-QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
-    "quark") is not None and version.parse(
-        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")
+) >= version.parse("0.8.99")
 
 if QUARK_MXFP4_AVAILABLE:
-    from quark.torch.export.nn.modules.realquantizer import (
-        StaticScaledRealQuantizer)
+    from quark.torch.export.nn.modules.realquantizer import StaticScaledRealQuantizer
     from quark.torch.kernel import mx as mx_kernel
     from quark.torch.quantization.config.config import FP4PerGroupSpec
 
 try:
     huggingface_hub.list_repo_refs(
-        "amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ")
+        "amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ"
+    )
     HF_HUB_AMD_ORG_ACCESS = True
 except huggingface_hub.errors.RepositoryNotFoundError:
     HF_HUB_AMD_ORG_ACCESS = False
@@ -47,16 +50,16 @@ def use_v0_only(monkeypatch):
     """
     This module relies on V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
-@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
-@pytest.mark.parametrize('tp', [1])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+@pytest.mark.parametrize("tp", [1])
 def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
     model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
-    with vllm_runner(model_path,
-                     kv_cache_dtype=kv_cache_dtype,
-                     tensor_parallel_size=tp) as llm:
+    with vllm_runner(
+        model_path, kv_cache_dtype=kv_cache_dtype, tensor_parallel_size=tp
+    ) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -77,7 +80,7 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.parametrize('tp', [1])
+@pytest.mark.parametrize("tp", [1])
 def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
     model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
     with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
@@ -103,16 +106,18 @@ def test_quark_fp8_parity(vllm_runner):
     llm_kwargs = {
         "tensor_parallel_size": 1,
         "enforce_eager": True,
-        "gpu_memory_utilization": 0.1
+        "gpu_memory_utilization": 0.1,
     }
-    with (vllm_runner(quark_model_id, **llm_kwargs) as
-          quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
-        quark_model = (quark_handle.model.llm_engine.model_executor.
-                       driver_worker.model_runner.model)
+    with (
+        vllm_runner(quark_model_id, **llm_kwargs) as quark_handle,
+        vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle,
+    ):
+        quark_model = quark_handle.model.llm_engine.model_executor.driver_worker.model_runner.model
         quark_state_dict = quark_model.state_dict()
 
-        fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker.
-                     model_runner.model)
+        fp8_model = (
+            fp8_handle.model.llm_engine.model_executor.driver_worker.model_runner.model
+        )
         fp8_state_dict = fp8_model.state_dict()
 
     assert fp8_state_dict.keys() == quark_state_dict.keys()
@@ -143,16 +148,17 @@ def get_model_args(self) -> str:
     # Private model.
     GSM8KAccuracyTestConfig(
         model_name="amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant",
-        excepted_value=0.96),
+        excepted_value=0.96,
+    ),
 ]
 
 
 @pytest.mark.parametrize("config", ACCURACY_CONFIGS)
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.skipif(
     not HF_HUB_AMD_ORG_ACCESS,
-    reason="Read access to huggingface.co/amd is required for this test.")
+    reason="Read access to huggingface.co/amd is required for this test.",
+)
 def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
     if torch.cuda.device_count() < 8:
         pytest.skip(
@@ -174,28 +180,26 @@ def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
 
     EXPECTED_VALUE = config.excepted_value
     measured_value = results["results"][task]["exact_match,strict-match"]
-    assert (measured_value - rtol < EXPECTED_VALUE
-            and measured_value + rtol > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert (
+        measured_value - rtol < EXPECTED_VALUE
+        and measured_value + rtol > EXPECTED_VALUE
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
     del os.environ["VLLM_USE_TRITON_FLASH_ATTN"]
 
 
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("scalings",
-                         [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
-def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype,
-                                     scalings: list[int]):
+@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
+def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype, scalings: list[int]):
     torch.manual_seed(0)
 
     hidden_size = 64 * 32
-    inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") -
-           0.5) * 2
+    inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") - 0.5) * 2
     for i in range(hidden_size // 32):
-        inp[:, i * 32:(i + 1) *
-            32] = inp[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
+        inp[:, i * 32 : (i + 1) * 32] = (
+            inp[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
+        )
 
     inp_kernel = inp.clone()
     inp_kernel_clone = inp_kernel.clone()
@@ -204,20 +208,20 @@ def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype,
     res_torch = qdq_mxfp4_torch(inp_kernel, "even")
 
     for i in range(hidden_size // 32):
-        assert torch.all(torch.isfinite(res_hip[:, i * 32:(i + 1) * 32]))
-        assert torch.all(torch.isfinite(res_torch[:, i * 32:(i + 1) * 32]))
+        assert torch.all(torch.isfinite(res_hip[:, i * 32 : (i + 1) * 32]))
+        assert torch.all(torch.isfinite(res_torch[:, i * 32 : (i + 1) * 32]))
 
-        torch.testing.assert_close(res_hip[:, i * 32:(i + 1) * 32],
-                                   res_torch[:, i * 32:(i + 1) * 32])
+        torch.testing.assert_close(
+            res_hip[:, i * 32 : (i + 1) * 32], res_torch[:, i * 32 : (i + 1) * 32]
+        )
 
 
-@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
-                    reason="amd-quark>=0.9 is not available")
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 @pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("scalings",
-                         [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
-def test_mxfp4_dequant_kernel_match_quark(float_dtype: torch.dtype,
-                                          scalings: list[int]):
+@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
+def test_mxfp4_dequant_kernel_match_quark(
+    float_dtype: torch.dtype, scalings: list[int]
+):
     qspec = FP4PerGroupSpec(
         ch_axis=-1,
         group_size=32,
@@ -244,8 +248,9 @@ def test_mxfp4_dequant_kernel_match_quark(float_dtype: torch.dtype,
 
     # Make it so that different groups have different scales.
     for i in range(hidden_size // 32):
-        w[:, i * 32:(i + 1) *
-          32] = w[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
+        w[:, i * 32 : (i + 1) * 32] = (
+            w[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
+        )
 
     observer(w)
     scale, _ = observer._calculate_qparams()
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 6c541fdbeeae..23c36be7b2b7 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -6,18 +6,25 @@
 
 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
+
 from typing import Any, Optional
 
 import pytest
 import torch
 import torch.nn.functional as F
 
-from vllm.model_executor.layers.linear import LinearBase  # noqa: E501
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.linear import (
+    LinearBase,  # noqa: E501
+    UnquantizedLinearMethod,
+)
 from vllm.model_executor.layers.quantization import (
-    QuantizationMethods, get_quantization_config, register_quantization_config)
+    QuantizationMethods,
+    get_quantization_config,
+    register_quantization_config,
+)
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig)
+    QuantizationConfig,
+)
 
 
 class FakeQuantLinearMethod(UnquantizedLinearMethod):
@@ -28,10 +35,12 @@ def __init__(self, num_bits: int = 8) -> None:
         super().__init__()
         self.num_bits = num_bits
 
-    def apply(self,
-              layer: "torch.nn.Module",
-              x: "torch.Tensor",
-              bias: Optional["torch.Tensor"] = None) -> "torch.Tensor":
+    def apply(
+        self,
+        layer: "torch.nn.Module",
+        x: "torch.Tensor",
+        bias: Optional["torch.Tensor"] = None,
+    ) -> "torch.Tensor":
         """Perform fake quantization before the linear layer."""
 
         # Calculate the scales dynamically
@@ -40,8 +49,11 @@ def apply(self,
         scales = (max_val - min_val) / (2**self.num_bits - 1)
 
         # Fake quantize the input
-        quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1),
-                              2**(self.num_bits - 1) - 1)
+        quant_x = torch.clamp(
+            torch.round(x / scales),
+            -(2 ** (self.num_bits - 1)),
+            2 ** (self.num_bits - 1) - 1,
+        )
         dequant_x = quant_x * scales
 
         return F.linear(dequant_x, layer.weight, bias)
@@ -79,8 +91,9 @@ def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig":
         """Create a config class from the model's quantization config."""
         return CustomQuantConfig(num_bits=config.get("num_bits", 8))
 
-    def get_quant_method(self, layer: "torch.nn.Module",
-                         prefix: str) -> Optional["FakeQuantLinearMethod"]:
+    def get_quant_method(
+        self, layer: "torch.nn.Module", prefix: str
+    ) -> Optional["FakeQuantLinearMethod"]:
         """Get the quantize method to use for the quantized layer."""
         if isinstance(layer, LinearBase):
             return FakeQuantLinearMethod(num_bits=self.num_bits)
@@ -99,18 +112,19 @@ def test_register_quantization_config():
         register_quantization_config("custom_quant")(CustomQuantConfig)
 
 
-@pytest.mark.parametrize(argnames="model",
-                         argvalues=[
-                             "meta-llama/Llama-3.2-1B-Instruct",
-                         ])
+@pytest.mark.parametrize(
+    argnames="model",
+    argvalues=[
+        "meta-llama/Llama-3.2-1B-Instruct",
+    ],
+)
 def test_custom_quant(vllm_runner, model, monkeypatch):
     """Test infer with the custom quantization method."""
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
-    with vllm_runner(model_name=model,
-                     quantization="custom_quant",
-                     enforce_eager=True) as llm:
-
+    with vllm_runner(
+        model_name=model, quantization="custom_quant", enforce_eager=True
+    ) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
         qkv_proj = layer.self_attn.qkv_proj
diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py
index 133b2d9e4df6..51518ccb5896 100644
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright © 2025, Oracle and/or its affiliates.
-"""Tests RTN quantization startup and generation, 
+"""Tests RTN quantization startup and generation,
 doesn't test correctness
 """
+
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
@@ -11,8 +12,10 @@
 MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("rtn"),
-                    reason="RTN is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("rtn"),
+    reason="RTN is not supported on this GPU type.",
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [10])
@@ -24,6 +27,5 @@ def test_model_rtn_startup(
     dtype: str,
     max_tokens: int,
 ) -> None:
-
     with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index eef3568efea1..472b2948c726 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -13,12 +13,13 @@
 
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
-    with vllm_runner("drisspg/fp8-opt-125m",
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     enforce_eager=True) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        "drisspg/fp8-opt-125m",
+        quantization="torchao",
+        dtype="bfloat16",
+        enforce_eager=True,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
     assert output
     print(output)
 
@@ -29,17 +30,18 @@ def test_pre_quantized_model(vllm_runner):
     [
         "cuda:0",
         # {"": "cuda"},
-    ])
-def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
-                                                   pt_load_map_location):
+    ],
+)
+def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_location):
     torch._dynamo.reset()
     model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location=pt_load_map_location) as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location=pt_load_map_location,
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
 
         assert output
         print(output)
@@ -49,12 +51,13 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
 def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
     torch._dynamo.reset()
     model_name = "jerryzh168/opt-125m-int4wo-per-module"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
 
         assert output
         print(output)
@@ -64,12 +67,13 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
 def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
     torch._dynamo.reset()
     model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
-    with vllm_runner(model_name=model_name,
-                     quantization="torchao",
-                     dtype="bfloat16",
-                     pt_load_map_location="cuda:0") as llm:
-        output = llm.generate_greedy(["The capital of France is"],
-                                     max_tokens=32)
+    with vllm_runner(
+        model_name=model_name,
+        quantization="torchao",
+        dtype="bfloat16",
+        pt_load_map_location="cuda:0",
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
 
         assert output
         print(output)
diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 987f3c48de0c..946d01c123c5 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -259,15 +259,15 @@ def test_reasoning(
     output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
     output_tokens: list[str] = [
-        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
-        for token in output
+        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(deepseek_r1_qwen_tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        deepseek_r1_qwen_tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
     assert reasoning == param_dict["reasoning_content"]
     assert content == param_dict["content"]
@@ -281,7 +281,8 @@ def test_reasoning(
     if param_dict["content"] is not None:
         content = parser.extract_content_ids(output_ids)
         assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
-            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
+            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"])
+        )
     else:
         content = parser.extract_content_ids(output)
         assert content == []
diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
index 38cab73a45f2..de1663408d72 100644
--- a/tests/reasoning/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -11,8 +11,7 @@
 START_RESPONSE = "Here is my response:"
 
 SIMPLE_REASONING = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
 }
@@ -27,14 +26,12 @@
     "content": "This is content",
 }
 MULTIPLE_LINES = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
 }
 REASONING_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
 }
@@ -44,8 +41,7 @@
     "content": None,
 }
 MULTIPLE_LINES_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
 }
@@ -137,12 +133,13 @@ def test_reasoning(
     output_tokens: list[str] = [
         tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
     assert reasoning == param_dict["reasoning_content"]
     assert content == param_dict["content"]
@@ -229,18 +226,15 @@ def test_reasoning(
 ## The Response is ongoing, and the delta mixes reasoning content / content
 STREAMING_10 = {
     "previous_text": "Here is my thought process: foo",
-    "current_text":
-    "Here is my thought process: foo bar Here is my response: baz",
+    "current_text": "Here is my thought process: foo bar Here is my response: baz",
     "delta_text": " bar Here is my response: baz",
     "reasoning_content": " bar ",
     "content": " baz",
 }
 # The delta text starts a new substring that might be a response special seq
 STREAMING_11 = {
-    "previous_text":
-    "Here is my thought process: This is a reasoning section ",
-    "current_text":
-    "Here is my thought process: This is a reasoning section Here",
+    "previous_text": "Here is my thought process: This is a reasoning section ",
+    "current_text": "Here is my thought process: This is a reasoning section Here",
     "delta_text": "Here",
     "reasoning_content": None,
     "content": None,
@@ -320,14 +314,17 @@ def test_reasoning(
 @pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
 def test_streaming_subcases(param_dict):
     # Get all of the token IDs
-    previous_token_ids = tokenizer.encode(
-        param_dict["previous_text"]
-    ) if param_dict["previous_text"] is not None else []
+    previous_token_ids = (
+        tokenizer.encode(param_dict["previous_text"])
+        if param_dict["previous_text"] is not None
+        else []
+    )
     current_token_ids = tokenizer.encode(param_dict["current_text"])
     delta_token_ids = tokenizer.encode(param_dict["delta_text"])
 
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        tokenizer
+    )
 
     response = parser.extract_reasoning_content_streaming(
         previous_text=param_dict["previous_text"],
@@ -339,8 +336,7 @@ def test_streaming_subcases(param_dict):
     )
     # Streaming currently expects at least one of reasoning content / content,
     # so the response should return None in that case.
-    if param_dict["reasoning_content"] is None and param_dict[
-            "content"] is None:
+    if param_dict["reasoning_content"] is None and param_dict["content"] is None:
         assert response is None
     else:
         assert isinstance(response, DeltaMessage)
diff --git a/tests/reasoning/test_hunyuan_reasoning_parser.py b/tests/reasoning/test_hunyuan_reasoning_parser.py
index f9238267f02e..b7e3ea73ccde 100644
--- a/tests/reasoning/test_hunyuan_reasoning_parser.py
+++ b/tests/reasoning/test_hunyuan_reasoning_parser.py
@@ -13,15 +13,13 @@
 END_RESPONSE = "\n</answer>"
 
 NO_REASONING_QUICK_THROUGHT = {
-    "output":
-    f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}",  #noqa: E501
+    "output": f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}",  # noqa: E501
     "reasoning_content": None,
     "content": "This is the rest",
 }
 
 SIMPLE_REASONING = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}",  #noqa: E501
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}",  # noqa: E501
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
 }
@@ -42,14 +40,12 @@
     "content": "This is content",
 }
 MULTIPLE_LINES = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
 }
 REASONING_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  # noqa: E501
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
 }
@@ -59,8 +55,7 @@
     "content": None,
 }
 MULTIPLE_LINES_WITH_THINK = {
-    "output":
-    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
 }
@@ -122,9 +117,7 @@
         NO_REASONING,
         id="no_reasoning_streaming",
     ),
-    pytest.param(True,
-                 NO_REASONING_QUICK_THROUGHT,
-                 id="no_reasoning_quick_stream"),
+    pytest.param(True, NO_REASONING_QUICK_THROUGHT, id="no_reasoning_quick_stream"),
     pytest.param(
         True,
         MULTIPLE_LINES,
@@ -148,8 +141,9 @@
 ]
 
 # Global tokenizer initialization to avoid repeated loading
-tokenizer = AutoTokenizer.from_pretrained("tencent/Hunyuan-A13B-Instruct",
-                                          trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(
+    "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
+)
 
 
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
@@ -162,12 +156,13 @@ def test_reasoning(
     output_tokens: list[str] = [
         tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
     assert reasoning == param_dict["reasoning_content"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
index 2d5557d5cdc1..c06e40d72de2 100644
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -50,8 +50,7 @@ def qwen3_tokenizer():
     "content": None,
 }
 MULTILINE_REASONING = {
-    "output":
-    "<think>This is a reasoning\nsection</think>This is the rest\nThat",
+    "output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
     "reasoning_content": "This is a reasoning\nsection",
     "content": "This is the rest\nThat",
 }
@@ -131,12 +130,13 @@ def test_reasoning(
     output_tokens: list[str] = [
         qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
     ]
-    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(qwen3_tokenizer)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        qwen3_tokenizer
+    )
 
-    reasoning, content = run_reasoning_extraction(parser,
-                                                  output_tokens,
-                                                  streaming=streaming)
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
 
     assert reasoning == param_dict["reasoning_content"]
     assert content == param_dict["content"]
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index ddcf89796fb5..960c0a121583 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -3,13 +3,11 @@
 
 from typing import Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
 
 
 class StreamingReasoningReconstructor:
-
     def __init__(self):
         self.reasoning_content = None
         self.other_content = None
@@ -18,8 +16,8 @@ def append_delta(self, delta: DeltaMessage):
         # content and the reasoning content should not be present
         # at the same time
         assert delta.content is None or delta.reasoning_content is None, (
-            "Both content and reasoning content are present in the "
-            "delta message")
+            "Both content and reasoning content are present in the delta message"
+        )
         if delta.content is not None:
             if self.other_content is None:
                 self.other_content = delta.content
@@ -50,7 +48,8 @@ def run_reasoning_extraction(
         )
     else:
         reasoning, content = run_reasoning_extraction_nonstreaming(
-            reasoning_parser, model_output, request)
+            reasoning_parser, model_output, request
+        )
         return reasoning, content
 
 
@@ -61,7 +60,8 @@ def run_reasoning_extraction_nonstreaming(
 ) -> tuple[Optional[str], Optional[str]]:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     return reasoning_parser.extract_reasoning_content(
-        model_output=''.join(model_output), request=request)
+        model_output="".join(model_output), request=request
+    )
 
 
 def run_reasoning_extraction_streaming(
diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py
index ee448c2ccb21..03691b4a472f 100644
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
@@ -8,24 +8,25 @@
 import torch
 
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, runai_safetensors_weights_iterator,
-    safetensors_weights_iterator)
+    download_weights_from_hf,
+    runai_safetensors_weights_iterator,
+    safetensors_weights_iterator,
+)
 
 
 def test_runai_model_loader():
     with tempfile.TemporaryDirectory() as tmpdir:
         huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("openai-community/gpt2",
-                                 allow_patterns=["*.safetensors"],
-                                 cache_dir=tmpdir)
+        download_weights_from_hf(
+            "openai-community/gpt2", allow_patterns=["*.safetensors"], cache_dir=tmpdir
+        )
         safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
         assert len(safetensors) > 0
 
         runai_model_streamer_tensors = {}
         hf_safetensors_tensors = {}
 
-        for name, tensor in runai_safetensors_weights_iterator(
-                safetensors, True):
+        for name, tensor in runai_safetensors_weights_iterator(safetensors, True):
             runai_model_streamer_tensors[name] = tensor
 
         for name, tensor in safetensors_weights_iterator(safetensors, True):
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index bdf48c7687b2..02fe5a1cb29c 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -43,19 +43,21 @@ def test_beam_search_single_input(
 ) -> None:
     example_prompts = example_prompts[:1]
     with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
+        hf_outputs = hf_model.generate_beam_search(
+            example_prompts, beam_width, max_tokens
+        )
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                       beam_width, max_tokens)
+        vllm_outputs = vllm_model.generate_beam_search(
+            example_prompts, beam_width, max_tokens
+        )
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_texts = hf_outputs[i]
         vllm_output_ids, vllm_output_texts = vllm_outputs[i]
-        for j, (hf_text,
-                vllm_text) in enumerate(zip(hf_output_texts,
-                                            vllm_output_texts)):
+        for j, (hf_text, vllm_text) in enumerate(
+            zip(hf_output_texts, vllm_output_texts)
+        ):
             print(f">>>{j}-th hf output:")
             print(hf_text)
             print(f">>>{j}-th vllm output:")
@@ -63,8 +65,8 @@ def test_beam_search_single_input(
         assert len(hf_output_ids) == len(vllm_output_ids)
         for j in range(len(hf_output_ids)):
             assert hf_output_ids[j] == vllm_output_ids[j], (
-                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
-                f"vLLM: {vllm_output_ids}")
+                f"Test{i} output{j}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+            )
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -85,11 +87,10 @@ def test_beam_search_passes_multimodal_data(
     model = "Qwen/Qwen2-Audio-7B-Instruct"
     audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
     prompts = [
-        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501
+        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  # noqa: E501
     ]
 
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSeq2SeqLM) as hf_model:
         audio_token_id = hf_model.config.audio_token_index
         eos_token_id = hf_model.tokenizer.eos_token_id  # <|im_end|>
         hf_outputs = hf_model.generate_beam_search(
@@ -107,17 +108,15 @@ def test_beam_search_passes_multimodal_data(
             audios=audios,
         )
 
-    seq_with_no_audio_toks = lambda seq: [
-        tok for tok in seq if tok != audio_token_id
-    ]
+    seq_with_no_audio_toks = lambda seq: [tok for tok in seq if tok != audio_token_id]
 
     for i in range(len(prompts)):
         hf_output_ids, hf_output_texts = hf_outputs[i]
         vllm_output_ids, vllm_output_texts = vllm_outputs[i]
 
-        for j, (hf_text,
-                vllm_text) in enumerate(zip(hf_output_texts,
-                                            vllm_output_texts)):
+        for j, (hf_text, vllm_text) in enumerate(
+            zip(hf_output_texts, vllm_output_texts)
+        ):
             print(f">>>{j}-th hf output [NOTE: special tokens are filtered]:")
             print(hf_text)
             print(f">>>{j}-th vllm output:")
@@ -130,12 +129,10 @@ def test_beam_search_passes_multimodal_data(
             # token to match features, while the vLLM helper maintains the
             # single audio token in the input text
             filtered_hf_output_ids = seq_with_no_audio_toks(hf_output_ids[j])
-            filtered_vllm_output_ids = seq_with_no_audio_toks(
-                vllm_output_ids[j])
+            filtered_vllm_output_ids = seq_with_no_audio_toks(vllm_output_ids[j])
 
             # HF output IDs may contain the end of sequence
-            if len(filtered_hf_output_ids
-                   ) == len(filtered_vllm_output_ids) + 1:
+            if len(filtered_hf_output_ids) == len(filtered_vllm_output_ids) + 1:
                 assert filtered_hf_output_ids[-1] == eos_token_id
                 filtered_hf_output_ids = filtered_hf_output_ids[:-1]
 
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 7eb9c0b5fb8c..04f14f59217b 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -32,11 +32,11 @@ def test_ignore_eos(
     max_tokens: int,
 ) -> None:
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
+        sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
 
         for prompt in example_prompts:
             ignore_eos_output = vllm_model.model.generate(
-                prompt, sampling_params=sampling_params)
+                prompt, sampling_params=sampling_params
+            )
             output_length = len(ignore_eos_output[0].outputs[0].token_ids)
             assert output_length == max_tokens
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 901c87591264..05d96a1081a7 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -14,7 +14,7 @@ def use_v0_only(monkeypatch):
     """
     This file tests V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -29,8 +29,7 @@ def test_logits_processor_force_generate(
         tokenizer = vllm_model.model.get_tokenizer()
         repeat_times = 2
         enforced_answers = " vLLM"
-        vllm_token_ids = tokenizer.encode(enforced_answers,
-                                          add_special_tokens=False)
+        vllm_token_ids = tokenizer.encode(enforced_answers, add_special_tokens=False)
         max_tokens = len(vllm_token_ids) * repeat_times
 
         def pick_vllm(token_ids, logits):
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 86c8a03eee10..e56ad6696ba6 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -17,12 +17,11 @@ def use_v0_only(monkeypatch):
     This module is V0 only since it uses dtype=float, so
     set VLLM_USE_V1=0 for all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["float"])  # needed for comparing logprobs with HF
+@pytest.mark.parametrize("dtype", ["float"])  # needed for comparing logprobs with HF
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
 @pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
 @pytest.mark.parametrize("detokenize", [True, False])
@@ -52,20 +51,23 @@ def test_get_prompt_logprobs(
         )
 
     with vllm_runner(
-            model,
-            dtype=dtype,
-            max_logprobs=num_top_logprobs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
+        model,
+        dtype=dtype,
+        max_logprobs=num_top_logprobs,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                              logprobs=num_top_logprobs,
-                                              prompt_logprobs=num_top_logprobs,
-                                              temperature=0.0,
-                                              detokenize=detokenize)
+        vllm_sampling_params = SamplingParams(
+            max_tokens=max_tokens,
+            logprobs=num_top_logprobs,
+            prompt_logprobs=num_top_logprobs,
+            temperature=0.0,
+            detokenize=detokenize,
+        )
         vllm_results = vllm_model.model.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
+            example_prompts, sampling_params=vllm_sampling_params
+        )
 
     # Test whether logprobs are included in the results.
     for result in vllm_results:
@@ -75,33 +77,37 @@ def test_get_prompt_logprobs(
         for logprobs in result.outputs[0].logprobs:
             # If the output token is not included in the top X
             # logprob, it can return 1 more data
-            assert (len(logprobs) == num_top_logprobs
-                    or len(logprobs) == num_top_logprobs + 1)
+            assert (
+                len(logprobs) == num_top_logprobs
+                or len(logprobs) == num_top_logprobs + 1
+            )
         output_text = result.outputs[0].text
         output_string_from_most_likely_tokens_lst: list[str] = []
         for top_logprobs in result.outputs[0].logprobs:
             top_logprob = next(iter(top_logprobs.values()))
-            output_string_from_most_likely_tokens_lst.append(
-                top_logprob.decoded_token)
+            output_string_from_most_likely_tokens_lst.append(top_logprob.decoded_token)
 
         if detokenize:
             output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens_lst)
+                output_string_from_most_likely_tokens_lst
+            )
             assert output_text == output_string_from_most_likely_tokens, (
                 "The output text from the top logprob for each token position "
-                "should be the same as the output text in the result.")
+                "should be the same as the output text in the result."
+            )
         else:
-            assert output_text == ''
-            assert output_string_from_most_likely_tokens_lst == ([None] *
-                                                                 max_tokens)
+            assert output_text == ""
+            assert output_string_from_most_likely_tokens_lst == ([None] * max_tokens)
 
         # The first prompt logprob is always None
         assert result.prompt_logprobs[0] is None
         for prompt_logprobs in result.prompt_logprobs[1:]:
             # If the prompt token is not included in the top X
             # logprob, it can return 1 more data
-            assert (len(prompt_logprobs) == num_top_logprobs
-                    or len(prompt_logprobs) == num_top_logprobs + 1)
+            assert (
+                len(prompt_logprobs) == num_top_logprobs
+                or len(prompt_logprobs) == num_top_logprobs + 1
+            )
 
     # Test whether prompt logprobs are consistent with HF
     for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
@@ -110,22 +116,24 @@ def test_get_prompt_logprobs(
         vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
         for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
             for token_id, logprob in vllm_prompt_logprob_dict.items():
-                torch.testing.assert_close(logprob.logprob,
-                                           hf_logprob[0][i][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
+                torch.testing.assert_close(
+                    logprob.logprob,
+                    hf_logprob[0][i][token_id].item(),
+                    atol=1e-2,
+                    rtol=1e-2,
+                )
         vllm_sample_logprobs = vllm_result.outputs[0].logprobs
         for i, top_logprobs in enumerate(vllm_sample_logprobs):
             for token_id, sample_logprob in top_logprobs.items():
                 logprob = sample_logprob.logprob
-                torch.testing.assert_close(logprob,
-                                           hf_logprob[i][-1][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
+                torch.testing.assert_close(
+                    logprob, hf_logprob[i][-1][token_id].item(), atol=1e-2, rtol=1e-2
+                )
                 if detokenize:
                     assert isinstance(sample_logprob.decoded_token, str), (
                         "The token should be decoded by the time it is returned"
-                        " to the user.")
+                        " to the user."
+                    )
 
     # Test if prompt logprobs are correctly set.
     for vllm_result in vllm_results:
@@ -153,8 +161,13 @@ def test_max_logprobs():
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
 @pytest.mark.parametrize("detokenize", [True, False])
-def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
-                       detokenize: bool, example_prompts):
+def test_none_logprobs(
+    vllm_runner,
+    model,
+    chunked_prefill_token_size: int,
+    detokenize: bool,
+    example_prompts,
+):
     max_num_seqs = 256
     enable_chunked_prefill = False
     max_num_batched_tokens = None
@@ -165,17 +178,17 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
     max_tokens = 5
 
     with vllm_runner(
-            model,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
+        model,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=None,
-                                                       temperature=0.0,
-                                                       detokenize=detokenize)
+        sampling_params_logprobs_none = SamplingParams(
+            max_tokens=max_tokens, logprobs=None, temperature=0.0, detokenize=detokenize
+        )
         results_logprobs_none = vllm_model.model.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_none)
+            example_prompts, sampling_params=sampling_params_logprobs_none
+        )
 
     for i in range(len(results_logprobs_none)):
         assert results_logprobs_none[i].outputs[0].logprobs is None
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 42b529ae169d..ba91a3247a79 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -5,6 +5,7 @@
 Run `pytest tests/samplers/test_no_bad_words.py`.
 
 """
+
 from typing import Optional
 
 import pytest
@@ -49,25 +50,24 @@ class TestOneTokenBadWord:
     TARGET_TOKEN = "you"
 
     def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
-                                                       add_prefix_space=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.MODEL, add_prefix_space=True
+        )
 
         self.num_prompt_tokens = len(self._encode(self.PROMPT))
-        self.target_token_id = self._encode(self.TARGET_TOKEN,
-                                            add_special_tokens=False)[0]
+        self.target_token_id = self._encode(
+            self.TARGET_TOKEN, add_special_tokens=False
+        )[0]
 
     def test_one_token_bad_word(self, vllm_runner):
         with vllm_runner(self.MODEL) as llm:
             output_token_ids = self._generate(llm)
             assert output_token_ids[0] == self.target_token_id
 
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN])
+            output_token_ids = self._generate(llm, bad_words=[self.TARGET_TOKEN])
             assert self.target_token_id not in output_token_ids
 
-    def _generate(self,
-                  model: LLM,
-                  bad_words: Optional[list[str]] = None) -> list[int]:
+    def _generate(self, model: LLM, bad_words: Optional[list[str]] = None) -> list[int]:
         return _generate(
             model=model,
             prompt=self.PROMPT,
@@ -75,11 +75,8 @@ def _generate(self,
             bad_words=bad_words,
         )
 
-    def _encode(self,
-                prompt: str,
-                add_special_tokens: bool = True) -> list[int]:
-        return self.tokenizer(prompt,
-                              add_special_tokens=add_special_tokens).input_ids
+    def _encode(self, prompt: str, add_special_tokens: bool = True) -> list[int]:
+        return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids
 
 
 class TestTwoTokenBadWord:
@@ -92,72 +89,80 @@ class TestTwoTokenBadWord:
     NEIGHBOUR_TOKEN2 = "older"
 
     def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
-                                                       add_prefix_space=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.MODEL, add_prefix_space=True
+        )
 
         self.num_prompt_tokens = len(self._encode(self.PROMPT))
-        self.target_token_id1 = self._encode(self.TARGET_TOKEN1,
-                                             add_special_tokens=False)[0]
-        self.target_token_id2 = self._encode(self.TARGET_TOKEN2,
-                                             add_special_tokens=False)[0]
-        self.neighbour_token_id2 = self._encode(self.NEIGHBOUR_TOKEN2,
-                                                add_special_tokens=False)[0]
+        self.target_token_id1 = self._encode(
+            self.TARGET_TOKEN1, add_special_tokens=False
+        )[0]
+        self.target_token_id2 = self._encode(
+            self.TARGET_TOKEN2, add_special_tokens=False
+        )[0]
+        self.neighbour_token_id2 = self._encode(
+            self.NEIGHBOUR_TOKEN2, add_special_tokens=False
+        )[0]
 
     def test_two_token_bad_word(self, vllm_runner):
         with vllm_runner(self.MODEL, dtype="half") as llm:
             output_token_ids = self._generate(llm)
             assert output_token_ids[:2] == [
-                self.target_token_id1, self.target_token_id2
+                self.target_token_id1,
+                self.target_token_id2,
             ]
 
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN1])
+            output_token_ids = self._generate(llm, bad_words=[self.TARGET_TOKEN1])
             assert self.target_token_id1 not in output_token_ids
 
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN2])
+            output_token_ids = self._generate(llm, bad_words=[self.TARGET_TOKEN2])
             assert output_token_ids[0] == self.target_token_id1
             assert self.target_token_id2 not in output_token_ids
 
             output_token_ids = self._generate(
-                llm, bad_words=[f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}'])
+                llm, bad_words=[f"{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}"]
+            )
             assert output_token_ids[0] == self.target_token_id1
             assert output_token_ids[:2] != [
-                self.target_token_id1, self.target_token_id2
+                self.target_token_id1,
+                self.target_token_id2,
             ]
             assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.target_token_id2])
+                output_token_ids, [self.target_token_id1, self.target_token_id2]
+            )
             # Model dependent behaviour
             assert output_token_ids[:2] == [
-                self.target_token_id1, self.neighbour_token_id2
+                self.target_token_id1,
+                self.neighbour_token_id2,
             ]
 
             output_token_ids = self._generate(
                 llm,
                 bad_words=[
-                    f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}',
-                    f'{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}'
-                ])
+                    f"{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}",
+                    f"{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}",
+                ],
+            )
             assert output_token_ids[0] == self.target_token_id1
             assert output_token_ids[:2] != [
-                self.target_token_id1, self.target_token_id2
+                self.target_token_id1,
+                self.target_token_id2,
             ]
             assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.target_token_id2])
+                output_token_ids, [self.target_token_id1, self.target_token_id2]
+            )
             assert output_token_ids[:2] != [
-                self.target_token_id1, self.neighbour_token_id2
+                self.target_token_id1,
+                self.neighbour_token_id2,
             ]
             assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.neighbour_token_id2])
-            assert ((self.target_token_id2 in output_token_ids)
-                    or (self.neighbour_token_id2 in output_token_ids))
-
-    def _generate(self,
-                  model: LLM,
-                  bad_words: Optional[list[str]] = None) -> list[int]:
+                output_token_ids, [self.target_token_id1, self.neighbour_token_id2]
+            )
+            assert (self.target_token_id2 in output_token_ids) or (
+                self.neighbour_token_id2 in output_token_ids
+            )
+
+    def _generate(self, model: LLM, bad_words: Optional[list[str]] = None) -> list[int]:
         return _generate(
             model=model,
             prompt=self.PROMPT,
@@ -187,8 +192,5 @@ def _contains(sequence: list[int], subsequence: list[int]) -> bool:
 
         return False
 
-    def _encode(self,
-                prompt: str,
-                add_special_tokens: bool = True) -> list[int]:
-        return self.tokenizer(prompt,
-                              add_special_tokens=add_special_tokens).input_ids
+    def _encode(self, prompt: str, add_special_tokens: bool = True) -> list[int]:
+        return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index 86fc14dc85f8..88db6a211868 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -26,25 +26,27 @@ def test_ranks(
     num_top_logprobs = 5
     num_prompt_logprobs = 5
 
-    with vllm_runner(model, dtype=dtype,
-                     max_logprobs=num_top_logprobs) as vllm_model:
-
+    with vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs) as vllm_model:
         ## Test greedy logprobs ranks
         vllm_sampling_params = SamplingParams(
             temperature=0.0,
             top_p=1.0,
             max_tokens=max_tokens,
             logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs)
-        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                      vllm_sampling_params)
+            prompt_logprobs=num_prompt_logprobs,
+        )
+        vllm_results = vllm_model.generate_w_logprobs(
+            example_prompts, vllm_sampling_params
+        )
 
         ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(temperature=1.0,
-                                         top_p=1.0,
-                                         max_tokens=max_tokens,
-                                         logprobs=num_top_logprobs,
-                                         prompt_logprobs=num_prompt_logprobs)
+        sampling_params = SamplingParams(
+            temperature=1.0,
+            top_p=1.0,
+            max_tokens=max_tokens,
+            logprobs=num_top_logprobs,
+            prompt_logprobs=num_prompt_logprobs,
+        )
         res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
 
     for result in vllm_results:
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 3b93c64113da..c187f2aa937c 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -15,16 +15,15 @@ def use_v0_only(monkeypatch):
     """
     This file tests V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 def mock_causal_accepted_tensor(
-        k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
+    k: int, last_accepted_indices: torch.Tensor
+) -> torch.Tensor:
     """Generate an "accepted" tensor which should yield causally-accepted tokens
     up to last accepted indices.
 
@@ -33,17 +32,17 @@ def mock_causal_accepted_tensor(
     """
     batch_size = last_accepted_indices.shape[0]
 
-    accepted = (torch.arange(k).expand(batch_size, k)
-                <= last_accepted_indices.unsqueeze(-1).broadcast_to(
-                    batch_size, k))
+    accepted = torch.arange(k).expand(batch_size, k) <= last_accepted_indices.unsqueeze(
+        -1
+    ).broadcast_to(batch_size, k)
 
     # Sprinkle accepted values after the contiguous initial accepted values.
     # This replicates the behavior of rejection sampling, which may "accept"
     # a token that cannot be accepted because of causality.
-    sprinkle_candidates = (torch.arange(k).expand(
-        batch_size,
-        k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
-                           1)
+    sprinkle_candidates = (
+        torch.arange(k).expand(batch_size, k)
+        > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1
+    )
     sprinkle = torch.rand(batch_size, k) > 0.5
     accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
     return accepted
@@ -52,14 +51,15 @@ def mock_causal_accepted_tensor(
 @pytest.mark.parametrize("seed", list(range(10)))
 @pytest.mark.parametrize(
     "which_tokens_accepted",
-    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
+    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"],
+)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
-def test_correct_output_format(which_tokens_accepted: str, seed: int,
-                               device: str, use_flashinfer: bool):
-    """Verify the output has correct format given predetermined accepted matrix.
-    """
+def test_correct_output_format(
+    which_tokens_accepted: str, seed: int, device: str, use_flashinfer: bool
+):
+    """Verify the output has correct format given predetermined accepted matrix."""
     set_random_seed(seed)
     torch.set_default_device(device)
 
@@ -69,30 +69,27 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
 
     if which_tokens_accepted == "all_tokens_accepted":
         accepted = mock_causal_accepted_tensor(
-            k, -1 + k * torch.ones((batch_size, ), dtype=torch.long))
+            k, -1 + k * torch.ones((batch_size,), dtype=torch.long)
+        )
     elif which_tokens_accepted == "no_tokens_accepted":
         accepted = mock_causal_accepted_tensor(
-            k, -torch.ones((batch_size, ), dtype=torch.long))
+            k, -torch.ones((batch_size,), dtype=torch.long)
+        )
     elif which_tokens_accepted == "some_tokens_accepted":
-        last_accepted_indices = torch.randint(low=-1,
-                                              high=k,
-                                              size=(batch_size, ))
+        last_accepted_indices = torch.randint(low=-1, high=k, size=(batch_size,))
         accepted = mock_causal_accepted_tensor(k, last_accepted_indices)
     else:
         raise AssertionError()
 
-    recovered_token_ids = torch.randint(low=0,
-                                        high=vocab_size,
-                                        size=(batch_size, k),
-                                        dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
+    recovered_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
 
     rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
     rejection_sampler.init_gpu_tensors(device=device)
@@ -116,21 +113,25 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
         assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0])
 
         # Expect everything else to be -1.
-        assert torch.equal(output_token_ids[:, 1:],
-                           torch.ones_like(output_token_ids[:, 1:]) * -1)
+        assert torch.equal(
+            output_token_ids[:, 1:], torch.ones_like(output_token_ids[:, 1:]) * -1
+        )
     elif which_tokens_accepted == "some_tokens_accepted":
         recovered_plus_bonus = torch.cat(
-            (recovered_token_ids, expected_bonus_token_ids), dim=-1)
+            (recovered_token_ids, expected_bonus_token_ids), dim=-1
+        )
         # Assert first rejected token is a recovered token or bonus token.
         assert torch.equal(
-            recovered_plus_bonus[torch.arange(0, batch_size),
-                                 last_accepted_indices + 1],
-            output_token_ids[torch.arange(0, batch_size),
-                             last_accepted_indices + 1])
+            recovered_plus_bonus[
+                torch.arange(0, batch_size), last_accepted_indices + 1
+            ],
+            output_token_ids[torch.arange(0, batch_size), last_accepted_indices + 1],
+        )
 
         # Assert every subsequent token is -1.
-        subsequent_mask = torch.arange(0, k + 1).expand(
-            batch_size, k + 1) >= (last_accepted_indices + 2).unsqueeze(-1)
+        subsequent_mask = torch.arange(0, k + 1).expand(batch_size, k + 1) >= (
+            last_accepted_indices + 2
+        ).unsqueeze(-1)
         assert torch.all(output_token_ids[subsequent_mask] == -1)
 
 
@@ -140,28 +141,23 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
-def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
-                                    device: str, use_flashinfer: bool):
+def test_no_crash_with_varying_dims(
+    k: int, vocab_size: int, batch_size: int, device: str, use_flashinfer: bool
+):
     torch.set_default_device(device)
     rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
     rejection_sampler.init_gpu_tensors(device=device)
 
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-
-    rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                      draft_token_ids)
+    target_probs = torch.rand(batch_size, k + 1, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
+
+    rejection_sampler(target_probs, bonus_token_ids, draft_probs, draft_token_ids)
 
 
 @pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
@@ -175,26 +171,27 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
 # to pass in uniform samples.
 @pytest.mark.parametrize("use_flashinfer", [False])
 @torch.inference_mode()
-def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
-                                   frac_seeded: float, n_rep: int, device: str,
-                                   use_flashinfer: bool):
+def test_deterministic_when_seeded(
+    k: int,
+    vocab_size: int,
+    batch_size: int,
+    frac_seeded: float,
+    n_rep: int,
+    device: str,
+    use_flashinfer: bool,
+):
     torch.set_default_device(device)
     rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
     rejection_sampler.init_gpu_tensors(device=device)
 
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
+    target_probs = torch.rand(batch_size, k + 1, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
 
     seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
 
@@ -202,11 +199,14 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
     for _ in range(n_rep):
         seeded_seqs = {
             i: torch.Generator(device=device).manual_seed(i)
-            for i in range(batch_size) if seeded_mask[i]
+            for i in range(batch_size)
+            if seeded_mask[i]
         }
         results.append(
-            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                              draft_token_ids, seeded_seqs))
+            rejection_sampler(
+                target_probs, bonus_token_ids, draft_probs, draft_token_ids, seeded_seqs
+            )
+        )
 
     for i in range(batch_size):
         if seeded_mask[i]:
@@ -223,31 +223,31 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
 # to pass in uniform samples.
 @pytest.mark.parametrize("use_flashinfer", [False])
 @torch.inference_mode()
-def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
-                            device: str, use_flashinfer: bool):
+def test_mixed_seeded_batch(
+    k: int, vocab_size: int, batch_size: int, device: str, use_flashinfer: bool
+):
     torch.set_default_device(device)
     set_random_seed(0)
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
+    target_probs = torch.rand(batch_size, k + 1, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
 
     single_batches = []
     for i in range(batch_size):
-        single_batches.append((draft_probs[i].clone().unsqueeze(0),
-                               draft_token_ids[i].clone().unsqueeze(0),
-                               target_probs[i].clone().unsqueeze(0),
-                               bonus_token_ids[i].clone().unsqueeze(0),
-                               draft_token_ids[i].clone().unsqueeze(0)))
+        single_batches.append(
+            (
+                draft_probs[i].clone().unsqueeze(0),
+                draft_token_ids[i].clone().unsqueeze(0),
+                target_probs[i].clone().unsqueeze(0),
+                bonus_token_ids[i].clone().unsqueeze(0),
+                draft_token_ids[i].clone().unsqueeze(0),
+            )
+        )
 
     set_random_seed(0)
     rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
@@ -258,24 +258,40 @@ def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
         i: torch.Generator(device=device).manual_seed(i)
         for i in range(1, batch_size)  # 0 is seed None
     }
-    batch_result = rejection_sampler(target_probs.clone(),
-                                     bonus_token_ids.clone(),
-                                     draft_probs.clone(),
-                                     draft_token_ids.clone(), seeded_seqs)
+    batch_result = rejection_sampler(
+        target_probs.clone(),
+        bonus_token_ids.clone(),
+        draft_probs.clone(),
+        draft_token_ids.clone(),
+        seeded_seqs,
+    )
 
     set_random_seed(0)
 
     rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
     rejection_sampler.init_gpu_tensors(device=device)
     for i in range(batch_size):
-        request_seeded_seqs = {
-            0: torch.Generator(device=device).manual_seed(i)
-        } if seeded_seqs.get(i) is not None else None
-        (draft_probs, draft_token_ids, target_probs, bonus_token_ids,
-         draft_token_ids) = single_batches[i]
+        request_seeded_seqs = (
+            {0: torch.Generator(device=device).manual_seed(i)}
+            if seeded_seqs.get(i) is not None
+            else None
+        )
+        (
+            draft_probs,
+            draft_token_ids,
+            target_probs,
+            bonus_token_ids,
+            draft_token_ids,
+        ) = single_batches[i]
         results.append(
-            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                              draft_token_ids, request_seeded_seqs))
+            rejection_sampler(
+                target_probs,
+                bonus_token_ids,
+                draft_probs,
+                draft_token_ids,
+                request_seeded_seqs,
+            )
+        )
     for i in range(batch_size):
         assert torch.equal(batch_result[i], results[i].squeeze(0))
 
@@ -285,31 +301,29 @@ def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
-                                       batch_size: int, device: str):
+def test_compare_nonflashinfer_backend(
+    k: int, vocab_size: int, batch_size: int, device: str
+):
     """
-    Test the flashinfer and nonflashinfer backend generate 
+    Test the flashinfer and nonflashinfer backend generate
     the same output metrics.
     """
 
-    pytest.skip("Not testing FlashInfer now, since 0.2.3 API removed "
-                "the ability to pass in uniform samples.")
+    pytest.skip(
+        "Not testing FlashInfer now, since 0.2.3 API removed "
+        "the ability to pass in uniform samples."
+    )
 
     torch.set_default_device(device)
     torch.manual_seed(0)
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
+    target_probs = torch.rand(batch_size, k + 1, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
 
     num_accepted_tokens = []
     num_emitted_tokens = []
@@ -317,8 +331,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
 
     def get_seeded_seqs():
         return {
-            i: torch.Generator(device=device).manual_seed(i)
-            for i in range(batch_size)
+            i: torch.Generator(device=device).manual_seed(i) for i in range(batch_size)
         }
 
     for use_flashinfer in [True, False]:
@@ -327,8 +340,9 @@ def get_seeded_seqs():
         # We use seeded sequences to ensure the same tokens are accepted
         # for both flashinfer and nonflashinfer backends.
         seeded_seqs = get_seeded_seqs()
-        rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids, seeded_seqs)
+        rejection_sampler(
+            target_probs, bonus_token_ids, draft_probs, draft_token_ids, seeded_seqs
+        )
         num_accepted_tokens.append(rejection_sampler.num_accepted_tokens)
         num_emitted_tokens.append(rejection_sampler.num_emitted_tokens)
         num_draft_tokens.append(rejection_sampler.num_draft_tokens)
@@ -339,36 +353,34 @@ def get_seeded_seqs():
 
 
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
-@pytest.mark.parametrize("which_token_ids",
-                         ["bonus_token_ids", "draft_token_ids"])
+@pytest.mark.parametrize("which_token_ids", ["bonus_token_ids", "draft_token_ids"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
-def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
-                               which_token_ids: str, device: str,
-                               use_flashinfer: bool):
+def test_raises_when_vocab_oob(
+    above_or_below_vocab_range: str,
+    which_token_ids: str,
+    device: str,
+    use_flashinfer: bool,
+):
     k = 3
     batch_size = 5
     vocab_size = 30_000
     torch.set_default_device(device)
 
-    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer,
-                                         strict_mode=True)
+    rejection_sampler = RejectionSampler(
+        use_flashinfer=use_flashinfer, strict_mode=True
+    )
     rejection_sampler.init_gpu_tensors(device=device)
 
     draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
+    target_probs = torch.rand(batch_size, k + 1, vocab_size, dtype=torch.float32)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
 
     oob_token_ids = None
     if which_token_ids == "bonus_token_ids":
@@ -388,8 +400,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     oob_token_ids[0][0] = rogue_token_id
 
     with pytest.raises(AssertionError):
-        rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids)
+        rejection_sampler(target_probs, bonus_token_ids, draft_probs, draft_token_ids)
 
 
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@@ -397,7 +408,8 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
 @pytest.mark.parametrize("use_flashinfer", [True, False])
 @torch.inference_mode()
 def test_rejection_sampling_approximates_target_distribution(
-        seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
+    seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool
+):
     """Verify rejection sampling approximates target distribution,
     despite sampling from a potentially distinct draft distribution.
 
@@ -432,43 +444,54 @@ def test_rejection_sampling_approximates_target_distribution(
     )
 
     draft_probs, target_probs, reference_probs = helper.generate_probs_for_test(
-        draft_and_target_probs_equal)
+        draft_and_target_probs_equal
+    )
 
     sample_sizes = [10, 100, 1_000, 10_000, 100_000]
     distance_wrt_reference: list[float] = []
     distance_wrt_target: list[float] = []
 
     for num_samples in sample_sizes:
-        (reference_vs_rejsample_dist,
-         target_vs_rejsample_dist) = helper.run_and_compare_distributions(
-             draft_probs,
-             target_probs,
-             reference_probs,
-             num_samples,
-         )
+        (reference_vs_rejsample_dist, target_vs_rejsample_dist) = (
+            helper.run_and_compare_distributions(
+                draft_probs,
+                target_probs,
+                reference_probs,
+                num_samples,
+            )
+        )
 
         distance_wrt_reference.append(reference_vs_rejsample_dist)
         distance_wrt_target.append(target_vs_rejsample_dist)
 
         relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-            distance_wrt_target)
+            distance_wrt_target
+        )
         relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-            distance_wrt_reference)
+            distance_wrt_reference
+        )
 
-        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
-              f"{reference_vs_rejsample_dist=:.05f}")
-        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
-              f"{relative_change_in_distance_wrt_reference=:.02f}")
+        print(
+            f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+            f"{reference_vs_rejsample_dist=:.05f}"
+        )
+        print(
+            f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+            f"{relative_change_in_distance_wrt_reference=:.02f}"
+        )
 
     relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-        distance_wrt_target)
+        distance_wrt_target
+    )
     relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-        distance_wrt_reference)
+        distance_wrt_reference
+    )
 
     expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target
-            > relative_change_in_distance_wrt_reference *
-            expected_improvement_multiplier)
+    assert (
+        relative_change_in_distance_wrt_target
+        > relative_change_in_distance_wrt_reference * expected_improvement_multiplier
+    )
 
 
 def get_ratio_first_to_last(elements: list[float]) -> float:
@@ -497,16 +520,17 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler):
     def generate_probs_for_test(
         self, draft_and_target_probs_equal: bool
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        draft_probs, target_probs = (F.softmax(
-            torch.rand(self.vocab_size, dtype=torch.float32),
-            dim=-1,
-        ) for _ in range(2))
+        draft_probs, target_probs = (
+            F.softmax(
+                torch.rand(self.vocab_size, dtype=torch.float32),
+                dim=-1,
+            )
+            for _ in range(2)
+        )
 
         num_reference_probs = 100
         reference_probs = F.softmax(
-            torch.rand(num_reference_probs,
-                       self.vocab_size,
-                       dtype=torch.float32),
+            torch.rand(num_reference_probs, self.vocab_size, dtype=torch.float32),
             dim=-1,
         )
 
@@ -515,20 +539,24 @@ def generate_probs_for_test(
 
         return draft_probs, target_probs, reference_probs
 
-    def run_and_compare_distributions(self, draft_probs: torch.Tensor,
-                                      target_probs: torch.Tensor,
-                                      reference_probs: torch.Tensor,
-                                      num_samples: int) -> tuple[float, float]:
+    def run_and_compare_distributions(
+        self,
+        draft_probs: torch.Tensor,
+        target_probs: torch.Tensor,
+        reference_probs: torch.Tensor,
+        num_samples: int,
+    ) -> tuple[float, float]:
         # Sample using rejection sampling.
         rej_sample_probs = self._estimate_rejection_sampling_pdf(
-            draft_probs, target_probs, num_samples)
+            draft_probs, target_probs, num_samples
+        )
 
         # Average distance from reference probs.
-        reference_vs_rejsample_dist = torch.dist(
-            reference_probs,
-            rej_sample_probs).item() / reference_probs.shape[0]
-        target_vs_rejsample_dist = torch.dist(target_probs,
-                                              rej_sample_probs).item()
+        reference_vs_rejsample_dist = (
+            torch.dist(reference_probs, rej_sample_probs).item()
+            / reference_probs.shape[0]
+        )
+        target_vs_rejsample_dist = torch.dist(target_probs, rej_sample_probs).item()
 
         return reference_vs_rejsample_dist, target_vs_rejsample_dist
 
@@ -540,38 +568,42 @@ def _estimate_rejection_sampling_pdf(
     ) -> torch.Tensor:
         # Repeat draft probs num_samples times.
         draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat(
-            num_samples, 1, 1)
+            num_samples, 1, 1
+        )
 
         # Repeat target probs num_samples * (k + 1) times.
         # Rejection sampler requires bonus token probs, but they aren't used.
         target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat(
-            num_samples, self.k + 1, 1)
+            num_samples, self.k + 1, 1
+        )
 
         # Randomly sample draft token ids from draft probs.
-        draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
-                                            num_samples=1,
-                                            replacement=True).reshape(
-                                                num_samples, self.k)
+        draft_token_ids = torch.multinomial(
+            draft_probs[:, 0, :], num_samples=1, replacement=True
+        ).reshape(num_samples, self.k)
 
         # Bonus tokens not used but required.
-        bonus_token_ids = torch.zeros((1, self.num_bonus_tokens),
-                                      dtype=torch.int64,
-                                      device="cuda").repeat(num_samples, 1)
+        bonus_token_ids = torch.zeros(
+            (1, self.num_bonus_tokens), dtype=torch.int64, device="cuda"
+        ).repeat(num_samples, 1)
 
         # Get output tokens via rejection sampling.
-        output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
-                                                  bonus_token_ids.to("cuda"),
-                                                  draft_probs.to("cuda"),
-                                                  draft_token_ids.to("cuda"))
+        output_token_ids = self.rejection_sampler(
+            target_probs.to("cuda"),
+            bonus_token_ids.to("cuda"),
+            draft_probs.to("cuda"),
+            draft_token_ids.to("cuda"),
+        )
 
         # Remove bonus tokens
         output_token_ids = output_token_ids[:, :-1].flatten()
 
         # Estimate probability density function
-        hist = torch.histogram(output_token_ids.to(dtype=torch.float,
-                                                   device="cpu"),
-                               bins=self.vocab_size,
-                               range=self.vocab_range,
-                               density=True)
+        hist = torch.histogram(
+            output_token_ids.to(dtype=torch.float, device="cpu"),
+            bins=self.vocab_size,
+            range=self.vocab_range,
+            density=True,
+        )
 
         return hist.hist
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 520b88d03ac8..0eeed126a14f 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -24,11 +24,10 @@ def use_v0_only(monkeypatch):
     """
     This file tests V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 class MockLogitsSampler(Sampler):
-
     def __init__(self, fake_logits: torch.Tensor):
         super().__init__()
         self.fake_logits = fake_logits
@@ -38,21 +37,17 @@ def forward(self, *args, **kwargs):
 
 
 def _prepare_test(
-        batch_size: int
+    batch_size: int,
 ) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, VOCAB_SIZE),
-                             1e-2,
-                             dtype=input_tensor.dtype)
+    fake_logits = torch.full((batch_size, VOCAB_SIZE), 1e-2, dtype=input_tensor.dtype)
     sampler = MockLogitsSampler(fake_logits)
     return input_tensor, fake_logits, sampler
 
 
 VOCAB_SIZE = 32000
 RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 
 
 def _do_sample(
@@ -72,7 +67,8 @@ def _do_sample(
                 seq_data={0: SequenceData.from_seqs([1, 2, 3])},
                 sampling_params=sampling_params,
                 block_tables={0: [1]},
-            ))
+            )
+        )
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     sampling_metadata = SamplingMetadata.prepare(
@@ -80,7 +76,8 @@ def _do_sample(
         seq_lens,
         query_lens=seq_lens,
         device=device,
-        pin_memory=is_pin_memory_available())
+        pin_memory=is_pin_memory_available(),
+    )
     return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
 
 
@@ -93,8 +90,9 @@ def test_sampler_all_greedy(seed: int, device: str):
     input_tensor, fake_logits, sampler = _prepare_test(batch_size)
 
     sampling_params = SamplingParams(temperature=0)
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
+    sampler_output = _do_sample(
+        batch_size, fake_logits, sampler, sampling_params, device
+    )
     expected = torch.argmax(fake_logits, dim=-1)
     for i, sequence_output in enumerate(sampler_output):
         for nth_output in sequence_output.samples:
@@ -116,8 +114,9 @@ def test_sampler_all_random(seed: int, device: str):
         temperature=1.0,
         n=random.randint(1, 10),
     )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
+    sampler_output = _do_sample(
+        batch_size, fake_logits, sampler, sampling_params, device
+    )
 
     for i, sequence_output in enumerate(sampler_output):
         for nth_output in sequence_output.samples:
@@ -140,8 +139,9 @@ def test_sampler_all_random_seed(seed: int, device: str):
         n=random.randint(1, 10),
         seed=random.randint(0, 10000),
     )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
+    sampler_output = _do_sample(
+        batch_size, fake_logits, sampler, sampling_params, device
+    )
 
     for i, sequence_output in enumerate(sampler_output):
         for nth_output in sequence_output.samples:
@@ -161,11 +161,13 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
         n=random.randint(1, 10),
         seed=random.randint(0, 10000),
     )
-    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                      sampling_params, device)
+    first_sampler_output = _do_sample(
+        batch_size, fake_logits, sampler, sampling_params, device
+    )
 
-    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                       sampling_params, device)
+    second_sampler_output = _do_sample(
+        batch_size, fake_logits, sampler, sampling_params, device
+    )
 
     assert first_sampler_output == second_sampler_output
 
@@ -177,11 +179,13 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
 
-    def create_sampling_params(min_tokens,
-                               eos_token_id=0,
-                               *,
-                               stop_token_ids: Optional[list[int]] = None,
-                               prompt_logprobs: Optional[int] = None):
+    def create_sampling_params(
+        min_tokens,
+        eos_token_id=0,
+        *,
+        stop_token_ids: Optional[list[int]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ):
         sampling_params = SamplingParams(
             min_tokens=min_tokens,
             max_tokens=9999,  # keep higher than max of min_tokens
@@ -194,10 +198,12 @@ def create_sampling_params(min_tokens,
 
     def create_sequence_data(num_input=3, num_generated=0):
         seq_data = SequenceData.from_seqs(
-            random.choices(range(0, VOCAB_SIZE), k=num_input))
+            random.choices(range(0, VOCAB_SIZE), k=num_input)
+        )
         if num_generated > 0:
-            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
-                                                       k=num_generated)
+            seq_data.output_token_ids = random.choices(
+                range(0, VOCAB_SIZE), k=num_generated
+            )
         return seq_data
 
     def generate_test_case():
@@ -215,15 +221,17 @@ def generate_test_case():
             min_tokens = random.randint(0, 50)
             num_stop_tokens = random.randint(0, 8)
             if num_stop_tokens > 0:
-                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
-                                                k=num_stop_tokens)
+                stop_token_ids = random.choices(
+                    range(0, VOCAB_SIZE - 1), k=num_stop_tokens
+                )
             else:
                 stop_token_ids = None
 
             sampling_params = create_sampling_params(
                 min_tokens=min_tokens,
                 eos_token_id=eos_token_id,
-                stop_token_ids=stop_token_ids)
+                stop_token_ids=stop_token_ids,
+            )
 
             seq_data: dict[int, SequenceData] = {}
             seq_group_penalization: list[bool] = []
@@ -231,7 +239,8 @@ def generate_test_case():
                 num_input = random.randint(1, 100)
                 num_generated = 0 if is_prompt else random.randint(1, 100)
                 seq_data[next(seq_id_counter)] = create_sequence_data(
-                    num_input=num_input, num_generated=num_generated)
+                    num_input=num_input, num_generated=num_generated
+                )
                 seq_group_penalization.append(num_generated < min_tokens)
 
             expected_penalization.extend(seq_group_penalization)
@@ -242,7 +251,8 @@ def generate_test_case():
                     seq_data=seq_data,
                     sampling_params=sampling_params,
                     block_tables={},
-                ))
+                )
+            )
             batch_size -= num_seqs
 
         return {
@@ -263,7 +273,7 @@ def generate_test_case():
                 sampling_params=create_sampling_params(0),
                 block_tables={},
             ),
-        ]
+        ],
     }
 
     prompt_with_penalization = {
@@ -278,7 +288,7 @@ def generate_test_case():
                 sampling_params=create_sampling_params(1),
                 block_tables={},
             ),
-        ]
+        ],
     }
 
     prompt_with_penalization_and_prompt_logprobs = {
@@ -293,7 +303,7 @@ def generate_test_case():
                 sampling_params=create_sampling_params(1, prompt_logprobs=3),
                 block_tables={},
             ),
-        ]
+        ],
     }
 
     stop_penalizing_after_min_tokens = {
@@ -303,13 +313,12 @@ def generate_test_case():
                 request_id="test_1",
                 is_prompt=False,
                 seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter): create_sequence_data(num_generated=1),
                 },
                 sampling_params=create_sampling_params(1),
                 block_tables={},
             )
-        ]
+        ],
     }
 
     stop_token_ids = [42, 99, 42, 0]  # intentional duplication
@@ -332,10 +341,11 @@ def generate_test_case():
                     next(seq_id_counter): create_sequence_data(),
                 },
                 sampling_params=create_sampling_params(
-                    0, stop_token_ids=stop_token_ids),
+                    0, stop_token_ids=stop_token_ids
+                ),
                 block_tables={},
-            )
-        ]
+            ),
+        ],
     }
 
     stop_token_ids = [1, 999, 37, 37]  # intentional duplication
@@ -346,31 +356,28 @@ def generate_test_case():
                 request_id="test_1",
                 is_prompt=False,
                 seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=100),
+                    next(seq_id_counter): create_sequence_data(num_generated=1),
+                    next(seq_id_counter): create_sequence_data(num_generated=100),
                 },
                 sampling_params=create_sampling_params(
-                    2, stop_token_ids=stop_token_ids),
+                    2, stop_token_ids=stop_token_ids
+                ),
                 block_tables={},
             ),
             SequenceGroupMetadata(
                 request_id="test_2",
                 is_prompt=False,
                 seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=20),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=10),
+                    next(seq_id_counter): create_sequence_data(num_generated=20),
+                    next(seq_id_counter): create_sequence_data(num_generated=1),
+                    next(seq_id_counter): create_sequence_data(num_generated=10),
                 },
                 sampling_params=create_sampling_params(
-                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
+                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids
+                ),
                 block_tables={},
             ),
-        ]
+        ],
     }
 
     if seed == 0:
@@ -385,12 +392,15 @@ def generate_test_case():
     else:
         test_cases = [generate_test_case()]
 
-    def run_test_case(*, expected_penalization: list[bool],
-                      seq_group_metadata_list: list[SequenceGroupMetadata]):
-        assert expected_penalization, \
-            "Invalid test case, need expected_penalization"
-        assert seq_group_metadata_list, \
+    def run_test_case(
+        *,
+        expected_penalization: list[bool],
+        seq_group_metadata_list: list[SequenceGroupMetadata],
+    ):
+        assert expected_penalization, "Invalid test case, need expected_penalization"
+        assert seq_group_metadata_list, (
             "Invalid test case, need seq_group_metadata_list"
+        )
 
         batch_size = 0
         seq_lens: list[int] = []
@@ -412,14 +422,11 @@ def run_test_case(*, expected_penalization: list[bool],
                     num_rows = prompt_len
 
             batch_size += num_rows
-            sampling_params_per_row.extend(
-                itertools.repeat(sampling_params, num_rows))
+            sampling_params_per_row.extend(itertools.repeat(sampling_params, num_rows))
 
-        assert len(
-            expected_penalization
-        ) == batch_size, \
-            ("Invalid test case, expected_penalization does not match computed"
-             "batch size")
+        assert len(expected_penalization) == batch_size, (
+            "Invalid test case, expected_penalization does not match computedbatch size"
+        )
 
         _, fake_logits, sampler = _prepare_test(batch_size)
         sampling_metadata = SamplingMetadata.prepare(
@@ -427,31 +434,34 @@ def run_test_case(*, expected_penalization: list[bool],
             seq_lens=seq_lens if seq_lens else None,
             query_lens=seq_lens if seq_lens else [1] * batch_size,
             device=device,
-            pin_memory=is_pin_memory_available())
+            pin_memory=is_pin_memory_available(),
+        )
         # the logits tensor is modified in-place by the sampler
         _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
 
         for logits_idx, (should_penalize, sampling_params) in enumerate(
-                zip(expected_penalization, sampling_params_per_row)):
-
+            zip(expected_penalization, sampling_params_per_row)
+        ):
             tokens_to_check = sampling_params.all_stop_token_ids
 
             if should_penalize:
                 for token_id in tokens_to_check:
-                    assert fake_logits[logits_idx, token_id] == -float(
-                        'inf'
-                    ), f"Expected token {token_id} for logits row {logits_idx}"
+                    assert fake_logits[logits_idx, token_id] == -float("inf"), (
+                        f"Expected token {token_id} for logits row {logits_idx}"
+                    )
                     " to be penalized"
                 # no other tokens should be set to -inf
                 assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] == -float('inf')) == len(
-                        tokens_to_check
-                    ), f"Expected only {len(tokens_to_check)} to be penalized"
+                    fake_logits[logits_idx, :] == -float("inf")
+                ) == len(tokens_to_check), (
+                    f"Expected only {len(tokens_to_check)} to be penalized"
+                )
             else:
                 # no tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] ==
-                    -float('inf')) == 0, "No tokens should have been penalized"
+                assert (
+                    torch.count_nonzero(fake_logits[logits_idx, :] == -float("inf"))
+                    == 0
+                ), "No tokens should have been penalized"
 
     for test_case in test_cases:
         run_test_case(**test_case)
@@ -498,7 +508,8 @@ def test_sampler_mixed(seed: int, device: str):
                 seq_data={0: SequenceData.from_seqs([1, 2, 3])},
                 sampling_params=sampling_params,
                 block_tables={0: [1]},
-            ))
+            )
+        )
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     generators: dict[str, torch.Generator] = {}
@@ -510,21 +521,22 @@ def test_sampling():
             query_lens=seq_lens,
             device=device,
             pin_memory=is_pin_memory_available(),
-            generators=generators)
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
+            generators=generators,
+        )
+        sampler_output = sampler(
+            logits=fake_logits, sampling_metadata=sampling_metadata
+        )
 
         for i, (sequence_output, metadata) in enumerate(
-                zip(sampler_output, seq_group_metadata_list)):
+            zip(sampler_output, seq_group_metadata_list)
+        ):
             assert metadata.sampling_params is not None
 
-            if (metadata.sampling_params.seed is not None
-                    and expected_tokens[i] is None):
+            if metadata.sampling_params.seed is not None and expected_tokens[i] is None:
                 # Record seeded random result to compare with results of
                 # second invocation
                 expected_tokens[i] = [
-                    nth_output.output_token
-                    for nth_output in sequence_output.samples
+                    nth_output.output_token for nth_output in sequence_output.samples
                 ]
                 continue
 
@@ -534,8 +546,10 @@ def test_sampling():
             for n, nth_output in enumerate(sequence_output.samples):
                 assert metadata.sampling_params is not None
 
-                if (metadata.sampling_params.temperature == 0
-                        or metadata.sampling_params.seed is not None):
+                if (
+                    metadata.sampling_params.temperature == 0
+                    or metadata.sampling_params.seed is not None
+                ):
                     # Ensure exact matches for greedy or random with seed
                     assert nth_output.output_token == expected_tokens_item[n]
                 else:
@@ -548,8 +562,12 @@ def test_sampling():
 
     # Shuffle the batch and resample
     target_index = list(range(batch_size))
-    for list_to_shuffle in (target_index, seq_group_metadata_list,
-                            expected_tokens, seq_lens):
+    for list_to_shuffle in (
+        target_index,
+        seq_group_metadata_list,
+        expected_tokens,
+        seq_lens,
+    ):
         random.Random(seed).shuffle(list_to_shuffle)
     target_index = torch.tensor(target_index)
     input_tensor.data = input_tensor.index_select(0, target_index)
@@ -568,20 +586,18 @@ def test_sampler_top_k_top_p(seed: int, device: str):
     top_k = random.randint(100, 500)
     top_p = random.random() * 0.1
     vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024),
-                              device=device,
-                              dtype=torch.float16)
-    fake_logits = torch.normal(0,
-                               5,
-                               size=(batch_size, vocab_size),
-                               device=input_tensor.device,
-                               dtype=input_tensor.dtype)
+    input_tensor = torch.rand((batch_size, 1024), device=device, dtype=torch.float16)
+    fake_logits = torch.normal(
+        0,
+        5,
+        size=(batch_size, vocab_size),
+        device=input_tensor.device,
+        dtype=input_tensor.dtype,
+    )
     sampler = MockLogitsSampler(fake_logits)
 
     generation_model = GenerationMixin()
-    generation_config = GenerationConfig(top_k=top_k,
-                                         top_p=top_p,
-                                         do_sample=True)
+    generation_config = GenerationConfig(top_k=top_k, top_p=top_p, do_sample=True)
 
     @dataclass
     class MockConfig:
@@ -589,11 +605,9 @@ class MockConfig:
 
     generation_model.config = MockConfig()  # needed by the following method
     generation_model._prepare_special_tokens(generation_config, device=device)
-    processors = generation_model._get_logits_processor(generation_config,
-                                                        None,
-                                                        None,
-                                                        None, [],
-                                                        device=device)
+    processors = generation_model._get_logits_processor(
+        generation_config, None, None, None, [], device=device
+    )
     assert len(processors) == 2  # top_p and top_k
 
     seq_group_metadata_list: list[SequenceGroupMetadata] = []
@@ -610,7 +624,8 @@ class MockConfig:
                     top_p=top_p,
                 ),
                 block_tables={0: [1]},
-            ))
+            )
+        )
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
     sampling_metadata = SamplingMetadata.prepare(
@@ -618,20 +633,23 @@ class MockConfig:
         seq_lens,
         query_lens=seq_lens,
         device=device,
-        pin_memory=is_pin_memory_available())
+        pin_memory=is_pin_memory_available(),
+    )
 
     sample_probs = None
 
     def mock_sample(probs, *args, **kwargs):
         nonlocal sample_probs
         sample_probs = probs
-        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
-                 for prob in probs], None)
+        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs], None)
 
     # top-k and top-p is only calculated when flashinfer kernel is not available
-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
-         patch("vllm.model_executor.layers.sampler."
-               "flashinfer_top_k_top_p_sampling", None):
+    with (
+        patch("vllm.model_executor.layers.sampler._sample", mock_sample),
+        patch(
+            "vllm.model_executor.layers.sampler.flashinfer_top_k_top_p_sampling", None
+        ),
+    ):
         sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
 
     assert sample_probs is not None
@@ -663,25 +681,26 @@ def failing_flashinfer_sampling(*_args, **_kwargs):
         n=random.randint(1, 10),
         seed=random.randint(0, 10000),
     )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
+    sampler_output = _do_sample(
+        batch_size, fake_logits, sampler, sampling_params, device
+    )
 
     with patch(
-            "vllm.model_executor.layers.sampler."
-            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
-        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                             sampling_params, device)
+        "vllm.model_executor.layers.sampler.flashinfer_top_k_top_p_sampling",
+        failing_flashinfer_sampling,
+    ):
+        fallback_sampler_output = _do_sample(
+            batch_size, fake_logits, sampler, sampling_params, device
+        )
 
     assert sampler_output == fallback_sampler_output
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_repetition_penalty_mixed(device: str):
-
     vocab_size = 8
 
     def test_sampling_params(sampling_params: list[SamplingParams]):
-
         seq_group_metadata_list: list[SequenceGroupMetadata] = []
         seq_lens: list[int] = []
         for i in range(2):
@@ -692,7 +711,8 @@ def test_sampling_params(sampling_params: list[SamplingParams]):
                     seq_data={0: SequenceData.from_seqs([1, 2, 3])},
                     sampling_params=sampling_params[i],
                     block_tables={0: [1]},
-                ))
+                )
+            )
             seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
         sampling_metadata = SamplingMetadata.prepare(
@@ -700,20 +720,21 @@ def test_sampling_params(sampling_params: list[SamplingParams]):
             seq_lens,
             query_lens=seq_lens,
             device=device,
-            pin_memory=is_pin_memory_available())
+            pin_memory=is_pin_memory_available(),
+        )
 
-        fake_logits = torch.full((2, vocab_size),
-                                 1e-2,
-                                 device=device,
-                                 dtype=torch.float16)
+        fake_logits = torch.full(
+            (2, vocab_size), 1e-2, device=device, dtype=torch.float16
+        )
 
         fake_logits[:, 5] = 1.1e-2
         fake_logits[:, 1] = 1.2e-2
 
         sampler = MockLogitsSampler(fake_logits)
 
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
+        sampler_output = sampler(
+            logits=fake_logits, sampling_metadata=sampling_metadata
+        )
 
         generated_tokens = []
         for output in sampler_output:
@@ -734,11 +755,9 @@ def test_sampling_params(sampling_params: list[SamplingParams]):
         seed=42,
     )
 
-    tokens1 = test_sampling_params(
-        [sampling_params_rep, sampling_params_sample])
+    tokens1 = test_sampling_params([sampling_params_rep, sampling_params_sample])
 
-    tokens2 = test_sampling_params(
-        [sampling_params_sample, sampling_params_rep])
+    tokens2 = test_sampling_params([sampling_params_sample, sampling_params_rep])
 
     assert tokens1[0] == tokens2[1]
     assert tokens1[1] == tokens2[0]
@@ -757,11 +776,11 @@ def test_sampler_include_gpu_probs_tensor(device: str):
 
     mock_inplace = Mock()
     with patch(
-            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
-            mock_inplace):
-
-        sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                    sampling_params, device)
+        "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace", mock_inplace
+    ):
+        sampler_output = _do_sample(
+            batch_size, fake_logits, sampler, sampling_params, device
+        )
         mock_inplace.assert_not_called()
 
     assert sampler_output.sampled_token_probs is not None
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index b339b4b2ddf3..faf29d1c00bf 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/samplers/test_seeded_generate.py`.
 """
+
 import copy
 import random
 from itertools import combinations
@@ -53,26 +54,25 @@ def test_random_sample_with_seed(
 
     for prompt in example_prompts:
         for params in (
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
+            sampling_params,
+            sampling_params_seed_1,
+            sampling_params_seed_2,
+            sampling_params,
+            sampling_params_seed_1,
+            sampling_params_seed_2,
         ):
             llm._add_request(prompt, params=params)
 
     results = llm._run_engine(use_tqdm=False)
-    all_outputs = [[out.token_ids for out in output.outputs]
-                   for output in results]
+    all_outputs = [[out.token_ids for out in output.outputs] for output in results]
 
     for i in range(0, len(example_prompts), 6):
-        outputs = all_outputs[i:i + 6]
+        outputs = all_outputs[i : i + 6]
 
         # verify all non-seeded requests differ
         for output_a, output_b in combinations(
             (outputs[0], outputs[1], outputs[2], outputs[3]),
-                2,
+            2,
         ):
             assert output_a != output_b
 
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index 119841470bfb..a2033b7dd1a9 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -6,7 +6,8 @@
 import torch
 
 from vllm.model_executor.layers.typical_acceptance_sampler import (
-    TypicalAcceptanceSampler)
+    TypicalAcceptanceSampler,
+)
 from vllm.model_executor.utils import set_random_seed
 
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
@@ -17,7 +18,7 @@ def use_v0_only(monkeypatch):
     """
     This file tests V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
 
 def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
@@ -26,7 +27,7 @@ def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
     Returns:
         1. A fake temperature zero probability distribution of shape
            [batch_size, k, vocab_size]
-        2. Tensor of shape [batch_size, k] containing the token ids 
+        2. Tensor of shape [batch_size, k] containing the token ids
            of the probability 1.0 tokens at each position.
     """
     # Simulate temperature 0 probability distribution for target probabilities
@@ -38,16 +39,18 @@ def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
     # set the probability of the tokens with ids in zero_temperature_token_ids
     # to 1 and the rest to 0.
     target_probs = torch.zeros_like(probs).scatter_(
-        -1, zero_temperature_token_ids.unsqueeze(-1), 1.0)
+        -1, zero_temperature_token_ids.unsqueeze(-1), 1.0
+    )
     return target_probs, zero_temperature_token_ids
 
 
-def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
-                        token_ids_to_exclude: torch.Tensor):
+def get_draft_token_ids(
+    batch_size: int, k: int, vocab_size: int, token_ids_to_exclude: torch.Tensor
+):
     """
     Returns a tensor of shape [batch_size, k] of fake draft token ids
     drawn randomly from a vocab of size vocab_size. We however ensure
-    that token_ids from token_ids_to_exclude are excluded at the 
+    that token_ids from token_ids_to_exclude are excluded at the
     corresponding positions.
     """
     draft_token_ids = torch.empty(batch_size, k, dtype=torch.long)
@@ -55,7 +58,7 @@ def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
         for j in range(k):
             # Generate a random token ID excluding token_ids_to_exclude[i, j]
             while True:
-                token_id = torch.randint(0, vocab_size, (1, )).item()
+                token_id = torch.randint(0, vocab_size, (1,)).item()
                 if token_id != token_ids_to_exclude[i, j]:
                     draft_token_ids[i, j] = token_id
                     break
@@ -70,8 +73,7 @@ def get_acceptance_sampler(
     """
     Initializes and returns a TypicalAcceptanceSampler.
     """
-    return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha,
-                                    strict_mode)
+    return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha, strict_mode)
 
 
 @pytest.mark.parametrize("k", list(range(1, 6)))
@@ -79,8 +81,9 @@ def get_acceptance_sampler(
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
-                                    device: str):
+def test_no_crash_with_varying_dims(
+    k: int, vocab_size: int, batch_size: int, device: str
+):
     """
     Tests that the TypicalAcceptancSampler forward succeeds for
     different combinations of k, vocab_size, batch_size and num devices.
@@ -88,32 +91,31 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
     torch.set_default_device(device)
     typical_acceptance_sampler = get_acceptance_sampler()
     typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_with_bonus_probs = torch.rand(batch_size,
-                                         k + 1,
-                                         vocab_size,
-                                         dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
+    target_with_bonus_probs = torch.rand(
+        batch_size, k + 1, vocab_size, dtype=torch.float32
+    )
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
     # Verify that sampling succeeds for all cases.
-    typical_acceptance_sampler(target_with_bonus_probs,
-                               bonus_token_ids,
-                               draft_probs=None,
-                               draft_token_ids=draft_token_ids)
+    typical_acceptance_sampler(
+        target_with_bonus_probs,
+        bonus_token_ids,
+        draft_probs=None,
+        draft_token_ids=draft_token_ids,
+    )
 
 
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
-@pytest.mark.parametrize("which_token_ids",
-                         ["bonus_token_ids", "draft_token_ids"])
+@pytest.mark.parametrize("which_token_ids", ["bonus_token_ids", "draft_token_ids"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
-                               which_token_ids: str, device: str):
+def test_raises_when_vocab_oob(
+    above_or_below_vocab_range: str, which_token_ids: str, device: str
+):
     """
     Tests that we throw an exception of the token ids fall outside
     the bound of the provided vocabulary.
@@ -124,18 +126,15 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     torch.set_default_device(device)
     typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
     typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_with_bonus_probs = torch.rand(batch_size,
-                                         k + 1,
-                                         vocab_size,
-                                         dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
+    target_with_bonus_probs = torch.rand(
+        batch_size, k + 1, vocab_size, dtype=torch.float32
+    )
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
     # Verify that appropriate exceptions are thrown for out
     # of bound vocabs.
     oob_token_ids = None
@@ -156,21 +155,22 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
     oob_token_ids[0][0] = rogue_token_id
 
     with pytest.raises(AssertionError):
-        typical_acceptance_sampler(target_with_bonus_probs,
-                                   bonus_token_ids,
-                                   draft_probs=None,
-                                   draft_token_ids=draft_token_ids)
+        typical_acceptance_sampler(
+            target_with_bonus_probs,
+            bonus_token_ids,
+            draft_probs=None,
+            draft_token_ids=draft_token_ids,
+        )
 
 
 @pytest.mark.parametrize("seed", list(range(10)))
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_uniform_target_distribution_accepts_all_tokens(
-        seed: int, device: str):
+def test_uniform_target_distribution_accepts_all_tokens(seed: int, device: str):
     """
-     Test the TypicalAcceptanceSampler with a uniform target probability 
+     Test the TypicalAcceptanceSampler with a uniform target probability
      distribution.
-    
+
     This test verifies that when provided with a uniform target probability
     distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
     entropy of the uniform target distribution being high should lead to all
@@ -183,23 +183,21 @@ def test_uniform_target_distribution_accepts_all_tokens(
     torch.set_default_device(device)
     typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
     typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_with_bonus_probs = torch.rand(batch_size,
-                                         k + 1,
-                                         vocab_size,
-                                         dtype=torch.float32)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
+    target_with_bonus_probs = torch.rand(
+        batch_size, k + 1, vocab_size, dtype=torch.float32
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64
+    )
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
     output_token_ids = typical_acceptance_sampler(
         target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
-        draft_token_ids=draft_token_ids)
+        draft_token_ids=draft_token_ids,
+    )
     # We are using a uniform target probability distribution.
     # For a uniform distribution the entropy is very high and it
     # should lead to all draft tokens being accepted. Verify that.
@@ -236,17 +234,18 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
     # Simulate temperature 0 probability distribution for target probabilities
     # and create target probabilities such that only 1 token id has
     # probability 1.0
-    target_with_bonus_probs, zero_temperature_token_ids = \
+    target_with_bonus_probs, zero_temperature_token_ids = (
         get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
+    )
     zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
     # Populate draft_token_ids such that they exclude the token_ids
     # with probability = 1.0
-    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
-                                          zero_temperature_token_ids)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
+    draft_token_ids = get_draft_token_ids(
+        batch_size, k, vocab_size, zero_temperature_token_ids
+    )
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
     # The target probaility distribution is a temperature zero distribution
     # with zero entropy. Since our draft token ids don't match the probability
     # 1.0 tokens in the target distribution we will reject all of them and
@@ -256,12 +255,12 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
         target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
-        draft_token_ids=draft_token_ids)
+        draft_token_ids=draft_token_ids,
+    )
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, -1] == -1)
-    assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:,
-                                                                          0])
+    assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:, 0])
 
 
 @pytest.mark.parametrize("seed", list(range(10)))
@@ -273,10 +272,10 @@ def test_mixed_target_distribution(seed: int, device: str):
     distribution.
 
     This test ensures that the TypicalAcceptanceSampler handles a mixed
-    target probability distribution correctly. Specifically, it uses a 
+    target probability distribution correctly. Specifically, it uses a
     zero-temperature distribution for some sequences and a uniform
     distribution for others. The test verifies that:
-    
+
     - For sequences with a zero-temperature distribution, only the token
     with a probability of 1.0 is accepted, and all other tokens are rejected.
     - For sequences with a uniform distribution, all draft tokens are
@@ -292,23 +291,25 @@ def test_mixed_target_distribution(seed: int, device: str):
     # For sequences 0 and 2 set the distribution to a temperature
     # zero distribution. For sequences 1 and 3 set it to a uniform
     # distribution.
-    target_with_bonus_probs, zero_temperature_token_ids = \
+    target_with_bonus_probs, zero_temperature_token_ids = (
         get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
+    )
     zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
     target_probs = target_with_bonus_probs[:, :-1]
-    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
-                                          zero_temperature_token_ids)
+    draft_token_ids = get_draft_token_ids(
+        batch_size, k, vocab_size, zero_temperature_token_ids
+    )
     uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32)
     target_probs[[1, 3]] = uniform_probs
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
     output_token_ids = typical_acceptance_sampler(
         target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
-        draft_token_ids=draft_token_ids)
+        draft_token_ids=draft_token_ids,
+    )
     # verify the shape of output_token_ids
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
@@ -316,14 +317,13 @@ def test_mixed_target_distribution(seed: int, device: str):
     # which is the token with probability 1.0 in the target distribution
     # at position 0.
     assert torch.all(output_token_ids[[0, 2], 1:] == -1)
-    assert (torch.all(output_token_ids[[0, 2],
-                                       0] == zero_temperature_token_ids[[0, 2],
-                                                                        0]))
+    assert torch.all(
+        output_token_ids[[0, 2], 0] == zero_temperature_token_ids[[0, 2], 0]
+    )
     # For sequences 1 and 3 verify that all tokens are accepted since the
     # target probability distribution is uniform. In addition verify that
     # we also accept the bonus tokens.
-    assert torch.all(
-        output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :])
+    assert torch.all(output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :])
     assert torch.all(output_token_ids[[1, 3], -1] != -1)
 
 
@@ -338,7 +338,7 @@ def test_accept_tokens_partially(seed: int, device: str):
     This test verifies that the TypicalAcceptanceSampler correctly accepts or
     rejects draft tokens based on a zero-temperature target probability
     distribution. Specifically, it ensures that:
-    
+
     - When all draft tokens match tokens with a probability of 1.0 in the
     target distribution, all draft tokens are accepted.
     - When only some draft tokens match tokens with a probability of 1.0 in
@@ -355,19 +355,20 @@ def test_accept_tokens_partially(seed: int, device: str):
     # Create a temperature zero target probability distribution and ensure
     # all draft token ids correspond to the tokens with 1.0 probability.
     # Verify that all of them are accepted.
-    target_with_bonus_probs, zero_temperature_token_ids = \
+    target_with_bonus_probs, zero_temperature_token_ids = (
         get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
+    )
     zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
     draft_token_ids = zero_temperature_token_ids
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
     output_token_ids = typical_acceptance_sampler(
         target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
-        draft_token_ids=draft_token_ids)
+        draft_token_ids=draft_token_ids,
+    )
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -377,19 +378,21 @@ def test_accept_tokens_partially(seed: int, device: str):
     # response we will expect the first 2 tokens to be the same as the
     # draft tokens and the recovered token and rest as -1
     draft_token_ids_to_replace = get_draft_token_ids(
-        batch_size, k, vocab_size, zero_temperature_token_ids)
+        batch_size, k, vocab_size, zero_temperature_token_ids
+    )
     draft_token_ids = torch.cat(
-        (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
+        (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1
+    )
     output_token_ids = typical_acceptance_sampler(
         target_with_bonus_probs,
         bonus_token_ids,
         draft_probs=None,
-        draft_token_ids=draft_token_ids)
+        draft_token_ids=draft_token_ids,
+    )
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
-    assert torch.all(
-        output_token_ids[:, 2] == target_with_bonus_probs.argmax(-1)[:, 2])
+    assert torch.all(output_token_ids[:, 2] == target_with_bonus_probs.argmax(-1)[:, 2])
     assert torch.all(output_token_ids[:, -3:] == -1)
 
 
@@ -398,10 +401,10 @@ def test_accept_tokens_partially(seed: int, device: str):
 @torch.inference_mode()
 def test_accept_tokens_set_non_default_posteriors(seed: int, device: str):
     """
-    Test the TypicalAcceptanceSampler with custom posterior thresholds and 
+    Test the TypicalAcceptanceSampler with custom posterior thresholds and
     alpha values. This test verifies that by modifying the posterior
     thresholds and alpha values we can change the acceptance behavior of the
-    sampler. 
+    sampler.
     """
     set_random_seed(seed)
     k = 5
@@ -417,20 +420,19 @@ def test_accept_tokens_set_non_default_posteriors(seed: int, device: str):
     # with probability = 1.0. Without any changes to the posterior thresholds
     # none of the draft tokens are accepted.
     target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
-        batch_size, k + 1, vocab_size)
+        batch_size, k + 1, vocab_size
+    )
     zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
     target_probs[target_probs == 0] = 0.00001
-    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
-                                          zero_temperature_token_ids)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
+    draft_token_ids = get_draft_token_ids(
+        batch_size, k, vocab_size, zero_temperature_token_ids
+    )
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64
+    )
     output_token_ids = typical_acceptance_sampler(
-        target_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
+        target_probs, bonus_token_ids, draft_probs=None, draft_token_ids=draft_token_ids
+    )
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 1:-1] == -1)
@@ -439,13 +441,12 @@ def test_accept_tokens_set_non_default_posteriors(seed: int, device: str):
     # now accept even draft tokens with very low probability in the
     # target distribution. Simulate and verify the same.
     typical_acceptance_sampler = TypicalAcceptanceSampler(
-        strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0)
+        strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0
+    )
     typical_acceptance_sampler.init_gpu_tensors(device=device)
     output_token_ids = typical_acceptance_sampler(
-        target_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
+        target_probs, bonus_token_ids, draft_probs=None, draft_token_ids=draft_token_ids
+    )
     assert output_token_ids.shape[0] == batch_size
     assert output_token_ids.shape[1] == (k + 1)
     assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
@@ -460,7 +461,7 @@ def test_get_recovered_token_ids(seed: int, device: str):
     Test the TypicalAcceptanceSampler's method for generating
     replacement token IDs.
 
-    This test verifies that the `_get_recovered_token_ids` method of the 
+    This test verifies that the `_get_recovered_token_ids` method of the
     TypicalAcceptanceSampler correctly identifies the token IDs to be used
     as recovered token IDs based on the target probability distribution.
     Specifically, it ensures that the method correctly identifies the
@@ -475,6 +476,7 @@ def test_get_recovered_token_ids(seed: int, device: str):
     typical_acceptance_sampler.init_gpu_tensors(device=device)
     target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
     expected_replacement_tokens = torch.argmax(target_probs, dim=-1)
-    actual_replacement_tokens = (
-        typical_acceptance_sampler._get_recovered_token_ids(target_probs))
+    actual_replacement_tokens = typical_acceptance_sampler._get_recovered_token_ids(
+        target_probs
+    )
     assert torch.all(expected_replacement_tokens == actual_replacement_tokens)
diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py
index 375b248ebeda..a6a8b33e19d3 100644
--- a/tests/spec_decode/conftest.py
+++ b/tests/spec_decode/conftest.py
@@ -9,4 +9,4 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index f3fe9db3f79e..829adae8559a 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -13,9 +13,12 @@
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import PromptLogprobs, SampleLogprobs
 
-from ...models.utils import (TokensTextLogprobs,
-                             TokensTextLogprobsPromptLogprobs,
-                             check_logprobs_close, check_outputs_equal)
+from ...models.utils import (
+    TokensTextLogprobs,
+    TokensTextLogprobsPromptLogprobs,
+    check_logprobs_close,
+    check_outputs_equal,
+)
 from ...utils import RemoteOpenAIServer
 
 PROMPTS = [
@@ -31,9 +34,9 @@
 
 
 @pytest.fixture
-def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                       test_llm_kwargs, seed):
-
+def test_llm_generator(
+    common_llm_kwargs, per_test_common_llm_kwargs, test_llm_kwargs, seed
+):
     def generate():
         kwargs = {
             **common_llm_kwargs,
@@ -56,17 +59,20 @@ def generate():
 
 def maybe_assert_ngram_worker(llm):
     # Verify the proposer worker is ngram if ngram is specified.
-    if (llm.llm_engine.speculative_config is not None
-            and llm.llm_engine.speculative_config.method == "ngram"):
+    if (
+        llm.llm_engine.speculative_config is not None
+        and llm.llm_engine.speculative_config.method == "ngram"
+    ):
         from vllm.spec_decode.ngram_worker import NGramWorker
+
         assert isinstance(
-            llm.llm_engine.model_executor.driver_worker.proposer_worker,
-            NGramWorker)
+            llm.llm_engine.model_executor.driver_worker.proposer_worker, NGramWorker
+        )
 
 
 def get_output_from_llm_generator(
-        llm_generator, prompts,
-        sampling_params) -> tuple[list[str], list[list[int]], float]:
+    llm_generator, prompts, sampling_params
+) -> tuple[list[str], list[list[int]], float]:
     tokens: list[str] = []
     token_ids: list[list[int]] = []
     acceptance_rate: float = -1.0
@@ -81,23 +87,24 @@ def get_output_from_llm_generator(
         # Fetch acceptance rate if logging is enabled.
         if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None):
             stat_logger = stat_loggers["prometheus"]
-            acceptance_rate = (stat_logger.metrics.
-                               gauge_spec_decode_draft_acceptance_rate.labels(
-                                   **stat_logger.labels)._value.get())
+            acceptance_rate = (
+                stat_logger.metrics.gauge_spec_decode_draft_acceptance_rate.labels(
+                    **stat_logger.labels
+                )._value.get()
+            )
         del llm
 
     return tokens, token_ids, acceptance_rate
 
 
 def check_logprobs_correctness(
-    spec_outputs: Sequence[Union[TokensTextLogprobs,
-                                 TokensTextLogprobsPromptLogprobs]],
-    baseline_outputs: Sequence[Union[TokensTextLogprobs,
-                                     TokensTextLogprobsPromptLogprobs]],
+    spec_outputs: Sequence[Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs]],
+    baseline_outputs: Sequence[
+        Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs]
+    ],
     disable_logprobs: bool = False,
 ):
-    """Compare sampled and prompt logprobs between baseline and spec decoding
-    """
+    """Compare sampled and prompt logprobs between baseline and spec decoding"""
     if not disable_logprobs:
         return check_logprobs_close(
             outputs_0_lst=baseline_outputs,
@@ -111,18 +118,18 @@ def check_logprobs_correctness(
         # Check generated token logprobs.
         spec_logprobs = spec_output[2]
         baseline_logprobs = baseline_output[2]
-        _check_logprobs_when_output_disabled(spec_logprobs,
-                                             baseline_logprobs,
-                                             is_prompt_logprobs=False)
+        _check_logprobs_when_output_disabled(
+            spec_logprobs, baseline_logprobs, is_prompt_logprobs=False
+        )
 
         # Check prompt logprobs too, if they exist
         if len(baseline_output) == 4:
             assert len(spec_output) == 4
             spec_prompt_logprobs = spec_output[3]
             baseline_prompt_logprobs = baseline_output[3]
-            _check_logprobs_when_output_disabled(spec_prompt_logprobs,
-                                                 baseline_prompt_logprobs,
-                                                 is_prompt_logprobs=True)
+            _check_logprobs_when_output_disabled(
+                spec_prompt_logprobs, baseline_prompt_logprobs, is_prompt_logprobs=True
+            )
 
 
 def _check_logprobs_when_output_disabled(
@@ -141,8 +148,8 @@ def _check_logprobs_when_output_disabled(
 
     # For each generated position of the sequence.
     for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
-            zip(spec_logprobs, baseline_logprobs)):
-
+        zip(spec_logprobs, baseline_logprobs)
+    ):
         # First prompt logprob is expected to be None
         if is_prompt_logprobs and baseline_pos_logprobs is None:
             assert spec_pos_logprobs is None
@@ -155,8 +162,9 @@ def _check_logprobs_when_output_disabled(
         # When disabled, the 1 logprob is returned with dummy values for the
         # score and rank, but the token id should match the baseline model
         assert len(spec_pos_logprobs) == 1
-        (spec_pos_logprob_token_id,
-         spec_pos_logprob) = next(iter(spec_pos_logprobs.items()))
+        (spec_pos_logprob_token_id, spec_pos_logprob) = next(
+            iter(spec_pos_logprobs.items())
+        )
         assert spec_pos_logprob.rank == -1
         assert spec_pos_logprob.logprob == 0.0
         if isinstance(spec_pos_logprob_token_id, torch.Tensor):
@@ -165,23 +173,23 @@ def _check_logprobs_when_output_disabled(
 
 
 def run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size: int,
-        max_output_len: int,
-        seed: Optional[int] = 0,
-        temperature: float = 0.0,
-        disable_seed: bool = False,
-        ignore_eos: bool = True,
-        ensure_all_accepted: bool = False,
-        expected_acceptance_rate: Optional[float] = None,
-        logprobs: Optional[int] = None,
-        prompt_logprobs: Optional[int] = None,
-        disable_logprobs: bool = False):
-
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    max_output_len: int,
+    seed: Optional[int] = 0,
+    temperature: float = 0.0,
+    disable_seed: bool = False,
+    ignore_eos: bool = True,
+    ensure_all_accepted: bool = False,
+    expected_acceptance_rate: Optional[float] = None,
+    logprobs: Optional[int] = None,
+    prompt_logprobs: Optional[int] = None,
+    disable_logprobs: bool = False,
+):
     org_args = {
         **common_llm_kwargs,
         **per_test_common_llm_kwargs,
@@ -199,12 +207,14 @@ def run_equality_correctness_test(
     if disable_seed:
         seed = None
 
-    sampling_params = SamplingParams(temperature=temperature,
-                                     max_tokens=max_output_len,
-                                     seed=seed,
-                                     ignore_eos=ignore_eos,
-                                     logprobs=logprobs,
-                                     prompt_logprobs=prompt_logprobs)
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        max_tokens=max_output_len,
+        seed=seed,
+        ignore_eos=ignore_eos,
+        logprobs=logprobs,
+        prompt_logprobs=prompt_logprobs,
+    )
 
     with vllm_runner(**org_args) as vllm_model:
         org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
@@ -212,16 +222,17 @@ def run_equality_correctness_test(
     with vllm_runner(**sd_args) as vllm_model:
         if ensure_all_accepted or expected_acceptance_rate is not None:
             # Force log interval to be 0 to catch all metrics.
-            stat_logger = vllm_model.model.llm_engine.stat_loggers[
-                'prometheus']
+            stat_logger = vllm_model.model.llm_engine.stat_loggers["prometheus"]
             stat_logger.local_interval = -100
 
         sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
 
         if ensure_all_accepted or expected_acceptance_rate is not None:
-            acceptance_rate = (stat_logger.metrics.
-                               gauge_spec_decode_draft_acceptance_rate.labels(
-                                   **stat_logger.labels)._value.get())
+            acceptance_rate = (
+                stat_logger.metrics.gauge_spec_decode_draft_acceptance_rate.labels(
+                    **stat_logger.labels
+                )._value.get()
+            )
 
             if ensure_all_accepted:
                 assert True
@@ -233,26 +244,30 @@ def run_equality_correctness_test(
                 assert acceptance_rate >= expected_acceptance_rate - 1e-2
 
     # Only pass token entries, not the logprobs
-    check_outputs_equal(outputs_0_lst=[out[0:2] for out in org_outputs],
-                        outputs_1_lst=[out[0:2] for out in sd_outputs],
-                        name_0="org",
-                        name_1="sd")
+    check_outputs_equal(
+        outputs_0_lst=[out[0:2] for out in org_outputs],
+        outputs_1_lst=[out[0:2] for out in sd_outputs],
+        name_0="org",
+        name_1="sd",
+    )
 
     # Check logprobs if requested
     if logprobs is not None or prompt_logprobs is not None:
         check_logprobs_correctness(sd_outputs, org_outputs, disable_logprobs)
 
 
-def run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size: int,
-                                     max_output_len: int,
-                                     seed: int = 0,
-                                     temperature: float = 0.0,
-                                     logprobs: Optional[int] = None):
+def run_equality_correctness_test_tp(
+    model,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    max_output_len: int,
+    seed: int = 0,
+    temperature: float = 0.0,
+    logprobs: Optional[int] = None,
+):
     """Helper method that compares the outputs of both the baseline LLM and
     the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
     the same when temperature is zero.
@@ -266,29 +281,31 @@ def run_equality_correctness_test_tp(model,
 
     prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
     for args, env in ((arg1, env1), (arg2, env2)):
-        with RemoteOpenAIServer(model,
-                                args,
-                                env_dict=env,
-                                max_wait_seconds=max_wait_seconds) as server:
+        with RemoteOpenAIServer(
+            model, args, env_dict=env, max_wait_seconds=max_wait_seconds
+        ) as server:
             client = server.get_client()
 
-            completion = client.completions.create(model=model,
-                                                   prompt=prompts,
-                                                   max_tokens=max_output_len,
-                                                   seed=seed,
-                                                   temperature=temperature,
-                                                   logprobs=logprobs)
-
-            results.append({
-                "test":
-                "seeded_sampling",
-                "text": [choice.text for choice in completion.choices],
-                "logprobs": [choice.logprobs for choice in completion.choices],
-                "finish_reason":
-                [choice.finish_reason for choice in completion.choices],
-                "usage":
-                completion.usage,
-            })
+            completion = client.completions.create(
+                model=model,
+                prompt=prompts,
+                max_tokens=max_output_len,
+                seed=seed,
+                temperature=temperature,
+                logprobs=logprobs,
+            )
+
+            results.append(
+                {
+                    "test": "seeded_sampling",
+                    "text": [choice.text for choice in completion.choices],
+                    "logprobs": [choice.logprobs for choice in completion.choices],
+                    "finish_reason": [
+                        choice.finish_reason for choice in completion.choices
+                    ],
+                    "usage": completion.usage,
+                }
+            )
 
     n = len(results) // 2
     arg1_results = results[:n]
@@ -300,7 +317,8 @@ def run_equality_correctness_test_tp(model,
     for arg1_result, arg2_result in zip(arg1_results, arg2_results):
         assert arg1_result == arg2_result, (
             f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
-            f"{arg1_result=} != {arg2_result=}")
+            f"{arg1_result=} != {arg2_result=}"
+        )
     if logprobs:
         for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs):
             for l1, l2 in zip(logs1, logs2):
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 6c453879a6a6..66d49c147331 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -8,10 +8,14 @@
 from .conftest import get_output_from_llm_generator
 
 
-@pytest.mark.parametrize("common_llm_kwargs",
-                         [{
-                             "model": "meta-llama/Llama-3.2-1B-Instruct",
-                         }])
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [
+        {
+            "model": "meta-llama/Llama-3.2-1B-Instruct",
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -42,12 +46,12 @@
                 "max_model_len": 131072 + 1,
             },
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
-    """Verify that speculative decoding validates speculative_max_model_len.
-    """
+    """Verify that speculative decoding validates speculative_max_model_len."""
     output_len = 128
     temperature = 0.0
 
@@ -62,5 +66,4 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
     )
 
     with pytest.raises(ValueError, match="cannot be larger than"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
+        get_output_from_llm_generator(test_llm_generator, prompts, sampling_params)
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index 7c369feec415..15913b7b284a 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -41,88 +41,118 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs, test_llm_kwargs,
-                                      batch_size: int, output_len: int,
-                                      seed: int):
-
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+def test_eagle_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs": True,
-    },
-}])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_logprobs": False,
+            },
+        },
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_logprobs": True,
+            },
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
-def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                   per_test_common_llm_kwargs,
-                                   baseline_llm_kwargs, test_llm_kwargs,
-                                   batch_size: int, output_len: int, seed: int,
-                                   logprobs: int):
-
+def test_eagle_e2e_greedy_logprobs(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+):
     run_equality_correctness_test(
         vllm_runner,
         common_llm_kwargs,
@@ -134,111 +164,146 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
         seed,
         logprobs=logprobs,
         prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            "enforce_eager": False,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
 def test_eagle_e2e_greedy_correctness_cuda_graph(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality with cuda graph enabled and different
     batch sizes."""
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "block_size": 8,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            "block_size": 8,
+            # 2 for small prompt, 256//8 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 8,
+            "max_model_len": (2 + 256 // 8) * 8,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         128,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 def test_eagle_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
@@ -252,229 +317,291 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_different_k(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int):
+def test_eagle_different_k(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that eagle speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_by_batch_size": 4,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
-                             per_test_common_llm_kwargs, baseline_llm_kwargs,
-                             test_llm_kwargs, batch_size: int, output_len: int,
-                             seed: int):
+def test_eagle_disable_queue(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that eagle speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": "float16",
-
-        # Main model
-        "model_name": "meta-llama/Llama-2-7b-chat-hf",
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": "float16",
+            # Main model
+            "model_name": "meta-llama/Llama-2-7b-chat-hf",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "yuhuili/EAGLE-llama2-chat-7B",
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "yuhuili/EAGLE-llama2-chat-7B",
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize("seed", [1])
-def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                             per_test_common_llm_kwargs,
-                                             baseline_llm_kwargs,
-                                             test_llm_kwargs, batch_size: int,
-                                             output_len: int, seed: int):
-
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
+def test_llama2_eagle_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # 2 for small prompt, 256//16 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 16,
-        "max_model_len": (2 + 256 // 16) * 16,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": "float16",
-
-        # Main model
-        "model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
-    }])
+    [
+        {
+            # 2 for small prompt, 256//16 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 16,
+            "max_model_len": (2 + 256 // 16) * 16,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": "float16",
+            # Main model
+            "model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize("seed", [1])
-def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                             per_test_common_llm_kwargs,
-                                             baseline_llm_kwargs,
-                                             test_llm_kwargs, batch_size: int,
-                                             output_len: int, seed: int):
-
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
+def test_llama3_eagle_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # 2 for small prompt, 256//16 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 16,
-        "max_model_len": (2 + 256 // 16) * 16,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": "float16",
-
-        # Main model
-        "model_name": "Qwen/Qwen2-7B-Instruct",
-    }])
+    [
+        {
+            # 2 for small prompt, 256//16 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 16,
+            "max_model_len": (2 + 256 // 16) * 16,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": "float16",
+            # Main model
+            "model_name": "Qwen/Qwen2-7B-Instruct",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize("seed", [1])
-def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            output_len: int, seed: int):
-
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
+def test_qwen2_eagle_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+    )
 
 
 if __name__ == "__main__":
     import pytest
+
     pytest.main([__file__])
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index f15a9224c003..1e9eec17816a 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -13,15 +13,16 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Verify equality when cuda graphs allowed.
-        "enforce_eager": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Verify equality when cuda graphs allowed.
+            "enforce_eager": False,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -32,40 +33,49 @@
                 "num_speculative_tokens": 5,
             },
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("output_len", [32])
 @pytest.mark.parametrize("seed", [1])
-def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
-                                per_test_common_llm_kwargs,
-                                baseline_llm_kwargs, test_llm_kwargs,
-                                batch_size: int, output_len: int, seed: int):
-    """Verify spec decode equality when cuda graphs are enabled.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+def test_spec_decode_cuda_graph(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    """Verify spec decode equality when cuda graphs are enabled."""
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
@@ -94,68 +104,90 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
                 "quantization": None,
             },
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
-                                               per_test_common_llm_kwargs,
-                                               baseline_llm_kwargs,
-                                               test_llm_kwargs,
-                                               batch_size: int, seed: int):
-    """Verify spec decode works well with draft model quantization configs.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=32,
-                                  seed=seed,
-                                  temperature=0.0)
+def test_speculative_model_quantization_config(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    seed: int,
+):
+    """Verify spec decode works well with draft model quantization configs."""
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=32,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": MAIN_MODEL,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": MAIN_MODEL,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_mqa_scorer": True,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 3,
+                "disable_mqa_scorer": True,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
+def test_mqa_scorer(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that speculative decoding generates the same output
     with batch expansion scorer and mqa scorer.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index a18be80c50dd..9d80c8375c05 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -15,233 +15,336 @@
 from .conftest import run_equality_correctness_test_tp
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor-parallel-size",
-        "2"
-    ]])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("test_llm_kwargs", [
     [
-        "--speculative_config",
-        json.dumps({
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 3,
-        }),
+        [
+            # Skip cuda graph recording for fast test.
+            "--enforce-eager",
+            "--tensor-parallel-size",
+            "2",
+        ]
     ],
+)
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
     [
-        "--speculative_config",
-        json.dumps({
-            "model": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-        }),
+        [
+            "--speculative_config",
+            json.dumps(
+                {
+                    "model": "JackFram/llama-68m",
+                    "num_speculative_tokens": 3,
+                }
+            ),
+        ],
+        [
+            "--speculative_config",
+            json.dumps(
+                {
+                    "model": "ngram",
+                    "num_speculative_tokens": 5,
+                    "prompt_lookup_max": 3,
+                }
+            ),
+        ],
     ],
-])
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
-                              baseline_llm_kwargs, test_llm_kwargs,
-                              batch_size: int, output_len: int, seed: int):
-    """Verify greedy equality when tensor parallelism is used.
-    """
+def test_target_model_tp_gt_1(
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    """Verify greedy equality when tensor parallelism is used."""
     if current_platform.is_rocm():
         pytest.skip("hip is not well-supported yet")
-    run_equality_correctness_test_tp("JackFram/llama-68m",
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     output_len,
-                                     seed,
-                                     temperature=0.0)
+    run_equality_correctness_test_tp(
+        "JackFram/llama-68m",
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+    )
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor_parallel_size",
-        "2",
-
-        # precision
-        "--dtype",
-        "bfloat16",
-    ]])
+    [
+        [
+            # Skip cuda graph recording for fast test.
+            "--enforce-eager",
+            "--tensor_parallel_size",
+            "2",
+            # precision
+            "--dtype",
+            "bfloat16",
+        ]
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize(
     "model, test_llm_kwargs",
-    [("JackFram/llama-68m", [
-        "--speculative_config",
-        json.dumps({
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
-            "draft_tensor_parallel_size": 1,
-        }),
-    ]),
-     ("ibm-granite/granite-3b-code-instruct", [
-         "--speculative_config",
-         json.dumps({
-             "model": "ibm-granite/granite-3b-code-instruct",
-             "num_speculative_tokens": 5,
-             "draft_tensor_parallel_size": 1,
-         }),
-     ])])
+    [
+        (
+            "JackFram/llama-68m",
+            [
+                "--speculative_config",
+                json.dumps(
+                    {
+                        "model": "JackFram/llama-68m",
+                        "num_speculative_tokens": 5,
+                        "draft_tensor_parallel_size": 1,
+                    }
+                ),
+            ],
+        ),
+        (
+            "ibm-granite/granite-3b-code-instruct",
+            [
+                "--speculative_config",
+                json.dumps(
+                    {
+                        "model": "ibm-granite/granite-3b-code-instruct",
+                        "num_speculative_tokens": 5,
+                        "draft_tensor_parallel_size": 1,
+                    }
+                ),
+            ],
+        ),
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            seed: int):
-    """Verify spec decode works well with smaller tp for draft models.
-    """
-    run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0)
+def test_draft_model_tp_lt_target_model_tp2(
+    model,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    seed: int,
+):
+    """Verify spec decode works well with smaller tp for draft models."""
+    run_equality_correctness_test_tp(
+        model,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=32,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor_parallel_size",
-        "2",
-
-        # precision
-        "--dtype",
-        "bfloat16",
-    ]])
+    [
+        [
+            # Skip cuda graph recording for fast test.
+            "--enforce-eager",
+            "--tensor_parallel_size",
+            "2",
+            # precision
+            "--dtype",
+            "bfloat16",
+        ]
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
-    [["--enable-chunked-prefill", "False"],
-     [
-         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
-         "--max-num-seqs", "4"
-     ]])
+    [
+        ["--enable-chunked-prefill", "False"],
+        [
+            "--enable-chunked-prefill",
+            "True",
+            "--max-num-batched-tokens",
+            "4",
+            "--max-num-seqs",
+            "4",
+        ],
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("model, test_llm_kwargs",
-                         [("JackFram/llama-68m", [
-                             "--speculative_config",
-                             json.dumps({
-                                 "model": "JackFram/llama-68m",
-                                 "num_speculative_tokens": 3,
-                             }),
-                         ]),
-                          ("JackFram/llama-68m", [
-                              "--speculative_config",
-                              json.dumps({
-                                  "model": "JackFram/llama-68m",
-                                  "num_speculative_tokens": 3,
-                                  "draft_tensor_parallel_size": 1,
-                              }),
-                          ])])
+@pytest.mark.parametrize(
+    "model, test_llm_kwargs",
+    [
+        (
+            "JackFram/llama-68m",
+            [
+                "--speculative_config",
+                json.dumps(
+                    {
+                        "model": "JackFram/llama-68m",
+                        "num_speculative_tokens": 3,
+                    }
+                ),
+            ],
+        ),
+        (
+            "JackFram/llama-68m",
+            [
+                "--speculative_config",
+                json.dumps(
+                    {
+                        "model": "JackFram/llama-68m",
+                        "num_speculative_tokens": 3,
+                        "draft_tensor_parallel_size": 1,
+                    }
+                ),
+            ],
+        ),
+    ],
+)
 @pytest.mark.parametrize("logprobs", [None])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
-                                         per_test_common_llm_kwargs,
-                                         baseline_llm_kwargs, test_llm_kwargs,
-                                         logprobs: Optional[int],
-                                         batch_size: int, seed: int):
+def test_spec_decode_chunked_prefill_tp2(
+    model,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    logprobs: Optional[int],
+    batch_size: int,
+    seed: int,
+):
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
-    run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0,
-                                     logprobs=logprobs)
+    run_equality_correctness_test_tp(
+        model,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=32,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+    )
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor_parallel_size",
-        "2",
-
-        # precision
-        "--dtype",
-        "bfloat16",
-    ]])
+    [
+        [
+            # Skip cuda graph recording for fast test.
+            "--enforce-eager",
+            "--tensor_parallel_size",
+            "2",
+            # precision
+            "--dtype",
+            "bfloat16",
+        ]
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
-    [["--enable-chunked-prefill", "False"],
-     [
-         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
-         "--max-num-seqs", "4"
-     ]])
+    [
+        ["--enable-chunked-prefill", "False"],
+        [
+            "--enable-chunked-prefill",
+            "True",
+            "--max-num-batched-tokens",
+            "4",
+            "--max-num-seqs",
+            "4",
+        ],
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("model, test_llm_kwargs",
-                         [("JackFram/llama-68m", [
-                             "--speculative_config",
-                             json.dumps({
-                                 "model": "JackFram/llama-68m",
-                                 "num_speculative_tokens": 3,
-                                 "disable_logprobs": False,
-                             }),
-                         ]),
-                          ("JackFram/llama-68m", [
-                              "--speculative_config",
-                              json.dumps({
-                                  "model": "JackFram/llama-68m",
-                                  "num_speculative_tokens": 3,
-                                  "draft_tensor_parallel_size": 1,
-                                  "disable_logprobs": False,
-                              }),
-                          ])])
+@pytest.mark.parametrize(
+    "model, test_llm_kwargs",
+    [
+        (
+            "JackFram/llama-68m",
+            [
+                "--speculative_config",
+                json.dumps(
+                    {
+                        "model": "JackFram/llama-68m",
+                        "num_speculative_tokens": 3,
+                        "disable_logprobs": False,
+                    }
+                ),
+            ],
+        ),
+        (
+            "JackFram/llama-68m",
+            [
+                "--speculative_config",
+                json.dumps(
+                    {
+                        "model": "JackFram/llama-68m",
+                        "num_speculative_tokens": 3,
+                        "draft_tensor_parallel_size": 1,
+                        "disable_logprobs": False,
+                    }
+                ),
+            ],
+        ),
+    ],
+)
 @pytest.mark.parametrize("logprobs", [2])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2_with_logprobs(
-        model, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
-        batch_size: int, seed: int):
+    model,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    logprobs: Optional[int],
+    batch_size: int,
+    seed: int,
+):
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
-    run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0,
-                                     logprobs=logprobs)
+    run_equality_correctness_test_tp(
+        model,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=32,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+    )
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index 039eec8fd2cc..fe8a4f8c0bcc 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -16,64 +16,81 @@
 SPEC_MODEL = "JackFram/llama-68m"
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce_eager",
-        "--tensor-parallel-size",
-        "4",
-    ]])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    [],
-])
+    [
+        [
+            # Skip cuda graph recording for fast test.
+            "--enforce_eager",
+            "--tensor-parallel-size",
+            "4",
+        ]
+    ],
+)
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        [],
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
     [
-        #TODO(wooyeon): add spec_draft_dp=2 case
+        # TODO(wooyeon): add spec_draft_dp=2 case
         [
             "--speculative_config",
-            json.dumps({
-                "model": f"{SPEC_MODEL}",
-                "num_speculative_tokens": 5,
-                "draft_tensor_parallel_size": 1,
-            }),
+            json.dumps(
+                {
+                    "model": f"{SPEC_MODEL}",
+                    "num_speculative_tokens": 5,
+                    "draft_tensor_parallel_size": 1,
+                }
+            ),
         ],
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            seed: int):
-    """Verify spec decode works well with smaller tp for draft models.
-    """
-    run_equality_correctness_test_tp(MAIN_MODEL,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0)
+def test_draft_model_tp_lt_target_model_tp4(
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    seed: int,
+):
+    """Verify spec decode works well with smaller tp for draft models."""
+    run_equality_correctness_test_tp(
+        MAIN_MODEL,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=32,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [[
-
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor-parallel-size",
-        "4",
-    ]])
+    [
+        [
+            # Skip cuda graph recording for fast test.
+            "--enforce-eager",
+            "--tensor-parallel-size",
+            "4",
+        ]
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize(
@@ -83,13 +100,16 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "--speculative_config",
-            json.dumps({
-                "model": f"{SPEC_MODEL}",
-                "num_speculative_tokens": 5,
-                "max_model_len": 32,
-            }),
+            json.dumps(
+                {
+                    "model": f"{SPEC_MODEL}",
+                    "num_speculative_tokens": 5,
+                    "max_model_len": 32,
+                }
+            ),
         ],
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
@@ -98,11 +118,18 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
         # we can test the case where all seqs are skipped, but still small to
         # ensure fast test.
         64,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
-                          baseline_llm_kwargs, test_llm_kwargs,
-                          batch_size: int, output_len: int, seed: int):
+def test_skip_speculation(
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify job failure with RuntimeError when all sequences skip speculation.
     We do this by setting the max model len of the draft model to an
     artificially low value, such that when the sequences grow beyond it, they
@@ -110,14 +137,15 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
 
     TODO: fix it to pass without raising Error. (#5814)
     """
-    with pytest.raises(
-        (openai.APIConnectionError, openai.InternalServerError)):
-        run_equality_correctness_test_tp(MAIN_MODEL,
-                                         common_llm_kwargs,
-                                         per_test_common_llm_kwargs,
-                                         baseline_llm_kwargs,
-                                         test_llm_kwargs,
-                                         batch_size,
-                                         output_len,
-                                         seed,
-                                         temperature=0.0)
+    with pytest.raises((openai.APIConnectionError, openai.InternalServerError)):
+        run_equality_correctness_test_tp(
+            MAIN_MODEL,
+            common_llm_kwargs,
+            per_test_common_llm_kwargs,
+            baseline_llm_kwargs,
+            test_llm_kwargs,
+            batch_size,
+            output_len,
+            seed,
+            temperature=0.0,
+        )
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 4de7ee05605a..5c34dc29a832 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -13,46 +13,62 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": True,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 3,
+                "disable_logprobs": False,
+            },
+        },
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 3,
+                "disable_logprobs": True,
+            },
+        },
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         7,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12])
-def test_logprobs_equality(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int, prefill_chunk_size: int):
+def test_logprobs_equality(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+    prefill_chunk_size: int,
+):
     """Verify output logprobs are equal with and without speculative decoding,
-        as well as with and without chunked prefill.
+    as well as with and without chunked prefill.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
     run_equality_correctness_test(
@@ -67,51 +83,65 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
         temperature=0.0,
         logprobs=logprobs,
         prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-160m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": "JackFram/llama-160m",
-        "num_speculative_tokens": 6,
-        "disable_logprobs": False,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-160m",
+                "num_speculative_tokens": 3,
+                "disable_logprobs": False,
+            },
+        },
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-160m",
+                "num_speculative_tokens": 6,
+                "disable_logprobs": False,
+            },
+        },
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
-def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
-                              per_test_common_llm_kwargs, baseline_llm_kwargs,
-                              test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int, logprobs: int):
-    """Veriy logprob greedy equality with different speculation lens.
-    """
+def test_logprobs_different_k(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+):
+    """Veriy logprob greedy equality with different speculation lens."""
     run_equality_correctness_test(
         vllm_runner,
         common_llm_kwargs,
@@ -123,52 +153,62 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
         seed,
         temperature=0.0,
         logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
-    [{
-        "speculative_config": {
-            "model": "JackFram/llama-160m",
-            "num_speculative_tokens": 3,
-            "disable_logprobs": False,
-            # Artificially limit the draft model max model len; this forces
-            # vLLM to skip speculation once the sequences grow beyond 32-k
-            # tokens.
-            "max_model_len": 32,
-        },
-    }])
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-160m",
+                "num_speculative_tokens": 3,
+                "disable_logprobs": False,
+                # Artificially limit the draft model max model len; this forces
+                # vLLM to skip speculation once the sequences grow beyond 32-k
+                # tokens.
+                "max_model_len": 32,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1])
-def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
-                                        per_test_common_llm_kwargs,
-                                        baseline_llm_kwargs, test_llm_kwargs,
-                                        batch_size: int, output_len: int,
-                                        seed: int, logprobs: int):
-    """Verify logprobs greedy equality when some sequences skip speculation.
-    """
+def test_logprobs_when_skip_speculation(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+):
+    """Verify logprobs greedy equality when some sequences skip speculation."""
     run_equality_correctness_test(
         vllm_runner,
         common_llm_kwargs,
@@ -180,43 +220,57 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
         seed,
         temperature=0.0,
         logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-160m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-160m",
+                "num_speculative_tokens": 3,
+                "disable_logprobs": False,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [6])
-def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
-                         per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, output_len: int,
-                         seed: int, logprobs: int):
+def test_logprobs_temp_1(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+):
     """Verify at least one logprob result has num_logprobs+1, which tests the
     case where the sampled token is not in top-k logprobs.
 
@@ -254,36 +308,39 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
     with vllm_runner(**sd_args) as vllm_model:
         sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
 
-    num_returned_logprobs = [
-        len(seq_logprobs) for seq_logprobs in sd_outputs[-1]
-    ]
+    num_returned_logprobs = [len(seq_logprobs) for seq_logprobs in sd_outputs[-1]]
 
     # Assert one of the returned logprobs has > num_logprobs (indicating the
     # sampled token is not in top-k).
-    assert any(
-        [num_returned > logprobs for num_returned in num_returned_logprobs])
+    assert any([num_returned > logprobs for num_returned in num_returned_logprobs])
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": True,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 3,
+                "disable_logprobs": True,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize(
@@ -291,12 +348,20 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("logprobs", [0])
-def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int):
+def test_logprobs_disabled(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+):
     """Check the behavior when logprobs are disabled.
     Token choices should match with the base model.
     """
@@ -311,5 +376,5 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
         seed,
         temperature=0.0,
         logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index bc9501bd5737..9d6448c56551 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -44,99 +44,125 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                       per_test_common_llm_kwargs,
-                                       baseline_llm_kwargs, test_llm_kwargs,
-                                       batch_size: int, output_len: int,
-                                       seed: int, prefill_chunk_size: int):
+def test_medusa_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    prefill_chunk_size: int,
+):
     """Verify greedy equality with different batch size."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": False,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_logprobs": False,
+            },
         },
-    },
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": True,
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_logprobs": True,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    8,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        8,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    seed: int, logprobs: int,
-                                    prefill_chunk_size: int):
+def test_medusa_e2e_greedy_logprobs(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+    prefill_chunk_size: int,
+):
     """Verify greedy equality with different batch size."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     run_equality_correctness_test(
@@ -151,125 +177,154 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
         temperature=0.0,
         logprobs=logprobs,
         prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            "enforce_eager": False,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_cuda_graph(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int, prefill_chunk_size: int):
-    """Verify greedy equality with cuda graph enabled and different 
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    prefill_chunk_size: int,
+):
+    """Verify greedy equality with cuda graph enabled and different
     batch sizes."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            "block_size": 16,
+            # 2 for small prompt, 256//8 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 8,
+            "max_model_len": (2 + 256 // 8) * 8,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         128,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
 def test_medusa_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int, prefill_chunk_size: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    prefill_chunk_size: int,
+):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
@@ -283,135 +338,179 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_different_k(vllm_runner, common_llm_kwargs,
-                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                            test_llm_kwargs, batch_size: int, output_len: int,
-                            seed: int, prefill_chunk_size: int):
+def test_medusa_different_k(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    prefill_chunk_size: int,
+):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_by_batch_size": 4,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
-                              per_test_common_llm_kwargs, baseline_llm_kwargs,
-                              test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int,
-                              prefill_chunk_size: int):
+def test_medusa_disable_queue(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    prefill_chunk_size: int,
+):
     """Verify that medusa speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-        "disable_mqa_scorer": True,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_by_batch_size": 4,
+                "disable_mqa_scorer": True,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int, prefill_chunk_size: int):
-    """Verify that speculative decoding generates the same output 
+def test_mqa_scorer(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    prefill_chunk_size: int,
+):
+    """Verify that speculative decoding generates the same output
     with batch expansion scorer and mqa scorer.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 if __name__ == "__main__":
     import pytest
+
     pytest.main([__file__])
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 0e41d93eaa19..60e2222deaef 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -46,93 +46,117 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4, 32])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    seed: int, prefill_chunk_size: int):
+def test_mlp_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    prefill_chunk_size: int,
+):
     """Verify greedy equality with different batch size."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "disable_logprobs": False,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "disable_logprobs": False,
+            },
         },
-    },
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "disable_logprobs": True,
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "disable_logprobs": True,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize("output_len", [8])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                 per_test_common_llm_kwargs,
-                                 baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int,
-                                 logprobs: int, prefill_chunk_size: int):
+def test_mlp_e2e_greedy_logprobs(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+    prefill_chunk_size: int,
+):
     """Verify greedy equality with different batch size."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     # NOTE Test is sensitive enough st if we don't enable chunked prefill
@@ -152,78 +176,88 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
         temperature=0.0,
         logprobs=logprobs,
         prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize("output_len", [2048])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
-                                 per_test_common_llm_kwargs,
-                                 baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int,
-                                 prefill_chunk_size: int, seed: int):
-    """Verify acceptance rate with different batch size and large output 
+def test_mlp_e2e_acceptance_rate(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    prefill_chunk_size: int,
+    seed: int,
+):
+    """Verify acceptance rate with different batch size and large output
     length."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  temperature=0.0,
-                                  seed=seed,
-                                  expected_acceptance_rate=0.48)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        temperature=0.0,
+        seed=seed,
+        expected_acceptance_rate=0.48,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # Speculative config
-        "speculative_config": {
-            "model": SPEC_MODEL,
-        },
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+            # Speculative config
+            "speculative_config": {
+                "model": SPEC_MODEL,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
 @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
@@ -232,134 +266,167 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("temperature", [1.0])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    temperature: float,
-                                    prefill_chunk_size: int, seed: int):
+def test_mlp_e2e_seeded_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    temperature: float,
+    prefill_chunk_size: int,
+    seed: int,
+):
     """Verify seeded runs produce the same output."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
     maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  temperature=temperature,
-                                  seed=seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        temperature=temperature,
+        seed=seed,
+    )
 
     # Ensure this same test does fail if we _don't_ include per-request seeds
     with pytest.raises(AssertionError):
-        run_equality_correctness_test(vllm_runner,
-                                      common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs,
-                                      test_llm_kwargs,
-                                      batch_size,
-                                      max_output_len=output_len,
-                                      temperature=temperature,
-                                      seed=seed,
-                                      disable_seed=True)
+        run_equality_correctness_test(
+            vllm_runner,
+            common_llm_kwargs,
+            per_test_common_llm_kwargs,
+            baseline_llm_kwargs,
+            test_llm_kwargs,
+            batch_size,
+            max_output_len=output_len,
+            temperature=temperature,
+            seed=seed,
+            disable_seed=True,
+        )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            "block_size": 16,
+            # 2 for small prompt, 256//8 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 8,
+            "max_model_len": (2 + 256 // 8) * 8,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         128,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        prefill_chunk_size: int, seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    prefill_chunk_size: int,
+    seed: int,
+):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            "block_size": 16,
+            # 2 for small prompt, 256//8 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 8,
+            "max_model_len": (2 + 256 // 8) * 8,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         128,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 def test_mlp_e2e_greedy_correctness_with_padding(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        prefill_chunk_size: int, seed: int):
-    """Verify greedy equality when the vocab dimension is padded
-    """
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    prefill_chunk_size: int,
+    seed: int,
+):
+    """Verify greedy equality when the vocab dimension is padded"""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
 
     # Default pad_to is 64, test model has vocab_size of 32000
@@ -367,31 +434,35 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
         return pad_vocab_size(vocab_size, pad_to=32064)
 
     with patch(
-            "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
-            patched_pad_vocab_size):
-        run_equality_correctness_test(vllm_runner,
-                                      common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs,
-                                      test_llm_kwargs,
-                                      batch_size,
-                                      max_output_len=output_len,
-                                      seed=seed,
-                                      temperature=0.0)
+        "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
+        patched_pad_vocab_size,
+    ):
+        run_equality_correctness_test(
+            vllm_runner,
+            common_llm_kwargs,
+            per_test_common_llm_kwargs,
+            baseline_llm_kwargs,
+            test_llm_kwargs,
+            batch_size,
+            max_output_len=output_len,
+            seed=seed,
+            temperature=0.0,
+        )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
@@ -405,129 +476,172 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_different_k(vllm_runner, common_llm_kwargs,
-                         per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int,
-                         prefill_chunk_size: int, seed: int, output_len: int):
+def test_mlp_different_k(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    prefill_chunk_size: int,
+    seed: int,
+    output_len: int,
+):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "disable_by_batch_size": 4,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "disable_by_batch_size": 4,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 # Speculative decoding is disabled when sequences reach decoding and the batch
 # consists of single-token requests. Hence we set `max_num_seqs`
 # >= `speculative_disable_by_batch_size` to test feature interaction.
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int,
-                           prefill_chunk_size: int, seed: int,
-                           output_len: int):
+def test_mlp_disable_queue(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    prefill_chunk_size: int,
+    seed: int,
+    output_len: int,
+):
     """Verify that mlp speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": MAIN_MODEL,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-    }])
+    [
+        {
+            "model_name": MAIN_MODEL,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "disable_mqa_scorer": True,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "disable_mqa_scorer": True,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, prefill_chunk_size: int, seed: int):
-    """Verify that speculative decoding generates the same output 
+def test_mqa_scorer(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    prefill_chunk_size: int,
+    seed: int,
+):
+    """Verify that speculative decoding generates the same output
     with batch expansion scorer and mqa scorer.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py
index d9c7be8ffe71..15ce9c63c31f 100644
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -38,94 +38,119 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.85
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+            # GPU memory utilization
+            "gpu_memory_utilization": 0.85,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    seed: int):
-
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+def test_mtp_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.85
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+            # GPU memory utilization
+            "gpu_memory_utilization": 0.85,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": False,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_logprobs": False,
+            },
         },
-    },
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": True,
+        {
+            "speculative_config": {
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_logprobs": True,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
-def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                 per_test_common_llm_kwargs,
-                                 baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int,
-                                 logprobs: int):
-
+def test_mtp_e2e_greedy_logprobs(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+):
     run_equality_correctness_test(
         vllm_runner,
         common_llm_kwargs,
@@ -137,118 +162,149 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
         seed,
         logprobs=logprobs,
         prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-        "gpu_memory_utilization": 0.85
-    }])
+    [
+        {
+            "enforce_eager": False,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+            "gpu_memory_utilization": 0.85,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        128,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
-                                               per_test_common_llm_kwargs,
-                                               baseline_llm_kwargs,
-                                               test_llm_kwargs,
-                                               batch_size: int,
-                                               output_len: int, seed: int):
+def test_mtp_e2e_greedy_correctness_cuda_graph(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality with cuda graph enabled and different
     batch sizes."""
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "block_size": 8,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
-    }])
+    [
+        {
+            "block_size": 8,
+            # 2 for small prompt, 256//8 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 8,
+            "max_model_len": (2 + 256 // 8) * 8,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+            # GPU memory utilization
+            "gpu_memory_utilization": 0.9,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+            },
         },
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         128,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 def test_mtp_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+            # GPU memory utilization
+            "gpu_memory_utilization": 0.9,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
@@ -261,73 +317,106 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_mtp_different_k(vllm_runner, common_llm_kwargs,
-                         per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, output_len: int,
-                         seed: int):
+def test_mtp_different_k(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that mtp speculative decoding produces exact equality
     to without spec decode with different values of num_speculative_tokens.
     """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Precision
+            "dtype": PRECISION,
+            # Main model
+            "model_name": MAIN_MODEL,
+            # GPU memory utilization
+            "gpu_memory_utilization": 0.9,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "num_speculative_tokens": MAX_SPEC_TOKENS,
+                "disable_by_batch_size": 4,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int):
+def test_mtp_disable_queue(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that mtp speculative decoding produces exact equality
     to without spec decode when speculation is disabled for large
     batch sizes.
     """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+    )
 
 
 if __name__ == "__main__":
     import pytest
+
     pytest.main([__file__])
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index ccc8e745ab37..d9737fed7d13 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -14,7 +14,7 @@
 equality. This gives us good coverage of temp=0.
 
 At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the
-highest probability in the target distribution are accepted. Therefore, we can 
+highest probability in the target distribution are accepted. Therefore, we can
 expect greedy equality for the TypicalAcceptanceSampler at temp=0.
 
 For temp>0, we rely on unit tests on the rejection sampler to verify that the
@@ -44,23 +44,23 @@
 from vllm import SamplingParams
 
 from ...utils import create_new_process_for_each_test
-from .conftest import (get_output_from_llm_generator,
-                       run_equality_correctness_test)
+from .conftest import get_output_from_llm_generator, run_equality_correctness_test
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        # Note this is repeated in the test body; to initialize a tokenizer.
-        "model": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            # Use a small model for a fast test.
+            # Note this is repeated in the test body; to initialize a tokenizer.
+            "model": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -80,19 +80,19 @@
             },
             "enable_chunked_prefill": True,
             "max_num_batched_tokens": 4,
-            "max_num_seqs": 4
+            "max_num_seqs": 4,
         },
         {
             # Verify the detokenizer assertions in the test work when spec
             # decode is disabled.
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
-def test_spec_decode_e2e_with_detokenization(test_llm_generator,
-                                             batch_size: int):
+def test_spec_decode_e2e_with_detokenization(test_llm_generator, batch_size: int):
     """Run generation with speculative decoding on a batch. Verify the engine
     generates the correct number of tokens (via ignore_eos=True), and that the
     detokenization matches HF transformers.
@@ -116,15 +116,17 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
     )
 
     batch_tokens, batch_token_ids, _ = get_output_from_llm_generator(
-        test_llm_generator, prompts, sampling_params)
+        test_llm_generator, prompts, sampling_params
+    )
 
     # Expect a generation for each prompt in the batch.
     assert len(batch_token_ids) == len(prompts)
 
     # Expect each generation to have expected number of tokens (note ignore_eos
     # is True).
-    assert [len(token_ids)
-            for token_ids in batch_token_ids] == ([output_len] * batch_size)
+    assert [len(token_ids) for token_ids in batch_token_ids] == (
+        [output_len] * batch_size
+    )
 
     # Expect detokenized string to match.
     tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
@@ -136,16 +138,17 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -157,38 +160,52 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         {
             "model_name": "JackFram/llama-160m",
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "disable_logprobs": False,
-    },
-    "enable_chunked_prefill": False,
-}, {
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-    "enable_chunked_prefill": True,
-    "max_num_batched_tokens": 4,
-    "max_num_seqs": 4,
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "disable_logprobs": False,
+            },
+            "enable_chunked_prefill": False,
+        },
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 3,
+                "disable_logprobs": False,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
+        },
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use long output len for the small model test.
         10,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality on a tiny model with batch size of one.
 
     Since this test is cheaper than other e2e correctness tests, we generate
@@ -197,35 +214,40 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
     When the draft model is the same as the target model, we further check
     whether all speculative tokens are accepted.
     """
-    ensure_all_accepted = per_test_common_llm_kwargs.get(
-        "model_name") == test_llm_kwargs.get("speculative_config")["model"]
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  prompt_logprobs=2,
-                                  logprobs=2,
-                                  disable_logprobs=False,
-                                  temperature=0.0,
-                                  ensure_all_accepted=ensure_all_accepted)
+    ensure_all_accepted = (
+        per_test_common_llm_kwargs.get("model_name")
+        == test_llm_kwargs.get("speculative_config")["model"]
+    )
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        prompt_logprobs=2,
+        logprobs=2,
+        disable_logprobs=False,
+        temperature=0.0,
+        ensure_all_accepted=ensure_all_accepted,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -237,61 +259,75 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
         {
             "model_name": "JackFram/llama-160m",
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         256,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [64])
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
-    """Verify greedy equality on a tiny model and large batch size.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    """Verify greedy equality on a tiny model and large batch size."""
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -303,243 +339,303 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
         {
             "model_name": "JackFram/llama-160m",
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
-@pytest.mark.parametrize("max_output_len", [
-    256,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "max_output_len",
+    [
+        256,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [32])
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-        max_output_len: int, seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    max_output_len: int,
+    seed: int,
+):
     """Verify greedy equality on a tiny model, with a large batch size, and when
     sampling respects the EOS token.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len,
-                                  seed=seed,
-                                  temperature=0.0,
-                                  ignore_eos=False)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len,
+        seed=seed,
+        temperature=0.0,
+        ignore_eos=False,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # A "real" model (not tiny).
-        "model_name": "meta-llama/Llama-2-7b-chat-hf",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-    }])
+    [
+        {
+            # A "real" model (not tiny).
+            "model_name": "meta-llama/Llama-2-7b-chat-hf",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use decently long output len for a high quality test.
         256,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality on a "real" model and batch size of 1. This is
     separate from large BS tests to make identifying the source of bugs easier.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # A "real" model (not tiny).
-        "model_name": "meta-llama/Llama-2-7b-chat-hf",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-    }])
+    [
+        {
+            # A "real" model (not tiny).
+            "model_name": "meta-llama/Llama-2-7b-chat-hf",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [32])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         64,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality with a "real" model on a nontrivial batch size.
     This is the closest test to a real production workload.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-160m",
-    },
-])
+    [
+        {
+            "block_size": 16,
+            # 2 for small prompt, 256//8 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 8,
+            "max_model_len": (2 + 256 // 8) * 8,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+        },
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         256,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
@@ -551,70 +647,82 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
         {
             "block_size": 32,
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
-def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
-                                          per_test_common_llm_kwargs,
-                                          baseline_llm_kwargs, test_llm_kwargs,
-                                          batch_size: int, output_len: int,
-                                          seed: int):
-    """Verify greedy equality over different block sizes.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+def test_spec_decode_different_block_size(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    """Verify greedy equality over different block sizes."""
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
     [
         {
-
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "speculative_config": {
@@ -634,7 +742,8 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
             "max_num_batched_tokens": 4,
             "max_num_seqs": 4,
         },
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
@@ -643,92 +752,115 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
         # we can test the case where all seqs are skipped, but still small to
         # ensure fast test.
         64,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
-def test_skip_speculation(vllm_runner, common_llm_kwargs,
-                          per_test_common_llm_kwargs, baseline_llm_kwargs,
-                          test_llm_kwargs, batch_size: int, output_len: int,
-                          seed: int):
+def test_skip_speculation(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality when some (or all) sequences skip speculation.
     We do this by setting the max model len of the draft model to an
     artificially low value, such that when the sequences grow beyond it, they
     are skipped in speculative decoding.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
-            "disable_by_batch_size": 2,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "disable_by_batch_size": 2,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
-            "disable_by_batch_size": 2,
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "disable_by_batch_size": 2,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4,
-    },
-])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("output_len", [10])
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
-def test_disable_speculation(vllm_runner, common_llm_kwargs,
-                             per_test_common_llm_kwargs, baseline_llm_kwargs,
-                             test_llm_kwargs, batch_size: int, output_len: int,
-                             seed: int):
-    """Verify greedy equality when all sequences disable speculation.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+def test_disable_speculation(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    """Verify greedy equality when all sequences disable speculation."""
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
@@ -743,51 +875,68 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
-    ] + [{
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": k,
-        },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4,
-    } for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]])
+    ]
+    + [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": k,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
+        }
+        for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
-def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                output_len: int, seed: int):
+def test_many_k(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that speculative decoding produces exact equality to without spec
     decode with many different values of k.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
@@ -799,44 +948,57 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                 "num_speculative_tokens": k,
                 "acceptance_method": "typical_acceptance_sampler",
             },
-            "enable_chunked_prefill": False
+            "enable_chunked_prefill": False,
         }
         # Try a range of common k.
         for k in [1, 2, 3]
-    ] + [{
-        "speculative_config": {
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": k,
-            "acceptance_method": "typical_acceptance_sampler",
-        },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    } for k in [1, 2, 3]])
+    ]
+    + [
+        {
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": k,
+                "acceptance_method": "typical_acceptance_sampler",
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
+        }
+        for k in [1, 2, 3]
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
 @create_new_process_for_each_test()
-def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs, test_llm_kwargs,
-                                     batch_size: int, output_len: int,
-                                     seed: int):
+def test_typical_acceptance_sampling(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that speculative decoding produces exact equality to without spec
     decode with TypicalAcceptanceSampler as the draft token acceptance
     sampling method.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index 58d1a6ca7add..15d38ddf3469 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -34,111 +34,145 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-68m",
-    },
-])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+        },
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_mqa_scorer": False,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_mqa_scorer": False,
+            },
         },
-    },
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_mqa_scorer": True,
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_mqa_scorer": True,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    256,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        256,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs, test_llm_kwargs,
-                                      batch_size: int, output_len: int,
-                                      prefill_chunk_size: int, seed: int):
+def test_ngram_e2e_greedy_correctness(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    prefill_chunk_size: int,
+    seed: int,
+):
     """Verify greedy equality on a tiny model with different batch size."""
     maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-68m",
-    },
-])
+    [
+        {
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # Print spec metrics.
+            "disable_log_stats": False,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+        },
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_logprobs": False,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_logprobs": False,
+            },
         },
-    },
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_logprobs": True,
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_logprobs": True,
+            },
         },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    8,
-])
+    ],
+)
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        8,
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("logprobs", [1, 6])
-def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                   per_test_common_llm_kwargs,
-                                   baseline_llm_kwargs, test_llm_kwargs,
-                                   batch_size: int, output_len: int, seed: int,
-                                   logprobs: int):
+def test_ngram_e2e_greedy_logprobs(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+    logprobs: int,
+):
     """Verify greedy equality on a tiny model with different batch size."""
     run_equality_correctness_test(
         vllm_runner,
@@ -152,88 +186,105 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
         temperature=0.0,
         logprobs=logprobs,
         prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
+        disable_logprobs=test_llm_kwargs["speculative_config"]["disable_logprobs"],
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-160m",
-    },
-])
+    [
+        {
+            "block_size": 16,
+            # 2 for small prompt, 256//8 for generated.
+            "num_gpu_blocks_override": 2 + 256 // 8,
+            "max_model_len": (2 + 256 // 8) * 8,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "model_name": "JackFram/llama-160m",
+        },
+    ],
+)
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+            },
+            "enable_chunked_prefill": False,
         },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_mqa_scorer": True,
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_mqa_scorer": True,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
         },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
+    ],
+)
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use small output len for fast test.
         256,
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 def test_ngram_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify greedy equality, even when some sequences are preempted mid-
     generation.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  temperature=0,
-                                  seed=seed)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        temperature=0,
+        seed=seed,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
@@ -248,7 +299,8 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 3, 5]
-    ] + [
+    ]
+    + [
         {
             "speculative_config": {
                 "method": "ngram",
@@ -258,135 +310,176 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 3, 5]
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_different_k(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int):
+def test_ngram_different_k(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that ngram speculative decoding produces exact equality
     to without spec decode with many different values of k and
     different ngram prompt_lookup_max.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 3,
-        "disable_by_batch_size": 4
-    },
-}, {
-    "speculative_config": {
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 3,
-        "disable_by_batch_size": 4,
-        "disable_mqa_scorer": True,
-    },
-    "enable_chunked_prefill": True,
-    "max_num_batched_tokens": 4,
-    "max_num_seqs": 4
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_by_batch_size": 4,
+            },
+        },
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_by_batch_size": 4,
+                "disable_mqa_scorer": True,
+            },
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
+        },
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
-                             per_test_common_llm_kwargs, baseline_llm_kwargs,
-                             test_llm_kwargs, batch_size: int, output_len: int,
-                             seed: int):
+def test_ngram_disable_queue(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
     """Verify that ngram speculative decoding produces exact equality
     to without spec decode with many different values of k and
     different ngram prompt_lookup_max.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
 
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # The original model is float32, keep it for numerical stability.
+            "dtype": "float32",
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 3,
-        "disable_mqa_scorer": True,
-    },
-}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": 5,
+                "prompt_lookup_max": 3,
+                "disable_mqa_scorer": True,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
     [
         # Use smaller output len for fast test.
         32,
-    ])
+    ],
+)
 @pytest.mark.parametrize("seed", [1])
-def test_ngram_scorer(vllm_runner, common_llm_kwargs,
-                      per_test_common_llm_kwargs, baseline_llm_kwargs,
-                      test_llm_kwargs, batch_size: int, output_len: int,
-                      seed: int):
-    """Verify that ngram speculative decoding generates the same output 
+def test_ngram_scorer(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    output_len: int,
+    seed: int,
+):
+    """Verify that ngram speculative decoding generates the same output
     with batch expansion scorer and mqa scorer.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+    )
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index 4cf373809dba..a7cda4ddcd80 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -14,18 +14,19 @@
 
 @pytest.mark.parametrize(
     "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # speculative config
-        "speculative_config": {
-            "model": "JackFram/llama-160m",
-            "num_speculative_tokens": 3,
-        },
-    }])
+    [
+        {
+            "model_name": "JackFram/llama-68m",
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,
+            # speculative config
+            "speculative_config": {
+                "model": "JackFram/llama-160m",
+                "num_speculative_tokens": 3,
+            },
+        }
+    ],
+)
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
 @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
@@ -36,13 +37,19 @@
     [
         # Use smaller output len for fast test.
         20,
-    ])
-def test_seeded_consistency(vllm_runner, common_llm_kwargs,
-                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                            test_llm_kwargs, batch_size: int,
-                            temperature: float, output_len: int):
-    """Verify outputs are consistent across multiple runs with same seed
-    """
+    ],
+)
+def test_seeded_consistency(
+    vllm_runner,
+    common_llm_kwargs,
+    per_test_common_llm_kwargs,
+    baseline_llm_kwargs,
+    test_llm_kwargs,
+    batch_size: int,
+    temperature: float,
+    output_len: int,
+):
+    """Verify outputs are consistent across multiple runs with same seed"""
     run_equality_correctness_test(
         vllm_runner,
         common_llm_kwargs,
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index d20c549b0905..a7707759c8d9 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -9,13 +9,13 @@
 from .utils import create_seq_group_metadata_from_prompts, mock_worker
 
 
-@pytest.mark.parametrize('num_target_seq_ids', [100])
+@pytest.mark.parametrize("num_target_seq_ids", [100])
 @pytest.mark.skip_global_cleanup
 def test_create_target_seq_id_iterator(num_target_seq_ids: int):
     """Verify all new sequence ids are greater than all input
     seq ids.
     """
-    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+    scorer = BatchExpansionTop1Scorer(mock_worker(), "cuda:0", 32_000)
 
     all_seq_ids = [
         [1, 3, 5, 7],
@@ -30,24 +30,23 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int):
             assert next(iterator) > max_seq_id
 
 
-@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize("k", [1, 2, 6])
 @pytest.mark.skip_global_cleanup
 def test_get_token_ids_to_score(k: int):
-    """Verify correct tokens are selected for scoring.
-    """
+    """Verify correct tokens are selected for scoring."""
     proposal_token_ids = torch.tensor(
         list(range(k)),
         dtype=torch.int64,
-        device='cuda',
+        device="cuda",
     )
 
     expected_output: list[list[int]] = [
         [],
     ]
     for i in range(proposal_token_ids.shape[0]):
-        expected_output.append(proposal_token_ids[:i + 1].tolist())
+        expected_output.append(proposal_token_ids[: i + 1].tolist())
 
-    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+    scorer = BatchExpansionTop1Scorer(mock_worker(), "cuda:0", 32_000)
     actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist())  # pylint: disable=protected-access
 
     actual_output = [
@@ -57,11 +56,10 @@ def test_get_token_ids_to_score(k: int):
     assert actual_output == expected_output
 
 
-@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize("k", [1, 2, 6])
 @pytest.mark.skip_global_cleanup
 def test_create_single_target_seq_group_metadata(k: int):
-    """Verify correct creation of a batch-expanded seq group metadata.
-    """
+    """Verify correct creation of a batch-expanded seq group metadata."""
 
     prompt_tokens = [1, 2, 3]
     prev_output_tokens = [4, 5, 6]
@@ -70,18 +68,22 @@ def test_create_single_target_seq_group_metadata(k: int):
 
     num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1
 
-    final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len(
-        token_ids)
+    final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len(token_ids)
 
     block_size = 32
     input_seq_group_metadata = create_seq_group_metadata_from_prompts(
-        [prompt_tokens], 2048 // block_size, block_size, [final_seq_len],
-        [prev_output_tokens], [num_tokens_processed])[0]
+        [prompt_tokens],
+        2048 // block_size,
+        block_size,
+        [final_seq_len],
+        [prev_output_tokens],
+        [num_tokens_processed],
+    )[0]
 
     input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0]
     target_seq_id = 100
 
-    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+    scorer = BatchExpansionTop1Scorer(mock_worker(), "cuda:0", 32_000)
     output = scorer._create_single_target_seq_group_metadata(  # pylint: disable=protected-access
         input_seq_group_metadata,
         input_seq_id,
@@ -91,20 +93,28 @@ def test_create_single_target_seq_group_metadata(k: int):
     )
 
     assert output.request_id == input_seq_group_metadata.request_id
-    assert output.sampling_params.repetition_penalty == \
-        input_seq_group_metadata.sampling_params.repetition_penalty
-    assert output.sampling_params.temperature == \
-        input_seq_group_metadata.sampling_params.temperature
-    assert output.sampling_params.top_p == \
-        input_seq_group_metadata.sampling_params.top_p
-    assert output.sampling_params.top_k == \
-        input_seq_group_metadata.sampling_params.top_k
+    assert (
+        output.sampling_params.repetition_penalty
+        == input_seq_group_metadata.sampling_params.repetition_penalty
+    )
+    assert (
+        output.sampling_params.temperature
+        == input_seq_group_metadata.sampling_params.temperature
+    )
+    assert (
+        output.sampling_params.top_p == input_seq_group_metadata.sampling_params.top_p
+    )
+    assert (
+        output.sampling_params.top_k == input_seq_group_metadata.sampling_params.top_k
+    )
     assert len(output.seq_data) == 1
-    assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(
-        prompt_tokens)
+    assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(prompt_tokens)
     assert output.seq_data[target_seq_id].get_output_token_ids() == tuple(
-        prev_output_tokens + token_ids)
+        prev_output_tokens + token_ids
+    )
 
     assert len(output.block_tables) == 1
-    assert output.block_tables[
-        target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id]
+    assert (
+        output.block_tables[target_seq_id]
+        == input_seq_group_metadata.block_tables[input_seq_id]
+    )
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index 407786ad3c64..e501d88bba90 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -16,14 +16,16 @@
 from .utils import create_batch, mock_worker
 
 
-@pytest.mark.parametrize('queue_size', [4])
-@pytest.mark.parametrize('batch_size', [1])
-@pytest.mark.parametrize('k', [1])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("queue_size", [4])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("k", [1])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
-def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
-                             acceptance_sampler_method: str):
+def test_disable_spec_tokens(
+    queue_size: int, batch_size: int, k: int, acceptance_sampler_method: str
+):
     """Verify that speculative tokens are disabled when the batch size
     exceeds the threshold.
     """
@@ -31,41 +33,44 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    worker = SpecDecodeWorker(proposer_worker=draft_worker,
-                              scorer_worker=target_worker,
-                              spec_decode_sampler=mock_spec_decode_sampler(
-                                  acceptance_sampler_method),
-                              disable_logprobs=False,
-                              metrics_collector=metrics_collector,
-                              disable_by_batch_size=disable_by_batch_size)
+    worker = SpecDecodeWorker(
+        proposer_worker=draft_worker,
+        scorer_worker=target_worker,
+        spec_decode_sampler=mock_spec_decode_sampler(acceptance_sampler_method),
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+        disable_by_batch_size=disable_by_batch_size,
+    )
 
-    exception_secret = 'artificial stop'
+    exception_secret = "artificial stop"
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=k,
-        running_queue_size=queue_size)
+        running_queue_size=queue_size,
+    )
 
     if queue_size > disable_by_batch_size:
-        with patch.object(worker,
-                          '_run_no_spec',
-                          side_effect=ValueError(exception_secret)), \
-            pytest.raises(ValueError, match=exception_secret):
+        with (
+            patch.object(
+                worker, "_run_no_spec", side_effect=ValueError(exception_secret)
+            ),
+            pytest.raises(ValueError, match=exception_secret),
+        ):
             worker.execute_model(execute_model_req=execute_model_req)
 
     # When the batch size is larger than the threshold,
     # we expect no speculative tokens (0).
     expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0
-    assert seq_group_metadata_list[
-        0].num_speculative_tokens == expected_num_spec_tokens
+    assert seq_group_metadata_list[0].num_speculative_tokens == expected_num_spec_tokens
 
     draft_worker.sampler_output.side_effect = ValueError(exception_secret)
 
     proposer = Top1Proposer(
         worker=draft_worker,
-        device='cpu',  # not used
+        device="cpu",  # not used
         vocab_size=100,  # not used
         # Must be long enough to avoid being skipped due to length.
         max_proposal_len=1024,
@@ -77,14 +82,17 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
             proposer.get_spec_proposals(
                 execute_model_req=ExecuteModelRequest(
                     seq_group_metadata_list=seq_group_metadata_list,
-                    num_lookahead_slots=k),
-                seq_ids_with_bonus_token_in_last_step=set())
+                    num_lookahead_slots=k,
+                ),
+                seq_ids_with_bonus_token_in_last_step=set(),
+            )
     else:
         # Should not execute the draft model because spec decode is disabled
         # for all requests. Accordingly, the proposal length should be 0.
         proposals = proposer.get_spec_proposals(
             execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                num_lookahead_slots=k),
-            seq_ids_with_bonus_token_in_last_step=set())
+                seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+            ),
+            seq_ids_with_bonus_token_in_last_step=set(),
+        )
         assert proposals.proposal_lens.tolist() == [0] * batch_size
diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py
index 5d9dd3f72a78..e48409422cb0 100644
--- a/tests/spec_decode/test_memory_usage.py
+++ b/tests/spec_decode/test_memory_usage.py
@@ -2,16 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This docstring details important information on the testing methodology.
 
-This test verifies that memory usage remains constant (or never grows) when 
-we enable / disable speculation via --speculative-disable-by-batch-size. 
+This test verifies that memory usage remains constant (or never grows) when
+we enable / disable speculation via --speculative-disable-by-batch-size.
 
 There are a lot of things we try to keep track of between batches of requests
-and if certain tensors are not freed from memory, can result in CUDA ooms. 
+and if certain tensors are not freed from memory, can result in CUDA ooms.
 
-This is particularly relevant for production situations where speculation might 
+This is particularly relevant for production situations where speculation might
 be enabled during off hours, but disabled once traffic peaks during the workday.
-Since traffic will stay high for a long period of time, verifying we do not 
-increase our memory usage over time is essential to prevent possible CUDA ooms. 
+Since traffic will stay high for a long period of time, verifying we do not
+increase our memory usage over time is essential to prevent possible CUDA ooms.
 """
 
 import torch
@@ -43,21 +43,22 @@ def add_seq_group_to_engine(engine: vllm.LLMEngine, seq_group: SequenceGroup):
 
 def test_memory_usage_no_spec():
     previous_memory_allocated = None
-    llm = vllm.LLM(model=MAIN_MODEL,
-                   speculative_config={
-                       "model": SPEC_MODEL,
-                       "num_speculative_tokens": 3,
-                       "disable_by_batch_size": SPEC_DISABLE_BATCH_SIZE,
-                   })
+    llm = vllm.LLM(
+        model=MAIN_MODEL,
+        speculative_config={
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": 3,
+            "disable_by_batch_size": SPEC_DISABLE_BATCH_SIZE,
+        },
+    )
 
     batch_sequences = set()
     engine = llm.llm_engine
 
     for i in range(ITERATIONS):
-        seq, seq_group = create_dummy_prompt(request_id=str(i),
-                                             prompt_length=10,
-                                             min_tokens=10,
-                                             max_tokens=10)
+        seq, seq_group = create_dummy_prompt(
+            request_id=str(i), prompt_length=10, min_tokens=10, max_tokens=10
+        )
 
         add_seq_group_to_engine(engine, seq_group)
 
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index e8de410f8a94..288bdd9c3a26 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -11,15 +11,14 @@
 
 
 def test_initial_call_returns_none():
-    """Expect first call to get metrics to return None.
-    """
+    """Expect first call to get metrics to return None."""
     spec_decode_sampler = MagicMock()
-    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
-                                                           dtype=torch.long,
-                                                           device='cuda')
-    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
-                                                          dtype=torch.long,
-                                                          device='cuda')
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
     spec_decode_sampler.num_draft_tokens = 0
 
     collector = AsyncMetricsCollector(spec_decode_sampler)
@@ -29,26 +28,25 @@ def test_initial_call_returns_none():
 
 
 def test_second_call_returns_metrics():
-    """Expect second call to not return None.
-    """
+    """Expect second call to not return None."""
     spec_decode_sampler = MagicMock()
-    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
-                                                           dtype=torch.long,
-                                                           device='cuda')
-    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
-                                                          dtype=torch.long,
-                                                          device='cuda')
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
     spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
-    timer.side_effect = [
-        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
-    ]
+    timer.side_effect = [0.0, collect_interval_s + 0.1, collect_interval_s + 0.2]
 
-    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
-                                      timer=timer,
-                                      collect_interval_s=collect_interval_s)
+    collector = AsyncMetricsCollector(
+        spec_decode_sampler=spec_decode_sampler,
+        timer=timer,
+        collect_interval_s=collect_interval_s,
+    )
     collector.init_gpu_tensors(rank=0)
     _ = collector.maybe_collect_rejsample_metrics(k=5)
     metrics = collector.maybe_collect_rejsample_metrics(k=5)
@@ -57,15 +55,14 @@ def test_second_call_returns_metrics():
 
 @pytest.mark.parametrize("rank", [1, 2, 3, 4])
 def test_nonzero_rank_noop(rank):
-    """Verify nonzero ranks don't collect metrics.
-    """
+    """Verify nonzero ranks don't collect metrics."""
     spec_decode_sampler = MagicMock()
-    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
-                                                           dtype=torch.long,
-                                                           device='cuda')
-    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
-                                                          dtype=torch.long,
-                                                          device='cuda')
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
     spec_decode_sampler.num_draft_tokens = 0
 
     collector = AsyncMetricsCollector(spec_decode_sampler)
@@ -76,27 +73,31 @@ def test_nonzero_rank_noop(rank):
 
 
 def test_noop_until_time():
-    """Verify metrics aren't collected until enough time passes.
-    """
+    """Verify metrics aren't collected until enough time passes."""
     spec_decode_sampler = MagicMock()
-    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
-                                                           dtype=torch.long,
-                                                           device='cuda')
-    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
-                                                          dtype=torch.long,
-                                                          device='cuda')
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
     spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
     timer.side_effect = [
-        0.0, collect_interval_s - 0.1, collect_interval_s - 0.1,
-        collect_interval_s + 0.1, collect_interval_s + 0.1
+        0.0,
+        collect_interval_s - 0.1,
+        collect_interval_s - 0.1,
+        collect_interval_s + 0.1,
+        collect_interval_s + 0.1,
     ]
 
-    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
-                                      timer=timer,
-                                      collect_interval_s=collect_interval_s)
+    collector = AsyncMetricsCollector(
+        spec_decode_sampler=spec_decode_sampler,
+        timer=timer,
+        collect_interval_s=collect_interval_s,
+    )
     collector.init_gpu_tensors(rank=0)
 
     _ = collector.maybe_collect_rejsample_metrics(k=5)
@@ -113,12 +114,12 @@ def test_timer_is_reset():
     is reset after collection.
     """
     spec_decode_sampler = MagicMock()
-    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
-                                                           dtype=torch.long,
-                                                           device='cuda')
-    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
-                                                          dtype=torch.long,
-                                                          device='cuda')
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(
+        0, dtype=torch.long, device="cuda"
+    )
     spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
@@ -133,9 +134,11 @@ def test_timer_is_reset():
         2 * collect_interval_s + 0.1,
     ]
 
-    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
-                                      timer=timer,
-                                      collect_interval_s=collect_interval_s)
+    collector = AsyncMetricsCollector(
+        spec_decode_sampler=spec_decode_sampler,
+        timer=timer,
+        collect_interval_s=collect_interval_s,
+    )
     collector.init_gpu_tensors(rank=0)
 
     _ = collector.maybe_collect_rejsample_metrics(k=5)
@@ -153,8 +156,7 @@ def test_timer_is_reset():
 
 @pytest.mark.parametrize("has_data", [True, False])
 def test_initial_metrics_has_correct_values(has_data: bool):
-    """Test correctness of metrics data.
-    """
+    """Test correctness of metrics data."""
     if has_data:
         num_accepted_tokens = 103
         num_emitted_tokens = 104
@@ -166,26 +168,27 @@ def test_initial_metrics_has_correct_values(has_data: bool):
     k = 5
 
     max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
-        num_draft_tokens, k)
+        num_draft_tokens, k
+    )
 
     spec_decode_sampler = MagicMock()
-    spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
-                                                           dtype=torch.long,
-                                                           device='cuda')
-    spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
-                                                          dtype=torch.long,
-                                                          device='cuda')
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(
+        num_accepted_tokens, dtype=torch.long, device="cuda"
+    )
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(
+        num_emitted_tokens, dtype=torch.long, device="cuda"
+    )
     spec_decode_sampler.num_draft_tokens = num_draft_tokens
 
     collect_interval_s = 5.0
     timer = MagicMock()
-    timer.side_effect = [
-        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
-    ]
+    timer.side_effect = [0.0, collect_interval_s + 0.1, collect_interval_s + 0.2]
 
-    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
-                                      timer=timer,
-                                      collect_interval_s=collect_interval_s)
+    collector = AsyncMetricsCollector(
+        spec_decode_sampler=spec_decode_sampler,
+        timer=timer,
+        collect_interval_s=collect_interval_s,
+    )
     collector.init_gpu_tensors(rank=0)
     _ = collector.maybe_collect_rejsample_metrics(k)
     metrics = collector.maybe_collect_rejsample_metrics(k)
@@ -196,10 +199,8 @@ def test_initial_metrics_has_correct_values(has_data: bool):
     assert metrics.emitted_tokens == num_emitted_tokens
 
     if has_data:
-        assert (metrics.draft_acceptance_rate == num_accepted_tokens /
-                num_draft_tokens)
-        assert (metrics.system_efficiency == num_emitted_tokens /
-                max_num_emitted_tokens)
+        assert metrics.draft_acceptance_rate == num_accepted_tokens / num_draft_tokens
+        assert metrics.system_efficiency == num_emitted_tokens / max_num_emitted_tokens
     else:
         assert math.isnan(metrics.draft_acceptance_rate)
         assert math.isnan(metrics.system_efficiency)
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index f2d93203b8e1..58b786166f57 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -7,23 +7,26 @@
 import pytest
 import torch
 
-from vllm.attention.selector import (_Backend,
-                                     global_force_attn_backend_context_manager)
+from vllm.attention.selector import _Backend, global_force_attn_backend_context_manager
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob,
-                           get_all_seq_ids)
+from vllm.sequence import ExecuteModelRequest, HiddenStates, Logprob, get_all_seq_ids
 from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
 
-from .utils import (assert_logprobs_dict_allclose, create_batch,
-                    create_seq_group_metadata_from_prompts, create_worker,
-                    patch_execute_model_with_seeds, zero_kv_cache)
+from .utils import (
+    assert_logprobs_dict_allclose,
+    create_batch,
+    create_seq_group_metadata_from_prompts,
+    create_worker,
+    patch_execute_model_with_seeds,
+    zero_kv_cache,
+)
 
 
-@pytest.mark.parametrize('num_steps', list(range(1, 17)))
+@pytest.mark.parametrize("num_steps", list(range(1, 17)))
 def test_assert_enough_kv_space(num_steps: int):
     """Test that the multi step worker checks for sufficient space in the KV
     cache. It should throw if it cannot run all the steps.
@@ -51,7 +54,8 @@ def test_assert_enough_kv_space(num_steps: int):
         num_gpu_blocks,
         block_size,
         final_prompt_lens,
-        continuations=prev_output_tokens)
+        continuations=prev_output_tokens,
+    )
 
     assert_enough_kv_space = MultiStepWorker._assert_enough_kv_space  # pylint: disable=protected-access
     worker = MagicMock()
@@ -64,13 +68,13 @@ def test_assert_enough_kv_space(num_steps: int):
         assert_enough_kv_space(worker, inputs, num_steps)
 
         seq_group_metadata.block_tables = {
-            seq_id: []
-            for seq_id, physical_blocks in original_block_tables.items()
+            seq_id: [] for seq_id, physical_blocks in original_block_tables.items()
         }
 
         # Expect exception.
-        with pytest.raises(ValueError,
-                           match='times but found insufficient KV space for'):
+        with pytest.raises(
+            ValueError, match="times but found insufficient KV space for"
+        ):
             assert_enough_kv_space(worker, inputs, num_steps)
 
         seq_group_metadata.block_tables = original_block_tables
@@ -82,7 +86,7 @@ def test_same_output_for_single_step():
     worker for num_steps=1.
     """
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
 
     block_size = 32
     num_gpu_blocks = 2048 // block_size
@@ -114,49 +118,43 @@ def test_same_output_for_single_step():
     final_prompt_lens = [len(prompt) + num_steps for prompt in prompts]
 
     multi_step_seq_group = create_seq_group_metadata_from_prompts(
-        prompts,
-        num_gpu_blocks,
-        block_size,
-        final_prompt_lens=final_prompt_lens)
+        prompts, num_gpu_blocks, block_size, final_prompt_lens=final_prompt_lens
+    )
 
     zero_kv_cache(multi_step_worker.cache_engine)
     set_random_seed(seed)
     actual_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=multi_step_seq_group),
+            seq_group_metadata_list=multi_step_seq_group
+        ),
         sample_len=num_steps,
-        seq_ids_with_bonus_token_in_last_step=set())
+        seq_ids_with_bonus_token_in_last_step=set(),
+    )
     assert len(actual_output) == num_steps
     actual_output = actual_output[0]
 
     single_step_seq_group = create_seq_group_metadata_from_prompts(
-        prompts,
-        num_gpu_blocks,
-        block_size,
-        final_prompt_lens=final_prompt_lens)
+        prompts, num_gpu_blocks, block_size, final_prompt_lens=final_prompt_lens
+    )
 
     zero_kv_cache(worker.cache_engine)
     set_random_seed(seed)
     expected_output = worker.execute_model(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=single_step_seq_group))[0]
+            seq_group_metadata_list=single_step_seq_group
+        )
+    )[0]
 
-    actual_token_ids = [
-        output.samples[0].output_token for output in actual_output
-    ]
+    actual_token_ids = [output.samples[0].output_token for output in actual_output]
     actual_logprobs = [output.samples[0].logprobs for output in actual_output]
 
-    expected_token_ids = [
-        output.samples[0].output_token for output in expected_output
-    ]
-    expected_logprobs = [
-        output.samples[0].logprobs for output in expected_output
-    ]
+    expected_token_ids = [output.samples[0].output_token for output in expected_output]
+    expected_logprobs = [output.samples[0].logprobs for output in expected_output]
 
     assert actual_token_ids == expected_token_ids
 
-    print(f'{actual_logprobs=}')
-    print(f'{expected_logprobs=}')
+    print(f"{actual_logprobs=}")
+    print(f"{expected_logprobs=}")
     assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs)
 
 
@@ -167,7 +165,7 @@ def test_same_output_for_multi_step():
     then runs the worker num_steps times, and compares the output.
     """
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
 
     block_size = 16
     num_gpu_blocks = 2048 // block_size
@@ -191,15 +189,17 @@ def test_same_output_for_multi_step():
     num_steps = block_size + 1
 
     random.seed(seed)
-    prompts = [[
-        random.randint(0, 1000) for _ in range(random.randint(10, 20))
-    ] for _ in range(10)]
+    prompts = [
+        [random.randint(0, 1000) for _ in range(random.randint(10, 20))]
+        for _ in range(10)
+    ]
 
     final_prompt_lens = [len(prompt) + num_steps for prompt in prompts]
 
     rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
     multi_step_worker.execute_model = patch_execute_model_with_seeds(
-        multi_step_worker, rand_seeds)
+        multi_step_worker, rand_seeds
+    )
     worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
 
     continuations = [[1] for _ in prompts]
@@ -208,16 +208,19 @@ def test_same_output_for_multi_step():
         num_gpu_blocks,
         block_size,
         continuations=continuations,
-        final_prompt_lens=final_prompt_lens)
+        final_prompt_lens=final_prompt_lens,
+    )
 
     # Run multi-step.
     zero_kv_cache(multi_step_worker.cache_engine)
     set_random_seed(seed)
     multi_step_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list),
+            seq_group_metadata_list=seq_group_metadata_list
+        ),
         sample_len=num_steps,
-        seq_ids_with_bonus_token_in_last_step=set())
+        seq_ids_with_bonus_token_in_last_step=set(),
+    )
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
@@ -226,62 +229,61 @@ def test_same_output_for_multi_step():
     set_random_seed(seed)
 
     for _ in multi_step_output:
-
         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
             prompts,
             num_gpu_blocks,
             block_size,
             continuations=continuations,
-            final_prompt_lens=final_prompt_lens)
+            final_prompt_lens=final_prompt_lens,
+        )
 
         single_step_output.extend(
-            worker.execute_model(execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list)))
+            worker.execute_model(
+                execute_model_req=ExecuteModelRequest(
+                    seq_group_metadata_list=seq_group_metadata_list
+                )
+            )
+        )
 
         # Append output tokens to new sequence data.
         for i, seq_group_output in enumerate(single_step_output[-1]):
             continuations[i].append(seq_group_output.samples[0].output_token)
 
     # Get token ids and logprobs for comparison.
-    multi_step_output_logprobs: list[list[dict[int,
-                                               Logprob]]] = [[]
-                                                             for _ in prompts]
-    single_step_output_logprobs: list[list[dict[int,
-                                                Logprob]]] = [[]
-                                                              for _ in prompts]
+    multi_step_output_logprobs: list[list[dict[int, Logprob]]] = [[] for _ in prompts]
+    single_step_output_logprobs: list[list[dict[int, Logprob]]] = [[] for _ in prompts]
 
     multi_step_output_token_ids: list[list[int]] = [[] for _ in prompts]
     single_step_output_token_ids: list[list[int]] = [[] for _ in prompts]
     for i, _ in enumerate(prompts):
-        for multi_step, single_step in zip(multi_step_output,
-                                           single_step_output):
-            multi_step_output_token_ids[i].append(
-                multi_step[i].samples[0].output_token)
+        for multi_step, single_step in zip(multi_step_output, single_step_output):
+            multi_step_output_token_ids[i].append(multi_step[i].samples[0].output_token)
             single_step_output_token_ids[i].append(
-                single_step[i].samples[0].output_token)
+                single_step[i].samples[0].output_token
+            )
 
-            multi_step_output_logprobs[i].append(
-                multi_step[i].samples[0].logprobs)
-            single_step_output_logprobs[i].append(
-                single_step[i].samples[0].logprobs)
+            multi_step_output_logprobs[i].append(multi_step[i].samples[0].logprobs)
+            single_step_output_logprobs[i].append(single_step[i].samples[0].logprobs)
 
     # Print per-sequence token ids
     for i, (multi_step_tokens, single_step_tokens) in enumerate(
-            zip(multi_step_output_token_ids, single_step_output_token_ids)):
-        print(f'{i=} {multi_step_tokens=}')
-        print(f'{i=} {single_step_tokens=}')
-        print(f'{i=} equal {multi_step_tokens == single_step_tokens}')
+        zip(multi_step_output_token_ids, single_step_output_token_ids)
+    ):
+        print(f"{i=} {multi_step_tokens=}")
+        print(f"{i=} {single_step_tokens=}")
+        print(f"{i=} equal {multi_step_tokens == single_step_tokens}")
 
     # Assert token ids are equal.
     for multi_step_tokens, single_step_tokens in zip(
-            multi_step_output_token_ids, single_step_output_token_ids):
+        multi_step_output_token_ids, single_step_output_token_ids
+    ):
         assert multi_step_tokens == single_step_tokens
 
     # Assert logprobs are equal.
     for multi_step_logprobs, single_step_logprobs in zip(
-            multi_step_output_logprobs, single_step_output_logprobs):
-        assert_logprobs_dict_allclose(multi_step_logprobs,
-                                      single_step_logprobs)
+        multi_step_output_logprobs, single_step_output_logprobs
+    ):
+        assert_logprobs_dict_allclose(multi_step_logprobs, single_step_logprobs)
 
 
 @torch.inference_mode()
@@ -294,7 +296,7 @@ def test_multi_step_with_batch_expansion_correct_output():
     expanded batch is then used for predicting the next tokens.
     """
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
 
     block_size = 16
     num_gpu_blocks = 2048 // block_size
@@ -321,7 +323,8 @@ def test_multi_step_with_batch_expansion_correct_output():
     final_prompt_lens = [(num_steps + 1) for prompt in prompts]
     rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
     multi_step_worker.execute_model = patch_execute_model_with_seeds(
-        multi_step_worker, rand_seeds)
+        multi_step_worker, rand_seeds
+    )
     worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
     # Create the test continuations
     continuations = [[random.randint(0, 1000)] for _ in prompts]
@@ -330,7 +333,8 @@ def test_multi_step_with_batch_expansion_correct_output():
         num_gpu_blocks,
         block_size,
         continuations=continuations,
-        final_prompt_lens=final_prompt_lens)
+        final_prompt_lens=final_prompt_lens,
+    )
 
     # Run single-step twice to generate 2 tokens. This
     # will simulate the bonus token case with the second token
@@ -344,10 +348,15 @@ def test_multi_step_with_batch_expansion_correct_output():
             num_gpu_blocks,
             block_size,
             continuations=continuations,
-            final_prompt_lens=final_prompt_lens)
+            final_prompt_lens=final_prompt_lens,
+        )
         single_step_output.extend(
-            worker.execute_model(execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list)))
+            worker.execute_model(
+                execute_model_req=ExecuteModelRequest(
+                    seq_group_metadata_list=seq_group_metadata_list
+                )
+            )
+        )
         # Append output tokens to new sequence data.
         for i, seq_group_output in enumerate(single_step_output[-1]):
             continuations[i].append(seq_group_output.samples[0].output_token)
@@ -362,7 +371,8 @@ def test_multi_step_with_batch_expansion_correct_output():
         num_gpu_blocks,
         block_size,
         continuations=multi_step_continuations,
-        final_prompt_lens=final_prompt_lens)
+        final_prompt_lens=final_prompt_lens,
+    )
 
     # Run multi-step and verify that the third token prediction is accurate
     # for all sequences.
@@ -370,11 +380,13 @@ def test_multi_step_with_batch_expansion_correct_output():
     all_seq_ids = {i for i in range(batch_size)}
     multi_step_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list),
+            seq_group_metadata_list=seq_group_metadata_list
+        ),
         sample_len=1,
-        seq_ids_with_bonus_token_in_last_step=all_seq_ids)
+        seq_ids_with_bonus_token_in_last_step=all_seq_ids,
+    )
     for index, output in enumerate(multi_step_output[-1].outputs):
-        assert (continuations[index][-1] == output.samples[0].output_token)
+        assert continuations[index][-1] == output.samples[0].output_token
 
 
 @torch.inference_mode()
@@ -389,7 +401,7 @@ def test_multi_step_with_batch_expansion_incorrect_output():
     the sequence ID is specified incorrectly.
     """
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
 
     block_size = 16
     num_gpu_blocks = 2048 // block_size
@@ -416,7 +428,8 @@ def test_multi_step_with_batch_expansion_incorrect_output():
     final_prompt_lens = [(num_steps + 1) for prompt in prompts]
     rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
     multi_step_worker.execute_model = patch_execute_model_with_seeds(
-        multi_step_worker, rand_seeds)
+        multi_step_worker, rand_seeds
+    )
     worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
     # Create the test continuations
     continuations = [[random.randint(0, 1000)] for _ in prompts]
@@ -425,7 +438,8 @@ def test_multi_step_with_batch_expansion_incorrect_output():
         num_gpu_blocks,
         block_size,
         continuations=continuations,
-        final_prompt_lens=final_prompt_lens)
+        final_prompt_lens=final_prompt_lens,
+    )
     # Run single-step twice to generate 2 tokens. This
     # will simulate the bonus token case with the second token
     # being the bonus token.
@@ -438,10 +452,15 @@ def test_multi_step_with_batch_expansion_incorrect_output():
             num_gpu_blocks,
             block_size,
             continuations=continuations,
-            final_prompt_lens=final_prompt_lens)
+            final_prompt_lens=final_prompt_lens,
+        )
         single_step_output.extend(
-            worker.execute_model(execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list)))
+            worker.execute_model(
+                execute_model_req=ExecuteModelRequest(
+                    seq_group_metadata_list=seq_group_metadata_list
+                )
+            )
+        )
         # Append output tokens to new sequence data.
         for i, seq_group_output in enumerate(single_step_output[-1]):
             continuations[i].append(seq_group_output.samples[0].output_token)
@@ -456,7 +475,8 @@ def test_multi_step_with_batch_expansion_incorrect_output():
         num_gpu_blocks,
         block_size,
         continuations=multi_step_continuations,
-        final_prompt_lens=final_prompt_lens)
+        final_prompt_lens=final_prompt_lens,
+    )
 
     # Run multi-step. In this run INCORRECTLY specify that only the odd number
     # sequences have bonus tokens. Verify that with this setting the third token
@@ -468,30 +488,31 @@ def test_multi_step_with_batch_expansion_incorrect_output():
     odd_seq_ids = {i for i in range(batch_size) if i % 2 != 0}
     multi_step_output, _ = multi_step_worker.sampler_output(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list),
+            seq_group_metadata_list=seq_group_metadata_list
+        ),
         sample_len=1,
-        seq_ids_with_bonus_token_in_last_step=odd_seq_ids)
+        seq_ids_with_bonus_token_in_last_step=odd_seq_ids,
+    )
     num_mismatch = 0
     for index, output in enumerate(multi_step_output[-1].outputs):
         if (index % 2) != 0:
-            assert (continuations[index][-1] == output.samples[0].output_token)
-        elif (continuations[index][-1] != output.samples[0].output_token):
+            assert continuations[index][-1] == output.samples[0].output_token
+        elif continuations[index][-1] != output.samples[0].output_token:
             num_mismatch += 1
     # The prediction is accurate for some of the sequences even without proper
     # handling of the bonus tokens. Hence verify that the number of sequences
     # for which there is a mismatch is > 0.
-    assert (num_mismatch > 0)
+    assert num_mismatch > 0
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
+@pytest.mark.parametrize("num_steps", [1, 2, 3, 4])
 # The choice of backends forces the multi_step_worker to choose between
 # the vanilla model_runner and TP1DraftModelRunner and that we can test
 # both code paths.
-@pytest.mark.parametrize('attn_backend',
-                         [_Backend.XFORMERS, _Backend.FLASH_ATTN])
+@pytest.mark.parametrize("attn_backend", [_Backend.XFORMERS, _Backend.FLASH_ATTN])
 def test_multi_step_correct_kvcache(num_steps, attn_backend):
-    """Verify that the KV cache of the draft model 
+    """Verify that the KV cache of the draft model
     is correctly updated for sequences with bonus token.
     """
     seed = 100
@@ -502,29 +523,27 @@ def test_multi_step_correct_kvcache(num_steps, attn_backend):
     batch_size = 1
 
     with global_force_attn_backend_context_manager(attn_backend):
-        dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
-        multi_step_worker = create_worker(MultiStepWorker,
-                                          model_name,
-                                          block_size,
-                                          num_gpu_blocks,
-                                          seed,
-                                          model_runner_cls=TP1DraftModelRunner,
-                                          dtype=dtype)
+        dtype = "float16" if attn_backend == _Backend.FLASH_ATTN else "float32"
+        multi_step_worker = create_worker(
+            MultiStepWorker,
+            model_name,
+            block_size,
+            num_gpu_blocks,
+            seed,
+            model_runner_cls=TP1DraftModelRunner,
+            dtype=dtype,
+        )
         multi_step_worker.set_include_gpu_probs_tensor()
-        worker = create_worker(Worker,
-                               model_name,
-                               block_size,
-                               num_gpu_blocks,
-                               seed,
-                               dtype=dtype)
+        worker = create_worker(
+            Worker, model_name, block_size, num_gpu_blocks, seed, dtype=dtype
+        )
 
         prompts = [[0] for _ in range(batch_size)]
         # Already generate two tokens for the sequence
         # so that we can simulate the bonus token case
-        multi_step_continuations = [[
-            random.randint(0, 1000),
-            random.randint(0, 1000)
-        ] for _ in prompts]
+        multi_step_continuations = [
+            [random.randint(0, 1000), random.randint(0, 1000)] for _ in prompts
+        ]
         final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
 
         seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
@@ -533,15 +552,18 @@ def test_multi_step_correct_kvcache(num_steps, attn_backend):
             num_gpu_blocks,
             block_size,
             continuations=multi_step_continuations,
-            final_prompt_lens=final_prompt_lens)
+            final_prompt_lens=final_prompt_lens,
+        )
 
         # Run multi-step.
         zero_kv_cache(multi_step_worker.cache_engine)
-        multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list),
-                                         sample_len=num_steps,
-                                         seq_ids_with_bonus_token_in_last_step=
-                                         seq_ids_with_bonus_token_in_last_step)
+        multi_step_worker.sampler_output(
+            execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list
+            ),
+            sample_len=num_steps,
+            seq_ids_with_bonus_token_in_last_step=seq_ids_with_bonus_token_in_last_step,
+        )
 
         # Run single-step repeatedly.
         zero_kv_cache(worker.cache_engine)
@@ -552,38 +574,42 @@ def test_multi_step_correct_kvcache(num_steps, attn_backend):
             num_gpu_blocks,
             block_size,
             continuations=single_step_continuations,
-            final_prompt_lens=final_prompt_lens)
+            final_prompt_lens=final_prompt_lens,
+        )
         single_step_output = worker.execute_model(
             execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list))
+                seq_group_metadata_list=seq_group_metadata_list
+            )
+        )
         for _ in range(num_steps):
             seq_group_metadata_list = create_seq_group_metadata_from_prompts(
                 prompts,
                 num_gpu_blocks,
                 block_size,
                 continuations=multi_step_continuations,
-                final_prompt_lens=final_prompt_lens)
+                final_prompt_lens=final_prompt_lens,
+            )
 
             single_step_output = worker.execute_model(
                 execute_model_req=ExecuteModelRequest(
-                    seq_group_metadata_list=seq_group_metadata_list))
+                    seq_group_metadata_list=seq_group_metadata_list
+                )
+            )
 
             for i, seq_group_output in enumerate(single_step_output[-1]):
                 multi_step_continuations[i].append(
-                    seq_group_output.samples[0].output_token)
+                    seq_group_output.samples[0].output_token
+                )
 
         # Verify that the KV cache of the single-step and
         # multi-step workers are the same.
         single_step_gpu_cache = worker.cache_engine[0].gpu_cache
         multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
         num_layers = len(single_step_gpu_cache)
-        allclose = lambda a, b: torch.allclose(
-            a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
+        allclose = lambda a, b: torch.allclose(a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
         for i in range(num_layers):
-            assert allclose(single_step_gpu_cache[i][0],
-                            multi_step_gpu_cache[i][0])
-            assert allclose(single_step_gpu_cache[i][1],
-                            multi_step_gpu_cache[i][1])
+            assert allclose(single_step_gpu_cache[i][0], multi_step_gpu_cache[i][0])
+            assert allclose(single_step_gpu_cache[i][1], multi_step_gpu_cache[i][1])
 
 
 @torch.inference_mode()
@@ -594,7 +620,7 @@ def test_draft_proposals_full_speculation_len():
     k = 10
     batch_size = 32
     vocab_size = 32_000
-    device = 'cuda:0'
+    device = "cuda:0"
 
     draft_worker = MagicMock()
     proposer = Top1Proposer(
@@ -603,32 +629,37 @@ def test_draft_proposals_full_speculation_len():
         vocab_size=vocab_size,
         max_proposal_len=2048,
     )
-    draft_worker.sampler_output.return_value = [
-        SamplerOutput(
-            outputs=[],
-            sampled_token_probs=torch.rand(batch_size,
-                                           vocab_size,
-                                           device=device,
-                                           dtype=torch.float32),
-            logprobs=torch.rand(batch_size,
-                                vocab_size,
-                                device=device,
-                                dtype=torch.float32),
-            sampled_token_ids=torch.randint(low=0,
-                                            high=vocab_size,
-                                            size=(batch_size, ),
-                                            device=device,
-                                            dtype=torch.long),
-        ) for _ in range(k)
-    ], True
+    draft_worker.sampler_output.return_value = (
+        [
+            SamplerOutput(
+                outputs=[],
+                sampled_token_probs=torch.rand(
+                    batch_size, vocab_size, device=device, dtype=torch.float32
+                ),
+                logprobs=torch.rand(
+                    batch_size, vocab_size, device=device, dtype=torch.float32
+                ),
+                sampled_token_ids=torch.randint(
+                    low=0,
+                    high=vocab_size,
+                    size=(batch_size,),
+                    device=device,
+                    dtype=torch.long,
+                ),
+            )
+            for _ in range(k)
+        ],
+        True,
+    )
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k),
-        seq_ids_with_bonus_token_in_last_step=set())
+            seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+        ),
+        seq_ids_with_bonus_token_in_last_step=set(),
+    )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -648,7 +679,7 @@ def test_draft_proposals_no_speculations():
     k = 10
     batch_size = 32
     vocab_size = 32_000
-    device = 'cuda:0'
+    device = "cuda:0"
     prompt_len = 10
 
     draft_worker = MagicMock()
@@ -659,15 +690,14 @@ def test_draft_proposals_no_speculations():
         max_proposal_len=prompt_len + k - 1,
     )
 
-    seq_group_metadata_list, _, _ = create_batch(batch_size,
-                                                 k,
-                                                 prompt_len=prompt_len)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k, prompt_len=prompt_len)
 
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k),
-        seq_ids_with_bonus_token_in_last_step=set())
+            seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+        ),
+        seq_ids_with_bonus_token_in_last_step=set(),
+    )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -687,7 +717,7 @@ def test_draft_proposals_mixed_k():
     k = 10
     batch_size = 32
     vocab_size = 32_000
-    device = 'cuda:0'
+    device = "cuda:0"
 
     small_prompt_len = 5
     long_prompt_len = 10
@@ -696,10 +726,11 @@ def test_draft_proposals_mixed_k():
     expected_num_proposal_seqs = 6
     expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs
 
-    prompt_len = [
-        small_prompt_len for _ in range(expected_num_proposal_seqs - 1)
-    ] + [long_prompt_len
-         for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len]
+    prompt_len = (
+        [small_prompt_len for _ in range(expected_num_proposal_seqs - 1)]
+        + [long_prompt_len for _ in range(expected_num_no_proposal_seqs)]
+        + [small_prompt_len]
+    )
 
     draft_worker = MagicMock()
     proposer = Top1Proposer(
@@ -709,25 +740,34 @@ def test_draft_proposals_mixed_k():
         max_proposal_len=long_prompt_len + prev_output_token_len + k - 1,
     )
 
-    draft_worker.sampler_output.return_value = [
-        SamplerOutput(
-            outputs=[],
-            sampled_token_probs=torch.rand(expected_num_proposal_seqs,
-                                           vocab_size,
-                                           device=device,
-                                           dtype=torch.float32),
-            logprobs=torch.rand(expected_num_proposal_seqs,
-                                vocab_size,
-                                device=device,
-                                dtype=torch.float32),
-            sampled_token_ids=torch.randint(
-                low=0,
-                high=vocab_size,
-                size=(expected_num_proposal_seqs, ),
-                device=device,
-                dtype=torch.long),
-        ) for _ in range(k)
-    ], True
+    draft_worker.sampler_output.return_value = (
+        [
+            SamplerOutput(
+                outputs=[],
+                sampled_token_probs=torch.rand(
+                    expected_num_proposal_seqs,
+                    vocab_size,
+                    device=device,
+                    dtype=torch.float32,
+                ),
+                logprobs=torch.rand(
+                    expected_num_proposal_seqs,
+                    vocab_size,
+                    device=device,
+                    dtype=torch.float32,
+                ),
+                sampled_token_ids=torch.randint(
+                    low=0,
+                    high=vocab_size,
+                    size=(expected_num_proposal_seqs,),
+                    device=device,
+                    dtype=torch.long,
+                ),
+            )
+            for _ in range(k)
+        ],
+        True,
+    )
 
     seq_group_metadata_list, _, _ = create_batch(
         batch_size,
@@ -738,9 +778,10 @@ def test_draft_proposals_mixed_k():
 
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k),
-        seq_ids_with_bonus_token_in_last_step=set())
+            seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+        ),
+        seq_ids_with_bonus_token_in_last_step=set(),
+    )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -760,7 +801,7 @@ def test_use_draft_model_runner_advance_step():
     when applicable.
     """
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
 
     k = 5
     batch_size = 32
@@ -778,26 +819,26 @@ def test_use_draft_model_runner_advance_step():
     # Mock "_gpu_advance_step" to raise an exception when called.
     exception_secret = "artificial stop"
     worker.model_runner._gpu_advance_step = MagicMock()
-    worker.model_runner._gpu_advance_step.side_effect = ValueError(
-        exception_secret)
+    worker.model_runner._gpu_advance_step.side_effect = ValueError(exception_secret)
 
-    seq_group_metadata_list, _, _ = create_batch(batch_size,
-                                                 k,
-                                                 block_size=block_size,
-                                                 num_gpu_blocks=num_gpu_blocks)
+    seq_group_metadata_list, _, _ = create_batch(
+        batch_size, k, block_size=block_size, num_gpu_blocks=num_gpu_blocks
+    )
 
     # Fallback (should not call) when num_steps=1.
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=k,
-        num_steps=1)
+        num_steps=1,
+    )
     worker.execute_model(execute_model_req=execute_model_req)
 
     # Expect exception if _gpu_advance_step is called.
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=k,
-        num_steps=k)
+        num_steps=k,
+    )
 
     with pytest.raises(ValueError, match=exception_secret):
         worker.execute_model(execute_model_req=execute_model_req)
@@ -808,8 +849,8 @@ def test_use_draft_model_runner_advance_step():
 @torch.inference_mode()
 def test_expand_execute_model_request_sync_with_expand_hidden_states():
     """
-    In this test we verify that the logic for expanding the 
-    seq_group_metadata_list remains in sync with the expansion logic of 
+    In this test we verify that the logic for expanding the
+    seq_group_metadata_list remains in sync with the expansion logic of
     the HiddenStates in _expand_execute_model_request.
     """
     k = 5
@@ -821,18 +862,29 @@ def test_expand_execute_model_request_sync_with_expand_hidden_states():
     execute_model_request = ExecuteModelRequest(
         seq_group_metadata_list,
         previous_hidden_states=HiddenStates(
-            torch.arange(batch_size), seq_group_metadata_list,
-            torch.arange(batch_size, 2 * batch_size)))
+            torch.arange(batch_size),
+            seq_group_metadata_list,
+            torch.arange(batch_size, 2 * batch_size),
+        ),
+    )
 
-    expanded_execute_model_request, orig_seq_group_ids = MultiStepWorker.\
-        _expand_execute_model_request(execute_model_request,
-                                      seq_with_bonus_token_in_last_step)
+    expanded_execute_model_request, orig_seq_group_ids = (
+        MultiStepWorker._expand_execute_model_request(
+            execute_model_request, seq_with_bonus_token_in_last_step
+        )
+    )
 
     all_seq_ids = torch.tensor(
-        get_all_seq_ids(
-            expanded_execute_model_request.seq_group_metadata_list))
+        get_all_seq_ids(expanded_execute_model_request.seq_group_metadata_list)
+    )
     ref_expanded_hidden_states = all_seq_ids + batch_size
     ref_expanded_hidden_states[orig_seq_group_ids] -= batch_size
 
-    assert (ref_expanded_hidden_states == expanded_execute_model_request.
-            previous_hidden_states.hidden_states).all().item()
+    assert (
+        (
+            ref_expanded_hidden_states
+            == expanded_execute_model_request.previous_hidden_states.hidden_states
+        )
+        .all()
+        .item()
+    )
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index 8a7c11485681..2801cff2212c 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -18,9 +18,9 @@ def test_ngram_algo_correctness_for_single_no_match():
     block_size = 32
     num_gpu_blocks = 2048 // block_size
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
     vocab_size = 32_000
-    device = 'cuda:0'
+    device = "cuda:0"
 
     ngram_worker = create_worker(
         NGramWorker,
@@ -48,16 +48,16 @@ def test_ngram_algo_correctness_for_single_no_match():
     proposal_len = 5
     final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
     seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-        prompts,
-        num_gpu_blocks,
-        block_size,
-        final_prompt_lens=final_prompt_lens)
+        prompts, num_gpu_blocks, block_size, final_prompt_lens=final_prompt_lens
+    )
 
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len),
-        seq_ids_with_bonus_token_in_last_step=None)
+            num_lookahead_slots=proposal_len,
+        ),
+        seq_ids_with_bonus_token_in_last_step=None,
+    )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -76,9 +76,9 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
     block_size = 32
     num_gpu_blocks = 2048 // block_size
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
     vocab_size = 32_000
-    device = 'cuda:0'
+    device = "cuda:0"
 
     ngram_worker = create_worker(
         NGramWorker,
@@ -109,25 +109,44 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
         [31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33],
         # shall find no candidate as exceed max_proposal_len
         [
-            31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 33, 34, 35, 36, 37,
-            38, 31, 32, 33
+            31,
+            32,
+            31,
+            32,
+            31,
+            32,
+            31,
+            32,
+            31,
+            32,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            31,
+            32,
+            33,
         ],
     ]
 
     proposal_len = 5
     final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
     seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-        prompts,
-        num_gpu_blocks,
-        block_size,
-        final_prompt_lens=final_prompt_lens)
+        prompts, num_gpu_blocks, block_size, final_prompt_lens=final_prompt_lens
+    )
     for sg in seq_group_metadata_list:
         sg.is_prompt = False
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len),
-        seq_ids_with_bonus_token_in_last_step=None)
+            num_lookahead_slots=proposal_len,
+        ),
+        seq_ids_with_bonus_token_in_last_step=None,
+    )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -137,8 +156,9 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
     assert proposals.proposal_lens.shape == torch.Size([5])
 
     # the first sequence has no match so proposal_len should be overwritten to 0
-    assert proposals.proposal_lens.tolist(
-    ) == [0] + [proposal_len for _ in range(3)] + [0]
+    assert proposals.proposal_lens.tolist() == [0] + [
+        proposal_len for _ in range(3)
+    ] + [0]
 
     for i in range(proposal_len):
         assert proposals.proposal_token_ids[0][i] == -1
@@ -157,9 +177,9 @@ def test_ngram_algo_correctness_for_batches_match_all():
     block_size = 32
     num_gpu_blocks = 2048 // block_size
     seed = 100
-    model_name = 'JackFram/llama-68m'
+    model_name = "JackFram/llama-68m"
     vocab_size = 32_000
-    device = 'cuda:0'
+    device = "cuda:0"
 
     ngram_worker = create_worker(
         NGramWorker,
@@ -191,10 +211,8 @@ def test_ngram_algo_correctness_for_batches_match_all():
     proposal_len = 5
     final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts]
     seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-        prompts,
-        num_gpu_blocks,
-        block_size,
-        final_prompt_lens=final_prompt_lens)
+        prompts, num_gpu_blocks, block_size, final_prompt_lens=final_prompt_lens
+    )
 
     # Normally drafter is run on decode requests only; here we check the output
     # of the ngram worker as it is the sole proposer that has no forward.
@@ -203,8 +221,10 @@ def test_ngram_algo_correctness_for_batches_match_all():
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=proposal_len),
-        seq_ids_with_bonus_token_in_last_step=None)
+            num_lookahead_slots=proposal_len,
+        ),
+        seq_ids_with_bonus_token_in_last_step=None,
+    )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 55fcf0055747..b555a1f94588 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -15,53 +15,58 @@
 from .utils import create_batch, create_worker
 
 
-def create_proposal(propose_lens: list[int], vocab_size: int,
-                    device: str) -> SpeculativeProposals:
+def create_proposal(
+    propose_lens: list[int], vocab_size: int, device: str
+) -> SpeculativeProposals:
     batch_size = len(propose_lens)
     max_propose_len = max(propose_lens)
-    proposal_probs = torch.rand((batch_size, max_propose_len, vocab_size),
-                                device=device)
+    proposal_probs = torch.rand(
+        (batch_size, max_propose_len, vocab_size), device=device
+    )
 
-    proposal_token_ids = torch.full((batch_size, max_propose_len),
-                                    fill_value=-1,
-                                    device=device)
+    proposal_token_ids = torch.full(
+        (batch_size, max_propose_len), fill_value=-1, device=device
+    )
     for i in range(batch_size):
-        proposal_token_ids[i][:propose_lens[i]] = torch.argmax(
-            proposal_probs[i][:propose_lens[i]], dim=-1)
+        proposal_token_ids[i][: propose_lens[i]] = torch.argmax(
+            proposal_probs[i][: propose_lens[i]], dim=-1
+        )
 
     propose_lens = torch.tensor(propose_lens, device=device)
-    return SpeculativeProposals(proposal_token_ids, proposal_probs,
-                                propose_lens)
+    return SpeculativeProposals(proposal_token_ids, proposal_probs, propose_lens)
 
 
-def assert_score_equal(score1: SpeculativeScores,
-                       score2: SpeculativeScores) -> None:
+def assert_score_equal(score1: SpeculativeScores, score2: SpeculativeScores) -> None:
     assert torch.allclose(score1.probs, score2.probs)
     assert torch.allclose(score1.logprobs, score2.logprobs)
-    assert torch.equal(
-        score1.token_ids,
-        score2.token_ids), f"{score1.token_ids}, {score2.token_ids}"
-
-
-@pytest.mark.parametrize('model_name', ['facebook/opt-125m'])
-@pytest.mark.parametrize('batch_size', [1, 2, 4, 8, 16])
-@pytest.mark.parametrize('max_propose_len', [1, 3, 5])
-@pytest.mark.parametrize('mixed_propose_len', [True])
-@pytest.mark.parametrize('device', ['cuda'])
-@pytest.mark.parametrize('prefill_chunking', [False, True])
-def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
-                mixed_propose_len: bool, device: str,
-                prefill_chunking: bool) -> None:
+    assert torch.equal(score1.token_ids, score2.token_ids), (
+        f"{score1.token_ids}, {score2.token_ids}"
+    )
+
+
+@pytest.mark.parametrize("model_name", ["facebook/opt-125m"])
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("max_propose_len", [1, 3, 5])
+@pytest.mark.parametrize("mixed_propose_len", [True])
+@pytest.mark.parametrize("device", ["cuda"])
+@pytest.mark.parametrize("prefill_chunking", [False, True])
+def test_scorer(
+    model_name: str,
+    batch_size: int,
+    max_propose_len: int,
+    mixed_propose_len: bool,
+    device: str,
+    prefill_chunking: bool,
+) -> None:
     """
     Compare the batch expansion scorer and mqa scorer return the same score.
-    We test for both queries with the same propose length and different 
+    We test for both queries with the same propose length and different
     propose length, as well as mixed prefill-decode batches.
     """
     seed = 0
     block_size = 32
     num_gpu_blocks = 2048 // block_size
-    scorer_worker = create_worker(Worker, model_name, block_size,
-                                  num_gpu_blocks, seed)
+    scorer_worker = create_worker(Worker, model_name, block_size, num_gpu_blocks, seed)
     scorer_worker.model_runner.disable_logprobs = True  # accessed by mqa_scorer
     scorer_worker.model_runner.sampler.include_gpu_probs_tensor = True
     scorer_worker.model_runner.sampler.should_modify_greedy_probs_inplace = True
@@ -74,41 +79,45 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
         # There must be at least 1 decode request, otherwise
         # we have nothing to score (`_run_no_spec`).
         non_zero_cnt = random.randint(1, batch_size)
-        propose_lens = [max_propose_len
-                        ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt)
+        propose_lens = [max_propose_len] * non_zero_cnt + [0] * (
+            batch_size - non_zero_cnt
+        )
         random.shuffle(propose_lens)
 
-    seq_group_metadatalist, _, _ = create_batch(batch_size,
-                                                max_propose_len,
-                                                block_size=block_size,
-                                                num_gpu_blocks=num_gpu_blocks)
-
-    if mixed_propose_len and prefill_chunking and (n_prefills :=
-                                                   batch_size - non_zero_cnt):
-        prefill, _, _ = create_batch(n_prefills,
-                                     None,
-                                     prefill_chunk_size=4,
-                                     block_size=block_size,
-                                     num_gpu_blocks=num_gpu_blocks,
-                                     seq_ids=list(
-                                         range(batch_size,
-                                               batch_size + n_prefills)))
+    seq_group_metadatalist, _, _ = create_batch(
+        batch_size,
+        max_propose_len,
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+    )
+
+    if (
+        mixed_propose_len
+        and prefill_chunking
+        and (n_prefills := batch_size - non_zero_cnt)
+    ):
+        prefill, _, _ = create_batch(
+            n_prefills,
+            None,
+            prefill_chunk_size=4,
+            block_size=block_size,
+            num_gpu_blocks=num_gpu_blocks,
+            seq_ids=list(range(batch_size, batch_size + n_prefills)),
+        )
         # re-order to guarantee prefill|decode order
         target_group_metadatalist = [
-            seq_group_metadatalist[i] for i, p in enumerate(propose_lens)
-            if p > 0
+            seq_group_metadatalist[i] for i, p in enumerate(propose_lens) if p > 0
         ]
         seq_group_metadatalist = prefill + target_group_metadatalist
         propose_lens = [0] * n_prefills + [p for p in propose_lens if p > 0]
 
     proposals = create_proposal(propose_lens, vocab_size, device)
-    requests = ExecuteModelRequest(seq_group_metadatalist,
-                                   num_lookahead_slots=max_propose_len)
+    requests = ExecuteModelRequest(
+        seq_group_metadatalist, num_lookahead_slots=max_propose_len
+    )
 
-    batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device,
-                                                      vocab_size)
-    batch_expansion_score = batch_expansion_scorer.score_proposals(
-        requests, proposals)
+    batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device, vocab_size)
+    batch_expansion_score = batch_expansion_scorer.score_proposals(requests, proposals)
 
     mqa_scorer = MQAScorer(scorer_worker, device, vocab_size)
     mqa_score = mqa_scorer.score_proposals(requests, proposals)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 8aceaadff8d3..307417735ec5 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -15,25 +15,27 @@
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import SpeculativeProposals
-from vllm.spec_decode.metrics import (AsyncMetricsCollector,
-                                      SpecDecodeWorkerMetrics)
+from vllm.spec_decode.metrics import AsyncMetricsCollector, SpecDecodeWorkerMetrics
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
-from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
-                                                 split_num_cache_blocks_evenly)
+from vllm.spec_decode.spec_decode_worker import (
+    SpecDecodeWorker,
+    split_num_cache_blocks_evenly,
+)
 from vllm.worker.worker import Worker
 
 from .test_utils import mock_spec_decode_sampler
-from .utils import (create_batch, create_sampler_output_list, create_worker,
-                    mock_worker)
+from .utils import create_batch, create_sampler_output_list, create_worker, mock_worker
 
 
-@pytest.mark.parametrize('k', [1, 2, 6])
-@pytest.mark.parametrize('batch_size', [1, 2, 32])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("k", [1, 2, 6])
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
-def test_correctly_calls_draft_model(k: int, batch_size: int,
-                                     acceptance_sampler_method: str):
+def test_correctly_calls_draft_model(
+    k: int, batch_size: int, acceptance_sampler_method: str
+):
     """Verify SpecDecodeWorker calls the draft worker with correct
     inputs. Everything else is mocked out.
     """
@@ -45,13 +47,15 @@ def test_correctly_calls_draft_model(k: int, batch_size: int,
         target_worker,
         mock_spec_decode_sampler(acceptance_sampler_method),
         disable_logprobs=False,
-        metrics_collector=metrics_collector)
-    exception_secret = 'artificial stop'
+        metrics_collector=metrics_collector,
+    )
+    exception_secret = "artificial stop"
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
     execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)
+        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+    )
 
     with pytest.raises(ValueError, match=exception_secret):
         worker.execute_model(execute_model_req=execute_model_req)
@@ -64,13 +68,15 @@ def test_correctly_calls_draft_model(k: int, batch_size: int,
         assert actual_execute_model_data == execute_model_req
 
 
-@pytest.mark.parametrize('k', [1, 2, 6])
-@pytest.mark.parametrize('batch_size', [1, 2, 32])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("k", [1, 2, 6])
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
 def test_batch_expansion_correctly_calls_target_model(
-        k: int, batch_size: int, acceptance_sampler_method: str):
+    k: int, batch_size: int, acceptance_sampler_method: str
+):
     """Verify SpecDecodeWorker calls the target model with correct
     inputs with batch expansion. Everything else is mocked out.
     """
@@ -78,8 +84,8 @@ def test_batch_expansion_correctly_calls_target_model(
     target_worker = mock_worker(use_spec=False)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    draft_worker.device = 'cuda'
-    target_worker.device = 'cuda'
+    draft_worker.device = "cuda"
+    target_worker.device = "cuda"
 
     set_random_seed(1)
 
@@ -89,47 +95,44 @@ def test_batch_expansion_correctly_calls_target_model(
         mock_spec_decode_sampler(acceptance_sampler_method),
         disable_logprobs=False,
         metrics_collector=metrics_collector,
-        disable_mqa_scorer=True)
+        disable_mqa_scorer=True,
+    )
     worker.init_device()
 
     vocab_size = 32_000
 
-    proposal_token_ids = torch.randint(low=0,
-                                       high=vocab_size,
-                                       size=(batch_size, k),
-                                       dtype=torch.int64,
-                                       device='cuda')
-    proposal_probs = torch.rand(batch_size,
-                                k,
-                                vocab_size,
-                                dtype=torch.float32,
-                                device='cuda')
-    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
-                               device='cuda') * k
-
-    seq_group_metadata_list, prompts, prev_output_tokens = create_batch(
-        batch_size, k)
+    proposal_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device="cuda"
+    )
+    proposal_probs = torch.rand(
+        batch_size, k, vocab_size, dtype=torch.float32, device="cuda"
+    )
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64, device="cuda") * k
+
+    seq_group_metadata_list, prompts, prev_output_tokens = create_batch(batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
         proposal_token_ids=proposal_token_ids,
         proposal_probs=proposal_probs,
-        proposal_lens=proposal_lens)
+        proposal_lens=proposal_lens,
+    )
 
-    exception_secret = 'artificial stop'
+    exception_secret = "artificial stop"
     target_worker.execute_model.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k))
+        worker.execute_model(
+            execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+            )
+        )
 
     seen_contexts: list[list[int]] = []
 
     call_args_list = target_worker.execute_model.call_args_list
     assert len(call_args_list) == 1
     for _, kwargs in call_args_list:
-        seq_group_metadata_list = kwargs[
-            "execute_model_req"].seq_group_metadata_list
+        seq_group_metadata_list = kwargs["execute_model_req"].seq_group_metadata_list
 
         assert len(seq_group_metadata_list) == (k + 1) * batch_size
         for seq_group_metadata in seq_group_metadata_list:
@@ -139,212 +142,216 @@ def test_batch_expansion_correctly_calls_target_model(
     expected_seen_contexts: list[list[int]] = []
 
     for prompt, prev_generated, draft_tokens in zip(
-            prompts, prev_output_tokens, proposal_token_ids.tolist()):
-
+        prompts, prev_output_tokens, proposal_token_ids.tolist()
+    ):
         for i in range(len(draft_tokens) + 1):
-            expected_seen_contexts.append(prompt + prev_generated +
-                                          draft_tokens[:i])
+            expected_seen_contexts.append(prompt + prev_generated + draft_tokens[:i])
 
     seen_contexts.sort()
     expected_seen_contexts.sort()
     assert expected_seen_contexts == seen_contexts
 
 
-@pytest.mark.parametrize('k', [1, 2, 6])
-@pytest.mark.parametrize('batch_size', [1, 2, 32])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("k", [1, 2, 6])
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
-def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
-                                             acceptance_sampler_method: str):
+def test_correctly_calls_spec_decode_sampler(
+    k: int, batch_size: int, acceptance_sampler_method: str
+):
     """Verify SpecDecodeWorker calls the rejection sampler with
     correct inputs. Everything else is mocked out.
     """
     vocab_size = 32_000
 
-    draft_worker = mock_worker(cls=MultiStepWorker,
-                               vocab_size=vocab_size,
-                               use_spec=False)
+    draft_worker = mock_worker(
+        cls=MultiStepWorker, vocab_size=vocab_size, use_spec=False
+    )
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
     spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    draft_worker.device = 'cuda'
-    target_worker.device = 'cuda'
+    draft_worker.device = "cuda"
+    target_worker.device = "cuda"
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker,
-                              target_worker,
-                              spec_decode_sampler,
-                              disable_logprobs=False,
-                              metrics_collector=metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker,
+        target_worker,
+        spec_decode_sampler,
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
     worker.init_device()
 
-    proposal_token_ids = torch.randint(low=0,
-                                       high=vocab_size,
-                                       size=(batch_size, k),
-                                       dtype=torch.int64,
-                                       device='cuda')
-    proposal_probs = torch.rand(batch_size,
-                                k,
-                                vocab_size,
-                                dtype=torch.float32,
-                                device='cuda')
+    proposal_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device="cuda"
+    )
+    proposal_probs = torch.rand(
+        batch_size, k, vocab_size, dtype=torch.float32, device="cuda"
+    )
 
-    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
-                               device='cuda') * k
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64, device="cuda") * k
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
         proposal_token_ids=proposal_token_ids,
         proposal_probs=proposal_probs,
-        proposal_lens=proposal_lens)
-
-    target_token_ids = torch.randint(low=0,
-                                     high=vocab_size,
-                                     size=(1, batch_size * (k + 1)),
-                                     dtype=torch.int64,
-                                     device='cuda')
-    target_token_probs = torch.rand(1,
-                                    batch_size * (k + 1),
-                                    vocab_size,
-                                    dtype=torch.float32,
-                                    device='cuda')
-    target_token_logprobs = torch.rand(1,
-                                       batch_size * (k + 1),
-                                       vocab_size,
-                                       dtype=torch.float32,
-                                       device='cuda')
-    target_output = create_sampler_output_list(target_token_ids,
-                                               target_token_probs,
-                                               target_token_logprobs)
+        proposal_lens=proposal_lens,
+    )
+
+    target_token_ids = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(1, batch_size * (k + 1)),
+        dtype=torch.int64,
+        device="cuda",
+    )
+    target_token_probs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_token_logprobs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_output = create_sampler_output_list(
+        target_token_ids, target_token_probs, target_token_logprobs
+    )
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    exception_secret = 'artificial stop'
+    exception_secret = "artificial stop"
 
     spec_decode_sampler.side_effect = ValueError(exception_secret)
 
     with pytest.raises(ValueError, match=exception_secret):
-        worker.execute_model(execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list,
-            num_lookahead_slots=k))
+        worker.execute_model(
+            execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+            )
+        )
 
     assert len(spec_decode_sampler.call_args_list) == 1
     _, kwargs = spec_decode_sampler.call_args_list[0]
     actual = SimpleNamespace(**kwargs)
 
-    assert torch.equal(actual.bonus_token_ids,
-                       target_token_ids.reshape(batch_size, k + 1)[:, -1:])
-    assert torch.equal(actual.target_with_bonus_probs,
-                       target_token_probs.reshape(batch_size, k + 1, -1))
+    assert torch.equal(
+        actual.bonus_token_ids, target_token_ids.reshape(batch_size, k + 1)[:, -1:]
+    )
+    assert torch.equal(
+        actual.target_with_bonus_probs,
+        target_token_probs.reshape(batch_size, k + 1, -1),
+    )
     assert torch.equal(actual.draft_token_ids, proposal_token_ids)
     assert torch.equal(actual.draft_probs, proposal_probs)
 
 
-@pytest.mark.parametrize('k', [1, 2, 6])
-@pytest.mark.parametrize('batch_size', [1, 2, 32])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("k", [1, 2, 6])
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
-def test_correctly_formats_output(k: int, batch_size: int,
-                                  acceptance_sampler_method: str):
+def test_correctly_formats_output(
+    k: int, batch_size: int, acceptance_sampler_method: str
+):
     """Verify SpecDecodeWorker formats sampler output correctly.
     Everything else is mocked out.
     """
     vocab_size = 32_000
 
-    draft_worker = mock_worker(cls=MultiStepWorker,
-                               vocab_size=vocab_size,
-                               use_spec=False)
+    draft_worker = mock_worker(
+        cls=MultiStepWorker, vocab_size=vocab_size, use_spec=False
+    )
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    draft_worker.device = 'cuda'
-    target_worker.device = 'cuda'
+    draft_worker.device = "cuda"
+    target_worker.device = "cuda"
 
     set_random_seed(1)
     spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
-    worker = SpecDecodeWorker(draft_worker,
-                              target_worker,
-                              spec_decode_sampler,
-                              disable_logprobs=False,
-                              metrics_collector=metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker,
+        target_worker,
+        spec_decode_sampler,
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
     worker.init_device()
 
-    proposal_token_ids = torch.randint(low=0,
-                                       high=vocab_size,
-                                       size=(batch_size, k),
-                                       dtype=torch.int64,
-                                       device='cuda')
-    proposal_probs = torch.rand(batch_size,
-                                k,
-                                vocab_size,
-                                dtype=torch.float32,
-                                device='cuda')
+    proposal_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device="cuda"
+    )
+    proposal_probs = torch.rand(
+        batch_size, k, vocab_size, dtype=torch.float32, device="cuda"
+    )
 
-    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
-                               device='cuda') * k
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64, device="cuda") * k
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
         proposal_token_ids=proposal_token_ids,
         proposal_probs=proposal_probs,
-        proposal_lens=proposal_lens)
-
-    target_token_ids = torch.randint(low=0,
-                                     high=vocab_size,
-                                     size=(1, batch_size * (k + 1)),
-                                     dtype=torch.int64,
-                                     device='cuda')
-    target_token_probs = torch.rand(1,
-                                    batch_size * (k + 1),
-                                    vocab_size,
-                                    dtype=torch.float32,
-                                    device='cuda')
-    target_token_logprobs = torch.rand(1,
-                                       batch_size * (k + 1),
-                                       vocab_size,
-                                       dtype=torch.float32,
-                                       device='cuda')
-    target_output = create_sampler_output_list(target_token_ids,
-                                               target_token_probs,
-                                               target_token_logprobs)
+        proposal_lens=proposal_lens,
+    )
+
+    target_token_ids = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(1, batch_size * (k + 1)),
+        dtype=torch.int64,
+        device="cuda",
+    )
+    target_token_probs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_token_logprobs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_output = create_sampler_output_list(
+        target_token_ids, target_token_probs, target_token_logprobs
+    )
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    spec_decode_sampler_output = torch.randint(low=0,
-                                               high=vocab_size,
-                                               size=(batch_size, k + 1),
-                                               dtype=torch.int64,
-                                               device='cuda')
+    spec_decode_sampler_output = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(batch_size, k + 1),
+        dtype=torch.int64,
+        device="cuda",
+    )
     for i in range(batch_size):
         minimum_accepted_tokens = 1
         spec_decode_sampler_output[i][
-            -random.randint(minimum_accepted_tokens, k + 1):] = -1
+            -random.randint(minimum_accepted_tokens, k + 1) :
+        ] = -1
 
     spec_decode_sampler.return_value = spec_decode_sampler_output
-    output = worker.execute_model(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=k))
+    output = worker.execute_model(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+        )
+    )
 
     expected_output = create_sampler_output_list(
         token_ids=spec_decode_sampler_output.transpose(0, 1),
         probs=[None for _ in range(k + 1)],
-        logprobs=[None for _ in range(k + 1)])
+        logprobs=[None for _ in range(k + 1)],
+    )
 
     seq_ids = [
         next(iter(seq_group_metadata.seq_data.keys()))
         for seq_group_metadata in seq_group_metadata_list
     ]
     actual_output_by_seq: dict[int, list[SequenceOutput]] = {
-        seq_id: []
-        for seq_id in seq_ids
+        seq_id: [] for seq_id in seq_ids
     }
     expected_output_by_seq: dict[int, list[SequenceOutput]] = {
-        seq_id: []
-        for seq_id in seq_ids
+        seq_id: [] for seq_id in seq_ids
     }
 
     for step in output:
@@ -360,8 +367,8 @@ def test_correctly_formats_output(k: int, batch_size: int,
                 expected_output_by_seq[seq_id].append(sample)
 
     all_seen_seq_ids = set(
-        list(actual_output_by_seq.keys()) +
-        list(expected_output_by_seq.keys()))
+        list(actual_output_by_seq.keys()) + list(expected_output_by_seq.keys())
+    )
     for seq_id in all_seen_seq_ids:
         actual_by_step = actual_output_by_seq[seq_id]
         expected_by_step = expected_output_by_seq[seq_id]
@@ -370,117 +377,119 @@ def test_correctly_formats_output(k: int, batch_size: int,
             if i >= len(actual_by_step):
                 assert expected_by_step[i].output_token == -1
                 continue
-            assert actual_by_step[i].output_token == expected_by_step[
-                i].output_token
+            assert actual_by_step[i].output_token == expected_by_step[i].output_token
 
 
-@pytest.mark.parametrize('k', [1, 2])
-@pytest.mark.parametrize('batch_size', [1])
-@pytest.mark.parametrize('returns_metrics', [True, False])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("k", [1, 2])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("returns_metrics", [True, False])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
-def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
-                          acceptance_sampler_method: str):
-    """Verify SpecDecodeWorker collects metrics.
-    """
+def test_collects_metrics(
+    k: int, batch_size: int, returns_metrics: bool, acceptance_sampler_method: str
+):
+    """Verify SpecDecodeWorker collects metrics."""
     vocab_size = 32_000
 
-    draft_worker = mock_worker(cls=MultiStepWorker,
-                               vocab_size=vocab_size,
-                               use_spec=False)
+    draft_worker = mock_worker(
+        cls=MultiStepWorker, vocab_size=vocab_size, use_spec=False
+    )
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
     spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    draft_worker.device = 'cuda'
-    target_worker.device = 'cuda'
+    draft_worker.device = "cuda"
+    target_worker.device = "cuda"
 
     set_random_seed(1)
 
-    worker = SpecDecodeWorker(draft_worker,
-                              target_worker,
-                              spec_decode_sampler,
-                              disable_logprobs=False,
-                              metrics_collector=metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker,
+        target_worker,
+        spec_decode_sampler,
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
     worker.init_device()
 
-    proposal_token_ids = torch.randint(low=0,
-                                       high=vocab_size,
-                                       size=(batch_size, k),
-                                       dtype=torch.int64,
-                                       device='cuda')
-    proposal_probs = torch.rand(batch_size,
-                                k,
-                                vocab_size,
-                                dtype=torch.float32,
-                                device='cuda')
+    proposal_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device="cuda"
+    )
+    proposal_probs = torch.rand(
+        batch_size, k, vocab_size, dtype=torch.float32, device="cuda"
+    )
 
-    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
-                               device='cuda') * k
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64, device="cuda") * k
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
     draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
         proposal_token_ids=proposal_token_ids,
         proposal_probs=proposal_probs,
-        proposal_lens=proposal_lens)
-
-    target_token_ids = torch.randint(low=0,
-                                     high=vocab_size,
-                                     size=(1, batch_size * (k + 1)),
-                                     dtype=torch.int64,
-                                     device='cuda')
-    target_token_probs = torch.rand(1,
-                                    batch_size * (k + 1),
-                                    vocab_size,
-                                    dtype=torch.float32,
-                                    device='cuda')
-    target_token_logprobs = torch.rand(1,
-                                       batch_size * (k + 1),
-                                       vocab_size,
-                                       dtype=torch.float32,
-                                       device='cuda')
-    target_output = create_sampler_output_list(target_token_ids,
-                                               target_token_probs,
-                                               target_token_logprobs)
+        proposal_lens=proposal_lens,
+    )
+
+    target_token_ids = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(1, batch_size * (k + 1)),
+        dtype=torch.int64,
+        device="cuda",
+    )
+    target_token_probs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_token_logprobs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_output = create_sampler_output_list(
+        target_token_ids, target_token_probs, target_token_logprobs
+    )
 
     target_worker.execute_model.return_value = [target_output[0]]
 
-    spec_decode_sampler_output = torch.randint(low=0,
-                                               high=vocab_size,
-                                               size=(batch_size, k + 1),
-                                               dtype=torch.int64,
-                                               device='cuda')
+    spec_decode_sampler_output = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(batch_size, k + 1),
+        dtype=torch.int64,
+        device="cuda",
+    )
     for i in range(batch_size):
         minimum_accepted_tokens = 1
         spec_decode_sampler_output[i][
-            -random.randint(minimum_accepted_tokens, k + 1):] = -1
+            -random.randint(minimum_accepted_tokens, k + 1) :
+        ] = -1
     spec_decode_sampler.return_value = spec_decode_sampler_output
 
-    mock_rejsample_metrics = MagicMock(
-        spec=SpecDecodeWorkerMetrics) if returns_metrics else None
+    mock_rejsample_metrics = (
+        MagicMock(spec=SpecDecodeWorkerMetrics) if returns_metrics else None
+    )
     metrics_collector.maybe_collect_rejsample_metrics.return_value = (
-        mock_rejsample_metrics)
+        mock_rejsample_metrics
+    )
 
-    output = worker.execute_model(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=k))
+    output = worker.execute_model(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+        )
+    )
     assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics
 
-    call_args_list = (
-        metrics_collector.maybe_collect_rejsample_metrics.call_args_list)
+    call_args_list = metrics_collector.maybe_collect_rejsample_metrics.call_args_list
     assert len(call_args_list) == 1
     args, kwargs = call_args_list[0]
-    assert args[0] == k or kwargs.get('k', -1) == k
+    assert args[0] == k or kwargs.get("k", -1) == k
 
 
-@pytest.mark.parametrize('k', [0])
-@pytest.mark.parametrize('batch_size', [1, 2, 32])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("k", [0])
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
-def test_k_equals_zero(k: int, batch_size: int,
-                       acceptance_sampler_method: str):
+def test_k_equals_zero(k: int, batch_size: int, acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when k is zero. This happens during prefill.
     """
@@ -492,45 +501,41 @@ def test_k_equals_zero(k: int, batch_size: int,
     sampler_output.hidden_states = None
     target_worker.execute_model.return_value = [sampler_output]
 
-    draft_worker.device = 'cuda'
-    target_worker.device = 'cuda'
+    draft_worker.device = "cuda"
+    target_worker.device = "cuda"
 
     set_random_seed(1)
 
     worker = SpecDecodeWorker(
         proposer_worker=draft_worker,
         scorer_worker=target_worker,
-        spec_decode_sampler=mock_spec_decode_sampler(
-            acceptance_sampler_method),
+        spec_decode_sampler=mock_spec_decode_sampler(acceptance_sampler_method),
         disable_logprobs=False,
         metrics_collector=metrics_collector,
     )
 
-    seq_group_metadata_list, _, _ = create_batch(batch_size,
-                                                 k,
-                                                 prev_output_token_len=0)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k, prev_output_token_len=0)
     execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)
+        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+    )
 
     out = worker.execute_model(execute_model_req=execute_model_req)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
-    assert out[0].sampled_token_probs is None, (
-        "expect gpu tensor references to be None")
-    assert out[
-        0].sampled_token_ids is None, "expect gpu tensor references to be None"
+    assert out[0].sampled_token_probs is None, "expect gpu tensor references to be None"
+    assert out[0].sampled_token_ids is None, "expect gpu tensor references to be None"
 
     draft_worker.execute_model.assert_called_once_with(execute_model_req)
     target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
-@pytest.mark.parametrize('k', [0, 5])
-@pytest.mark.parametrize('batch_size', [0])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("k", [0, 5])
+@pytest.mark.parametrize("batch_size", [0])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
-def test_empty_input_batch(k: int, batch_size: int,
-                           acceptance_sampler_method: str):
+def test_empty_input_batch(k: int, batch_size: int, acceptance_sampler_method: str):
     """Verify that the SpecDecodeWorker calls the draft and target workers
     when the input batch is empty. This can happen if the engine communicates
     to the workers information without scheduling a batch.
@@ -543,40 +548,37 @@ def test_empty_input_batch(k: int, batch_size: int,
     sampler_output.hidden_states = None
     target_worker.execute_model.return_value = [sampler_output]
 
-    draft_worker.device = 'cuda'
-    target_worker.device = 'cuda'
+    draft_worker.device = "cuda"
+    target_worker.device = "cuda"
 
     set_random_seed(1)
 
     worker = SpecDecodeWorker(
         proposer_worker=draft_worker,
         scorer_worker=target_worker,
-        spec_decode_sampler=mock_spec_decode_sampler(
-            acceptance_sampler_method),
+        spec_decode_sampler=mock_spec_decode_sampler(acceptance_sampler_method),
         disable_logprobs=False,
         metrics_collector=metrics_collector,
     )
 
-    seq_group_metadata_list, _, _ = create_batch(batch_size,
-                                                 k,
-                                                 prev_output_token_len=0)
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k, prev_output_token_len=0)
     execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)
+        seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k
+    )
 
     out = worker.execute_model(execute_model_req=execute_model_req)
 
     assert len(out) == 1, f"expected only one token output when {k=}"
-    assert out[0].sampled_token_probs is None, (
-        "expect gpu tensor references to be None")
-    assert out[
-        0].sampled_token_ids is None, "expect gpu tensor references to be None"
+    assert out[0].sampled_token_probs is None, "expect gpu tensor references to be None"
+    assert out[0].sampled_token_ids is None, "expect gpu tensor references to be None"
 
     draft_worker.execute_model.assert_called_once_with(execute_model_req)
     target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @pytest.mark.skip_global_cleanup
 def test_init_device(acceptance_sampler_method: str):
     """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
@@ -604,8 +606,9 @@ def test_init_device(acceptance_sampler_method: str):
     spec_decode_sampler.init_tensors.assert_called_once()
 
 
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @torch.inference_mode()
 def test_initialize_cache(acceptance_sampler_method):
     """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer
@@ -615,11 +618,12 @@ def test_initialize_cache(acceptance_sampler_method):
     target_worker = mock_worker()
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
-    worker = SpecDecodeWorker(proposer_worker=draft_worker,
-                              scorer_worker=target_worker,
-                              spec_decode_sampler=mock_spec_decode_sampler(
-                                  acceptance_sampler_method),
-                              metrics_collector=metrics_collector)
+    worker = SpecDecodeWorker(
+        proposer_worker=draft_worker,
+        scorer_worker=target_worker,
+        spec_decode_sampler=mock_spec_decode_sampler(acceptance_sampler_method),
+        metrics_collector=metrics_collector,
+    )
 
     kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
     worker.initialize_cache(**kwargs)
@@ -628,18 +632,21 @@ def test_initialize_cache(acceptance_sampler_method):
     target_worker.initialize_cache.assert_called_once_with(**kwargs)
 
 
-@pytest.mark.parametrize('available_gpu_blocks', [1, 1024])
-@pytest.mark.parametrize('available_cpu_blocks', [500])
-@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
-@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
-@pytest.mark.parametrize("acceptance_sampler_method",
-                         ["rejection_sampler", "typical_acceptance_sampler"])
+@pytest.mark.parametrize("available_gpu_blocks", [1, 1024])
+@pytest.mark.parametrize("available_cpu_blocks", [500])
+@pytest.mark.parametrize("target_cache_block_size_bytes", [2 * 2 * 4096])
+@pytest.mark.parametrize("draft_kv_size_bytes", [0, 2 * 2 * 768, 2 * 2 * 4096])
+@pytest.mark.parametrize(
+    "acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]
+)
 @pytest.mark.skip_global_cleanup
-def test_determine_num_available_blocks(available_gpu_blocks: int,
-                                        available_cpu_blocks: int,
-                                        target_cache_block_size_bytes: int,
-                                        draft_kv_size_bytes: int,
-                                        acceptance_sampler_method: str):
+def test_determine_num_available_blocks(
+    available_gpu_blocks: int,
+    available_cpu_blocks: int,
+    target_cache_block_size_bytes: int,
+    draft_kv_size_bytes: int,
+    acceptance_sampler_method: str,
+):
     """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
     Specifically, it should run profiling in the scorer worker, and then evenly
     split the blocks between proposer and scorer worker.
@@ -649,14 +656,20 @@ def test_determine_num_available_blocks(available_gpu_blocks: int,
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
 
     target_worker.determine_num_available_blocks.return_value = (
-        available_gpu_blocks, available_cpu_blocks)
+        available_gpu_blocks,
+        available_cpu_blocks,
+    )
     target_worker.get_cache_block_size_bytes.return_value = (
-        target_cache_block_size_bytes)
+        target_cache_block_size_bytes
+    )
     draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
 
     worker = SpecDecodeWorker(
-        draft_worker, target_worker,
-        mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector)
+        draft_worker,
+        target_worker,
+        mock_spec_decode_sampler(acceptance_sampler_method),
+        metrics_collector,
+    )
 
     num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks()
 
@@ -664,28 +677,28 @@ def test_determine_num_available_blocks(available_gpu_blocks: int,
     assert num_cpu_blocks == available_cpu_blocks
 
     assert num_gpu_blocks == split_num_cache_blocks_evenly(
-        target_cache_block_size_bytes, draft_kv_size_bytes,
-        available_gpu_blocks)
+        target_cache_block_size_bytes, draft_kv_size_bytes, available_gpu_blocks
+    )
 
 
-@pytest.mark.parametrize('available_gpu_blocks',
-                         list(range(20)) + [1024, 1024**2])
-@pytest.mark.parametrize('target_cache_block_size_bytes',
-                         [2 * 2 * 4096, 2 * 2 * 8192])
-@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@pytest.mark.parametrize("available_gpu_blocks", list(range(20)) + [1024, 1024**2])
+@pytest.mark.parametrize("target_cache_block_size_bytes", [2 * 2 * 4096, 2 * 2 * 8192])
+@pytest.mark.parametrize("draft_kv_size_bytes", [0, 2 * 2 * 768, 2 * 2 * 4096])
 @pytest.mark.skip_global_cleanup
-def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
-                                       target_cache_block_size_bytes: int,
-                                       draft_kv_size_bytes: int):
+def test_split_num_cache_blocks_evenly(
+    available_gpu_blocks: int,
+    target_cache_block_size_bytes: int,
+    draft_kv_size_bytes: int,
+):
     """Verify split_num_cache_blocks_evenly does not exceed original memory
     allocation in bytes.
     """
-    num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes,
-                                               draft_kv_size_bytes,
-                                               available_gpu_blocks)
+    num_blocks = split_num_cache_blocks_evenly(
+        target_cache_block_size_bytes, draft_kv_size_bytes, available_gpu_blocks
+    )
     assert (num_blocks * target_cache_block_size_bytes) + (
-        num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks *
-                                              target_cache_block_size_bytes)
+        num_blocks * draft_kv_size_bytes
+    ) <= (available_gpu_blocks * target_cache_block_size_bytes)
 
 
 @torch.inference_mode()
@@ -707,61 +720,67 @@ def test_populate_seq_ids_with_bonus_tokens():
     target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
-    target_worker.device = 'cuda'
+    target_worker.device = "cuda"
 
     set_random_seed(1)
     draft_worker = mock_worker(cls=MultiStepWorker)
-    draft_worker.device = 'cuda'
+    draft_worker.device = "cuda"
     # The sequence_ids attached to each sequence in the batch.
     # The sequence at index i has seq_id assigned_seq_ids[i]
     assigned_seq_ids = list(range(batch_size))
-    seq_group_metadata_list, _, _ = create_batch(batch_size,
-                                                 k,
-                                                 seq_ids=assigned_seq_ids,
-                                                 prev_output_token_len=10)
-    target_token_logprobs = torch.rand(batch_size, (k + 1),
-                                       vocab_size,
-                                       dtype=torch.float32,
-                                       device='cuda')
-    accepted_token_ids = torch.randint(low=0,
-                                       high=vocab_size,
-                                       size=(batch_size, (k + 1)),
-                                       dtype=torch.int64,
-                                       device='cuda')
+    seq_group_metadata_list, _, _ = create_batch(
+        batch_size, k, seq_ids=assigned_seq_ids, prev_output_token_len=10
+    )
+    target_token_logprobs = torch.rand(
+        batch_size, (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    accepted_token_ids = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(batch_size, (k + 1)),
+        dtype=torch.int64,
+        device="cuda",
+    )
     expected_request_id_seq_ids_mapping: dict[str, set[int]] = defaultdict(set)
     for seq_group_metadata in seq_group_metadata_list:
         for seq_id in seq_group_metadata.seq_data:
-            expected_request_id_seq_ids_mapping[
-                seq_group_metadata.request_id].add(seq_id)
+            expected_request_id_seq_ids_mapping[seq_group_metadata.request_id].add(
+                seq_id
+            )
     # Generate a random sample of sequence indexes with bonus tokens
     seq_indexes_with_bonus_tokens = random.sample(
-        range(batch_size), num_sequences_with_bonus_tokens)
+        range(batch_size), num_sequences_with_bonus_tokens
+    )
     # Create a mask that is True for indices in seq_indexes_with_bonus_tokens
-    mask = torch.ones(batch_size, dtype=torch.bool, device='cuda')
+    mask = torch.ones(batch_size, dtype=torch.bool, device="cuda")
     mask[seq_indexes_with_bonus_tokens] = False
     # Set the last token ID to -1 for all indices not in
     # seq_indexes_with_bonus_tokens to indicate the lack of bonus token in
     # those indices.
     accepted_token_ids[mask, -1:] = -1
-    worker = SpecDecodeWorker(draft_worker,
-                              target_worker,
-                              mock_spec_decode_sampler("rejection_sampler"),
-                              disable_logprobs=False,
-                              metrics_collector=metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker,
+        target_worker,
+        mock_spec_decode_sampler("rejection_sampler"),
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
     # Initialize _seq_with_bonus_token_in_last_step with a set of sequence IDs.
     # This set includes all sequence IDs in the batch as well as an additional
     # `num_extra_sequence_ids` sequence IDs. Note that the sequence IDs are in
     # the range [0, batch_size + num_extra_sequence_ids).
     num_extra_sequence_ids = 10
     worker._seq_with_bonus_token_in_last_step = set(
-        range(batch_size + num_extra_sequence_ids))
+        range(batch_size + num_extra_sequence_ids)
+    )
     worker._create_output_sampler_list(
         seq_group_metadata_list=seq_group_metadata_list,
         accepted_token_ids=accepted_token_ids,
         target_logprobs=target_token_logprobs,
         prompt_logprobs=None,
         k=k,
-        stage_times=(0, 0, 0))
+        stage_times=(0, 0, 0),
+    )
     # Verify that _seq_with_bonus_token_in_last_step contains the following:
     # 1. Sequence IDs that were already present in
     #    _seq_with_bonus_token_in_last_step but were not part of the current
@@ -770,24 +789,27 @@ def test_populate_seq_ids_with_bonus_tokens():
     #    bonus token are retained in _seq_with_bonus_token_in_last_step.
     #    Sequence IDs that are present in the current batch but do not have
     #    bonus tokens are removed from _seq_with_bonus_token_in_last_step.
-    expected_seq_ids_with_bonus_tokens = \
-        set([assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens])
-    additional_sequence_ids = \
-        set(range(batch_size, batch_size + num_extra_sequence_ids))
-    assert worker._seq_with_bonus_token_in_last_step == \
-        expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids)
-    assert worker._request_id_seq_id_mapping == \
-        expected_request_id_seq_ids_mapping
+    expected_seq_ids_with_bonus_tokens = set(
+        [assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens]
+    )
+    additional_sequence_ids = set(
+        range(batch_size, batch_size + num_extra_sequence_ids)
+    )
+    assert (
+        worker._seq_with_bonus_token_in_last_step
+        == expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids)
+    )
+    assert worker._request_id_seq_id_mapping == expected_request_id_seq_ids_mapping
 
 
 @torch.inference_mode()
 def test_handle_finished_requests():
     """
-    Test to verify that finished request IDs are appropriately processed to 
+    Test to verify that finished request IDs are appropriately processed to
     update the internal state of the SpecDecodeWorker.
 
-    This test initializes the SpecDecodeWorker with mock data, marks certain 
-    requests as finished, and ensures that the corresponding sequence IDs are 
+    This test initializes the SpecDecodeWorker with mock data, marks certain
+    requests as finished, and ensures that the corresponding sequence IDs are
     correctly removed from the internal mappings.
     """
     batch_size = 32
@@ -795,18 +817,24 @@ def test_handle_finished_requests():
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    worker = SpecDecodeWorker(draft_worker, target_worker,
-                              mock_spec_decode_sampler("rejection_sampler"),
-                              metrics_collector)
+    worker = SpecDecodeWorker(
+        draft_worker,
+        target_worker,
+        mock_spec_decode_sampler("rejection_sampler"),
+        metrics_collector,
+    )
     # Initialize the request_id_seq_id_mapping mapping dict with a few fake
     # request ids and corresponding sequence ids.
-    worker._request_id_seq_id_mapping = \
-        {'request-1': {1,2,3}, 'request-2': {4,5,6,7},
-        'request-3': {8,9}, 'request-4': {10,11}}
+    worker._request_id_seq_id_mapping = {
+        "request-1": {1, 2, 3},
+        "request-2": {4, 5, 6, 7},
+        "request-3": {8, 9},
+        "request-4": {10, 11},
+    }
     # Initialize seq_with_bonus_token_in_last_step with a few fake
     # sequence ids.
     worker._seq_with_bonus_token_in_last_step = {1, 4, 5, 8, 9, 10}
-    exception_secret = 'artificial stop'
+    exception_secret = "artificial stop"
     draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
@@ -814,39 +842,42 @@ def test_handle_finished_requests():
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=seq_group_metadata_list,
         num_lookahead_slots=k,
-        finished_requests_ids=['request-1', 'request-3'])
+        finished_requests_ids=["request-1", "request-3"],
+    )
 
     with pytest.raises(ValueError, match=exception_secret):
         worker.execute_model(execute_model_req=execute_model_req)
     # Verify that request-1 and request-3 are removed from
     # request_id_seq_id_mapping
-    assert worker._request_id_seq_id_mapping == \
-        {'request-2': {4,5,6,7}, 'request-4': {10,11}}
+    assert worker._request_id_seq_id_mapping == {
+        "request-2": {4, 5, 6, 7},
+        "request-4": {10, 11},
+    }
     # Verify that all sequence ids corresponding to 'request-1'
     # and 'request-3' are removed from seq_with_bonus_token_in_last_step.
-    assert worker._seq_with_bonus_token_in_last_step == \
-        {4,5,10}
+    assert worker._seq_with_bonus_token_in_last_step == {4, 5, 10}
 
 
-@pytest.mark.parametrize('k', [3])
-@pytest.mark.parametrize('batch_size', [2, 32])
-@pytest.mark.parametrize("batch_composition",
-                         ["prefill_only", "decode_only", "mixed"])
+@pytest.mark.parametrize("k", [3])
+@pytest.mark.parametrize("batch_size", [2, 32])
+@pytest.mark.parametrize("batch_composition", ["prefill_only", "decode_only", "mixed"])
 @torch.inference_mode()
 def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
     """
-        Verify SpecDecodeWorker calls match the expected flow.
+    Verify SpecDecodeWorker calls match the expected flow.
     """
     vocab_size = 32_000
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
-    worker = SpecDecodeWorker(draft_worker,
-                              target_worker,
-                              mock_spec_decode_sampler("rejection_sampler"),
-                              disable_logprobs=False,
-                              metrics_collector=metrics_collector)
-    exception_secret = 'artificial stop'
+    worker = SpecDecodeWorker(
+        draft_worker,
+        target_worker,
+        mock_spec_decode_sampler("rejection_sampler"),
+        disable_logprobs=False,
+        metrics_collector=metrics_collector,
+    )
+    exception_secret = "artificial stop"
     worker.scorer = mock_worker(BatchExpansionTop1Scorer)
     worker.scorer.score_proposals.side_effect = ValueError(exception_secret)
 
@@ -854,11 +885,12 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
     # and decodes (different seq_ids).
     decodes, _, _ = create_batch(batch_size, k)
     # Pre-chunking here, get 'batch_size' chunks.
-    prefill, _, _ = create_batch(batch_size,
-                                 k,
-                                 prefill_chunk_size=4,
-                                 seq_ids=list(range(batch_size,
-                                                    batch_size * 2)))
+    prefill, _, _ = create_batch(
+        batch_size,
+        k,
+        prefill_chunk_size=4,
+        seq_ids=list(range(batch_size, batch_size * 2)),
+    )
 
     if batch_composition == "prefill_only":
         n_prefills = batch_size
@@ -874,26 +906,25 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=target_group_metadata_list,
         # For prefill only batches we expect num_lookahead_slots = 0.
-        num_lookahead_slots=k if n_decodes > 0 else 0)
-
-    target_token_ids = torch.randint(low=0,
-                                     high=vocab_size,
-                                     size=(1, batch_size * (k + 1)),
-                                     dtype=torch.int64,
-                                     device='cuda')
-    target_token_probs = torch.rand(1,
-                                    batch_size * (k + 1),
-                                    vocab_size,
-                                    dtype=torch.float32,
-                                    device='cuda')
-    target_token_logprobs = torch.rand(1,
-                                       batch_size * (k + 1),
-                                       vocab_size,
-                                       dtype=torch.float32,
-                                       device='cuda')
-    target_output = create_sampler_output_list(target_token_ids,
-                                               target_token_probs,
-                                               target_token_logprobs)
+        num_lookahead_slots=k if n_decodes > 0 else 0,
+    )
+
+    target_token_ids = torch.randint(
+        low=0,
+        high=vocab_size,
+        size=(1, batch_size * (k + 1)),
+        dtype=torch.int64,
+        device="cuda",
+    )
+    target_token_probs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_token_logprobs = torch.rand(
+        1, batch_size * (k + 1), vocab_size, dtype=torch.float32, device="cuda"
+    )
+    target_output = create_sampler_output_list(
+        target_token_ids, target_token_probs, target_token_logprobs
+    )
 
     target_worker.execute_model.return_value = [target_output[0]]
 
@@ -912,7 +943,7 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
 
 def test_correctly_load_weight_for_eagle():
     """
-        Verify SpecDecodeWorker loads lm_head weight for eagle correctly.
+    Verify SpecDecodeWorker loads lm_head weight for eagle correctly.
     """
     seed = 100
     block_size = 32
@@ -934,12 +965,13 @@ def test_correctly_load_weight_for_eagle():
     )
 
     spec_decode_sampler = mock_spec_decode_sampler("rejection_sampler")
-    worker = SpecDecodeWorker(draft_worker,
-                              target_worker,
-                              spec_decode_sampler,
-                              disable_logprobs=False)
+    worker = SpecDecodeWorker(
+        draft_worker, target_worker, spec_decode_sampler, disable_logprobs=False
+    )
     worker.proposer_worker.maybe_load_lm_head_weight(
-        target_worker.model_runner.model.lm_head.weight.data)
+        target_worker.model_runner.model.lm_head.weight.data
+    )
     assert torch.allclose(
         worker.proposer_worker.worker.model_runner.model.lm_head.weight.data,
-        worker.scorer_worker.model_runner.model.lm_head.weight.data)
+        worker.scorer_worker.model_runner.model.lm_head.weight.data,
+    )
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
index 9cfc618b9d95..7288012c718a 100644
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
@@ -9,15 +9,17 @@
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.layers.sampler import _get_ranks
 from vllm.model_executor.layers.typical_acceptance_sampler import (
-    TypicalAcceptanceSampler)
+    TypicalAcceptanceSampler,
+)
 from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids
-from vllm.spec_decode.util import (get_sampled_token_logprobs,
-                                   split_batch_by_proposal_len)
+from vllm.spec_decode.util import (
+    get_sampled_token_logprobs,
+    split_batch_by_proposal_len,
+)
 
 
 def test_get_all_seq_ids():
-    """Verify get_all_seq_ids extracts all seq ids.
-    """
+    """Verify get_all_seq_ids extracts all seq ids."""
     expected_seq_ids = list(range(10)) + list(range(100, 110))
 
     seq_group_metadata_list = [
@@ -32,7 +34,8 @@ def test_get_all_seq_ids():
                 seq_id: MagicMock(),
             },
             lora_request=None,
-        ) for seq_id in expected_seq_ids
+        )
+        for seq_id in expected_seq_ids
     ]
 
     actual_seq_ids = get_all_seq_ids(seq_group_metadata_list)
@@ -54,19 +57,18 @@ def fake_sequence_group_metadata():
                 i: MagicMock(),
             },
             lora_request=None,
-        ) for i in seq_ids
+        )
+        for i in seq_ids
     ]
 
 
 def test_filter_zero_length_proposals(fake_sequence_group_metadata):
     proposal_lens = [0, 1, 0]
-    _, (filtered_groups,
-        indices) = split_batch_by_proposal_len(fake_sequence_group_metadata,
-                                               proposal_lens)
+    _, (filtered_groups, indices) = split_batch_by_proposal_len(
+        fake_sequence_group_metadata, proposal_lens
+    )
 
-    expected_groups = [
-        fake_sequence_group_metadata[0], fake_sequence_group_metadata[2]
-    ]
+    expected_groups = [fake_sequence_group_metadata[0], fake_sequence_group_metadata[2]]
     expected_indices = [0, 2]
 
     assert filtered_groups == expected_groups
@@ -75,13 +77,11 @@ def test_filter_zero_length_proposals(fake_sequence_group_metadata):
 
 def test_filter_non_zero_length_proposals(fake_sequence_group_metadata):
     proposal_lens = [0, 1, 2]
-    (filtered_groups,
-     indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata,
-                                               proposal_lens)
+    (filtered_groups, indices), _ = split_batch_by_proposal_len(
+        fake_sequence_group_metadata, proposal_lens
+    )
 
-    expected_groups = [
-        fake_sequence_group_metadata[1], fake_sequence_group_metadata[2]
-    ]
+    expected_groups = [fake_sequence_group_metadata[1], fake_sequence_group_metadata[2]]
     expected_indices = [1, 2]
 
     assert filtered_groups == expected_groups
@@ -97,9 +97,9 @@ def test_empty_inputs():
 
 def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata):
     proposal_lens = [0, 0, 0]
-    (filtered_groups,
-     indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata,
-                                               proposal_lens)
+    (filtered_groups, indices), _ = split_batch_by_proposal_len(
+        fake_sequence_group_metadata, proposal_lens
+    )
 
     assert filtered_groups == []
     assert indices == []
@@ -107,9 +107,9 @@ def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata):
 
 def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
     proposal_lens = [1, 1, 1]
-    _, (filtered_groups,
-        indices) = split_batch_by_proposal_len(fake_sequence_group_metadata,
-                                               proposal_lens)
+    _, (filtered_groups, indices) = split_batch_by_proposal_len(
+        fake_sequence_group_metadata, proposal_lens
+    )
 
     assert filtered_groups == []
     assert indices == []
@@ -118,7 +118,7 @@ def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
 def mock_spec_decode_sampler(acceptance_sampler_method):
     """
     Returns either a RejectionSampler or TypicalAcceptanceSampler
-    object depending on whether acceptance_sampler_method is 
+    object depending on whether acceptance_sampler_method is
     'rejection_sampler' or 'typical_acceptance_sampler' respectively.
     """
     if acceptance_sampler_method == "rejection_sampler":
@@ -134,17 +134,17 @@ def mock_spec_decode_sampler(acceptance_sampler_method):
 
 
 def test_get_sampled_token_logprobs():
-    """Verify get_sampled_token_logprobs returns consistent rankings 
+    """Verify get_sampled_token_logprobs returns consistent rankings
     with regular get_ranks when probabilities match exactly.
     """
     logprob_tensor = torch.tensor(
-        [[[-.1, -.1]] * 2])  # shape (num_steps, batch_size, vocab_size)
-    sampled_token_tensor = torch.tensor([[1,
-                                          0]])  # shape (num_steps, batch_size)
-    ranks_spec_dec, _ = get_sampled_token_logprobs(logprob_tensor,
-                                                   sampled_token_tensor)
-
-    ranks_regular = _get_ranks(logprob_tensor.reshape((2, -1)),
-                               sampled_token_tensor.reshape(-1))
+        [[[-0.1, -0.1]] * 2]
+    )  # shape (num_steps, batch_size, vocab_size)
+    sampled_token_tensor = torch.tensor([[1, 0]])  # shape (num_steps, batch_size)
+    ranks_spec_dec, _ = get_sampled_token_logprobs(logprob_tensor, sampled_token_tensor)
+
+    ranks_regular = _get_ranks(
+        logprob_tensor.reshape((2, -1)), sampled_token_tensor.reshape(-1)
+    )
 
     assert torch.equal(ranks_spec_dec.reshape(-1), ranks_regular)
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 1733f66feec0..4997e637fa4a 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -12,8 +12,13 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.utils import set_random_seed
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceData, SequenceGroupMetadata, SequenceOutput)
+from vllm.sequence import (
+    CompletionSequenceGroupOutput,
+    Logprob,
+    SequenceData,
+    SequenceGroupMetadata,
+    SequenceOutput,
+)
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
@@ -26,11 +31,13 @@ def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
 
 
-def mock_worker(cls=None,
-                vocab_size: int = 30_000,
-                max_model_len: int = 2048,
-                rank: int = 0,
-                use_spec: bool = True) -> MagicMock:
+def mock_worker(
+    cls=None,
+    vocab_size: int = 30_000,
+    max_model_len: int = 2048,
+    rank: int = 0,
+    use_spec: bool = True,
+) -> MagicMock:
     if cls is None:
         cls = Worker
 
@@ -40,7 +47,7 @@ def mock_worker(cls=None,
     worker.vocab_size = vocab_size
     worker.max_model_len = max_model_len
     worker.rank = rank
-    worker.device = 'cuda:0'
+    worker.device = "cuda:0"
     return worker
 
 
@@ -63,15 +70,17 @@ def zero_kv_cache(cache_engine: list[CacheEngine]):
         value_blocks.zero_()
 
 
-def create_worker(cls: Callable[..., T],
-                  model_name: str,
-                  block_size: int,
-                  num_gpu_blocks: int,
-                  seed: int,
-                  is_driver_worker: bool = True,
-                  enforce_eager: bool = True,
-                  model_runner_cls: Optional[ModelRunner] = None,
-                  dtype: Optional[str] = "auto") -> T:
+def create_worker(
+    cls: Callable[..., T],
+    model_name: str,
+    block_size: int,
+    num_gpu_blocks: int,
+    seed: int,
+    is_driver_worker: bool = True,
+    enforce_eager: bool = True,
+    model_runner_cls: Optional[ModelRunner] = None,
+    dtype: Optional[str] = "auto",
+) -> T:
     engine_args = EngineArgs(
         model=model_name,
         seed=seed,
@@ -81,8 +90,7 @@ def create_worker(cls: Callable[..., T],
     )
     engine_config = engine_args.create_engine_config()
 
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
+    distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
 
     worker = cls(
         vllm_config=engine_config,
@@ -100,7 +108,8 @@ def create_worker(cls: Callable[..., T],
     engine_config.cache_config.num_cpu_blocks = 0
     worker.initialize_cache(
         num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
-        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
+        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks,
+    )
 
     return worker
 
@@ -113,7 +122,6 @@ def create_seq_group_metadata_from_prompts(
     continuations: Optional[list[list[int]]] = None,
     seq_ids: Optional[list[int]] = None,
 ) -> list[SequenceGroupMetadata]:
-
     if continuations is None:
         continuations = [[] for _ in prompts]
 
@@ -131,11 +139,9 @@ def create_seq_group_metadata_from_prompts(
     }
 
     seq_grou_metadata_list = []
-    for i, (prompt_token_ids,
-            cont_token_ids) in enumerate(zip(prompts, continuations)):
+    for i, (prompt_token_ids, cont_token_ids) in enumerate(zip(prompts, continuations)):
         data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
-        data.update_num_computed_tokens(
-            len(prompt_token_ids) + len(cont_token_ids) - 1)
+        data.update_num_computed_tokens(len(prompt_token_ids) + len(cont_token_ids) - 1)
         seq_data = {i: data}
         seq_grou_metadata_list.append(
             SequenceGroupMetadata(
@@ -144,17 +150,18 @@ def create_seq_group_metadata_from_prompts(
                 seq_data=seq_data,
                 sampling_params=SamplingParams(temperature=0.0),
                 block_tables={i: block_allocations[i][:]},
-            ))
+            )
+        )
     return seq_grou_metadata_list
 
 
 def create_chunked_seq_group_metadata_from_prompt(
-        prompt: list[int],
-        num_gpu_blocks: int,
-        chunk_size: int,
-        block_size: int,
-        seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]:
-
+    prompt: list[int],
+    num_gpu_blocks: int,
+    chunk_size: int,
+    block_size: int,
+    seq_id: Optional[int] = None,
+) -> list[SequenceGroupMetadata]:
     if seq_id is None:
         seq_id = 0
 
@@ -167,7 +174,7 @@ def create_chunked_seq_group_metadata_from_prompt(
 
     seq_group_metadata_list = []
     for i, idx in enumerate(range(0, len(prompt), chunk_size)):
-        chunk_ids = prompt[idx:idx + chunk_size]
+        chunk_ids = prompt[idx : idx + chunk_size]
         data = SequenceData.from_seqs(prompt)
         data.update_num_computed_tokens(idx)
         seq_data = {i: data}
@@ -179,30 +186,34 @@ def create_chunked_seq_group_metadata_from_prompt(
                 seq_data=seq_data,
                 sampling_params=SamplingParams(temperature=0.0),
                 block_tables={i: block_allocations},
-                token_chunk_size=len(chunk_ids)))
+                token_chunk_size=len(chunk_ids),
+            )
+        )
     return seq_group_metadata_list
 
 
 def assert_logprobs_dict_allclose(
-        actual_logprobs: list[dict[int, Logprob]],
-        expected_logprobs: list[dict[int, Logprob]]) -> None:
+    actual_logprobs: list[dict[int, Logprob]],
+    expected_logprobs: list[dict[int, Logprob]],
+) -> None:
     for single_step_actual_logprobs, single_step_expected_logprobs in zip(
-            actual_logprobs, expected_logprobs):
+        actual_logprobs, expected_logprobs
+    ):
         assert set(single_step_actual_logprobs.keys()) == set(
-            single_step_expected_logprobs.keys())
+            single_step_expected_logprobs.keys()
+        )
         for token_id in single_step_actual_logprobs:
-            actual = torch.tensor(
-                single_step_actual_logprobs[token_id].logprob)
-            expected = torch.tensor(
-                single_step_expected_logprobs[token_id].logprob)
+            actual = torch.tensor(single_step_actual_logprobs[token_id].logprob)
+            expected = torch.tensor(single_step_expected_logprobs[token_id].logprob)
             torch.testing.assert_close(actual, expected)
 
 
 def create_sampler_output_list(
-        token_ids: torch.Tensor,
-        probs: GenericSequence[Optional[torch.Tensor]],
-        logprobs: GenericSequence[Optional[torch.Tensor]],
-        seq_ids: Optional[list[int]] = None) -> list[SamplerOutput]:
+    token_ids: torch.Tensor,
+    probs: GenericSequence[Optional[torch.Tensor]],
+    logprobs: GenericSequence[Optional[torch.Tensor]],
+    seq_ids: Optional[list[int]] = None,
+) -> list[SamplerOutput]:
     num_steps, batch_size = token_ids.shape
     token_ids_by_step = token_ids.tolist()
 
@@ -210,33 +221,38 @@ def create_sampler_output_list(
         seq_ids = list(range(batch_size))
 
     return [
-        SamplerOutput(outputs=[
-            CompletionSequenceGroupOutput(
-                samples=[
-                    SequenceOutput(
-                        output_token=token_id,
-                        parent_seq_id=seq_ids[seq_index],
-                        logprobs={token_id: Logprob(0)},
-                    )
-                ],
-                prompt_logprobs=None,
-            ) for seq_index, token_id in enumerate(token_ids_by_step[step])
-        ],
-                      sampled_token_probs=probs[step],
-                      logprobs=logprobs[step],
-                      sampled_token_ids=token_ids[step])
+        SamplerOutput(
+            outputs=[
+                CompletionSequenceGroupOutput(
+                    samples=[
+                        SequenceOutput(
+                            output_token=token_id,
+                            parent_seq_id=seq_ids[seq_index],
+                            logprobs={token_id: Logprob(0)},
+                        )
+                    ],
+                    prompt_logprobs=None,
+                )
+                for seq_index, token_id in enumerate(token_ids_by_step[step])
+            ],
+            sampled_token_probs=probs[step],
+            logprobs=logprobs[step],
+            sampled_token_ids=token_ids[step],
+        )
         for step in range(num_steps)
     ]
 
 
-def create_batch(batch_size,
-                 k,
-                 prompt_len: Union[int, list[int]] = 10,
-                 prev_output_token_len: int = 10,
-                 seq_ids: Optional[list[int]] = None,
-                 num_gpu_blocks: Optional[int] = None,
-                 block_size: Optional[int] = None,
-                 prefill_chunk_size: Optional[int] = None):
+def create_batch(
+    batch_size,
+    k,
+    prompt_len: Union[int, list[int]] = 10,
+    prev_output_token_len: int = 10,
+    seq_ids: Optional[list[int]] = None,
+    num_gpu_blocks: Optional[int] = None,
+    block_size: Optional[int] = None,
+    prefill_chunk_size: Optional[int] = None,
+):
     if block_size is None:
         block_size = 8
 
@@ -258,23 +274,29 @@ def create_batch(batch_size,
             seq_ids = list(range(len(prompts)))
         seq_group_metadata_list = []
         for p, sid in zip(prompts, seq_ids):
-            seq_group_metadata_list += \
-                create_chunked_seq_group_metadata_from_prompt(
-                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
+            seq_group_metadata_list += create_chunked_seq_group_metadata_from_prompt(
+                p, num_gpu_blocks, prefill_chunk_size, block_size, sid
+            )
         seq_group_metadata_list = seq_group_metadata_list[:batch_size]
         prev_output_tokens = []
     else:
-        prev_output_tokens = [[
-            next(iterator) for _ in range(prev_output_token_len)
-        ] for _ in range(batch_size)]
+        prev_output_tokens = [
+            [next(iterator) for _ in range(prev_output_token_len)]
+            for _ in range(batch_size)
+        ]
         final_prompt_lens = [
             len(prompt) + len(prev_output_token) + k + 1
             for prompt, prev_output_token in zip(prompts, prev_output_tokens)
         ]
 
         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size, final_prompt_lens,
-            prev_output_tokens, seq_ids)
+            prompts,
+            num_gpu_blocks,
+            block_size,
+            final_prompt_lens,
+            prev_output_tokens,
+            seq_ids,
+        )
     return seq_group_metadata_list, prompts, prev_output_tokens
 
 
@@ -284,7 +306,8 @@ def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs):
             **{
                 "enable_chunked_prefill": True,
                 "max_num_batched_tokens": prefill_chunk_size,
-                "max_num_seqs": prefill_chunk_size
-            })
+                "max_num_seqs": prefill_chunk_size,
+            }
+        )
     else:
         llm_kwargs["enable_chunked_prefill"] = False
diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py
index 21bcb6b822d1..ddcdd2a51ab9 100644
--- a/tests/standalone_tests/lazy_imports.py
+++ b/tests/standalone_tests/lazy_imports.py
@@ -37,4 +37,5 @@ def any_module_imported():
 
 assert not any_module_imported(), (
     f"Some the modules in {module_names} are imported. To see the first"
-    f" import location, run the test with `use_blame=True`.")
+    f" import location, run the test with `use_blame=True`."
+)
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index 18aa4c88c033..90828bb05610 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -32,7 +32,6 @@ def cleanup():
 
 @pytest.fixture()
 def just_serialize_model_tensors(model_ref, monkeypatch, tmp_path):
-
     def noop(*args, **kwargs):
         return None
 
@@ -56,8 +55,7 @@ def model_path(model_ref, tmp_path):
     yield tmp_path / model_ref / "model.tensors"
 
 
-def assert_from_collective_rpc(engine: LLM, closure: Callable,
-                               closure_kwargs: dict):
+def assert_from_collective_rpc(engine: LLM, closure: Callable, closure_kwargs: dict):
     res = engine.collective_rpc(method=closure, kwargs=closure_kwargs)
     return all(res)
 
@@ -67,18 +65,13 @@ def assert_from_collective_rpc(engine: LLM, closure: Callable,
 # method. It's purely used as a dummy utility to run methods that test
 # Tensorizer functionality
 class DummyExecutor(UniProcExecutor):
-
     def _init_executor(self) -> None:
-        """Initialize the worker and load the model.
-        """
-        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
-                                               rpc_rank=0)
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
+        """Initialize the worker and load the model."""
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
+        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
         local_rank = 0
         # set local rank as the device index if specified
-        device_info = self.vllm_config.device_config.device.__str__().split(
-            ":")
+        device_info = self.vllm_config.device_config.device.__str__().split(":")
         if len(device_info) > 1:
             local_rank = int(device_info[1])
         rank = 0
@@ -90,7 +83,7 @@ def _init_executor(self) -> None:
             distributed_init_method=distributed_init_method,
             is_driver_worker=is_driver_worker,
         )
-        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_worker", args=([kwargs],))
         self.collective_rpc("init_device")
 
     @property
@@ -98,5 +91,5 @@ def max_concurrent_batches(self) -> int:
         return 2
 
     def shutdown(self):
-        if hasattr(self, 'thread_pool'):
+        if hasattr(self, "thread_pool"):
             self.thread_pool.shutdown(wait=False)
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b8d7892e57f2..7a1a89c5b6a9 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -16,14 +16,19 @@
 import vllm.model_executor.model_loader.tensorizer
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
+
 # yapf: disable
-from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
-                                                         TensorSerializer,
-                                                         is_vllm_tensorized,
-                                                         open_stream,
-                                                         tensorize_vllm_model)
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig,
+    TensorSerializer,
+    is_vllm_tensorized,
+    open_stream,
+    tensorize_vllm_model,
+)
 from vllm.model_executor.model_loader.tensorizer_loader import (
-    BLACKLISTED_TENSORIZER_ARGS)
+    BLACKLISTED_TENSORIZER_ARGS,
+)
+
 # yapf: enable
 from vllm.utils import PlaceholderModule
 
@@ -44,7 +49,7 @@ class TensorizerCaughtError(Exception):
 
 EXAMPLES_PATH = VLLM_PATH / "examples"
 
-pytest_plugins = "pytest_asyncio",
+pytest_plugins = ("pytest_asyncio",)
 
 prompts = [
     "Hello, my name is",
@@ -56,8 +61,7 @@ class TensorizerCaughtError(Exception):
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
 
 
-def patch_init_and_catch_error(self, obj, method_name,
-                               expected_error: type[Exception]):
+def patch_init_and_catch_error(self, obj, method_name, expected_error: type[Exception]):
     original = getattr(obj, method_name, None)
     if original is None:
         raise ValueError("Method '{}' not found.".format(method_name))
@@ -80,17 +84,19 @@ def assert_specific_tensorizer_error_is_raised(
     expected_error: type[Exception],
 ):
     with pytest.raises(TensorizerCaughtError):
-        executor.collective_rpc(patch_init_and_catch_error,
-                                args=(
-                                    obj,
-                                    method_name,
-                                    expected_error,
-                                ))
+        executor.collective_rpc(
+            patch_init_and_catch_error,
+            args=(
+                obj,
+                method_name,
+                expected_error,
+            ),
+        )
 
 
 def is_curl_installed():
     try:
-        subprocess.check_call(['curl', '--version'])
+        subprocess.check_call(["curl", "--version"])
         return True
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
@@ -99,13 +105,14 @@ def is_curl_installed():
 def write_keyfile(keyfile_path: str):
     encryption_params = EncryptionParams.random()
     pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
-    with open(keyfile_path, 'wb') as f:
+    with open(keyfile_path, "wb") as f:
         f.write(encryption_params.key)
 
 
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
-        model_ref, vllm_runner, tmp_path, model_path):
+    model_ref, vllm_runner, tmp_path, model_path
+):
     args = EngineArgs(model=model_ref)
     with vllm_runner(model_ref) as vllm_model:
         key_path = tmp_path / model_ref / "model.key"
@@ -113,29 +120,30 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
         outputs = vllm_model.generate(prompts, sampling_params)
 
-    config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path),
-                                              encryption_keyfile=str(key_path))
+    config_for_serializing = TensorizerConfig(
+        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path)
+    )
 
     tensorize_vllm_model(args, config_for_serializing)
 
     config_for_deserializing = TensorizerConfig(
-        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path))
-
-    with vllm_runner(model_ref,
-                     load_format="tensorizer",
-                     model_loader_extra_config=config_for_deserializing
-                     ) as loaded_vllm_model:  # noqa: E501
+        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path)
+    )
 
-        deserialized_outputs = loaded_vllm_model.generate(
-            prompts, sampling_params)
+    with vllm_runner(
+        model_ref,
+        load_format="tensorizer",
+        model_loader_extra_config=config_for_deserializing,
+    ) as loaded_vllm_model:  # noqa: E501
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
         # noqa: E501
 
         assert outputs == deserialized_outputs
 
 
-def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
-                                                tmp_path, model_ref,
-                                                model_path):
+def test_deserialized_hf_model_has_same_outputs(
+    hf_runner, vllm_runner, tmp_path, model_ref, model_path
+):
     with hf_runner(model_ref) as hf_model:
         max_tokens = 50
         outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
@@ -143,14 +151,17 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
             serializer = TensorSerializer(stream)
             serializer.write_module(hf_model.model)
 
-    with vllm_runner(model_ref,
-                     load_format="tensorizer",
-                     model_loader_extra_config=TensorizerConfig(
-                         tensorizer_uri=str(model_path),
-                         num_readers=1,
-                     )) as loaded_hf_model:
+    with vllm_runner(
+        model_ref,
+        load_format="tensorizer",
+        model_loader_extra_config=TensorizerConfig(
+            tensorizer_uri=str(model_path),
+            num_readers=1,
+        ),
+    ) as loaded_hf_model:
         deserialized_outputs = loaded_hf_model.generate_greedy(
-            prompts, max_tokens=max_tokens)
+            prompts, max_tokens=max_tokens
+        )
 
         assert outputs == deserialized_outputs
 
@@ -159,34 +170,38 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
     model = None
     try:
         model = vllm_runner(
-            model_ref,
-            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+            model_ref, model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")
+        )
     except RuntimeError:
         out, err = capfd.readouterr()
         combined_output = out + err
-        assert ("ValueError: Model loader extra config "
-                "is not supported for load "
-                "format LoadFormat.AUTO") in combined_output
+        assert (
+            "ValueError: Model loader extra config "
+            "is not supported for load "
+            "format LoadFormat.AUTO"
+        ) in combined_output
     finally:
         del model
         gc.collect()
         torch.cuda.empty_cache()
 
 
-def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
-                                                  model_ref):
+def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, model_ref):
     model = None
     try:
         model = vllm_runner(
             model_ref,
             load_format="safetensors",
-            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"),
+        )
     except RuntimeError:
         out, err = capfd.readouterr()
 
         combined_output = out + err
-        assert ("ValueError: Model loader extra config is not supported "
-                "for load format LoadFormat.SAFETENSORS") in combined_output
+        assert (
+            "ValueError: Model loader extra config is not supported "
+            "for load format LoadFormat.SAFETENSORS"
+        ) in combined_output
     finally:
         del model
         gc.collect()
@@ -213,21 +228,24 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
     except RuntimeError:
         out, err = capfd.readouterr()
         combined_output = out + err
-        assert ("ValueError: For a sharded model, tensorizer_uri "
-                "should include a string format template like '%04d' "
-                "to be formatted with the rank "
-                "of the shard") in combined_output
+        assert (
+            "ValueError: For a sharded model, tensorizer_uri "
+            "should include a string format template like '%04d' "
+            "to be formatted with the rank "
+            "of the shard"
+        ) in combined_output
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
 def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
-        vllm_runner, tmp_path):
+    vllm_runner, tmp_path
+):
     model_ref = "EleutherAI/pythia-1.4b"
     # record outputs from un-sharded un-tensorized model
     with vllm_runner(
-            model_ref,
-            disable_custom_all_reduce=True,
-            enforce_eager=True,
+        model_ref,
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
     ) as base_model:
         outputs = base_model.generate(prompts, sampling_params)
 
@@ -253,21 +271,22 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
 
     with vllm_runner(
-            model_ref,
-            tensor_parallel_size=2,
-            load_format="tensorizer",
-            disable_custom_all_reduce=True,
-            enforce_eager=True,
-            model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(
-            prompts, sampling_params)
+        model_ref,
+        tensor_parallel_size=2,
+        load_format="tensorizer",
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config,
+    ) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
 
     assert outputs == deserialized_outputs
 
 
 @pytest.mark.flaky(reruns=3)
-def test_vllm_tensorized_model_has_same_outputs(model_ref, vllm_runner,
-                                                tmp_path, model_path):
+def test_vllm_tensorized_model_has_same_outputs(
+    model_ref, vllm_runner, tmp_path, model_path
+):
     gc.collect()
     torch.cuda.empty_cache()
     config = TensorizerConfig(tensorizer_uri=str(model_path))
@@ -279,11 +298,10 @@ def test_vllm_tensorized_model_has_same_outputs(model_ref, vllm_runner,
     tensorize_vllm_model(args, config)
     assert is_vllm_tensorized(config)
 
-    with vllm_runner(model_ref,
-                     load_format="tensorizer",
-                     model_loader_extra_config=config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(
-            prompts, sampling_params)
+    with vllm_runner(
+        model_ref, load_format="tensorizer", model_loader_extra_config=config
+    ) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
         # noqa: E501
 
         assert outputs == deserialized_outputs
@@ -313,15 +331,17 @@ def test_load_with_just_model_tensors(just_serialize_model_tensors, model_ref):
 
 
 def test_assert_serialization_kwargs_passed_to_tensor_serializer(tmp_path):
-
     serialization_params = {
         "limit_cpu_concurrency": 2,
     }
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
-    llm = LLM(model=model_ref, )
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
+    llm = LLM(
+        model=model_ref,
+    )
 
     def serialization_test(self, *args, **kwargs):
         # This is performed in the ephemeral worker process, so monkey-patching
@@ -339,10 +359,13 @@ def tensorizer_serializer_wrapper(self, *args, **kwargs):
             return original(self, *args, **kwargs)
 
         tensorizer.serialization.TensorSerializer.__init__ = (
-            tensorizer_serializer_wrapper)
+            tensorizer_serializer_wrapper
+        )
 
         tensorizer_config = TensorizerConfig(**kwargs["tensorizer_config"])
-        self.save_tensorized_model(tensorizer_config=tensorizer_config, )
+        self.save_tensorized_model(
+            tensorizer_config=tensorizer_config,
+        )
         return to_compare | original_dict == to_compare
 
     kwargs = {"tensorizer_config": config.to_serializable()}
@@ -350,9 +373,7 @@ def tensorizer_serializer_wrapper(self, *args, **kwargs):
     assert assert_from_collective_rpc(llm, serialization_test, kwargs)
 
 
-def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
-        tmp_path, capfd):
-
+def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
     deserialization_kwargs = {
         "num_readers": "bar",  # illegal value
     }
@@ -363,8 +384,9 @@ def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
 
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
 
     args = EngineArgs(model=model_ref)
     tensorize_vllm_model(args, config)
@@ -392,7 +414,6 @@ def test_assert_deserialization_kwargs_passed_to_tensor_deserializer(
 
 
 def test_assert_stream_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
-
     deserialization_kwargs = {
         "num_readers": 1,
     }
@@ -403,8 +424,9 @@ def test_assert_stream_kwargs_passed_to_tensor_deserializer(tmp_path, capfd):
 
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
 
     args = EngineArgs(model=model_ref)
     tensorize_vllm_model(args, config)
@@ -440,16 +462,24 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
 
     suffix = "test"
     try:
-        result = subprocess.run([
-            sys.executable,
-            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
-            model_ref, "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
-            '{"limit_cpu_concurrency": 4}'
-        ],
-                                check=True,
-                                capture_output=True,
-                                text=True)
+        result = subprocess.run(
+            [
+                sys.executable,
+                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
+                "--model",
+                model_ref,
+                "serialize",
+                "--serialized-directory",
+                str(tmp_path),
+                "--suffix",
+                suffix,
+                "--serialization-kwargs",
+                '{"limit_cpu_concurrency": 4}',
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
     except subprocess.CalledProcessError as e:
         print("Tensorizing failed.")
         print("STDOUT:\n", e.stdout)
@@ -469,14 +499,20 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
         "deserialization_kwargs": {
             "verify_hash": True,
             "num_readers": 8,
-        }
+        },
     }
 
     cmd = [
-        "-m", "vllm.entrypoints.cli.main", "serve", "--host", "localhost",
-        "--load-format", "tensorizer", model_ref,
+        "-m",
+        "vllm.entrypoints.cli.main",
+        "serve",
+        "--host",
+        "localhost",
+        "--load-format",
+        "tensorizer",
+        model_ref,
         "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config, indent=2)
+        json.dumps(model_loader_extra_config, indent=2),
     ]
 
     proc = await asyncio.create_subprocess_exec(
@@ -499,17 +535,16 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
 
 
 @pytest.mark.parametrize("illegal_value", BLACKLISTED_TENSORIZER_ARGS)
-def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd,
-                                           illegal_value):
-
+def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd, illegal_value):
     serialization_params = {
         "limit_cpu_concurrency": 2,
     }
 
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
-    config = TensorizerConfig(tensorizer_uri=str(model_path),
-                              serialization_kwargs=serialization_params)
+    config = TensorizerConfig(
+        tensorizer_uri=str(model_path), serialization_kwargs=serialization_params
+    )
 
     args = EngineArgs(model=model_ref)
     tensorize_vllm_model(args, config)
@@ -525,5 +560,6 @@ def test_blacklisted_parameter_for_loading(tmp_path, vllm_runner, capfd,
     except RuntimeError:
         out, err = capfd.readouterr()
         combined_output = out + err
-        assert (f"ValueError: {illegal_value} is not an allowed "
-                f"Tensorizer argument.") in combined_output
+        assert (
+            f"ValueError: {illegal_value} is not an allowed Tensorizer argument."
+        ) in combined_output
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index edc0849dff33..466d91230539 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/test_cache_block_hashing.py`.
 """
+
 from typing import Optional
 
 import pytest
@@ -24,13 +25,16 @@
     "teaching role. They have 5 years of previous teaching experience "
     "as an assistant teacher at a co-ed, public school with experience "
     "in middle school math teaching. Based on this, fulfill "
-    "the following: ")
+    "the following: "
+)
 prefixes = [start + prefix_common for start in prefix_start]
 
 # Sample prompts.
 sample_prompts = [
-    "Hello, my name is", "The president of the United States is",
-    "The capital of France is", "The future of AI is"
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
 ]
 
 
@@ -42,11 +46,15 @@ def flatten_2d(li):
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("max_num_seqs", [256])
-@pytest.mark.parametrize("concurrent_lora_int_ids",
-                         [[None], [1], [None, 1], [None, 1, 2], [1, 2]])
-def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
-                             concurrent_lora_int_ids: list[Optional[int]]):
-
+@pytest.mark.parametrize(
+    "concurrent_lora_int_ids", [[None], [1], [None, 1], [None, 1, 2], [1, 2]]
+)
+def test_auto_prefix_caching(
+    model: str,
+    block_size: int,
+    max_num_seqs: int,
+    concurrent_lora_int_ids: list[Optional[int]],
+):
     tokenizer = TokenizerGroup(
         tokenizer_id="facebook/opt-125m",
         enable_lora=False,
@@ -72,12 +80,13 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
             for seq_id, prompt in enumerate(prompts):
                 hashes[-1].append([])
                 prompt_token_ids = tokenizer.encode(prompt)
-                seq = Sequence(seq_id,
-                               inputs=token_inputs(prompt_token_ids,
-                                                   prompt=prompt),
-                               block_size=block_size,
-                               eos_token_id=tokenizer.tokenizer.eos_token_id,
-                               lora_request=lora_request)
+                seq = Sequence(
+                    seq_id,
+                    inputs=token_inputs(prompt_token_ids, prompt=prompt),
+                    block_size=block_size,
+                    eos_token_id=tokenizer.tokenizer.eos_token_id,
+                    lora_request=lora_request,
+                )
 
                 num_blocks = len(prompt_token_ids) // block_size
                 for idx in range(num_blocks):
@@ -86,12 +95,12 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
     # Check that hashes made with two prefixes with different first blocks are
     # different everywhere.
     for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
-        assert (hash0 != hash1)
+        assert hash0 != hash1
 
     # Check that hashes of different prompts made with the same prefix are the
     # same until the hashes that contain the prompt.
     for hash_pref in hashes:
         same_hashes = [tuple(h[:-1]) for h in hash_pref]
         different_hashes = [h[-1] for h in hash_pref]
-        assert (len(set(same_hashes)) == 1)
-        assert (len(set(different_hashes)) == len(different_hashes))
+        assert len(set(same_hashes)) == 1
+        assert len(set(different_hashes)) == len(different_hashes)
diff --git a/tests/test_config.py b/tests/test_config.py
index 015baef91811..45a95d007fad 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,8 +6,14 @@
 import pytest
 
 from vllm.compilation.backends import VllmBackend
-from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
-                         get_field, update_config)
+from vllm.config import (
+    LoadConfig,
+    ModelConfig,
+    PoolerConfig,
+    VllmConfig,
+    get_field,
+    update_config,
+)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
@@ -20,8 +26,8 @@ def test_compile_config_repr_succeeds():
 
     # test that repr(config) succeeds
     val = repr(config)
-    assert 'VllmConfig' in val
-    assert 'inductor_passes' in val
+    assert "VllmConfig" in val
+    assert "inductor_passes" in val
 
 
 @dataclass
@@ -48,8 +54,7 @@ def test_get_field():
 
 @dataclass
 class _TestNestedConfig:
-    a: _TestConfigFields = field(
-        default_factory=lambda: _TestConfigFields(a=0))
+    a: _TestConfigFields = field(default_factory=lambda: _TestConfigFields(a=0))
 
 
 def test_update_config():
@@ -130,10 +135,12 @@ def test_score_task(model_id, expected_runner_type, expected_task):
     assert config.task == expected_task
 
 
-@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"),
-                         [
-                             ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
-                         ])
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
+    ],
+)
 def test_draft_task(model_id, expected_runner_type, expected_task):
     config = ModelConfig(
         model_id,
@@ -168,10 +175,13 @@ def test_transcription_task(model_id, expected_runner_type, expected_task):
     assert config.task == expected_task
 
 
-@pytest.mark.parametrize(("model_id", "bad_task"), [
-    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
-    ("Qwen/Qwen3-0.6B", "transcription"),
-])
+@pytest.mark.parametrize(
+    ("model_id", "bad_task"),
+    [
+        ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
+        ("Qwen/Qwen3-0.6B", "transcription"),
+    ],
+)
 def test_incorrect_task(model_id, bad_task):
     with pytest.raises(ValueError, match=r"does not support task=.*"):
         ModelConfig(
@@ -249,8 +259,9 @@ def test_get_sliding_window():
     assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_get_pooling_config():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
     model_config = ModelConfig(
@@ -271,20 +282,23 @@ def test_get_pooling_config():
     assert pooling_config.pooling_type == PoolingType.MEAN.name
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_get_pooling_config_from_args():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               revision=None)
-
-    override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    override_pooler_config = PoolerConfig(pooling_type="CLS", normalize=True)
     model_config.override_pooler_config = override_pooler_config
 
     pooling_config = model_config._init_pooler_config()
@@ -292,8 +306,9 @@ def test_get_pooling_config_from_args():
     assert asdict(pooling_config) == asdict(override_pooler_config)
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Xformers backend is not supported on ROCm.")
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
+)
 def test_get_bert_tokenization_sentence_transformer_config():
     bge_model_config = ModelConfig(
         model="BAAI/bge-base-en-v1.5",
@@ -343,10 +358,10 @@ def test_rope_customization():
             "rope_theta": TEST_ROPE_THETA,
         },
     )
-    assert getattr(llama_model_config.hf_config, "rope_scaling",
-                   None) == TEST_ROPE_SCALING
-    assert getattr(llama_model_config.hf_config, "rope_theta",
-                   None) == TEST_ROPE_THETA
+    assert (
+        getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING
+    )
+    assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(
@@ -361,7 +376,8 @@ def test_rope_customization():
     # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
     assert all(
         longchat_model_config.hf_config.rope_scaling.get(key) == value
-        for key, value in LONGCHAT_ROPE_SCALING.items())
+        for key, value in LONGCHAT_ROPE_SCALING.items()
+    )
     assert longchat_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(
@@ -376,19 +392,25 @@ def test_rope_customization():
             "rope_scaling": TEST_ROPE_SCALING,
         },
     )
-    assert getattr(longchat_model_config.hf_config, "rope_scaling",
-                   None) == TEST_ROPE_SCALING
+    assert (
+        getattr(longchat_model_config.hf_config, "rope_scaling", None)
+        == TEST_ROPE_SCALING
+    )
     assert longchat_model_config.max_model_len == 4096
 
 
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="Encoder Decoder models not supported on ROCm.")
-@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
-    ("facebook/opt-125m", False),
-    ("facebook/bart-base", True),
-    ("meta-llama/Llama-3.2-1B-Instruct", False),
-    ("meta-llama/Llama-3.2-11B-Vision", True),
-])
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Encoder Decoder models not supported on ROCm."
+)
+@pytest.mark.parametrize(
+    ("model_id", "is_encoder_decoder"),
+    [
+        ("facebook/opt-125m", False),
+        ("facebook/bart-base", True),
+        ("meta-llama/Llama-3.2-1B-Instruct", False),
+        ("meta-llama/Llama-3.2-11B-Vision", True),
+    ],
+)
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
     config = ModelConfig(
         model_id,
@@ -403,10 +425,13 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
     assert config.is_encoder_decoder == is_encoder_decoder
 
 
-@pytest.mark.parametrize(("model_id", "uses_mrope"), [
-    ("facebook/opt-125m", False),
-    ("Qwen/Qwen2-VL-2B-Instruct", True),
-])
+@pytest.mark.parametrize(
+    ("model_id", "uses_mrope"),
+    [
+        ("facebook/opt-125m", False),
+        ("Qwen/Qwen2-VL-2B-Instruct", True),
+    ],
+)
 def test_uses_mrope(model_id, uses_mrope):
     config = ModelConfig(
         model_id,
@@ -426,26 +451,30 @@ def test_generation_config_loading():
 
     # When set generation_config to "vllm", the default generation config
     # will not be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="vllm")
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config="vllm",
+    )
     assert model_config.get_diff_sampling_param() == {}
 
     # When set generation_config to "auto", the default generation config
     # should be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="auto")
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        generation_config="auto",
+    )
 
     correct_generation_config = {
         "repetition_penalty": 1.1,
@@ -468,7 +497,8 @@ def test_generation_config_loading():
         seed=0,
         dtype="float16",
         generation_config="auto",
-        override_generation_config=override_generation_config)
+        override_generation_config=override_generation_config,
+    )
 
     override_result = correct_generation_config.copy()
     override_result.update(override_generation_config)
@@ -486,17 +516,19 @@ def test_generation_config_loading():
         seed=0,
         dtype="float16",
         generation_config="vllm",
-        override_generation_config=override_generation_config)
+        override_generation_config=override_generation_config,
+    )
 
     assert model_config.get_diff_sampling_param() == override_generation_config
 
 
-@pytest.mark.parametrize("pt_load_map_location", [
-    "cuda",
-    {
-        "": "cuda"
-    },
-])
+@pytest.mark.parametrize(
+    "pt_load_map_location",
+    [
+        "cuda",
+        {"": "cuda"},
+    ],
+)
 def test_load_config_pt_load_map_location(pt_load_map_location):
     load_config = LoadConfig(pt_load_map_location=pt_load_map_location)
     config = VllmConfig(load_config=load_config)
@@ -505,15 +537,18 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
 
 
 @pytest.mark.parametrize(
-    ("model_id", "max_model_len", "expected_max_len", "should_raise"), [
+    ("model_id", "max_model_len", "expected_max_len", "should_raise"),
+    [
         ("BAAI/bge-reranker-base", None, 512, False),
         ("BAAI/bge-reranker-base", 256, 256, False),
         ("BAAI/bge-reranker-base", 513, 512, True),
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", None, 131072, False),
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
-    ])
-def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
-                                should_raise):
+    ],
+)
+def test_get_and_verify_max_len(
+    model_id, max_model_len, expected_max_len, should_raise
+):
     """Test get_and_verify_max_len with different configurations."""
     model_config = ModelConfig(
         model_id,
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
index b9593e2a3b7c..687a15446fc2 100644
--- a/tests/test_embedded_commit.py
+++ b/tests/test_embedded_commit.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import vllm
-
-
-def test_embedded_commit_defined():
-    assert hasattr(vllm, "__version__")
-    assert hasattr(vllm, "__version_tuple__")
-    assert vllm.__version__ != "dev"
-    assert vllm.__version_tuple__ != (0, 0, "dev")
+import vllm
+
+
+def test_embedded_commit_defined():
+    assert hasattr(vllm, "__version__")
+    assert hasattr(vllm, "__version_tuple__")
+    assert vllm.__version__ != "dev"
+    assert vllm.__version_tuple__ != (0, 0, "dev")
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index e549834faf6f..d7d190183e3a 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,11 +7,11 @@
 from vllm.inputs.parse import parse_and_batch_prompt
 
 STRING_INPUTS = [
-    '',
-    'foo',
-    'foo bar',
-    'foo baz bar',
-    'foo bar qux baz',
+    "",
+    "foo",
+    "foo bar",
+    "foo baz bar",
+    "foo bar qux baz",
 ]
 
 TOKEN_INPUTS = [
@@ -37,22 +37,23 @@ def test_parse_single_batch_empty():
         parse_and_batch_prompt([[]])
 
 
-@pytest.mark.parametrize('string_input', STRING_INPUTS)
+@pytest.mark.parametrize("string_input", STRING_INPUTS)
 def test_parse_single_batch_string_consistent(string_input: str):
-    assert parse_and_batch_prompt(string_input) \
-        == parse_and_batch_prompt([string_input])
+    assert parse_and_batch_prompt(string_input) == parse_and_batch_prompt(
+        [string_input]
+    )
 
 
-@pytest.mark.parametrize('token_input', TOKEN_INPUTS)
+@pytest.mark.parametrize("token_input", TOKEN_INPUTS)
 def test_parse_single_batch_token_consistent(token_input: list[int]):
-    assert parse_and_batch_prompt(token_input) \
-        == parse_and_batch_prompt([token_input])
+    assert parse_and_batch_prompt(token_input) == parse_and_batch_prompt([token_input])
 
 
-@pytest.mark.parametrize('inputs_slice', INPUTS_SLICES)
+@pytest.mark.parametrize("inputs_slice", INPUTS_SLICES)
 def test_parse_single_batch_string_slice(inputs_slice: slice):
-    assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
-        == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
+    assert parse_and_batch_prompt(STRING_INPUTS)[
+        inputs_slice
+    ] == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
 
 
 # yapf: disable
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 8f235f1474fe..1ebf7e77946e 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -15,8 +15,13 @@
 
 import pytest
 
-from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
-                         enable_trace_function_call, init_logger)
+from vllm.logger import (
+    _DATE_FORMAT,
+    _FORMAT,
+    _configure_vllm_root_logger,
+    enable_trace_function_call,
+    init_logger,
+)
 from vllm.logging_utils import NewLineFormatter
 from vllm.logging_utils.dump_input import prepare_object_to_dump
 
@@ -128,8 +133,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write("---\nloggers: []\nversion: 1")
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name):
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
             with pytest.raises(JSONDecodeError) as ex_info:
                 _configure_vllm_root_logger()
             assert ex_info.type == JSONDecodeError
@@ -137,24 +141,24 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
 
 
 @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@pytest.mark.parametrize("unexpected_config", (
-    "Invalid string",
-    [{
-        "version": 1,
-        "loggers": []
-    }],
-    0,
-))
+@pytest.mark.parametrize(
+    "unexpected_config",
+    (
+        "Invalid string",
+        [{"version": 1, "loggers": []}],
+        0,
+    ),
+)
 def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
-        unexpected_config: Any):
+    unexpected_config: Any,
+):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(unexpected_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name):
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
             with pytest.raises(ValueError) as ex_info:
                 _configure_vllm_root_logger()
             assert ex_info.type == ValueError  # noqa: E721
@@ -173,14 +177,15 @@ def test_custom_logging_config_is_parsed_and_used_when_provided():
                 "propagate": False,
             }
         },
-        "version": 1
+        "version": 1,
     }
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name), patch(
-                       "vllm.logger.dictConfig") as dict_config_mock:
+        with (
+            patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name),
+            patch("vllm.logger.dictConfig") as dict_config_mock,
+        ):
             _configure_vllm_root_logger()
             dict_config_mock.assert_called_with(valid_logging_config)
 
@@ -196,19 +201,19 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
                 "handlers": [],
             }
         },
-        "version": 1
+        "version": 1,
     }
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-                   logging_config_file.name):
+        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
             with pytest.raises(RuntimeError) as ex_info:
                 _configure_vllm_root_logger()
             assert ex_info.type is RuntimeError
             expected_message_snippet = (
                 "VLLM_CONFIGURE_LOGGING evaluated to false, but "
-                "VLLM_LOGGING_CONFIG_PATH was given.")
+                "VLLM_LOGGING_CONFIG_PATH was given."
+            )
             assert expected_message_snippet in str(ex_info)
 
         # Remember! The root logger is assumed to have been configured as
@@ -222,21 +227,19 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
 
 
 def test_prepare_object_to_dump():
-    str_obj = 'str'
+    str_obj = "str"
     assert prepare_object_to_dump(str_obj) == "'str'"
 
     list_obj = [1, 2, 3]
-    assert prepare_object_to_dump(list_obj) == '[1, 2, 3]'
+    assert prepare_object_to_dump(list_obj) == "[1, 2, 3]"
 
-    dict_obj = {'a': 1, 'b': 'b'}
-    assert prepare_object_to_dump(dict_obj) in [
-        "{a: 1, b: 'b'}", "{b: 'b', a: 1}"
-    ]
+    dict_obj = {"a": 1, "b": "b"}
+    assert prepare_object_to_dump(dict_obj) in ["{a: 1, b: 'b'}", "{b: 'b', a: 1}"]
 
     set_obj = {1, 2, 3}
-    assert prepare_object_to_dump(set_obj) == '[1, 2, 3]'
+    assert prepare_object_to_dump(set_obj) == "[1, 2, 3]"
 
-    tuple_obj = ('a', 'b', 'c')
+    tuple_obj = ("a", "b", "c")
     assert prepare_object_to_dump(tuple_obj) == "['a', 'b', 'c']"
 
     class CustomEnum(enum.Enum):
@@ -251,5 +254,4 @@ class CustomClass:
         a: int
         b: str
 
-    assert (prepare_object_to_dump(CustomClass(
-        1, 'b')) == "CustomClass(a=1, b='b')")
+    assert prepare_object_to_dump(CustomClass(1, "b")) == "CustomClass(a=1, b='b')"
diff --git a/tests/test_outputs.py b/tests/test_outputs.py
index 4bb1c20f77f1..e6d59906be26 100644
--- a/tests/test_outputs.py
+++ b/tests/test_outputs.py
@@ -5,11 +5,13 @@
 
 
 def test_request_output_forward_compatible():
-    output = RequestOutput(request_id="test_request_id",
-                           prompt="test prompt",
-                           prompt_token_ids=[1, 2, 3],
-                           prompt_logprobs=None,
-                           outputs=[],
-                           finished=False,
-                           example_arg_added_in_new_version="some_value")
+    output = RequestOutput(
+        request_id="test_request_id",
+        prompt="test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[],
+        finished=False,
+        example_arg_added_in_new_version="some_value",
+    )
     assert output is not None
diff --git a/tests/test_regression.py b/tests/test_regression.py
index f5f1ed8e805e..8a9829e4dba5 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -6,6 +6,7 @@
 will never happen again.
 
 """
+
 import gc
 
 import pytest
@@ -18,12 +19,12 @@
 def test_duplicated_ignored_sequence_group():
     """https://github.com/vllm-project/vllm/issues/1655"""
 
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=256)
+    llm = LLM(
+        model="distilbert/distilgpt2",
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+    )
     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
     outputs = llm.generate(prompts, sampling_params=sampling_params)
 
@@ -31,12 +32,12 @@ def test_duplicated_ignored_sequence_group():
 
 
 def test_max_tokens_none():
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=None)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
+    llm = LLM(
+        model="distilbert/distilgpt2",
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+    )
     prompts = ["Just say hello!"]
     outputs = llm.generate(prompts, sampling_params=sampling_params)
 
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 39e3808d831c..145d6c0d0cfa 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the SamplingParams class.
-"""
+"""Tests for the SamplingParams class."""
 
 import pytest
 
@@ -37,19 +36,18 @@ def default_max_tokens():
 
 
 def test_sampling_params_from_request_with_no_guided_decoding_backend(
-        model_config, default_max_tokens):
+    model_config, default_max_tokens
+):
     # guided_decoding_backend is not present at request level
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        MODEL_NAME,
-        'response_format': {
-            'type': 'json_object',
-        },
-    })
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": MODEL_NAME,
+            "response_format": {
+                "type": "json_object",
+            },
+        }
+    )
 
     sampling_params = request.to_sampling_params(
         default_max_tokens,
@@ -60,27 +58,30 @@ def test_sampling_params_from_request_with_no_guided_decoding_backend(
     assert sampling_params.guided_decoding.backend is None
 
 
-@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
-                         [("xgrammar", "xgrammar"),
-                          ("lm-format-enforcer", "lm-format-enforcer"),
-                          ("outlines", "outlines")])
+@pytest.mark.parametrize(
+    "request_level_guided_decoding_backend,expected",
+    [
+        ("xgrammar", "xgrammar"),
+        ("lm-format-enforcer", "lm-format-enforcer"),
+        ("outlines", "outlines"),
+    ],
+)
 def test_sampling_params_from_request_with_guided_decoding_backend(
-        request_level_guided_decoding_backend: str, expected: str,
-        model_config, default_max_tokens):
-
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        MODEL_NAME,
-        'response_format': {
-            'type': 'json_object',
-        },
-        'guided_decoding_backend':
-        request_level_guided_decoding_backend,
-    })
+    request_level_guided_decoding_backend: str,
+    expected: str,
+    model_config,
+    default_max_tokens,
+):
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": MODEL_NAME,
+            "response_format": {
+                "type": "json_object",
+            },
+            "guided_decoding_backend": request_level_guided_decoding_backend,
+        }
+    )
 
     sampling_params = request.to_sampling_params(
         default_max_tokens,
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index ef4aef3afc2e..5361efbbdf6f 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -7,21 +7,24 @@
 from vllm.scalar_type import scalar_types
 
 
-@pytest.mark.parametrize("type_tuple", (
-    (-8, 7, scalar_types.int4),
-    (0, 15, scalar_types.uint4),
-    (-8, 7, scalar_types.uint4b8),
-    (-128, 127, scalar_types.uint8b128),
-    (-6., 6., scalar_types.float4_e2m1f),
-    (-28., 28., scalar_types.float6_e3m2f),
-    (torch.int8, scalar_types.int8),
-    (torch.uint8, scalar_types.uint8),
-    (torch.float8_e5m2, scalar_types.float8_e5m2),
-    (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
-    (torch.bfloat16, scalar_types.float16_e8m7),
-    (torch.float16, scalar_types.float16_e5m10),
-),
-                         ids=lambda x: str(x))
+@pytest.mark.parametrize(
+    "type_tuple",
+    (
+        (-8, 7, scalar_types.int4),
+        (0, 15, scalar_types.uint4),
+        (-8, 7, scalar_types.uint4b8),
+        (-128, 127, scalar_types.uint8b128),
+        (-6.0, 6.0, scalar_types.float4_e2m1f),
+        (-28.0, 28.0, scalar_types.float6_e3m2f),
+        (torch.int8, scalar_types.int8),
+        (torch.uint8, scalar_types.uint8),
+        (torch.float8_e5m2, scalar_types.float8_e5m2),
+        (torch.float8_e4m3fn, scalar_types.float8_e4m3fn),
+        (torch.bfloat16, scalar_types.float16_e8m7),
+        (torch.float16, scalar_types.float16_e5m10),
+    ),
+    ids=lambda x: str(x),
+)
 def test_scalar_type_min_max(type_tuple):
     print(type_tuple)
     if len(type_tuple) == 3:
diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
index e9138b9e8eb6..adc8a1a4bf08 100644
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
@@ -1,25 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import random
-
-import numpy as np
-import torch
-
-from vllm.platforms.interface import Platform
-
-
-def test_seed_behavior():
-    # Test with a specific seed
-    Platform.seed_everything(42)
-    random_value_1 = random.randint(0, 100)
-    np_random_value_1 = np.random.randint(0, 100)
-    torch_random_value_1 = torch.randint(0, 100, (1, )).item()
-
-    Platform.seed_everything(42)
-    random_value_2 = random.randint(0, 100)
-    np_random_value_2 = np.random.randint(0, 100)
-    torch_random_value_2 = torch.randint(0, 100, (1, )).item()
-
-    assert random_value_1 == random_value_2
-    assert np_random_value_1 == np_random_value_2
-    assert torch_random_value_1 == torch_random_value_2
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import numpy as np
+import torch
+
+from vllm.platforms.interface import Platform
+
+
+def test_seed_behavior():
+    # Test with a specific seed
+    Platform.seed_everything(42)
+    random_value_1 = random.randint(0, 100)
+    np_random_value_1 = np.random.randint(0, 100)
+    torch_random_value_1 = torch.randint(0, 100, (1,)).item()
+
+    Platform.seed_everything(42)
+    random_value_2 = random.randint(0, 100)
+    np_random_value_2 = np.random.randint(0, 100)
+    torch_random_value_2 = torch.randint(0, 100, (1,)).item()
+
+    assert random_value_1 == random_value_2
+    assert np_random_value_1 == np_random_value_2
+    assert torch_random_value_1 == torch_random_value_2
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index a782a3bf7716..c79422a6b38d 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -4,8 +4,7 @@
 import pytest
 
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import (CompletionSequenceGroupOutput, SequenceData,
-                           SequenceOutput)
+from vllm.sequence import CompletionSequenceGroupOutput, SequenceData, SequenceOutput
 
 from .core.utils import create_dummy_prompt
 
@@ -13,10 +12,11 @@
 @pytest.fixture
 def sample_outputs():
     return [
-        CompletionSequenceGroupOutput(samples=[
-            SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
-        ],
-                                      prompt_logprobs=None) for i in range(5)
+        CompletionSequenceGroupOutput(
+            samples=[SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})],
+            prompt_logprobs=None,
+        )
+        for i in range(5)
     ]
 
 
@@ -37,10 +37,10 @@ def test_sampler_output_getitem(sampler_output, sample_outputs):
 
 
 def test_sampler_output_setitem(sampler_output):
-    new_output = CompletionSequenceGroupOutput(samples=[
-        SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
-    ],
-                                               prompt_logprobs=None)
+    new_output = CompletionSequenceGroupOutput(
+        samples=[SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})],
+        prompt_logprobs=None,
+    )
     sampler_output[2] = new_output
     assert sampler_output[2] == new_output
 
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 64706defb596..79698c91baf7 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -34,11 +34,13 @@ def test_filter_subtensors():
         "b": torch.empty((2, 4)),
         "c": torch.empty((2, 4, 8)),
     }
-    state_dict.update({
-        "x": state_dict["b"],
-        "y": state_dict["c"][1, 2, :],
-        "z": state_dict["c"][1, :, 4],
-    })
+    state_dict.update(
+        {
+            "x": state_dict["b"],
+            "y": state_dict["c"][1, 2, :],
+            "z": state_dict["c"][1, :, 4],
+        }
+    )
     filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
     assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
     for key, tensor in filtered_state_dict.items():
@@ -48,8 +50,9 @@ def test_filter_subtensors():
 
 @pytest.fixture(scope="module")
 def llama_3p2_1b_files():
-    input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
-                                  ignore_patterns=["*.bin*", "original/*"])
+    input_dir = snapshot_download(
+        "meta-llama/Llama-3.2-1B-Instruct", ignore_patterns=["*.bin*", "original/*"]
+    )
 
     yield input_dir
 
@@ -58,8 +61,7 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     llm_sharded_writer = LLM(model=input_dir, **kwargs)
 
     # Dump worker states to output directory
-    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
-        path=output_dir)
+    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(path=output_dir)
 
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
@@ -79,13 +81,17 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
 
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
-def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_3p2_1b_files,
-                              monkeypatch: pytest.MonkeyPatch):
+def test_sharded_state_loader(
+    enable_lora,
+    tp_size,
+    num_gpus_available,
+    llama_3p2_1b_files,
+    monkeypatch: pytest.MonkeyPatch,
+):
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    weights_patterns = ("*.safetensors", )
+    weights_patterns = ("*.safetensors",)
     gpu_memory_utilization = 0.8
     input_dir = llama_3p2_1b_files
     ctx = mp.get_context("spawn")
@@ -94,40 +100,46 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
 
     # Run in separate processes for memory & CUDA isolation
     with TemporaryDirectory() as output_dir:
-        p = ctx.Process(target=_run_writer,
-                        args=(input_dir, output_dir, weights_patterns),
-                        kwargs=dict(
-                            tensor_parallel_size=tp_size,
-                            distributed_executor_backend="mp",
-                            gpu_memory_utilization=gpu_memory_utilization,
-                            enforce_eager=True,
-                        ))
+        p = ctx.Process(
+            target=_run_writer,
+            args=(input_dir, output_dir, weights_patterns),
+            kwargs=dict(
+                tensor_parallel_size=tp_size,
+                distributed_executor_backend="mp",
+                gpu_memory_utilization=gpu_memory_utilization,
+                enforce_eager=True,
+            ),
+        )
         p.start()
         p.join()
 
         queue = ctx.Queue()
 
-        p = ctx.Process(target=_run_generate,
-                        args=(input_dir, queue),
-                        kwargs=dict(
-                            distributed_executor_backend="mp",
-                            enable_lora=enable_lora,
-                            gpu_memory_utilization=gpu_memory_utilization,
-                            tensor_parallel_size=tp_size,
-                        ))
+        p = ctx.Process(
+            target=_run_generate,
+            args=(input_dir, queue),
+            kwargs=dict(
+                distributed_executor_backend="mp",
+                enable_lora=enable_lora,
+                gpu_memory_utilization=gpu_memory_utilization,
+                tensor_parallel_size=tp_size,
+            ),
+        )
         p.start()
         p.join()
         out_before = queue.get()
 
-        p = ctx.Process(target=_run_generate,
-                        args=(output_dir, queue),
-                        kwargs=dict(
-                            distributed_executor_backend="mp",
-                            enable_lora=enable_lora,
-                            gpu_memory_utilization=gpu_memory_utilization,
-                            tensor_parallel_size=tp_size,
-                            load_format="sharded_state",
-                        ))
+        p = ctx.Process(
+            target=_run_generate,
+            args=(output_dir, queue),
+            kwargs=dict(
+                distributed_executor_backend="mp",
+                enable_lora=enable_lora,
+                gpu_memory_utilization=gpu_memory_utilization,
+                tensor_parallel_size=tp_size,
+                load_format="sharded_state",
+            ),
+        )
         p.start()
         p.join()
         out_after = queue.get()
diff --git a/tests/test_triton_utils.py b/tests/test_triton_utils.py
index 64f72668f29c..ac93967f8bc0 100644
--- a/tests/test_triton_utils.py
+++ b/tests/test_triton_utils.py
@@ -5,8 +5,7 @@
 import types
 from unittest import mock
 
-from vllm.triton_utils.importing import (TritonLanguagePlaceholder,
-                                         TritonPlaceholder)
+from vllm.triton_utils.importing import TritonLanguagePlaceholder, TritonPlaceholder
 
 
 def test_triton_placeholder_is_module():
@@ -52,8 +51,7 @@ def foo(x):
     def bar(x):
         return x
 
-    @triton.heuristics(
-        {"BLOCK_SIZE": lambda args: 128 if args["x"] > 1024 else 64})
+    @triton.heuristics({"BLOCK_SIZE": lambda args: 128 if args["x"] > 1024 else 64})
     def baz(x):
         return x
 
@@ -87,6 +85,7 @@ def test_no_triton_fallback():
     # mock triton not being installed
     with mock.patch.dict(sys.modules, {"triton": None}):
         from vllm.triton_utils import HAS_TRITON, tl, triton
+
         assert HAS_TRITON is False
         assert triton.__class__.__name__ == "TritonPlaceholder"
         assert triton.language.__class__.__name__ == "TritonLanguagePlaceholder"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 28acacd25190..45cd68f8b308 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -18,23 +18,37 @@
 from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.transformers_utils.detokenizer_utils import (
-    convert_ids_list_to_tokens)
-from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
-                        MemorySnapshot, PlaceholderModule, StoreBoolean,
-                        bind_kv_cache, common_broadcastable_dtype,
-                        deprecate_kwargs, get_open_port, get_tcp_uri,
-                        is_lossless_cast, join_host_port, make_zmq_path,
-                        make_zmq_socket, memory_profiling,
-                        merge_async_iterators, sha256, split_host_port,
-                        split_zmq_path, supports_kw, swap_dict_values)
+from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
+from vllm.utils import (
+    CacheInfo,
+    FlexibleArgumentParser,
+    LRUCache,
+    MemorySnapshot,
+    PlaceholderModule,
+    StoreBoolean,
+    bind_kv_cache,
+    common_broadcastable_dtype,
+    deprecate_kwargs,
+    get_open_port,
+    get_tcp_uri,
+    is_lossless_cast,
+    join_host_port,
+    make_zmq_path,
+    make_zmq_socket,
+    memory_profiling,
+    merge_async_iterators,
+    sha256,
+    split_host_port,
+    split_zmq_path,
+    supports_kw,
+    swap_dict_values,
+)
 
 from .utils import create_new_process_for_each_test, error_on_warning
 
 
 @pytest.mark.asyncio
 async def test_merge_async_iterators():
-
     async def mock_async_iterator(idx: int):
         try:
             while True:
@@ -68,7 +82,6 @@ async def stream_output(generator: AsyncIterator[tuple[int, str]]):
 
 
 def test_deprecate_kwargs_always():
-
     @deprecate_kwargs("old_arg", is_deprecated=True)
     def dummy(*, old_arg: object = None, new_arg: object = None):
         pass
@@ -81,7 +94,6 @@ def dummy(*, old_arg: object = None, new_arg: object = None):
 
 
 def test_deprecate_kwargs_never():
-
     @deprecate_kwargs("old_arg", is_deprecated=False)
     def dummy(*, old_arg: object = None, new_arg: object = None):
         pass
@@ -116,7 +128,6 @@ def dummy(*, old_arg: object = None, new_arg: object = None):
 
 
 def test_deprecate_kwargs_additional_message():
-
     @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
     def dummy(*, old_arg: object = None, new_arg: object = None):
         pass
@@ -141,100 +152,108 @@ def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
 @pytest.fixture
 def parser():
     parser = FlexibleArgumentParser()
-    parser.add_argument('--image-input-type',
-                        choices=['pixel_values', 'image_features'])
-    parser.add_argument('--model-name')
-    parser.add_argument('--batch-size', type=int)
-    parser.add_argument('--enable-feature', action='store_true')
-    parser.add_argument('--hf-overrides', type=json.loads)
-    parser.add_argument('-O', '--compilation-config', type=json.loads)
+    parser.add_argument(
+        "--image-input-type", choices=["pixel_values", "image_features"]
+    )
+    parser.add_argument("--model-name")
+    parser.add_argument("--batch-size", type=int)
+    parser.add_argument("--enable-feature", action="store_true")
+    parser.add_argument("--hf-overrides", type=json.loads)
+    parser.add_argument("-O", "--compilation-config", type=json.loads)
     return parser
 
 
 @pytest.fixture
 def parser_with_config():
     parser = FlexibleArgumentParser()
-    parser.add_argument('serve')
-    parser.add_argument('model_tag', nargs='?')
-    parser.add_argument('--model', type=str)
-    parser.add_argument('--served-model-name', type=str)
-    parser.add_argument('--config', type=str)
-    parser.add_argument('--port', type=int)
-    parser.add_argument('--tensor-parallel-size', type=int)
-    parser.add_argument('--trust-remote-code', action='store_true')
-    parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
+    parser.add_argument("serve")
+    parser.add_argument("model_tag", nargs="?")
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--served-model-name", type=str)
+    parser.add_argument("--config", type=str)
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--tensor-parallel-size", type=int)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--multi-step-stream-outputs", action=StoreBoolean)
     return parser
 
 
 def test_underscore_to_dash(parser):
-    args = parser.parse_args(['--image_input_type', 'pixel_values'])
-    assert args.image_input_type == 'pixel_values'
+    args = parser.parse_args(["--image_input_type", "pixel_values"])
+    assert args.image_input_type == "pixel_values"
 
 
 def test_mixed_usage(parser):
-    args = parser.parse_args([
-        '--image_input_type', 'image_features', '--model-name',
-        'facebook/opt-125m'
-    ])
-    assert args.image_input_type == 'image_features'
-    assert args.model_name == 'facebook/opt-125m'
+    args = parser.parse_args(
+        ["--image_input_type", "image_features", "--model-name", "facebook/opt-125m"]
+    )
+    assert args.image_input_type == "image_features"
+    assert args.model_name == "facebook/opt-125m"
 
 
 def test_with_equals_sign(parser):
     args = parser.parse_args(
-        ['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
-    assert args.image_input_type == 'pixel_values'
-    assert args.model_name == 'facebook/opt-125m'
+        ["--image_input_type=pixel_values", "--model-name=facebook/opt-125m"]
+    )
+    assert args.image_input_type == "pixel_values"
+    assert args.model_name == "facebook/opt-125m"
 
 
 def test_with_int_value(parser):
-    args = parser.parse_args(['--batch_size', '32'])
+    args = parser.parse_args(["--batch_size", "32"])
     assert args.batch_size == 32
-    args = parser.parse_args(['--batch-size', '32'])
+    args = parser.parse_args(["--batch-size", "32"])
     assert args.batch_size == 32
 
 
 def test_with_bool_flag(parser):
-    args = parser.parse_args(['--enable_feature'])
+    args = parser.parse_args(["--enable_feature"])
     assert args.enable_feature is True
-    args = parser.parse_args(['--enable-feature'])
+    args = parser.parse_args(["--enable-feature"])
     assert args.enable_feature is True
 
 
 def test_invalid_choice(parser):
     with pytest.raises(SystemExit):
-        parser.parse_args(['--image_input_type', 'invalid_choice'])
+        parser.parse_args(["--image_input_type", "invalid_choice"])
 
 
 def test_missing_required_argument(parser):
-    parser.add_argument('--required-arg', required=True)
+    parser.add_argument("--required-arg", required=True)
     with pytest.raises(SystemExit):
         parser.parse_args([])
 
 
 def test_cli_override_to_config(parser_with_config, cli_config_file):
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', cli_config_file,
-        '--tensor-parallel-size', '3'
-    ])
+    args = parser_with_config.parse_args(
+        ["serve", "mymodel", "--config", cli_config_file, "--tensor-parallel-size", "3"]
+    )
     assert args.tensor_parallel_size == 3
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file
-    ])
+    args = parser_with_config.parse_args(
+        ["serve", "mymodel", "--tensor-parallel-size", "3", "--config", cli_config_file]
+    )
     assert args.tensor_parallel_size == 3
     assert args.port == 12312
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file, '--port', '666'
-    ])
+    args = parser_with_config.parse_args(
+        [
+            "serve",
+            "mymodel",
+            "--tensor-parallel-size",
+            "3",
+            "--config",
+            cli_config_file,
+            "--port",
+            "666",
+        ]
+    )
     assert args.tensor_parallel_size == 3
     assert args.port == 666
 
 
 def test_config_args(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', cli_config_file])
+        ["serve", "mymodel", "--config", cli_config_file]
+    )
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
     assert not args.multi_step_stream_outputs
@@ -243,22 +262,31 @@ def test_config_args(parser_with_config, cli_config_file):
 def test_config_file(parser_with_config):
     with pytest.raises(FileNotFoundError):
         parser_with_config.parse_args(
-            ['serve', 'mymodel', '--config', 'test_config.yml'])
+            ["serve", "mymodel", "--config", "test_config.yml"]
+        )
 
     with pytest.raises(ValueError):
         parser_with_config.parse_args(
-            ['serve', 'mymodel', '--config', './data/test_config.json'])
+            ["serve", "mymodel", "--config", "./data/test_config.json"]
+        )
 
     with pytest.raises(ValueError):
-        parser_with_config.parse_args([
-            'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-            '--batch-size', '32'
-        ])
+        parser_with_config.parse_args(
+            [
+                "serve",
+                "mymodel",
+                "--tensor-parallel-size",
+                "3",
+                "--config",
+                "--batch-size",
+                "32",
+            ]
+        )
 
 
 def test_no_model_tag(parser_with_config, cli_config_file):
     with pytest.raises(ValueError):
-        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+        parser_with_config.parse_args(["serve", "--config", cli_config_file])
 
 
 def test_dict_args(parser):
@@ -321,7 +349,7 @@ def test_dict_args(parser):
         },
         "key14": {
             "key15": "-minus.and.dot",
-        }
+        },
     }
     assert parsed_args.compilation_config == {
         "level": 1,
@@ -373,24 +401,29 @@ def test_duplicate_dict_args(caplog_vllm, parser):
         (lambda foo, **kwargs: None, "something_else", False, True, True),
         (lambda foo, **kwargs: None, "kwargs", True, True, False),
         (lambda foo, **kwargs: None, "foo", True, True, False),
-    ])
+    ],
+)
 # yapf: disable
-def test_supports_kw(callable,kw_name,requires_kw_only,
-                     allow_var_kwargs,is_supported):
-    assert supports_kw(
-        callable=callable,
-        kw_name=kw_name,
-        requires_kw_only=requires_kw_only,
-        allow_var_kwargs=allow_var_kwargs
-    ) == is_supported
+def test_supports_kw(
+    callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
+):
+    assert (
+        supports_kw(
+            callable=callable,
+            kw_name=kw_name,
+            requires_kw_only=requires_kw_only,
+            allow_var_kwargs=allow_var_kwargs,
+        )
+        == is_supported
+    )
 
 
 @create_new_process_for_each_test()
 def test_memory_profiling():
     # Fake out some model loading + inference memory usage to test profiling
     # Memory used by other processes will show up as cuda usage outside of torch
-    from vllm.distributed.device_communicators.cuda_wrapper import (
-        CudaRTLibrary)
+    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+
     lib = CudaRTLibrary()
     # 512 MiB allocation outside of this instance
     handle1 = lib.cudaMalloc(512 * 1024 * 1024)
@@ -399,9 +432,9 @@ def test_memory_profiling():
 
     # load weights
 
-    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
+    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
 
-    weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
+    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
 
     def measure_current_non_torch():
         free, total = torch.cuda.mem_get_info()
@@ -410,11 +443,14 @@ def measure_current_non_torch():
         current_non_torch = current_used - current_torch
         return current_non_torch
 
-    with memory_profiling(baseline_snapshot=baseline_snapshot,
-    weights_memory=weights_memory) as result, \
-        monitor(measure_current_non_torch) as monitored_values:
+    with (
+        memory_profiling(
+            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
+        ) as result,
+        monitor(measure_current_non_torch) as monitored_values,
+    ):
         # make a memory spike, 1 GiB
-        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
+        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
         del spike
 
         # Add some extra non-torch memory 256 MiB (simulate NCCL)
@@ -429,7 +465,7 @@ def measure_current_non_torch():
     # 5% tolerance is caused by cuda runtime.
     # we cannot control cuda runtime in the granularity of bytes,
     # which causes a small error (<10 MiB in practice)
-    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
     assert abs(non_torch_ratio - 1) <= 0.05
     assert result.torch_peak_increase == 1024 * 1024 * 1024
     del weights
@@ -441,63 +477,65 @@ def test_bind_kv_cache():
     from vllm.attention import Attention
 
     ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
+        "layers.0.self_attn": Attention(32, 128, 0.1),
+        "layers.1.self_attn": Attention(32, 128, 0.1),
+        "layers.2.self_attn": Attention(32, 128, 0.1),
+        "layers.3.self_attn": Attention(32, 128, 0.1),
     }
     kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
+        torch.zeros((1,)),
+        torch.zeros((1,)),
+        torch.zeros((1,)),
+        torch.zeros((1,)),
     ]
     bind_kv_cache(ctx, [kv_cache])
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
+    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
+    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
+    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
+    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
+
 
 def test_bind_kv_cache_kv_sharing():
     from vllm.attention import Attention
 
     ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
+        "layers.0.self_attn": Attention(32, 128, 0.1),
+        "layers.1.self_attn": Attention(32, 128, 0.1),
+        "layers.2.self_attn": Attention(32, 128, 0.1),
+        "layers.3.self_attn": Attention(32, 128, 0.1),
     }
     kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
+        torch.zeros((1,)),
+        torch.zeros((1,)),
+        torch.zeros((1,)),
+        torch.zeros((1,)),
     ]
     shared_kv_cache_layers = {
-        'layers.2.self_attn': 'layers.1.self_attn',
-        'layers.3.self_attn': 'layers.0.self_attn'
+        "layers.2.self_attn": "layers.1.self_attn",
+        "layers.3.self_attn": "layers.0.self_attn",
     }
     bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
+    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
+    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
+    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
+
 
 def test_bind_kv_cache_non_attention():
     from vllm.attention import Attention
 
     # example from Jamba PP=2
     ctx = {
-        'model.layers.20.attn': Attention(32, 128, 0.1),
-        'model.layers.28.attn': Attention(32, 128, 0.1),
+        "model.layers.20.attn": Attention(32, 128, 0.1),
+        "model.layers.28.attn": Attention(32, 128, 0.1),
     }
     kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
+        torch.zeros((1,)),
+        torch.zeros((1,)),
     ]
     bind_kv_cache(ctx, [kv_cache])
-    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
+    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
+    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
 
 
 def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
@@ -509,47 +547,45 @@ def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
 
         # example from bart
         ctx = {
-            'encoder.layers.0.self_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
-            'decoder.layers.0.encoder_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
-            'decoder.layers.0.self_attn.attn':
-                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+            "encoder.layers.0.self_attn.attn": Attention(
+                32, 128, 0.1, attn_type=AttentionType.ENCODER
+            ),
+            "decoder.layers.0.encoder_attn.attn": Attention(
+                32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER
+            ),
+            "decoder.layers.0.self_attn.attn": Attention(
+                32, 128, 0.1, attn_type=AttentionType.DECODER
+            ),
         }
 
         kv_cache = [
-            torch.zeros((1, )),
+            torch.zeros((1,)),
         ]
-        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+        encoder_kv_cache = ctx["encoder.layers.0.self_attn.attn"].kv_cache
 
         bind_kv_cache(ctx, [kv_cache])
-        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
-        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
-        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+        assert ctx["encoder.layers.0.self_attn.attn"].kv_cache is encoder_kv_cache
+        assert ctx["decoder.layers.0.encoder_attn.attn"].kv_cache[0] is kv_cache[0]
+        assert ctx["decoder.layers.0.self_attn.attn"].kv_cache[0] is kv_cache[0]
 
 
 def test_bind_kv_cache_pp():
     with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
         # this test runs with 1 GPU, but we simulate 2 GPUs
-        cfg = VllmConfig(
-            parallel_config=ParallelConfig(pipeline_parallel_size=2))
+        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
     with set_current_vllm_config(cfg):
         from vllm.attention import Attention
 
         ctx = {
-            'layers.0.self_attn': Attention(32, 128, 0.1),
+            "layers.0.self_attn": Attention(32, 128, 0.1),
         }
-        kv_cache = [
-            [torch.zeros((1, ))],
-            [torch.zeros((1, ))]
-        ]
+        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
         bind_kv_cache(ctx, kv_cache)
-        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
-        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
+        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
+        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
 
 
 class TestLRUCache(LRUCache):
-
     def _on_remove(self, key, value):
         if not hasattr(self, "_remove_counter"):
             self._remove_counter = 0
diff --git a/tests/test_version.py b/tests/test_version.py
index fd07abb59b1f..928f742f1de8 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -31,7 +31,8 @@ def test_version_tuple():
         ((1, 0, 0), "1.-1", True),
         ((1, 0, 0), "0.9", False),
         ((1, 0, 0), "0.17", False),
-    ])
+    ],
+)
 def test_prev_minor_version_was(version_tuple, version_str, expected):
     with patch("vllm.version.__version_tuple__", version_tuple):
         assert version._prev_minor_version_was(version_str) == expected
diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py
index 88e1efd8fdbb..68bd511635dc 100644
--- a/tests/test_vllm_port.py
+++ b/tests/test_vllm_port.py
@@ -23,14 +23,17 @@ def test_get_vllm_port_valid():
 
 def test_get_vllm_port_invalid():
     """Test when VLLM_PORT is set to a non-integer value."""
-    with (patch.dict(os.environ, {"VLLM_PORT": "abc"}, clear=True),
-          pytest.raises(ValueError, match="must be a valid integer")):
+    with (
+        patch.dict(os.environ, {"VLLM_PORT": "abc"}, clear=True),
+        pytest.raises(ValueError, match="must be a valid integer"),
+    ):
         get_vllm_port()
 
 
 def test_get_vllm_port_uri():
     """Test when VLLM_PORT is set to a URI."""
-    with (patch.dict(os.environ, {"VLLM_PORT": "tcp://localhost:5678"},
-                     clear=True),
-          pytest.raises(ValueError, match="appears to be a URI")):
+    with (
+        patch.dict(os.environ, {"VLLM_PORT": "tcp://localhost:5678"}, clear=True),
+        pytest.raises(ValueError, match="appears to be a URI"),
+    ):
         get_vllm_port()
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index e218678c4363..fbfe7e834f09 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -6,17 +6,16 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               get_cached_tokenizer)
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_cached_tokenizer
 
 
 @pytest.mark.parametrize("model_id", ["gpt2", "THUDM/chatglm3-6b"])
 def test_cached_tokenizer(model_id: str):
-    reference_tokenizer = AutoTokenizer.from_pretrained(model_id,
-                                                        trust_remote_code=True)
+    reference_tokenizer = AutoTokenizer.from_pretrained(
+        model_id, trust_remote_code=True
+    )
     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
-    reference_tokenizer.add_special_tokens(
-        {"additional_special_tokens": ["<SEP>"]})
+    reference_tokenizer.add_special_tokens({"additional_special_tokens": ["<SEP>"]})
 
     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
     _check_consistency(cached_tokenizer, reference_tokenizer)
@@ -32,13 +31,13 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
     # Cached attributes
     assert target.all_special_ids == expected.all_special_ids
     assert target.all_special_tokens == expected.all_special_tokens
-    assert (target.all_special_tokens_extended ==
-            expected.all_special_tokens_extended)
+    assert target.all_special_tokens_extended == expected.all_special_tokens_extended
     assert target.get_vocab() == expected.get_vocab()
     assert len(target) == len(expected)
 
     # Other attributes
-    assert getattr(target, "padding_side",
-                   None) == getattr(expected, "padding_side", None)
+    assert getattr(target, "padding_side", None) == getattr(
+        expected, "padding_side", None
+    )
 
     assert target.encode("prompt") == expected.encode("prompt")
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index f8aeba8301b1..095910cf2dc0 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -5,8 +5,7 @@
 from typing import Any, Optional
 
 import pytest
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.inputs import token_inputs
 from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
@@ -14,9 +13,11 @@
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
-                                        IncrementalDetokenizer,
-                                        SlowIncrementalDetokenizer)
+from vllm.v1.engine.detokenizer import (
+    FastIncrementalDetokenizer,
+    IncrementalDetokenizer,
+    SlowIncrementalDetokenizer,
+)
 
 SPECIAL_TOKS_TRUTH = [
     "Some text with adjacent special tokens                <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad>",  # noqa
@@ -48,35 +49,37 @@
 ]
 
 
-def _run_incremental_decode(tokenizer,
-                            all_input_ids,
-                            skip_special_tokens: bool,
-                            starting_index: int,
-                            spaces_between_special_tokens: bool = True,
-                            fast: Optional[bool] = None):
-
+def _run_incremental_decode(
+    tokenizer,
+    all_input_ids,
+    skip_special_tokens: bool,
+    starting_index: int,
+    spaces_between_special_tokens: bool = True,
+    fast: Optional[bool] = None,
+):
     prompt_token_ids = all_input_ids[:starting_index]
 
     params = SamplingParams(
         skip_special_tokens=skip_special_tokens,
         spaces_between_special_tokens=spaces_between_special_tokens,
     )
-    request = EngineCoreRequest("",
-                                prompt_token_ids,
-                                None,
-                                None,
-                                None,
-                                params,
-                                None,
-                                None,
-                                0.0,
-                                None,
-                                cache_salt=None,
-                                data_parallel_rank=None)
+    request = EngineCoreRequest(
+        "",
+        prompt_token_ids,
+        None,
+        None,
+        None,
+        params,
+        None,
+        None,
+        0.0,
+        None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
 
     if fast is None:
-        detokenizer = IncrementalDetokenizer.from_new_request(
-            tokenizer, request)
+        detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
     elif fast:
         detokenizer = FastIncrementalDetokenizer(tokenizer, request)
     else:
@@ -93,9 +96,11 @@ def _run_incremental_decode(tokenizer,
 
 @pytest.fixture
 def tokenizer(tokenizer_name):
-    return (MistralTokenizer.from_pretrained(tokenizer_name)
-            if "mistral" in tokenizer_name else
-            AutoTokenizer.from_pretrained(tokenizer_name))
+    return (
+        MistralTokenizer.from_pretrained(tokenizer_name)
+        if "mistral" in tokenizer_name
+        else AutoTokenizer.from_pretrained(tokenizer_name)
+    )
 
 
 @pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
@@ -107,7 +112,8 @@ def tokenizer(tokenizer_name):
         "ပုံပြင်လေးပြောပြပါ",
         # Using "URGENCY" since "CY" has token id 130282
         "URGENCY🌶️",
-    ])
+    ],
+)
 def test_mistral_edge_case(tokenizer, truth):
     """Test for a specific edge cases with V3-Tekken MistralTokenizer.
 
@@ -120,7 +126,8 @@ def test_mistral_edge_case(tokenizer, truth):
         tokenizer,
         all_input_ids,
         skip_special_tokens=True,
-        starting_index=starting_index)
+        starting_index=starting_index,
+    )
     assert decoded_text == truth
     assert out_ids == all_input_ids[starting_index:]
 
@@ -129,8 +136,10 @@ def test_mistral_edge_case(tokenizer, truth):
 def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
     if "mistral" in tokenizer_name:
         yield (
-            True if request.param else
-            pytest.skip("mistral doesn't support skip_special_tokens=False"))
+            True
+            if request.param
+            else pytest.skip("mistral doesn't support skip_special_tokens=False")
+        )
     else:
         yield bool(request.param)
 
@@ -141,8 +150,14 @@ def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
 @pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
 @pytest.mark.parametrize("spaces_between_special_tokens", (True, False))
 @pytest.mark.parametrize("fast", (True, False))
-def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
-                          spaces_between_special_tokens, fast):
+def test_decode_streaming(
+    tokenizer,
+    truth,
+    with_prompt,
+    skip_special_tokens,
+    spaces_between_special_tokens,
+    fast,
+):
     if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
         pytest.skip()
 
@@ -151,30 +166,35 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
 
     if not fast and isinstance(tokenizer, PreTrainedTokenizerFast):
         # Fix up inconsistency in fast/slow tokenizer behaviour.
-        tokenizer.add_special_tokens({
-            "additional_special_tokens": [
-                at for at in
-                tokenizer._tokenizer.get_added_tokens_decoder().values()
-                if at.special
-            ]
-        })
-
-    extra_decode_args = {} if not isinstance(tokenizer,  PreTrainedTokenizer) \
+        tokenizer.add_special_tokens(
+            {
+                "additional_special_tokens": [
+                    at
+                    for at in tokenizer._tokenizer.get_added_tokens_decoder().values()
+                    if at.special
+                ]
+            }
+        )
+
+    extra_decode_args = (
+        {}
+        if not isinstance(tokenizer, PreTrainedTokenizer)
         else {"spaces_between_special_tokens": spaces_between_special_tokens}
+    )
 
     truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
     if tokenizer.bos_token_id is not None:
         truth_tokens.insert(0, tokenizer.bos_token_id)
     truth_tokens.append(tokenizer.eos_token_id)
 
-    new_truth = tokenizer.decode(truth_tokens,
-                                 skip_special_tokens=skip_special_tokens,
-                                 **extra_decode_args)
+    new_truth = tokenizer.decode(
+        truth_tokens, skip_special_tokens=skip_special_tokens, **extra_decode_args
+    )
 
     if with_prompt:
         num_prompt_tokens = len(
-            tokenizer(truth[:len(truth) // 2],
-                      add_special_tokens=False).input_ids)
+            tokenizer(truth[: len(truth) // 2], add_special_tokens=False).input_ids
+        )
         if tokenizer.bos_token_id is not None:
             num_prompt_tokens += 1
 
@@ -182,11 +202,13 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
         generated_input_ids = truth_tokens[num_prompt_tokens:]
         all_input_ids = prompt_input_ids + generated_input_ids
         starting_index = len(prompt_input_ids)
-        prompt = tokenizer.decode(prompt_input_ids,
-                                  skip_special_tokens=skip_special_tokens,
-                                  **extra_decode_args)
+        prompt = tokenizer.decode(
+            prompt_input_ids,
+            skip_special_tokens=skip_special_tokens,
+            **extra_decode_args,
+        )
 
-        generated = new_truth[len(prompt):]
+        generated = new_truth[len(prompt) :]
     else:
         generated = new_truth
         starting_index = 0
@@ -198,7 +220,8 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
         skip_special_tokens=skip_special_tokens,
         starting_index=starting_index,
         spaces_between_special_tokens=spaces_between_special_tokens,
-        fast=fast)
+        fast=fast,
+    )
 
     assert decoded_text == generated
     assert out_ids == all_input_ids[starting_index:]
@@ -211,13 +234,15 @@ def test_oov_decode(tokenizer, fast):
         pytest.skip()
 
     decoded_text, out_ids = _run_incremental_decode(
-        tokenizer, [len(tokenizer)],
+        tokenizer,
+        [len(tokenizer)],
         skip_special_tokens=True,
         starting_index=0,
         spaces_between_special_tokens=True,
-        fast=fast)
+        fast=fast,
+    )
 
-    assert decoded_text == ''
+    assert decoded_text == ""
     assert out_ids == [len(tokenizer)]
 
 
@@ -237,8 +262,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 
 
 @pytest.fixture(name="complete_sequence_token_ids")
-def create_complete_sequence_token_ids(complete_sequence: str,
-                                       tokenizer) -> list[int]:
+def create_complete_sequence_token_ids(complete_sequence: str, tokenizer) -> list[int]:
     return tokenizer(complete_sequence, add_special_tokens=False).input_ids
 
 
@@ -252,15 +276,16 @@ def create_sequence(prompt_token_ids=None):
 
 
 def create_dummy_logprobs(
-        complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]:
-    return [{
-        token_id: Logprob(logprob=0.0),
-        token_id + 1: Logprob(logprob=0.1)
-    } for token_id in complete_sequence_token_ids]
+    complete_sequence_token_ids: list[int],
+) -> list[dict[int, Logprob]]:
+    return [
+        {token_id: Logprob(logprob=0.0), token_id + 1: Logprob(logprob=0.1)}
+        for token_id in complete_sequence_token_ids
+    ]
 
 
 def create_dummy_prompt_logprobs(
-        complete_sequence_token_ids: list[int]
+    complete_sequence_token_ids: list[int],
 ) -> list[Optional[dict[int, Any]]]:
     # logprob for the first prompt token is None.
     logprobs: list[Optional[dict[int, Any]]] = [None]
@@ -271,27 +296,31 @@ def create_dummy_prompt_logprobs(
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
 @pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
-def test_decode_sequence_logprobs(complete_sequence: str,
-                                  complete_sequence_token_ids: list[int],
-                                  detokenizer: Detokenizer,
-                                  skip_special_tokens: bool):
+def test_decode_sequence_logprobs(
+    complete_sequence: str,
+    complete_sequence_token_ids: list[int],
+    detokenizer: Detokenizer,
+    skip_special_tokens: bool,
+):
     """Verify Detokenizer decodes logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
-                                     logprobs=2)
+    sampling_params = SamplingParams(
+        skip_special_tokens=skip_special_tokens, logprobs=2
+    )
 
     # Run sequentially.
     seq = create_sequence()
     dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
     sequential_logprobs_text_chosen_token: list[str] = []
     sequential_logprobs_text_other_token: list[str] = []
-    for new_token, logprobs in zip(complete_sequence_token_ids,
-                                   dummy_logprobs):
+    for new_token, logprobs in zip(complete_sequence_token_ids, dummy_logprobs):
         seq.append_token_id(new_token, logprobs)
         detokenizer.decode_sequence_inplace(seq, sampling_params)
         sequential_logprobs_text_chosen_token.append(
-            seq.output_logprobs[-1][new_token].decoded_token)
+            seq.output_logprobs[-1][new_token].decoded_token
+        )
         sequential_logprobs_text_other_token.append(
-            seq.output_logprobs[-1][new_token + 1].decoded_token)
+            seq.output_logprobs[-1][new_token + 1].decoded_token
+        )
     sequential_result = seq.output_text
 
     assert sequential_result == "".join(sequential_logprobs_text_chosen_token)
@@ -306,58 +335,60 @@ def test_decode_sequence_logprobs(complete_sequence: str,
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-def test_decode_prompt_logprobs(complete_sequence: str,
-                                complete_sequence_token_ids: list[int],
-                                detokenizer: Detokenizer):
-
+def test_decode_prompt_logprobs(
+    complete_sequence: str,
+    complete_sequence_token_ids: list[int],
+    detokenizer: Detokenizer,
+):
     # We want to use skip_special_tokens=False here but Mistral tokenizers
     # don't support that.
     if complete_sequence not in SPECIAL_TOKS_TRUTH:
         skip_special_tokens = True
-    elif not isinstance(detokenizer.tokenizer_group.get_lora_tokenizer(None),
-                        MistralTokenizer):
+    elif not isinstance(
+        detokenizer.tokenizer_group.get_lora_tokenizer(None), MistralTokenizer
+    ):
         skip_special_tokens = False
     else:
-        pytest.skip("MistralTokenizers don't support "
-                    "skip_special_tokens=False")
+        pytest.skip("MistralTokenizers don't support skip_special_tokens=False")
         return
     """Verify Detokenizer decodes prompt logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
-                                     prompt_logprobs=1)
+    sampling_params = SamplingParams(
+        skip_special_tokens=skip_special_tokens, prompt_logprobs=1
+    )
 
     # Run sequentially.
     seq = create_sequence(complete_sequence_token_ids)
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[seq],
-                              sampling_params=sampling_params,
-                              arrival_time=0.0)
+    seq_group = SequenceGroup(
+        request_id="1", seqs=[seq], sampling_params=sampling_params, arrival_time=0.0
+    )
     dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
-    detokenizer.decode_prompt_logprobs_inplace(seq_group,
-                                               dummy_logprobs,
-                                               position_offset=0)
+    detokenizer.decode_prompt_logprobs_inplace(
+        seq_group, dummy_logprobs, position_offset=0
+    )
     # First logprob is None.
-    decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[
-        1:]  # type: ignore
+    decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[1:]  # type: ignore
 
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
     tokenizer = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = tokenizer.decode(token_ids,
-                                 skip_special_tokens=skip_special_tokens)
-    text_first = tokenizer.decode(token_ids[0],
-                                  skip_special_tokens=skip_special_tokens)
-    text = text_full[len(text_first):]
+    text_full = tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=skip_special_tokens)
+    text = text_full[len(text_first) :]
 
     # Text for logprobs for the chosen token should be the same as the
     # prompt text. Note that the first logprob is None.
-    assert text == "".join([
-        logprobs[token_id].decoded_token
-        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
-    ])
-    assert text != "".join([
-        logprobs[token_id + 1].decoded_token
-        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
-    ])
+    assert text == "".join(
+        [
+            logprobs[token_id].decoded_token
+            for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+        ]
+    )
+    assert text != "".join(
+        [
+            logprobs[token_id + 1].decoded_token
+            for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+        ]
+    )
 
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
@@ -381,20 +412,21 @@ def test_decode_prompt_logprobs_chunked_prefill(
         max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
         max_num_batched_tokens = chunked_prefill_token_size
 
-    with vllm_runner(model,
-                     dtype="half",
-                     max_logprobs=5,
-                     gpu_memory_utilization=0.5,
-                     enable_chunked_prefill=enable_chunked_prefill,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-
-        vllm_sampling_params = SamplingParams(max_tokens=10,
-                                              logprobs=5,
-                                              prompt_logprobs=5,
-                                              temperature=0.0)
+    with vllm_runner(
+        model,
+        dtype="half",
+        max_logprobs=5,
+        gpu_memory_utilization=0.5,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_sampling_params = SamplingParams(
+            max_tokens=10, logprobs=5, prompt_logprobs=5, temperature=0.0
+        )
         vllm_results = vllm_model.model.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
+            example_prompts, sampling_params=vllm_sampling_params
+        )
 
         for idx, result in enumerate(vllm_results):
             assert result.prompt_logprobs is not None
@@ -402,9 +434,9 @@ def test_decode_prompt_logprobs_chunked_prefill(
 
             # Compared detokenized prompts ids to original prompt.
             generated_string = ""
-            for (prompt_token,
-                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
-                                         result.prompt_logprobs[1:]):
+            for prompt_token, prompt_logprobs in zip(
+                result.prompt_token_ids[1:], result.prompt_logprobs[1:]
+            ):
                 # prompt_logprobs is a dict of the token_id: logprob
                 # We select the token_id corresponding to the actual prompt
                 # Decoded token in the detokenized string corresponding to this
@@ -412,4 +444,5 @@ def test_decode_prompt_logprobs_chunked_prefill(
                 generated_string += prompt_logprobs[prompt_token].decoded_token
 
             assert generated_string == example_prompts[idx], (
-                "Detokenized prompt logprobs do not match original prompt")
+                "Detokenized prompt logprobs do not match original prompt"
+            )
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index d8288429351c..921d77b1b335 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -5,6 +5,7 @@
 only get the `eos_token_id` from the tokenizer as defined by
 {meth}`vllm.LLMEngine._get_eos_token_id`.
 """
+
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -15,8 +16,7 @@ def test_get_llama3_eos_token():
     tokenizer = get_tokenizer(model_name)
     assert tokenizer.eos_token_id == 128009
 
-    generation_config = try_get_generation_config(model_name,
-                                                  trust_remote_code=False)
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
     assert generation_config is not None
     assert generation_config.eos_token_id == [128001, 128008, 128009]
 
@@ -27,7 +27,6 @@ def test_get_blip2_eos_token():
     tokenizer = get_tokenizer(model_name)
     assert tokenizer.eos_token_id == 2
 
-    generation_config = try_get_generation_config(model_name,
-                                                  trust_remote_code=False)
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
     assert generation_config is not None
     assert generation_config.eos_token_id == 50118
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index 69b3c6294284..a034188387d0 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -2,187 +2,206 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from mistral_common.protocol.instruct.messages import (AssistantMessage,
-                                                       ToolMessage,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.messages import (
+    AssistantMessage,
+    ToolMessage,
+    UserMessage,
+)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
-from mistral_common.protocol.instruct.tool_calls import (Function,
-                                                         FunctionCall, Tool,
-                                                         ToolCall)
+from mistral_common.protocol.instruct.tool_calls import (
+    Function,
+    FunctionCall,
+    Tool,
+    ToolCall,
+)
 
 from vllm.transformers_utils.tokenizers.mistral import (
-    make_mistral_chat_completion_request)
+    make_mistral_chat_completion_request,
+)
 
 
 @pytest.mark.parametrize(
     "openai_request,expected_mistral_request",
-    [(
-        {
-            "messages": [{
-                "role": "user",
-                "content": "What is the current local date and time?",
-            }],
-            "tools": [{
-                "type": "function",
-                "function": {
-                    "description": "Fetch the current local date and time.",
-                    "name": "get_current_time",
-                },
-            }],
-        },
-        ChatCompletionRequest(
-            messages=[
-                UserMessage(content="What is the current local date and time?")
-            ],
-            tools=[
-                Tool(
-                    type="function",
-                    function=Function(
-                        name="get_current_time",
-                        description="Fetch the current local date and time.",
-                        parameters={},
-                    ),
-                )
-            ],
+    [
+        (
+            {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                        },
+                    }
+                ],
+            },
+            ChatCompletionRequest(
+                messages=[
+                    UserMessage(content="What is the current local date and time?")
+                ],
+                tools=[
+                    Tool(
+                        type="function",
+                        function=Function(
+                            name="get_current_time",
+                            description="Fetch the current local date and time.",
+                            parameters={},
+                        ),
+                    )
+                ],
+            ),
         ),
-    ),
-     (
-         {
-             "messages":
-             [{
-                 "role": "user",
-                 "content": "What is the current local date and time?",
-             }],
-             "tools": [{
-                 "type": "function",
-                 "function": {
-                     "description": "Fetch the current local date and time.",
-                     "name": "get_current_time",
-                     "parameters": None,
-                 },
-             }],
-         },
-         ChatCompletionRequest(
-             messages=[
-                 UserMessage(
-                     content="What is the current local date and time?")
-             ],
-             tools=[
-                 Tool(
-                     type="function",
-                     function=Function(
-                         name="get_current_time",
-                         description="Fetch the current local date and time.",
-                         parameters={},
-                     ),
-                 )
-             ],
-         ),
-     )],
+        (
+            {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What is the current local date and time?",
+                    }
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "description": "Fetch the current local date and time.",
+                            "name": "get_current_time",
+                            "parameters": None,
+                        },
+                    }
+                ],
+            },
+            ChatCompletionRequest(
+                messages=[
+                    UserMessage(content="What is the current local date and time?")
+                ],
+                tools=[
+                    Tool(
+                        type="function",
+                        function=Function(
+                            name="get_current_time",
+                            description="Fetch the current local date and time.",
+                            parameters={},
+                        ),
+                    )
+                ],
+            ),
+        ),
+    ],
 )
-def test_make_mistral_chat_completion_request(openai_request,
-                                              expected_mistral_request):
+def test_make_mistral_chat_completion_request(openai_request, expected_mistral_request):
     actual_request = make_mistral_chat_completion_request(
-        openai_request["messages"], openai_request["tools"])
+        openai_request["messages"], openai_request["tools"]
+    )
     assert actual_request == expected_mistral_request
 
 
 # Tool use with list content and reasoning_content
-@pytest.mark.parametrize("openai_request,expected_mistral_request", [(
-    {
-        "messages": [
-            {
-                "role": "user",
-                "content": "What's the weather in Paris?",
-            },
+@pytest.mark.parametrize(
+    "openai_request,expected_mistral_request",
+    [
+        (
             {
-                "role":
-                "assistant",
-                "reasoning_content":
-                None,
-                "content":
-                None,
-                "tool_calls": [{
-                    "id": "call123",
-                    "type": "function",
-                    "function": {
-                        "name": "get_weather",
-                        "arguments": '{"city": "Paris"}',
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "What's the weather in Paris?",
                     },
-                }],
-            },
-            {
-                "role": "tool",
-                "content": [{
-                    "type": "text",
-                    "text": "Rainy"
-                }],
-                "name": "get_weather",
-                "tool_call_id": "call123",
-            },
-        ],
-        "tools": [{
-            "type": "function",
-            "function": {
-                "name": "get_weather",
-                "description": "Gets the current weather in a city.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description": "The city name"
-                        }
+                    {
+                        "role": "assistant",
+                        "reasoning_content": None,
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call123",
+                                "type": "function",
+                                "function": {
+                                    "name": "get_weather",
+                                    "arguments": '{"city": "Paris"}',
+                                },
+                            }
+                        ],
+                    },
+                    {
+                        "role": "tool",
+                        "content": [{"type": "text", "text": "Rainy"}],
+                        "name": "get_weather",
+                        "tool_call_id": "call123",
                     },
-                    "required": ["city"],
-                },
+                ],
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "description": "Gets the current weather in a city.",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "city": {
+                                        "type": "string",
+                                        "description": "The city name",
+                                    }
+                                },
+                                "required": ["city"],
+                            },
+                        },
+                    }
+                ],
             },
-        }],
-    },
-    ChatCompletionRequest(
-        messages=[
-            UserMessage(content="What's the weather in Paris?"),
-            AssistantMessage(
-                content=None,
-                tool_calls=[
-                    ToolCall(
-                        id="call123",
-                        function=FunctionCall(
+            ChatCompletionRequest(
+                messages=[
+                    UserMessage(content="What's the weather in Paris?"),
+                    AssistantMessage(
+                        content=None,
+                        tool_calls=[
+                            ToolCall(
+                                id="call123",
+                                function=FunctionCall(
+                                    name="get_weather",
+                                    arguments='{"city": "Paris"}',
+                                ),
+                            )
+                        ],
+                    ),
+                    ToolMessage(
+                        content="Rainy",
+                        tool_call_id="call123",
+                        name="get_weather",
+                    ),
+                ],
+                tools=[
+                    Tool(
+                        type="function",
+                        function=Function(
                             name="get_weather",
-                            arguments='{"city": "Paris"}',
+                            description="Gets the current weather in a city.",
+                            parameters={
+                                "type": "object",
+                                "properties": {
+                                    "city": {
+                                        "type": "string",
+                                        "description": "The city name",
+                                    }
+                                },
+                                "required": ["city"],
+                            },
                         ),
                     )
                 ],
             ),
-            ToolMessage(
-                content="Rainy",
-                tool_call_id="call123",
-                name="get_weather",
-            ),
-        ],
-        tools=[
-            Tool(
-                type="function",
-                function=Function(
-                    name="get_weather",
-                    description="Gets the current weather in a city.",
-                    parameters={
-                        "type": "object",
-                        "properties": {
-                            "city": {
-                                "type": "string",
-                                "description": "The city name"
-                            }
-                        },
-                        "required": ["city"],
-                    },
-                ),
-            )
-        ],
-    ),
-)])
+        )
+    ],
+)
 def test_make_mistral_chat_completion_request_list_content(
-        openai_request, expected_mistral_request):
+    openai_request, expected_mistral_request
+):
     actual_request = make_mistral_chat_completion_request(
-        openai_request["messages"], openai_request["tools"])
+        openai_request["messages"], openai_request["tools"]
+    )
     assert actual_request == expected_mistral_request
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
index 09a3638fd2ed..e86bb03883b5 100644
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
@@ -19,5 +19,5 @@ def test_tokenizer_revision(tokenizer_name: str):
     assert isinstance(tokenizer, PreTrainedTokenizerBase)
 
     # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
+    with pytest.raises(OSError, match="not a valid git identifier"):
         get_tokenizer(tokenizer_name, revision="never")
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 0570c1525e11..1ce38019c631 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -17,11 +17,12 @@ async def test_tokenizer_group():
         max_input_length=None,
     )
     assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
-        prompt="prompt", lora_request=None)
-    assert reference_tokenizer.encode(
-        "prompt") == await tokenizer_group.encode_async(prompt="prompt",
-                                                        lora_request=None)
-    assert isinstance(tokenizer_group.get_lora_tokenizer(None),
-                      PreTrainedTokenizerBase)
+        prompt="prompt", lora_request=None
+    )
+    assert reference_tokenizer.encode("prompt") == await tokenizer_group.encode_async(
+        prompt="prompt", lora_request=None
+    )
+    assert isinstance(tokenizer_group.get_lora_tokenizer(None), PreTrainedTokenizerBase)
     assert tokenizer_group.get_lora_tokenizer(
-        None) == await tokenizer_group.get_lora_tokenizer_async(None)
+        None
+    ) == await tokenizer_group.get_lora_tokenizer_async(None)
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index 5abb10164408..4384c12369c3 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -4,15 +4,13 @@
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
-                                                    TokenizerRegistry)
+from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry
 
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 
 class TestTokenizer(TokenizerBase):
-
     @classmethod
     def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
         return TestTokenizer()
@@ -81,23 +79,23 @@ def encode_one(
     ) -> list[int]:
         raise NotImplementedError()
 
-    def encode(self,
-               text: str,
-               add_special_tokens: Optional[bool] = None) -> list[int]:
+    def encode(self, text: str, add_special_tokens: Optional[bool] = None) -> list[int]:
         raise NotImplementedError()
 
-    def apply_chat_template(self,
-                            messages: list["ChatCompletionMessageParam"],
-                            tools: Optional[list[dict[str, Any]]] = None,
-                            **kwargs) -> list[int]:
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: Optional[list[dict[str, Any]]] = None,
+        **kwargs,
+    ) -> list[int]:
         raise NotImplementedError()
 
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError()
 
-    def decode(self,
-               ids: Union[list[int], int],
-               skip_special_tokens: bool = True) -> str:
+    def decode(
+        self, ids: Union[list[int], int], skip_special_tokens: bool = True
+    ) -> str:
         raise NotImplementedError()
 
     def convert_ids_to_tokens(
@@ -109,9 +107,9 @@ def convert_ids_to_tokens(
 
 
 def test_customized_tokenizer():
-    TokenizerRegistry.register("test_tokenizer",
-                               "tests.tokenization.test_tokenizer_registry",
-                               "TestTokenizer")
+    TokenizerRegistry.register(
+        "test_tokenizer", "tests.tokenization.test_tokenizer_registry", "TestTokenizer"
+    )
 
     tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
     assert isinstance(tokenizer, TestTokenizer)
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 510b54790cd9..ff9cdeeb7375 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -13,13 +13,13 @@
 
 # select models to test based on command line arguments
 def pytest_addoption(parser):
-    parser.addoption("--models",
-                     nargs="+",
-                     help="Specify one or more models to test")
-    parser.addoption("--extended",
-                     action="store_true",
-                     default=False,
-                     help="invoke extended tests requiring large GPUs")
+    parser.addoption("--models", nargs="+", help="Specify one or more models to test")
+    parser.addoption(
+        "--extended",
+        action="store_true",
+        default=False,
+        help="invoke extended tests requiring large GPUs",
+    )
 
 
 # for each server config, download the model and return the config
@@ -29,8 +29,10 @@ def server_config(request):
     models = request.config.getoption("--models")
 
     config_keys_to_test = [
-        key for key in CONFIGS if (models is None or key in models) and (
-            extended or not CONFIGS[key].get("extended", False))
+        key
+        for key in CONFIGS
+        if (models is None or key in models)
+        and (extended or not CONFIGS[key].get("extended", False))
     ]
 
     config_key = request.param
@@ -40,8 +42,9 @@ def server_config(request):
     config = CONFIGS[config_key]
 
     if current_platform.is_rocm() and not config.get("supports_rocm", True):
-        pytest.skip("The {} model can't be tested on the ROCm platform".format(
-            config["model"]))
+        pytest.skip(
+            "The {} model can't be tested on the ROCm platform".format(config["model"])
+        )
 
     # download model and tokenizer using transformers
     snapshot_download(config["model"])
@@ -53,8 +56,9 @@ def server_config(request):
 def server(request, server_config: ServerConfig):
     model = server_config["model"]
     args_for_model = server_config["arguments"]
-    with RemoteOpenAIServer(model, ARGS + args_for_model,
-                            max_wait_seconds=480) as server:
+    with RemoteOpenAIServer(
+        model, ARGS + args_for_model, max_wait_seconds=480
+    ) as server:
         yield server
 
 
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index a30c58b09fe8..50cd9e4279b2 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -8,68 +8,56 @@
 
 def test_chat_completion_request_with_no_tools():
     # tools key is not present
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        'facebook/opt-125m',
-    })
-    assert request.tool_choice == 'none'
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+        }
+    )
+    assert request.tool_choice == "none"
 
     # tools key is None
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        'facebook/opt-125m',
-        'tools':
-        None
-    })
-    assert request.tool_choice == 'none'
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+            "tools": None,
+        }
+    )
+    assert request.tool_choice == "none"
 
     # tools key present but empty
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        'facebook/opt-125m',
-        'tools': []
-    })
-    assert request.tool_choice == 'none'
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+            "tools": [],
+        }
+    )
+    assert request.tool_choice == "none"
 
 
-@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
 def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
-    with pytest.raises(ValueError,
-                       match="When using `tool_choice`, `tools` must be set."):
-        ChatCompletionRequest.model_validate({
-            'messages': [{
-                'role': 'user',
-                'content': 'Hello'
-            }],
-            'model':
-            'facebook/opt-125m',
-            'tool_choice':
-            tool_choice
-        })
-
-    with pytest.raises(ValueError,
-                       match="When using `tool_choice`, `tools` must be set."):
-        ChatCompletionRequest.model_validate({
-            'messages': [{
-                'role': 'user',
-                'content': 'Hello'
-            }],
-            'model':
-            'facebook/opt-125m',
-            'tool_choice':
-            tool_choice,
-            'tools':
-            None
-        })
+    with pytest.raises(
+        ValueError, match="When using `tool_choice`, `tools` must be set."
+    ):
+        ChatCompletionRequest.model_validate(
+            {
+                "messages": [{"role": "user", "content": "Hello"}],
+                "model": "facebook/opt-125m",
+                "tool_choice": tool_choice,
+            }
+        )
+
+    with pytest.raises(
+        ValueError, match="When using `tool_choice`, `tools` must be set."
+    ):
+        ChatCompletionRequest.model_validate(
+            {
+                "messages": [{"role": "user", "content": "Hello"}],
+                "model": "facebook/opt-125m",
+                "tool_choice": tool_choice,
+                "tools": None,
+            }
+        )
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 8c01c86e29f2..425d3879985e 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -4,16 +4,21 @@
 import openai
 import pytest
 
-from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
-                    ensure_system_prompt)
+from .utils import (
+    MESSAGES_WITHOUT_TOOLS,
+    WEATHER_TOOL,
+    ServerConfig,
+    ensure_system_prompt,
+)
 
 
 # test: make sure chat completions without tools provided work even when tools
 # are enabled. This makes sure tool call chat templates work, AND that the tool
 # parser stream processing doesn't change the output of the model.
 @pytest.mark.asyncio
-async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
-                                             server_config: ServerConfig):
+async def test_chat_completion_without_tools(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
@@ -21,7 +26,8 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
         temperature=0,
         max_completion_tokens=150,
         model=model_name,
-        logprobs=False)
+        logprobs=False,
+    )
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
     output_text = chat_completion.choices[0].message.content
@@ -32,8 +38,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
     assert stop_reason != "tool_calls"
 
     # check to make sure no tool calls were returned
-    assert (choice.message.tool_calls is None
-            or len(choice.message.tool_calls) == 0)
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
@@ -55,7 +60,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
         # make sure the role is assistant
         if delta.role:
             assert not role_sent
-            assert delta.role == 'assistant'
+            assert delta.role == "assistant"
             role_sent = True
 
         if delta.content:
@@ -80,8 +85,9 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
 # tools, to make sure we can still get normal chat completion responses
 # and that they won't be parsed as tools
 @pytest.mark.asyncio
-async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
-                                          server_config: ServerConfig):
+async def test_chat_completion_with_tools(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
@@ -90,19 +96,19 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
         max_completion_tokens=150,
         model=model_name,
         tools=[WEATHER_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
     output_text = chat_completion.choices[0].message.content
 
     # check to make sure we got text
     assert output_text is not None
-    assert stop_reason != 'tool_calls'
+    assert stop_reason != "tool_calls"
     assert len(output_text) > 0
 
     # check to make sure no tool calls were returned
-    assert (choice.message.tool_calls is None
-            or len(choice.message.tool_calls) == 0)
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
@@ -125,7 +131,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
 
         # make sure the role is assistant
         if delta.role:
-            assert delta.role == 'assistant'
+            assert delta.role == "assistant"
             role_sent = True
 
         if delta.content:
@@ -142,6 +148,6 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
     assert role_sent
     assert finish_reason_count == 1
     assert chunk.choices[0].finish_reason == stop_reason
-    assert chunk.choices[0].finish_reason != 'tool_calls'
+    assert chunk.choices[0].finish_reason != "tool_calls"
     assert len(chunks)
     assert "".join(chunks) == output_text
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 35153139350b..59fd2daad461 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -9,8 +9,7 @@
 import pytest
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
-                                              ToolCall)
+from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers import JambaToolParser
 from vllm.transformers_utils.detokenizer import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
@@ -28,12 +27,14 @@ def jamba_tool_parser(jamba_tokenizer):
     return JambaToolParser(jamba_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 16
 
@@ -42,10 +43,9 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
 
 
 def stream_delta_message_generator(
-        jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer,
-        model_output: str) -> Generator[DeltaMessage, None, None]:
-    all_token_ids = jamba_tokenizer.encode(model_output,
-                                           add_special_tokens=False)
+    jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, model_output: str
+) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = jamba_tokenizer.encode(model_output, add_special_tokens=False)
 
     previous_text = ""
     previous_tokens = None
@@ -54,18 +54,19 @@ def stream_delta_message_generator(
     for i, delta_token in enumerate(all_token_ids):
         delta_token_ids = [delta_token]
         previous_token_ids = all_token_ids[:i]
-        current_token_ids = all_token_ids[:i + 1]
-
-        (new_tokens, delta_text, new_prefix_offset,
-         new_read_offset) = detokenize_incrementally(
-             tokenizer=jamba_tokenizer,
-             all_input_ids=current_token_ids,
-             prev_tokens=previous_tokens,
-             prefix_offset=prefix_offset,
-             read_offset=read_offset,
-             skip_special_tokens=False,
-             spaces_between_special_tokens=True,
-         )
+        current_token_ids = all_token_ids[: i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+            detokenize_incrementally(
+                tokenizer=jamba_tokenizer,
+                all_input_ids=current_token_ids,
+                prev_tokens=previous_tokens,
+                prefix_offset=prefix_offset,
+                read_offset=read_offset,
+                skip_special_tokens=False,
+                spaces_between_special_tokens=True,
+            )
+        )
 
         current_text = previous_text + delta_text
 
@@ -82,8 +83,9 @@ def stream_delta_message_generator(
             yield delta_message
 
         previous_text = current_text
-        previous_tokens = previous_tokens + new_tokens if previous_tokens\
-            else new_tokens
+        previous_tokens = (
+            previous_tokens + new_tokens if previous_tokens else new_tokens
+        )
         prefix_offset = new_prefix_offset
         read_offset = new_read_offset
 
@@ -91,7 +93,8 @@ def stream_delta_message_generator(
 def test_extract_tool_calls_no_tools(jamba_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -106,54 +109,63 @@ def test_extract_tool_calls_no_tools(jamba_tool_parser):
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            None),
+            None,
+        ),
         (
-            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            " Sure! let me call the tool for you."),
+            " Sure! let me call the tool for you.",
+        ),
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   }))),
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Orlando",
-                                                       "state": "FL",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
             ],
-            None)
+            None,
+        ),
     ],
 )
-def test_extract_tool_calls(jamba_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    jamba_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -170,63 +182,75 @@ def test_extract_tool_calls(jamba_tool_parser, model_output,
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
-        ('''This is a test''', [], '''This is a test'''),
+        ("""This is a test""", [], """This is a test"""),
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            " "),
+            " ",
+        ),
         (
-            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                )
             ],
-            " Sure! let me call the tool for you."),
+            " Sure! let me call the tool for you.",
+        ),
         (
-            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            """ <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Dallas",
-                                                       "state": "TX",
-                                                       "unit": "fahrenheit"
-                                                   }))),
-                ToolCall(function=FunctionCall(name="get_current_weather",
-                                               arguments=json.dumps(
-                                                   {
-                                                       "city": "Orlando",
-                                                       "state": "FL",
-                                                       "unit": "fahrenheit"
-                                                   })))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}
+                        ),
+                    )
+                ),
             ],
-            " ")
+            " ",
+        ),
     ],
 )
-def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
-                                      model_output, expected_tool_calls,
-                                      expected_content):
-    other_content: str = ''
+def test_extract_tool_calls_streaming(
+    jamba_tool_parser,
+    jamba_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    other_content: str = ""
     function_names: list[str] = []
     function_args_strs: list[str] = []
     tool_call_idx: int = -1
     tool_call_ids: list[Optional[str]] = []
 
     for delta_message in stream_delta_message_generator(
-            jamba_tool_parser, jamba_tokenizer, model_output):
+        jamba_tool_parser, jamba_tokenizer, model_output
+    ):
         # role should never be streamed from tool parser
         assert not delta_message.role
 
@@ -262,18 +286,22 @@ def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
                     # make sure they're a string and then add them to the list
                     assert isinstance(tool_call.function.arguments, str)
 
-                    function_args_strs[
-                        tool_call.index] += tool_call.function.arguments
+                    function_args_strs[tool_call.index] += tool_call.function.arguments
 
     assert other_content == expected_content
 
     actual_tool_calls = [
-        ToolCall(id=tool_call_id,
-                 function=FunctionCall(
-                     name=function_name,
-                     arguments=partial_json_parser.ensure_json(
-                         function_args_str, Allow.OBJ | Allow.STR)))
+        ToolCall(
+            id=tool_call_id,
+            function=FunctionCall(
+                name=function_name,
+                arguments=partial_json_parser.ensure_json(
+                    function_args_str, Allow.OBJ | Allow.STR
+                ),
+            ),
+        )
         for tool_call_id, function_name, function_args_str in zip(
-            tool_call_ids, function_names, function_args_strs)
+            tool_call_ids, function_names, function_args_strs
+        )
     ]
     assert_tool_calls(actual_tool_calls, expected_tool_calls)
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index bd030632f167..11441f96d5a9 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -24,27 +24,31 @@ def kimi_k2_tool_parser(kimi_k2_tokenizer):
     return KimiK2ToolParser(kimi_k2_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
-
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert actual_tool_call.type == "function"
         assert actual_tool_call.function == expected_tool_call.function
 
         # assert tool call id format
         assert actual_tool_call.id.startswith("functions.")
-        assert actual_tool_call.id.split(':')[-1].isdigit()
-        assert actual_tool_call.id.split('.')[1].split(
-            ':')[0] == expected_tool_call.function.name
+        assert actual_tool_call.id.split(":")[-1].isdigit()
+        assert (
+            actual_tool_call.id.split(".")[1].split(":")[0]
+            == expected_tool_call.function.name
+        )
 
 
 def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -61,14 +65,18 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
             """I'll help you check the weather. <|tool_calls_section_begin|> <|tool_call_begin|>
 functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_calls_section_end|>""",
             [
-                ToolCall(id='functions.get_weather:0',
-                         function=FunctionCall(
-                             name="get_weather",
-                             arguments=json.dumps({
-                                 "city": "Beijing",
-                             }, ),
-                         ),
-                         type='function')
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Beijing",
+                            },
+                        ),
+                    ),
+                    type="function",
+                )
             ],
             "I'll help you check the weather. ",
         ),
@@ -77,31 +85,41 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
 functions.get_weather:0 <|tool_call_argument_begin|> {"city": "Beijing"} <|tool_call_end|> <|tool_call_begin|>
 functions.get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>""",
             [
-                ToolCall(id='functions.get_weather:0',
-                         function=FunctionCall(
-                             name="get_weather",
-                             arguments=json.dumps({
-                                 "city": "Beijing",
-                             }, ),
-                         ),
-                         type='function'),
-                ToolCall(id='functions.get_weather:1',
-                         function=FunctionCall(
-                             name="get_weather",
-                             arguments=json.dumps({
-                                 "city": "Shanghai",
-                             }, ),
-                         ),
-                         type='function')
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Beijing",
+                            },
+                        ),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_weather:1",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Shanghai",
+                            },
+                        ),
+                    ),
+                    type="function",
+                ),
             ],
             "I'll help you check the weather. ",
         ),
     ],
 )
-def test_extract_tool_calls(kimi_k2_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    kimi_k2_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -116,15 +134,14 @@ def test_extract_tool_calls_invalid_json(kimi_k2_tool_parser):
 functions.valid_get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>"""
 
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid JSON tool calls
     assert len(extracted_tool_calls.tool_calls) == 2
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "invalid_get_weather"
-    assert extracted_tool_calls.tool_calls[
-        1].function.name == "valid_get_weather"
+    assert extracted_tool_calls.tool_calls[0].function.name == "invalid_get_weather"
+    assert extracted_tool_calls.tool_calls[1].function.name == "valid_get_weather"
 
 
 def test_extract_tool_calls_invalid_funcall(kimi_k2_tool_parser):
@@ -134,13 +151,13 @@ def test_extract_tool_calls_invalid_funcall(kimi_k2_tool_parser):
 functions.valid_get_weather:1 <|tool_call_argument_begin|> {"city": "Shanghai"} <|tool_call_end|> <|tool_calls_section_end|>"""
 
     extracted_tool_calls = kimi_k2_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid JSON tool calls
     assert len(extracted_tool_calls.tool_calls) == 1
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "valid_get_weather"
+    assert extracted_tool_calls.tool_calls[0].function.name == "valid_get_weather"
 
 
 def test_streaming_basic_functionality(kimi_k2_tool_parser):
@@ -168,8 +185,7 @@ def test_streaming_basic_functionality(kimi_k2_tool_parser):
 
     # The result might be None or contain tool call information
     # This depends on the internal state management
-    if result is not None and hasattr(result,
-                                      'tool_calls') and result.tool_calls:
+    if result is not None and hasattr(result, "tool_calls") and result.tool_calls:
         assert len(result.tool_calls) >= 0
 
 
@@ -189,5 +205,5 @@ def test_streaming_no_tool_calls(kimi_k2_tool_parser):
 
     # Should return the delta text as content
     assert result is not None
-    assert hasattr(result, 'content')
+    assert hasattr(result, "content")
     assert result.content == " without any tool calls."
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index 49b8e4b96f1b..913e016956d3 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -24,12 +24,14 @@ def minimax_tool_parser(minimax_tokenizer):
     return MinimaxToolParser(minimax_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 16
 
@@ -40,7 +42,8 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
 def test_extract_tool_calls_no_tools(minimax_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -61,14 +64,18 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
@@ -78,22 +85,30 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                )),
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Orlando",
-                        "state": "FL",
-                        "unit": "fahrenheit",
-                    }),
-                )),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Orlando",
+                                "state": "FL",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
             ],
             None,
         ),
@@ -102,14 +117,18 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Seattle",
-                        "state": "WA",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Seattle",
+                                "state": "WA",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll help you check the weather.",
         ),
@@ -118,14 +137,18 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "New York", "state": "NY", "unit": "celsius"}}
 </tool_calls>""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "New York",
-                        "state": "NY",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "New York",
+                                "state": "NY",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
@@ -133,22 +156,28 @@ def test_extract_tool_calls_no_tools(minimax_tool_parser):
             """<tool_calls>
 {"name": "get_current_weather", "arguments": {"city": "Boston", "state": "MA"}}""",
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Boston",
-                        "state": "MA",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Boston",
+                                "state": "MA",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
     ],
 )
-def test_extract_tool_calls(minimax_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    minimax_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -166,8 +195,7 @@ def test_preprocess_model_output_with_thinking_tags(minimax_tool_parser):
 {"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA"}}
 </tool_calls>"""
 
-    processed_output = minimax_tool_parser.preprocess_model_output(
-        model_output)
+    processed_output = minimax_tool_parser.preprocess_model_output(model_output)
 
     # The tool call within thinking tags should be removed
     assert "fake_tool" not in processed_output
@@ -189,12 +217,12 @@ def test_extract_tool_calls_with_thinking_tags(minimax_tool_parser):
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     assert len(extracted_tool_calls.tool_calls) == 1
-    assert extracted_tool_calls.tool_calls[
-        0].function.name == "get_current_weather"
+    assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
 
     # Content extraction is based on the position of the first <tool_calls> in the original model_output
     # Since preprocessing removes tool calls within thinking tags, the actual first <tool_calls> is the external one
@@ -215,14 +243,14 @@ def test_extract_tool_calls_invalid_json(minimax_tool_parser):
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid JSON tool calls
     assert len(extracted_tool_calls.tool_calls) == 2
     assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
-    assert extracted_tool_calls.tool_calls[
-        1].function.name == "another_valid_tool"
+    assert extracted_tool_calls.tool_calls[1].function.name == "another_valid_tool"
 
 
 def test_extract_tool_calls_missing_name_or_arguments(minimax_tool_parser):
@@ -235,14 +263,14 @@ def test_extract_tool_calls_missing_name_or_arguments(minimax_tool_parser):
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     assert extracted_tool_calls.tools_called
     # Should extract only the valid tool calls with both name and arguments
     assert len(extracted_tool_calls.tool_calls) == 2
     assert extracted_tool_calls.tool_calls[0].function.name == "valid_tool"
-    assert extracted_tool_calls.tool_calls[
-        1].function.name == "another_valid_tool"
+    assert extracted_tool_calls.tool_calls[1].function.name == "another_valid_tool"
 
 
 def test_streaming_basic_functionality(minimax_tool_parser):
@@ -271,8 +299,7 @@ def test_streaming_basic_functionality(minimax_tool_parser):
 
     # The result might be None or contain tool call information
     # This depends on the internal state management
-    if result is not None and hasattr(result,
-                                      'tool_calls') and result.tool_calls:
+    if result is not None and hasattr(result, "tool_calls") and result.tool_calls:
         assert len(result.tool_calls) >= 0
 
 
@@ -297,7 +324,7 @@ def test_streaming_with_content_before_tool_calls(minimax_tool_parser):
         request=None,
     )
 
-    if result is not None and hasattr(result, 'content'):
+    if result is not None and hasattr(result, "content"):
         # Should contain some content
         assert result.content is not None
 
@@ -318,7 +345,7 @@ def test_streaming_no_tool_calls(minimax_tool_parser):
 
     # Should return the delta text as content
     assert result is not None
-    assert hasattr(result, 'content')
+    assert hasattr(result, "content")
     assert result.content == " without any tool calls."
 
 
@@ -344,8 +371,7 @@ def test_streaming_with_thinking_tags(minimax_tool_parser):
 
     # The preprocessing should remove tool calls from thinking tags
     # and only process the real tool call
-    if result is not None and hasattr(result,
-                                      'tool_calls') and result.tool_calls:
+    if result is not None and hasattr(result, "tool_calls") and result.tool_calls:
         for tool_call in result.tool_calls:
             assert tool_call.function.name != "ignored"
 
@@ -364,7 +390,8 @@ def test_extract_tool_calls_multiline_json_not_supported(minimax_tool_parser):
 </tool_calls>"""
 
     extracted_tool_calls = minimax_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
 
     # Multiline JSON is currently not supported, should return no tools called
     assert not extracted_tool_calls.tools_called
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index fff20c68d621..159966365ec4 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -7,9 +7,13 @@
 import openai
 import pytest
 
-from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
-                    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
-                    WEATHER_TOOL, ServerConfig)
+from .utils import (
+    MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+    SEARCH_TOOL,
+    WEATHER_TOOL,
+    ServerConfig,
+)
 
 
 # test: getting the model to generate parallel tool calls (streaming/not)
@@ -17,12 +21,15 @@
 # may be added in the future. e.g. llama 3.1 models are not designed to support
 # parallel tool calls.
 @pytest.mark.asyncio
-async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
-                                   server_config: ServerConfig):
-
+async def test_parallel_tool_calls(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     if not server_config.get("supports_parallel", True):
-        pytest.skip("The {} model doesn't support parallel tool calls".format(
-            server_config["model"]))
+        pytest.skip(
+            "The {} model doesn't support parallel tool calls".format(
+                server_config["model"]
+            )
+        )
 
     models = await client.models.list()
     model_name: str = models.data[0].id
@@ -32,7 +39,8 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
         max_completion_tokens=200,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
@@ -69,7 +77,8 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
         max_completion_tokens=200,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
     role_name: Optional[str] = None
     finish_reason_count: int = 0
@@ -80,24 +89,22 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
     tool_call_id_count: int = 0
 
     async for chunk in stream:
-
         # if there's a finish reason make sure it's tools
         if chunk.choices[0].finish_reason:
             finish_reason_count += 1
-            assert chunk.choices[0].finish_reason == 'tool_calls'
+            assert chunk.choices[0].finish_reason == "tool_calls"
 
         # if a role is being streamed make sure it wasn't already set to
         # something else
         if chunk.choices[0].delta.role:
-            assert not role_name or role_name == 'assistant'
-            role_name = 'assistant'
+            assert not role_name or role_name == "assistant"
+            role_name = "assistant"
 
         # if a tool call is streamed make sure there's exactly one
         # (based on the request parameters
         streamed_tool_calls = chunk.choices[0].delta.tool_calls
 
         if streamed_tool_calls and len(streamed_tool_calls) > 0:
-
             # make sure only one diff is present - correct even for parallel
             assert len(streamed_tool_calls) == 1
             tool_call = streamed_tool_calls[0]
@@ -110,8 +117,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
             # if a tool call ID is streamed, make sure one hasn't been already
             if tool_call.id:
                 tool_call_id_count += 1
-                assert (isinstance(tool_call.id, str)
-                        and (len(tool_call.id) >= 9))
+                assert isinstance(tool_call.id, str) and (len(tool_call.id) >= 9)
 
             # if parts of the function start being streamed
             if tool_call.function:
@@ -125,32 +131,32 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
                     # make sure they're a string and then add them to the list
                     assert isinstance(tool_call.function.arguments, str)
 
-                    tool_call_args[
-                        tool_call.index] += tool_call.function.arguments
+                    tool_call_args[tool_call.index] += tool_call.function.arguments
 
     assert finish_reason_count == 1
-    assert role_name == 'assistant'
+    assert role_name == "assistant"
 
-    assert (len(non_streamed_tool_calls) == len(tool_call_names) ==
-            len(tool_call_args))
+    assert len(non_streamed_tool_calls) == len(tool_call_names) == len(tool_call_args)
 
     for i in range(2):
         assert non_streamed_tool_calls[i].function.name == tool_call_names[i]
         streamed_args = json.loads(tool_call_args[i])
-        non_streamed_args = json.loads(
-            non_streamed_tool_calls[i].function.arguments)
+        non_streamed_args = json.loads(non_streamed_tool_calls[i].function.arguments)
         assert streamed_args == non_streamed_args
 
 
 # test: providing parallel tool calls back to the model to get a response
 # (streaming/not)
 @pytest.mark.asyncio
-async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
-                                                server_config: ServerConfig):
-
+async def test_parallel_tool_calls_with_results(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
     if not server_config.get("supports_parallel", True):
-        pytest.skip("The {} model doesn't support parallel tool calls".format(
-            server_config["model"]))
+        pytest.skip(
+            "The {} model doesn't support parallel tool calls".format(
+                server_config["model"]
+            )
+        )
 
     models = await client.models.list()
     model_name: str = models.data[0].id
@@ -160,14 +166,14 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
         max_completion_tokens=200,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
 
     assert choice.finish_reason != "tool_calls"  # "stop" or "length"
     assert choice.message.role == "assistant"
-    assert choice.message.tool_calls is None \
-           or len(choice.message.tool_calls) == 0
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
     assert choice.message.content is not None
     assert "98" in choice.message.content  # Dallas temp in tool response
     assert "78" in choice.message.content  # Orlando temp in tool response
@@ -179,7 +185,8 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
     chunks: list[str] = []
     finish_reason_count = 0
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index 53ba03a0ae10..64186aaac6a7 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -7,8 +7,12 @@
 import openai
 import pytest
 
-from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE,
-                    SEARCH_TOOL, WEATHER_TOOL)
+from .utils import (
+    MESSAGES_ASKING_FOR_TOOLS,
+    MESSAGES_WITH_TOOL_RESPONSE,
+    SEARCH_TOOL,
+    WEATHER_TOOL,
+)
 
 
 # test: request a chat completion that should return tool calls, so we know they
@@ -23,17 +27,18 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
         max_completion_tokens=100,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
     stop_reason = chat_completion.choices[0].finish_reason
     tool_calls = chat_completion.choices[0].message.tool_calls
 
     # make sure a tool call is present
-    assert choice.message.role == 'assistant'
+    assert choice.message.role == "assistant"
     assert tool_calls is not None
     assert len(tool_calls) == 1
-    assert tool_calls[0].type == 'function'
+    assert tool_calls[0].type == "function"
     assert tool_calls[0].function is not None
     assert isinstance(tool_calls[0].id, str)
     assert len(tool_calls[0].id) >= 9
@@ -54,7 +59,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
     assert stop_reason == "tool_calls"
 
     function_name: Optional[str] = None
-    function_args_str: str = ''
+    function_args_str: str = ""
     tool_call_id: Optional[str] = None
     role_name: Optional[str] = None
     finish_reason_count: int = 0
@@ -67,20 +72,21 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
         max_completion_tokens=100,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
     async for chunk in stream:
         assert chunk.choices[0].index == 0
 
         if chunk.choices[0].finish_reason:
             finish_reason_count += 1
-            assert chunk.choices[0].finish_reason == 'tool_calls'
+            assert chunk.choices[0].finish_reason == "tool_calls"
 
         # if a role is being streamed make sure it wasn't already set to
         # something else
         if chunk.choices[0].delta.role:
-            assert not role_name or role_name == 'assistant'
-            role_name = 'assistant'
+            assert not role_name or role_name == "assistant"
+            role_name = "assistant"
 
         # if a tool call is streamed make sure there's exactly one
         # (based on the request parameters
@@ -108,7 +114,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
                     function_args_str += tool_call.function.arguments
 
     assert finish_reason_count == 1
-    assert role_name == 'assistant'
+    assert role_name == "assistant"
     assert isinstance(tool_call_id, str) and (len(tool_call_id) >= 9)
 
     # validate the name and arguments
@@ -148,14 +154,14 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
         max_completion_tokens=100,
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
-        logprobs=False)
+        logprobs=False,
+    )
 
     choice = chat_completion.choices[0]
 
     assert choice.finish_reason != "tool_calls"  # "stop" or "length"
     assert choice.message.role == "assistant"
-    assert choice.message.tool_calls is None \
-           or len(choice.message.tool_calls) == 0
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
     assert choice.message.content is not None
     assert "98" in choice.message.content  # the temperature from the response
 
@@ -166,7 +172,8 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
         model=model_name,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
         logprobs=False,
-        stream=True)
+        stream=True,
+    )
 
     chunks: list[str] = []
     finish_reason_count = 0
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index e0ed221a93e1..699f64723de9 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -8,8 +8,10 @@
 import regex as re
 from pydantic import TypeAdapter
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              ChatCompletionToolsParam)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 
 EXAMPLE_TOOLS = [
@@ -22,18 +24,16 @@
                 "type": "object",
                 "properties": {
                     "city": {
-                        "type":
-                        "string",
-                        "description":
-                        "The city to find the weather for"
+                        "type": "string",
+                        "description": "The city to find the weather for"
                         ", e.g. 'San Francisco'",
                     },
                 },
                 "required": ["city"],
-                "additionalProperties": False
+                "additionalProperties": False,
             },
         },
-        "strict": True
+        "strict": True,
     },
     {
         "type": "function",
@@ -44,35 +44,33 @@
                 "type": "object",
                 "properties": {
                     "city": {
-                        "type":
-                        "string",
-                        "description":
-                        "The city to get the forecast for, e.g. 'New York'",
+                        "type": "string",
+                        "description": "The city to get the forecast for, e.g. 'New York'",
                     },
                     "days": {
-                        "type":
-                        "integer",
-                        "description":
-                        "Number of days to get the forecast for (1-7)",
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
                     },
                 },
                 "required": ["city", "days"],
-                "additionalProperties": False
+                "additionalProperties": False,
             },
         },
-        "strict": True
+        "strict": True,
     },
 ]
 
 
-def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
-                       should_match: bool):
+def _compile_and_check(
+    tools: list[ChatCompletionToolsParam], sample_output, should_match: bool
+):
     self = MagicMock(tool_choice="required", tools=tools)
     schema = ChatCompletionRequest._get_guided_json_from_tool(self)
     assert isinstance(schema, dict)
 
     # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
     from outlines_core.json_schema import build_regex_from_schema
+
     regex = build_regex_from_schema(json.dumps(schema))
     compiled = re.compile(regex)
     matches = compiled.fullmatch(json.dumps(sample_output)) is not None
@@ -81,65 +79,31 @@ def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
 
 
 VALID_TOOL_OUTPUTS = [
-    ([{
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }], True),
-    ([{
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Berlin"
-        }
-    }], True),
-    ([{
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Vienna",
-            "days": 7
-        }
-    }], True),
-    ([{
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Vienna",
-            "days": 7
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }], True),
-    ([{
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Vienna",
-            "days": 7
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Vienna"
-        }
-    }, {
-        "name": "get_forecast",
-        "parameters": {
-            "city": "Berlin",
-            "days": 7
-        }
-    }, {
-        "name": "get_current_weather",
-        "parameters": {
-            "city": "Berlin"
-        }
-    }], True),
+    ([{"name": "get_current_weather", "parameters": {"city": "Vienna"}}], True),
+    (
+        [
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+            {"name": "get_current_weather", "parameters": {"city": "Berlin"}},
+        ],
+        True,
+    ),
+    ([{"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}}], True),
+    (
+        [
+            {"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+        ],
+        True,
+    ),
+    (
+        [
+            {"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+            {"name": "get_forecast", "parameters": {"city": "Berlin", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Berlin"}},
+        ],
+        True,
+    ),
 ]
 
 VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
@@ -147,92 +111,100 @@ def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
 
 @pytest.mark.parametrize(
     "sample_output, should_match",
-    VALID_TOOL_OUTPUTS + [
+    VALID_TOOL_OUTPUTS
+    + [
         (None, False),
         ([], False),  # empty list cannot be generated
         ({}, False),  # empty object cannot be generated
         ([{}], False),  # list with empty object cannot be generated
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather"
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather"
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": {}
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {},
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": None
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": None,
+                }
+            ],
+            False,
+        ),
         (
             {  # tool call without lists cannot be generated
                 "name": "get_current_weather",
-                "parameters": {
-                    "city": "Vienna"
-                }
+                "parameters": {"city": "Vienna"},
             },
-            False),
+            False,
+        ),
         (
-            [{  # tool call with extra parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": {
-                    "city": "Vienna",
-                    "extra": "value"
+            [
+                {  # tool call with extra parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {"city": "Vienna", "extra": "value"},
                 }
-            }],
-            False),
+            ],
+            False,
+        ),
         (
-            [{  # tool call where parameters are first cannot be generated
-                "parameters": {
-                    "city": "Vienna"
-                },
-                "name": "get_current_weather"
-            }],
-            False),
+            [
+                {  # tool call where parameters are first cannot be generated
+                    "parameters": {"city": "Vienna"},
+                    "name": "get_current_weather",
+                }
+            ],
+            False,
+        ),
         (
-            [{  # tool call without all required parameters cannot be generated
-                "name": "get_forecast",
-                "parameters": {
-                    "city": "Vienna"
+            [
+                {  # tool call without all required parameters cannot be generated
+                    "name": "get_forecast",
+                    "parameters": {"city": "Vienna"},
                 }
-            }],
-            False),
+            ],
+            False,
+        ),
         (  # tool call with incorrect name/parameters cannot be generated
-            [{
-                "name": "get_weather",
-                "parameters": {
-                    "city": "Vienna",
-                    "days": 7
-                }
-            }], False),
+            [{"name": "get_weather", "parameters": {"city": "Vienna", "days": 7}}],
+            False,
+        ),
         (  #  tool call with both valid and empty function cannot be generated
-            [{
-                "name": "get_current_weather",
-                "parameters": {
-                    "city": "Vienna"
-                }
-            }, {}], False),
-    ])
+            [{"name": "get_current_weather", "parameters": {"city": "Vienna"}}, {}],
+            False,
+        ),
+    ],
+)
 def test_guided_json(sample_output, should_match):
-    _compile_and_check(tools=TypeAdapter(
-        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
-                       sample_output=sample_output,
-                       should_match=should_match)
+    _compile_and_check(
+        tools=TypeAdapter(list[ChatCompletionToolsParam]).validate_python(
+            EXAMPLE_TOOLS
+        ),
+        sample_output=sample_output,
+        should_match=should_match,
+    )
 
 
-def update_parameters_none(
-        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+def update_parameters_none(tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
     tool.function.parameters = None
     return tool
 
 
 def update_parameters_empty_dict(
-        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool: ChatCompletionToolsParam,
+) -> ChatCompletionToolsParam:
     tool.function.parameters = {}
     return tool
 
@@ -245,47 +217,58 @@ def update_parameters_empty_dict(
         ({}, False),  # empty object cannot be generated
         ([{}], False),  # list with empty object cannot be generated
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather"
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather"
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function without required parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": None
-            }],
-            False),
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": None,
+                }
+            ],
+            False,
+        ),
         (
-            [{  # function with extra parameters cannot be generated
-                "name": "get_current_weather",
-                "parameters": {
-                    "extra": "value"
+            [
+                {  # function with extra parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {"extra": "value"},
                 }
-            }],
-            False),
+            ],
+            False,
+        ),
         (
-            [{  # only function with empty parameters object is valid
-                "name": "get_current_weather",
-                "parameters": {}
-            }],
-            True),
-    ])
+            [
+                {  # only function with empty parameters object is valid
+                    "name": "get_current_weather",
+                    "parameters": {},
+                }
+            ],
+            True,
+        ),
+    ],
+)
 @pytest.mark.parametrize(
-    "update_parameters",
-    [update_parameters_none, update_parameters_empty_dict])
-def test_guided_json_without_parameters(sample_output, should_match,
-                                        update_parameters):
+    "update_parameters", [update_parameters_none, update_parameters_empty_dict]
+)
+def test_guided_json_without_parameters(sample_output, should_match, update_parameters):
     updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
-    tools = TypeAdapter(
-        list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = TypeAdapter(list[ChatCompletionToolsParam]).validate_python(updated_tools)
     tools = list(map(update_parameters, tools))
-    assert all([
-        tool.function.parameters is None or tool.function.parameters == {}
-        for tool in tools
-    ])
-    _compile_and_check(tools=tools,
-                       sample_output=sample_output,
-                       should_match=should_match)
+    assert all(
+        [
+            tool.function.parameters is None or tool.function.parameters == {}
+            for tool in tools
+        ]
+    )
+    _compile_and_check(
+        tools=tools, sample_output=sample_output, should_match=should_match
+    )
 
 
 @pytest.mark.parametrize("output", VALID_TOOLS)
@@ -303,7 +286,7 @@ def test_streaming_output_valid(output, empty_params, delta_len):
     function_name_returned = False
     messages = []
     for i in range(0, len(output_json), delta_len):
-        delta_text = output_json[i:i + delta_len]
+        delta_text = output_json[i : i + delta_len]
         current_text = previous_text + delta_text
 
         delta_message, function_name_returned = (
@@ -312,7 +295,9 @@ def test_streaming_output_valid(output, empty_params, delta_len):
                 previous_text=previous_text,
                 current_text=current_text,
                 delta_text=delta_text,
-                function_name_returned=function_name_returned))
+                function_name_returned=function_name_returned,
+            )
+        )
 
         if delta_message:
             messages.append(delta_message)
@@ -326,12 +311,14 @@ def test_streaming_output_valid(output, empty_params, delta_len):
             if len(combined_messages) > 1:
                 combined_messages += "},"
 
-            combined_messages += '{"name": "' + \
-                message.tool_calls[0].function.name  + \
-                    '", "parameters": ' + \
-                        message.tool_calls[0].function.arguments
+            combined_messages += (
+                '{"name": "'
+                + message.tool_calls[0].function.name
+                + '", "parameters": '
+                + message.tool_calls[0].function.arguments
+            )
         else:
             combined_messages += message.tool_calls[0].function.arguments
     combined_messages += "}]"
     assert json.loads(combined_messages) == output
-    assert json.dumps(json.loads(combined_messages)) == output_json
\ No newline at end of file
+    assert json.dumps(json.loads(combined_messages)) == output_json
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 8d26b9051590..d61595c488e8 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -23,12 +23,14 @@ def xlam_tool_parser(xlam_tokenizer):
     return xLAMToolParser(xlam_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: list[ToolCall],
-                      expected_tool_calls: list[ToolCall]):
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
-    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
-                                                    expected_tool_calls):
+    for actual_tool_call, expected_tool_call in zip(
+        actual_tool_calls, expected_tool_calls
+    ):
         assert isinstance(actual_tool_call.id, str)
         assert len(actual_tool_call.id) > 16
 
@@ -39,7 +41,8 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
 def test_extract_tool_calls_no_tools(xlam_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content == model_output
@@ -57,73 +60,95 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
         (
             """[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}, {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                )),
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Orlando",
-                        "state": "FL",
-                        "unit": "fahrenheit",
-                    }),
-                )),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Orlando",
+                                "state": "FL",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                ),
             ],
             None,
         ),
         (
             """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "<think>I'll help you with that.</think>",
         ),
         (
             """I'll help you with that.\n```json\n[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]\n```""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll help you with that.",
         ),
         (
             """I'll check the weather for you.[TOOL_CALLS][{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Dallas",
-                        "state": "TX",
-                        "unit": "fahrenheit",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Dallas",
+                                "state": "TX",
+                                "unit": "fahrenheit",
+                            }
+                        ),
+                    )
+                )
             ],
             "I'll check the weather for you.",
         ),
     ],
 )
-def test_extract_tool_calls(xlam_tool_parser, model_output,
-                            expected_tool_calls, expected_content):
+def test_extract_tool_calls(
+    xlam_tool_parser, model_output, expected_tool_calls, expected_content
+):
     extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -138,25 +163,30 @@ def test_extract_tool_calls(xlam_tool_parser, model_output,
         (
             """[{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}]""",  # noqa: E501
             [
-                ToolCall(function=FunctionCall(
-                    name="get_current_weather",
-                    arguments=json.dumps({
-                        "city": "Seattle",
-                        "state": "WA",
-                        "unit": "celsius",
-                    }),
-                ))
+                ToolCall(
+                    function=FunctionCall(
+                        name="get_current_weather",
+                        arguments=json.dumps(
+                            {
+                                "city": "Seattle",
+                                "state": "WA",
+                                "unit": "celsius",
+                            }
+                        ),
+                    )
+                )
             ],
             None,
         ),
     ],
 )
-def test_extract_tool_calls_list_structure(xlam_tool_parser, model_output,
-                                           expected_tool_calls,
-                                           expected_content):
+def test_extract_tool_calls_list_structure(
+    xlam_tool_parser, model_output, expected_tool_calls, expected_content
+):
     """Test extraction of tool calls when the model outputs a list-structured tool call."""  # noqa: E501
     extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
-        model_output, request=None)  # type: ignore[arg-type]
+        model_output, request=None
+    )  # type: ignore[arg-type]
     assert extracted_tool_calls.tools_called
 
     assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
@@ -167,20 +197,25 @@ def test_extract_tool_calls_list_structure(xlam_tool_parser, model_output,
 # Test for preprocess_model_output method
 def test_preprocess_model_output(xlam_tool_parser):
     # Test with list structure
-    model_output = """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    model_output = (
+        """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    )
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content is None
     assert potential_tool_calls == model_output
 
     # Test with thinking tag
     model_output = """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content == "<think>I'll help you with that.</think>"
     assert (
-        potential_tool_calls ==
-        '[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]')
+        potential_tool_calls
+        == '[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]'
+    )
 
     # Test with JSON code block
     model_output = """I'll help you with that.
@@ -188,14 +223,16 @@ def test_preprocess_model_output(xlam_tool_parser):
 [{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]
 ```"""
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content == "I'll help you with that."
     assert "get_current_weather" in potential_tool_calls
 
     # Test with no tool calls
     model_output = """I'll help you with that."""
     content, potential_tool_calls = xlam_tool_parser.preprocess_model_output(
-        model_output)
+        model_output
+    )
     assert content == model_output
     assert potential_tool_calls is None
 
@@ -209,7 +246,9 @@ def test_streaming_with_list_structure(xlam_tool_parser):
     xlam_tool_parser.current_tool_id = -1
 
     # Simulate receiving a message with list structure
-    current_text = """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    current_text = (
+        """[{"name": "get_current_weather", "arguments": {"city": "Seattle"}}]"""  # noqa: E501
+    )
 
     # First call to set up the tool
     xlam_tool_parser.extract_tool_calls_streaming(
@@ -223,8 +262,7 @@ def test_streaming_with_list_structure(xlam_tool_parser):
     )
 
     # Make sure the tool is set up correctly
-    assert (xlam_tool_parser.current_tool_id
-            >= 0), "Tool index should be initialized"
+    assert xlam_tool_parser.current_tool_id >= 0, "Tool index should be initialized"
 
     # Manually set up the state for sending the tool name
     xlam_tool_parser.current_tools_sent = [False]
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index a17fab9aecbc..835d07608e40 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -4,8 +4,7 @@
 from copy import deepcopy
 from typing import Any, Optional
 
-from openai.types.chat import (ChatCompletionMessageParam,
-                               ChatCompletionToolParam)
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletionToolParam
 from typing_extensions import TypedDict
 
 from tests.utils import VLLM_PATH
@@ -20,8 +19,9 @@ class ServerConfig(TypedDict, total=False):
     extended: Optional[bool]  # tests do not run in CI automatically
 
 
-def patch_system_prompt(messages: list[dict[str, Any]],
-                        system_prompt: str) -> list[dict[str, Any]]:
+def patch_system_prompt(
+    messages: list[dict[str, Any]], system_prompt: str
+) -> list[dict[str, Any]]:
     new_messages = deepcopy(messages)
     if new_messages[0]["role"] == "system":
         new_messages[0]["content"] = system_prompt
@@ -30,8 +30,9 @@ def patch_system_prompt(messages: list[dict[str, Any]],
     return new_messages
 
 
-def ensure_system_prompt(messages: list[dict[str, Any]],
-                         config: ServerConfig) -> list[dict[str, Any]]:
+def ensure_system_prompt(
+    messages: list[dict[str, Any]], config: ServerConfig
+) -> list[dict[str, Any]]:
     prompt = config.get("system_prompt")
     if prompt:
         return patch_system_prompt(messages, prompt)
@@ -42,92 +43,102 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
 # universal args for all models go here. also good if you need to test locally
 # and change type or KV cache quantization or something.
 ARGS: list[str] = [
-    "--enable-auto-tool-choice", "--max-model-len", "1024", "--max-num-seqs",
-    "256"
+    "--enable-auto-tool-choice",
+    "--max-model-len",
+    "1024",
+    "--max-num-seqs",
+    "256",
 ]
 
 CONFIGS: dict[str, ServerConfig] = {
     "hermes": {
-        "model":
-        "NousResearch/Hermes-3-Llama-3.1-8B",
+        "model": "NousResearch/Hermes-3-Llama-3.1-8B",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "hermes", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "hermes",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja"),
         ],
-        "system_prompt":
-        "You are a helpful assistant with access to tools. If a tool"
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
         " that you have would be helpful to answer a user query, "
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-        "to the user's question - just respond to it normally."
+        "to the user's question - just respond to it normally.",
     },
     "llama": {
-        "model":
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "llama3_json", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama3_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja"),
         ],
-        "supports_parallel":
-        False,
+        "supports_parallel": False,
     },
     "llama3.2": {
-        "model":
-        "meta-llama/Llama-3.2-3B-Instruct",
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "llama3_json", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama3_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja"),
         ],
-        "supports_parallel":
-        False,
+        "supports_parallel": False,
     },
     "llama4": {
-        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "llama4_pythonic", "--chat-template",
-            str(VLLM_PATH /
-                "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
-            "4"
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama4_pythonic",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_pythonic.jinja"),
+            "-tp",
+            "4",
         ],
-        "supports_parallel":
-        False,
-        "extended":
-        True
+        "supports_parallel": False,
+        "extended": True,
     },
     "llama4_json": {
-        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
-            "--distributed-executor-backend", "mp", "--tool-call-parser",
-            "llama4_json", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "-tp",
+            "4",
+            "--distributed-executor-backend",
+            "mp",
+            "--tool-call-parser",
+            "llama4_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja"),
         ],
-        "supports_parallel":
-        True,
-        "extended":
-        True
+        "supports_parallel": True,
+        "extended": True,
     },
     "mistral": {
-        "model":
-        "mistralai/Mistral-7B-Instruct-v0.3",
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "mistral", "--chat-template",
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "mistral",
+            "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
-            "--ignore-patterns=\"consolidated.safetensors\""
+            '--ignore-patterns="consolidated.safetensors"',
         ],
-        "system_prompt":
-        "You are a helpful assistant with access to tools. If a tool"
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
         " that you have would be helpful to answer a user query, "
         "call the tool. Otherwise, answer the user's query directly "
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-        "to the user's question - just respond to it normally."
+        "to the user's question - just respond to it normally.",
     },
     # V1 Test: Passing locally but failing in CI. This runs the
     # V0 Engine because of CPU offloading. Need to debug why.
@@ -146,49 +157,50 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
     #     False,
     # },
     "granite-3.0-8b": {
-        "model":
-        "ibm-granite/granite-3.0-8b-instruct",
+        "model": "ibm-granite/granite-3.0-8b-instruct",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "granite", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "granite",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja"),
         ],
     },
     "granite-3.1-8b": {
-        "model":
-        "ibm-granite/granite-3.1-8b-instruct",
+        "model": "ibm-granite/granite-3.1-8b-instruct",
         "arguments": [
             "--enforce-eager",
             "--no-enable-prefix-caching",
             "--tool-call-parser",
             "granite",
         ],
-        "supports_parallel":
-        True,
+        "supports_parallel": True,
     },
     "internlm": {
-        "model":
-        "internlm/internlm2_5-7b-chat",
+        "model": "internlm/internlm2_5-7b-chat",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "internlm", "--chat-template",
-            str(VLLM_PATH /
-                "examples/tool_chat_template_internlm2_tool.jinja"),
-            "--trust_remote_code"
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "internlm",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_internlm2_tool.jinja"),
+            "--trust_remote_code",
         ],
-        "supports_parallel":
-        False,
+        "supports_parallel": False,
     },
     "toolACE": {
-        "model":
-        "Team-ACE/ToolACE-8B",
+        "model": "Team-ACE/ToolACE-8B",
         "arguments": [
-            "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "pythonic", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "pythonic",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja"),
         ],
-        "supports_parallel":
-        True,
+        "supports_parallel": True,
     },
 }
 
@@ -201,37 +213,31 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
             "type": "object",
             "properties": {
                 "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, "
-                    "e.g. 'San Francisco'"
+                    "type": "string",
+                    "description": "The city to find the weather for, "
+                    "e.g. 'San Francisco'",
                 },
                 "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "must the two-letter abbreviation for the state "
+                    "type": "string",
+                    "description": "must the two-letter abbreviation for the state "
                     "that the city is in, e.g. 'CA' which would "
-                    "mean 'California'"
+                    "mean 'California'",
                 },
                 "unit": {
                     "type": "string",
                     "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
-            }
-        }
-    }
+                    "enum": ["celsius", "fahrenheit"],
+                },
+            },
+        },
+    },
 }
 
 SEARCH_TOOL: ChatCompletionToolParam = {
     "type": "function",
     "function": {
-        "name":
-        "web_search",
-        "description":
-        "Search the internet and get a summary of the top "
+        "name": "web_search",
+        "description": "Search the internet and get a summary of the top "
         "10 webpages. Should only be used if you don't know "
         "the answer to a user query, and the results are likely"
         "to be able to be found with a web search",
@@ -239,124 +245,98 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
             "type": "object",
             "properties": {
                 "search_term": {
-                    "type":
-                    "string",
-                    "description":
-                    "The term to use in the search. This should"
+                    "type": "string",
+                    "description": "The term to use in the search. This should"
                     "ideally be keywords to search for, not a"
-                    "natural-language question"
+                    "natural-language question",
                 }
             },
-            "required": ["search_term"]
-        }
-    }
+            "required": ["search_term"],
+        },
+    },
 }
 
-MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "Hi! How are you?"
-}, {
-    "role":
-    "assistant",
-    "content":
-    "I'm doing great! How can I assist you?"
-}, {
-    "role":
-    "user",
-    "content":
-    "Can you tell me a joke please?"
-}]
+MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "Hi! How are you?"},
+    {"role": "assistant", "content": "I'm doing great! How can I assist you?"},
+    {"role": "user", "content": "Can you tell me a joke please?"},
+]
 
-MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas in Fahrenheit?"
-}]
+MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?"}
+]
 
-MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas in Fahrenheit?"
-}, {
-    "role":
-    "assistant",
-    "tool_calls": [{
-        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-        "type": "function",
-        "function": {
-            "name":
-            WEATHER_TOOL["function"]["name"],
-            "arguments":
-            '{"city": "Dallas", "state": "TX", '
-            '"unit": "fahrenheit"}'
-        }
-    }]
-}, {
-    "role":
-    "tool",
-    "tool_call_id":
-    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-    "content":
-    "The weather in Dallas is 98 degrees fahrenheit, with partly"
-    "cloudy skies and a low chance of rain."
-}]
+MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?"},
+    {
+        "role": "assistant",
+        "tool_calls": [
+            {
+                "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Dallas", "state": "TX", '
+                    '"unit": "fahrenheit"}',
+                },
+            }
+        ],
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "content": "The weather in Dallas is 98 degrees fahrenheit, with partly"
+        "cloudy skies and a low chance of rain.",
+    },
+]
 
-MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas and Orlando, Florida in "
-    "Fahrenheit?"
-}]
+MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [
+    {
+        "role": "user",
+        "content": "What is the weather in Dallas, Texas and Orlando, Florida in "
+        "Fahrenheit?",
+    }
+]
 
-MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
-    "role":
-    "user",
-    "content":
-    "What is the weather in Dallas, Texas and Orlando, Florida in "
-    "Fahrenheit?"
-}, {
-    "role":
-    "assistant",
-    "tool_calls": [{
-        "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-        "type": "function",
-        "function": {
-            "name":
-            WEATHER_TOOL["function"]["name"],
-            "arguments":
-            '{"city": "Dallas", "state": "TX", '
-            '"unit": "fahrenheit"}'
-        }
-    }, {
-        "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
-        "type": "function",
-        "function": {
-            "name":
-            WEATHER_TOOL["function"]["name"],
-            "arguments":
-            '{"city": "Orlando", "state": "Fl", '
-            '"unit": "fahrenheit"}'
-        }
-    }]
-}, {
-    "role":
-    "tool",
-    "tool_call_id":
-    "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
-    "content":
-    "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
-    "cloudy skies and a chance of rain in the evening."
-}, {
-    "role":
-    "tool",
-    "tool_call_id":
-    "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
-    "content":
-    "The weather in Orlando FL is 78 degrees fahrenheit with clear"
-    "skies."
-}]
+MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [
+    {
+        "role": "user",
+        "content": "What is the weather in Dallas, Texas and Orlando, Florida in "
+        "Fahrenheit?",
+    },
+    {
+        "role": "assistant",
+        "tool_calls": [
+            {
+                "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Dallas", "state": "TX", '
+                    '"unit": "fahrenheit"}',
+                },
+            },
+            {
+                "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Orlando", "state": "Fl", '
+                    '"unit": "fahrenheit"}',
+                },
+            },
+        ],
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "content": "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
+        "cloudy skies and a chance of rain in the evening.",
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+        "content": "The weather in Orlando FL is 78 degrees fahrenheit with clear"
+        "skies.",
+    },
+]
diff --git a/tests/tools/test_config_validator.py b/tests/tools/test_config_validator.py
index b0475894a114..22d838d27264 100644
--- a/tests/tools/test_config_validator.py
+++ b/tests/tools/test_config_validator.py
@@ -7,11 +7,11 @@
 
 from tools.validate_config import validate_ast
 
-_TestConfig1 = '''
+_TestConfig1 = """
 @config
 class _TestConfig1:
     pass
-'''
+"""
 
 _TestConfig2 = '''
 @config
@@ -21,12 +21,12 @@ class _TestConfig2:
     """docstring"""
 '''
 
-_TestConfig3 = '''
+_TestConfig3 = """
 @config
 @dataclass
 class _TestConfig3:
     a: int = 1
-'''
+"""
 
 _TestConfig4 = '''
 @config
@@ -37,12 +37,15 @@ class _TestConfig4:
 '''
 
 
-@pytest.mark.parametrize(("test_config", "expected_error"), [
-    (_TestConfig1, "must be a dataclass"),
-    (_TestConfig2, "must have a default"),
-    (_TestConfig3, "must have a docstring"),
-    (_TestConfig4, "must use a single Literal"),
-])
+@pytest.mark.parametrize(
+    ("test_config", "expected_error"),
+    [
+        (_TestConfig1, "must be a dataclass"),
+        (_TestConfig2, "must have a default"),
+        (_TestConfig3, "must have a docstring"),
+        (_TestConfig4, "must use a single Literal"),
+    ],
+)
 def test_config(test_config, expected_error):
     tree = ast.parse(test_config)
     with pytest.raises(Exception, match=expected_error):
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index b26bdd34d890..47100675fc8a 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -28,14 +28,16 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
 
 
 def setup_vllm(num_loras: int) -> vllm.LLM:
-    return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
-                    num_scheduler_steps=1,
-                    max_model_len=256,
-                    max_seq_len_to_capture=256,
-                    max_num_seqs=8,
-                    enable_lora=True,
-                    max_loras=num_loras,
-                    max_lora_rank=8)
+    return vllm.LLM(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        num_scheduler_steps=1,
+        max_model_len=256,
+        max_seq_len_to_capture=256,
+        max_num_seqs=8,
+        enable_lora=True,
+        max_loras=num_loras,
+        max_lora_rank=8,
+    )
 
 
 def test_single_lora():
@@ -50,12 +52,19 @@ def test_single_lora():
     prompt = "What is 1+1? \n"
 
     lora_request = LoRARequest(
-        "lora_adapter_1", 1,
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter")
-    output = llm.generate(prompt,
-                          sampling_params=vllm.SamplingParams(max_tokens=256,
-                                                              temperature=0),
-                          lora_request=lora_request)[0].outputs[0].text
+        "lora_adapter_1",
+        1,
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
+    )
+    output = (
+        llm.generate(
+            prompt,
+            sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+            lora_request=lora_request,
+        )[0]
+        .outputs[0]
+        .text
+    )
 
     answer = output.strip()[0]
 
@@ -67,13 +76,12 @@ def test_lora_hotswapping():
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, even
     if we only have space to store 1.
-    
+
     We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
     will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
     """
 
-    lora_name_template = \
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
     lora_requests = [
         LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
         for i in range(1, 5)
@@ -84,10 +92,15 @@ def test_lora_hotswapping():
     prompt = "What is 1+1? \n"
 
     for i, req in enumerate(lora_requests):
-        output = llm.generate(prompt,
-                              sampling_params=vllm.SamplingParams(
-                                  max_tokens=256, temperature=0),
-                              lora_request=req)[0].outputs[0].text
+        output = (
+            llm.generate(
+                prompt,
+                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+                lora_request=req,
+            )[0]
+            .outputs[0]
+            .text
+        )
         answer = output.strip()[0]
 
         assert answer.isdigit()
@@ -98,12 +111,11 @@ def test_multi_lora():
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, when
     we have enough space to store all of them.
-    
+
     We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
     will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
     """
-    lora_name_template = \
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
     lora_requests = [
         LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
         for i in range(1, 5)
@@ -114,10 +126,15 @@ def test_multi_lora():
     prompt = "What is 1+1? \n"
 
     for i, req in enumerate(lora_requests):
-        output = llm.generate(prompt,
-                              sampling_params=vllm.SamplingParams(
-                                  max_tokens=256, temperature=0),
-                              lora_request=req)[0].outputs[0].text
+        output = (
+            llm.generate(
+                prompt,
+                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
+                lora_request=req,
+            )[0]
+            .outputs[0]
+            .text
+        )
 
         answer = output.strip()[0]
 
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 448b8b2bc094..5acfa484f0c1 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -26,16 +26,15 @@ def test_tpu_compilation():
 
         # Currently, top-p sampling is disabled. `top_p` should be 1.0.
         N = 1
-        sampling_params = SamplingParams(temperature=0.7,
-                                         top_p=1.0,
-                                         n=N,
-                                         max_tokens=16)
+        sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=N, max_tokens=16)
 
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-                  max_num_batched_tokens=256,
-                  max_model_len=256,
-                  max_num_seqs=32,
-                  enforce_eager=False)
+        llm = LLM(
+            model="Qwen/Qwen2-1.5B-Instruct",
+            max_num_batched_tokens=256,
+            max_model_len=256,
+            max_num_seqs=32,
+            enforce_eager=False,
+        )
 
         outputs = llm.generate(prompts, sampling_params)
         for output, answer in zip(outputs, answers):
@@ -45,7 +44,8 @@ def test_tpu_compilation():
             assert generated_text.startswith(answer)
 
     compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
+        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py"))
+    )
 
     for i, compiled_code in enumerate(compiled_codes):
         print("{} file: {}".format(i + 1, compiled_code))
@@ -66,9 +66,10 @@ def extract_compiled_index(s):
 
     # Check all the compilations are as expected. The dump files include the
     # captured graph for the forward function of the nn.Module.
-    compiled_fns = sorted(glob.glob(
-        os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
-                          key=lambda s: extract_compiled_index(s))
+    compiled_fns = sorted(
+        glob.glob(os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
+        key=lambda s: extract_compiled_index(s),
+    )
 
     for i, compiled_fn in enumerate(compiled_fns):
         print("{} file: {}".format(i + 1, compiled_fn))
@@ -82,4 +83,4 @@ def extract_compiled_index(s):
     # ragged_paged_attention
     with open(compiled_fns[1]) as f:
         content = f.read()
-        assert (kv_cache_prefix in content and attn_prefix in content)
+        assert kv_cache_prefix in content and attn_prefix in content
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index 9c90df1b7701..102e5ddf16d6 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -15,17 +15,20 @@
 def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
-                             arg1=[
-                                 "--max-model-len=256",
-                                 "--max-num-seqs=32",
-                                 "--enforce-eager",
-                                 f"-O{CompilationLevel.DYNAMO_ONCE}",
-                             ],
-                             arg2=[
-                                 "--max-model-len=256", "--max-num-seqs=32",
-                                 "--enforce-eager",
-                                 f"-O{CompilationLevel.DYNAMO_AS_IS}"
-                             ],
-                             env1={},
-                             env2={})
+        compare_two_settings(
+            "Qwen/Qwen2.5-1.5B-Instruct",
+            arg1=[
+                "--max-model-len=256",
+                "--max-num-seqs=32",
+                "--enforce-eager",
+                f"-O{CompilationLevel.DYNAMO_ONCE}",
+            ],
+            arg2=[
+                "--max-model-len=256",
+                "--max-num-seqs=32",
+                "--enforce-eager",
+                f"-O{CompilationLevel.DYNAMO_AS_IS}",
+            ],
+            env1={},
+            env2={},
+        )
diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py
index 407a824d8174..13844589c982 100644
--- a/tests/tpu/test_moe_pallas.py
+++ b/tests/tpu/test_moe_pallas.py
@@ -4,15 +4,17 @@
 
 Run `pytest tests/kernels/moe/test_moe_pallas.py`.
 """
+
 import pytest
 import torch
 
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.model_executor.layers.fused_moe.moe_pallas import (
-    fused_moe as pallas_moe)
+from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
-    fused_moe as torch_moe)
+    fused_moe as torch_moe,
+)
+
 # yapf: enable
 from vllm.platforms import current_platform
 
@@ -42,6 +44,7 @@ def test_pallas_moe(
     dtype: torch.dtype,
 ):
     import torch_xla.core.xla_model as xm
+
     with torch.device(xm.xla_device()):
         a = torch.randn((m, k), dtype=dtype) / 10
         w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 6cefbae4bdd1..4b2e9d2eeab6 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -17,15 +17,15 @@ class GSM8KAccuracyTestConfig:
     expected_value: float
 
     def get_model_args(self) -> str:
-        return (f"pretrained={self.model_name},"
-                "max_model_len=4096,max_num_seqs=32")
+        return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
 
 
 # NOTE: Accuracy scores measured on GPUs.
 ACCURACY_CONFIGS = [
     GSM8KAccuracyTestConfig(
         model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        expected_value=0.76),  # no bias
+        expected_value=0.76,
+    ),  # no bias
     # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
     # so only one of these tests can run in a single call to pytest. As
     # a follow up, move this into the LM-EVAL section of the CI.
@@ -37,7 +37,6 @@ def get_model_args(self) -> str:
 
 @pytest.mark.parametrize("config", ACCURACY_CONFIGS)
 def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
-
     results = lm_eval.simple_evaluate(
         model="vllm",
         model_args=config.get_model_args(),
@@ -47,6 +46,7 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
 
     EXPECTED_VALUE = config.expected_value
     measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < EXPECTED_VALUE
+        and measured_value + RTOL > EXPECTED_VALUE
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 4dbae7c15de3..a265cd83fc69 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -12,12 +12,14 @@
 import grpc
 import pytest
 from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
-    ExportTraceServiceResponse)
+    ExportTraceServiceResponse,
+)
 from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
-    TraceServiceServicer, add_TraceServiceServicer_to_server)
+    TraceServiceServicer,
+    add_TraceServiceServicer_to_server,
+)
 from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
-from opentelemetry.sdk.environment_variables import (
-    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+from opentelemetry.sdk.environment_variables import OTEL_EXPORTER_OTLP_TRACES_INSECURE
 
 from vllm import LLM, SamplingParams
 from vllm.tracing import SpanAttributes
@@ -30,14 +32,15 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     all tests in the module.
     """
     with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
+        m.setenv("VLLM_USE_V1", "0")
         yield
 
 
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
 
-FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
-                    'array_value']
+FieldName = Literal[
+    "bool_value", "string_value", "int_value", "double_value", "array_value"
+]
 
 
 def decode_value(value: AnyValue):
@@ -46,8 +49,9 @@ def decode_value(value: AnyValue):
         "string_value": (lambda v: v.string_value),
         "int_value": (lambda v: v.int_value),
         "double_value": (lambda v: v.double_value),
-        "array_value":
-        (lambda v: [decode_value(item) for item in v.array_value.values]),
+        "array_value": (
+            lambda v: [decode_value(item) for item in v.array_value.values]
+        ),
     }
     for field, decoder in field_decoders.items():
         if value.HasField(field):
@@ -60,7 +64,6 @@ def decode_attributes(attributes: Iterable[KeyValue]):
 
 
 class FakeTraceService(TraceServiceServicer):
-
     def __init__(self):
         self.request = None
         self.evt = threading.Event()
@@ -109,49 +112,61 @@ def test_traces(
         if not trace_service.evt.wait(timeout):
             raise TimeoutError(
                 f"The fake trace service didn't receive a trace within "
-                f"the {timeout} seconds timeout")
+                f"the {timeout} seconds timeout"
+            )
 
         request = trace_service.request
         assert len(request.resource_spans) == 1, (
-            f"Expected 1 resource span, "
-            f"but got {len(request.resource_spans)}")
+            f"Expected 1 resource span, but got {len(request.resource_spans)}"
+        )
         assert len(request.resource_spans[0].scope_spans) == 1, (
             f"Expected 1 scope span, "
-            f"but got {len(request.resource_spans[0].scope_spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans)}"
+        )
         assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
             f"Expected 1 span, "
-            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}"
+        )
 
         attributes = decode_attributes(
-            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+            request.resource_spans[0].scope_spans[0].spans[0].attributes
+        )
         assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                              ) == sampling_params.temperature
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-                              ) == sampling_params.max_tokens
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-                outputs[0].prompt_token_ids)
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE)
+            == sampling_params.temperature
+        )
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        )
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS)
+            == sampling_params.max_tokens
+        )
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+            outputs[0].prompt_token_ids
+        )
         completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS)
+            == completion_tokens
+        )
         metrics = outputs[0].metrics
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-                              ) == metrics.time_in_queue
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE)
+            == metrics.time_in_queue
+        )
         ttft = metrics.first_token_time - metrics.arrival_time
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
         e2e_time = metrics.finished_time - metrics.arrival_time
         assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
         assert metrics.scheduler_time > 0
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                              ) == metrics.scheduler_time
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER)
+            == metrics.scheduler_time
+        )
         # Model forward and model execute should be none, since detailed traces is
         # not enabled.
         assert metrics.model_forward_time is None
@@ -183,55 +198,68 @@ def test_traces_with_detailed_steps(
         if not trace_service.evt.wait(timeout):
             raise TimeoutError(
                 f"The fake trace service didn't receive a trace within "
-                f"the {timeout} seconds timeout")
+                f"the {timeout} seconds timeout"
+            )
 
         request = trace_service.request
         assert len(request.resource_spans) == 1, (
-            f"Expected 1 resource span, "
-            f"but got {len(request.resource_spans)}")
+            f"Expected 1 resource span, but got {len(request.resource_spans)}"
+        )
         assert len(request.resource_spans[0].scope_spans) == 1, (
             f"Expected 1 scope span, "
-            f"but got {len(request.resource_spans[0].scope_spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans)}"
+        )
         assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
             f"Expected 1 span, "
-            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}"
+        )
 
         attributes = decode_attributes(
-            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+            request.resource_spans[0].scope_spans[0].spans[0].attributes
+        )
         assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                              ) == sampling_params.temperature
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-                              ) == sampling_params.max_tokens
-        assert attributes.get(
-            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-                outputs[0].prompt_token_ids)
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE)
+            == sampling_params.temperature
+        )
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        )
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS)
+            == sampling_params.max_tokens
+        )
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+            outputs[0].prompt_token_ids
+        )
         completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-        assert attributes.get(
-            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS)
+            == completion_tokens
+        )
         metrics = outputs[0].metrics
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-                              ) == metrics.time_in_queue
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE)
+            == metrics.time_in_queue
+        )
         ttft = metrics.first_token_time - metrics.arrival_time
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
         e2e_time = metrics.finished_time - metrics.arrival_time
         assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
         assert metrics.scheduler_time > 0
-        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                              ) == metrics.scheduler_time
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER)
+            == metrics.scheduler_time
+        )
         assert metrics.model_forward_time > 0
         assert attributes.get(
             SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
         ) == pytest.approx(metrics.model_forward_time / 1000)
         assert metrics.model_execute_time > 0
-        assert attributes.get(
-            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
-        ) == metrics.model_execute_time
+        assert (
+            attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE)
+            == metrics.model_execute_time
+        )
         assert metrics.model_forward_time < 1000 * metrics.model_execute_time
diff --git a/tests/utils.py b/tests/utils.py
index f4317e6bdb40..2577de8eee47 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -26,20 +26,29 @@
 
 import vllm.envs as envs
 from tests.models.utils import TextTextLogprobs
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
+from vllm.distributed import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.cli.serve import ServeSubcommand
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import (FlexibleArgumentParser, GB_bytes,
-                        cuda_device_count_stateless, get_open_port)
+from vllm.utils import (
+    FlexibleArgumentParser,
+    GB_bytes,
+    cuda_device_count_stateless,
+    get_open_port,
+)
 
 if current_platform.is_rocm():
-    from amdsmi import (amdsmi_get_gpu_vram_usage,
-                        amdsmi_get_processor_handles, amdsmi_init,
-                        amdsmi_shut_down)
+    from amdsmi import (
+        amdsmi_get_gpu_vram_usage,
+        amdsmi_get_processor_handles,
+        amdsmi_init,
+        amdsmi_shut_down,
+    )
 
     @contextmanager
     def _nvml():
@@ -49,9 +58,12 @@ def _nvml():
         finally:
             amdsmi_shut_down()
 elif current_platform.is_cuda():
-    from vllm.third_party.pynvml import (nvmlDeviceGetHandleByIndex,
-                                         nvmlDeviceGetMemoryInfo, nvmlInit,
-                                         nvmlShutdown)
+    from vllm.third_party.pynvml import (
+        nvmlDeviceGetHandleByIndex,
+        nvmlDeviceGetMemoryInfo,
+        nvmlInit,
+        nvmlShutdown,
+    )
 
     @contextmanager
     def _nvml():
@@ -74,40 +86,40 @@ def _nvml():
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
 
-    def __init__(self,
-                 model: str,
-                 vllm_serve_args: list[str],
-                 *,
-                 env_dict: Optional[dict[str, str]] = None,
-                 seed: Optional[int] = 0,
-                 auto_port: bool = True,
-                 max_wait_seconds: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        model: str,
+        vllm_serve_args: list[str],
+        *,
+        env_dict: Optional[dict[str, str]] = None,
+        seed: Optional[int] = 0,
+        auto_port: bool = True,
+        max_wait_seconds: Optional[float] = None,
+    ) -> None:
         if auto_port:
             if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
-                raise ValueError("You have manually specified the port "
-                                 "when `auto_port=True`.")
+                raise ValueError(
+                    "You have manually specified the port when `auto_port=True`."
+                )
 
             # Don't mutate the input args
-            vllm_serve_args = vllm_serve_args + [
-                "--port", str(get_open_port())
-            ]
+            vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
         if seed is not None:
             if "--seed" in vllm_serve_args:
-                raise ValueError("You have manually specified the seed "
-                                 f"when `seed={seed}`.")
+                raise ValueError(
+                    f"You have manually specified the seed when `seed={seed}`."
+                )
 
             vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
 
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
+        parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
         subparsers = parser.add_subparsers(required=False, dest="subparser")
         parser = ServeSubcommand().subparser_init(subparsers)
         args = parser.parse_args(["--model", model, *vllm_serve_args])
-        self.host = str(args.host or 'localhost')
+        self.host = str(args.host or "localhost")
         self.port = int(args.port)
 
-        self.show_hidden_metrics = \
-            args.show_hidden_metrics_for_version is not None
+        self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None
 
         # download the model before starting the server to avoid timeout
         is_local = os.path.isdir(model)
@@ -122,7 +134,7 @@ def __init__(self,
         env = os.environ.copy()
         # the current process might initialize cuda,
         # to be safe, we should use spawn method
-        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
         if env_dict is not None:
             env.update(env_dict)
         self.proc = subprocess.Popen(
@@ -132,8 +144,7 @@ def __init__(self,
             stderr=sys.stderr,
         )
         max_wait_seconds = max_wait_seconds or 240
-        self._wait_for_server(url=self.url_for("health"),
-                              timeout=max_wait_seconds)
+        self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
 
     def __enter__(self):
         return self
@@ -164,8 +175,7 @@ def _wait_for_server(self, *, url: str, timeout: float):
 
                 time.sleep(0.5)
                 if time.time() - start > timeout:
-                    raise RuntimeError(
-                        "Server failed to start in time.") from None
+                    raise RuntimeError("Server failed to start in time.") from None
 
     @property
     def url_root(self) -> str:
@@ -187,10 +197,12 @@ def get_client(self, **kwargs):
     def get_async_client(self, **kwargs):
         if "timeout" not in kwargs:
             kwargs["timeout"] = 600
-        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
-                                  api_key=self.DUMMY_API_KEY,
-                                  max_retries=0,
-                                  **kwargs)
+        return openai.AsyncOpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
+        )
 
 
 def _test_completion(
@@ -202,17 +214,18 @@ def _test_completion(
     results = []
 
     # test with text prompt
-    completion = client.completions.create(model=model,
-                                           prompt=prompt,
-                                           max_tokens=5,
-                                           temperature=0.0)
-
-    results.append({
-        "test": "single_completion",
-        "text": completion.choices[0].text,
-        "finish_reason": completion.choices[0].finish_reason,
-        "usage": completion.usage,
-    })
+    completion = client.completions.create(
+        model=model, prompt=prompt, max_tokens=5, temperature=0.0
+    )
+
+    results.append(
+        {
+            "test": "single_completion",
+            "text": completion.choices[0].text,
+            "finish_reason": completion.choices[0].finish_reason,
+            "usage": completion.usage,
+        }
+    )
 
     # test using token IDs
     completion = client.completions.create(
@@ -222,43 +235,42 @@ def _test_completion(
         temperature=0.0,
     )
 
-    results.append({
-        "test": "token_ids",
-        "text": completion.choices[0].text,
-        "finish_reason": completion.choices[0].finish_reason,
-        "usage": completion.usage,
-    })
+    results.append(
+        {
+            "test": "token_ids",
+            "text": completion.choices[0].text,
+            "finish_reason": completion.choices[0].finish_reason,
+            "usage": completion.usage,
+        }
+    )
 
     # test seeded random sampling
-    completion = client.completions.create(model=model,
-                                           prompt=prompt,
-                                           max_tokens=5,
-                                           seed=33,
-                                           temperature=1.0)
-
-    results.append({
-        "test": "seeded_sampling",
-        "text": completion.choices[0].text,
-        "finish_reason": completion.choices[0].finish_reason,
-        "usage": completion.usage,
-    })
+    completion = client.completions.create(
+        model=model, prompt=prompt, max_tokens=5, seed=33, temperature=1.0
+    )
+
+    results.append(
+        {
+            "test": "seeded_sampling",
+            "text": completion.choices[0].text,
+            "finish_reason": completion.choices[0].finish_reason,
+            "usage": completion.usage,
+        }
+    )
 
     # test seeded random sampling with multiple prompts
-    completion = client.completions.create(model=model,
-                                           prompt=[prompt, prompt],
-                                           max_tokens=5,
-                                           seed=33,
-                                           temperature=1.0)
-
-    results.append({
-        "test":
-        "seeded_sampling",
-        "text": [choice.text for choice in completion.choices],
-        "finish_reason":
-        [choice.finish_reason for choice in completion.choices],
-        "usage":
-        completion.usage,
-    })
+    completion = client.completions.create(
+        model=model, prompt=[prompt, prompt], max_tokens=5, seed=33, temperature=1.0
+    )
+
+    results.append(
+        {
+            "test": "seeded_sampling",
+            "text": [choice.text for choice in completion.choices],
+            "finish_reason": [choice.finish_reason for choice in completion.choices],
+            "usage": completion.usage,
+        }
+    )
 
     # test simple list
     batch = client.completions.create(
@@ -268,11 +280,13 @@ def _test_completion(
         temperature=0.0,
     )
 
-    results.append({
-        "test": "simple_list",
-        "text0": batch.choices[0].text,
-        "text1": batch.choices[1].text,
-    })
+    results.append(
+        {
+            "test": "simple_list",
+            "text0": batch.choices[0].text,
+            "text1": batch.choices[1].text,
+        }
+    )
 
     # test streaming
     batch = client.completions.create(
@@ -289,10 +303,12 @@ def _test_completion(
         choice = chunk.choices[0]
         texts[choice.index] += choice.text
 
-    results.append({
-        "test": "streaming",
-        "texts": texts,
-    })
+    results.append(
+        {
+            "test": "streaming",
+            "texts": texts,
+        }
+    )
 
     return results
 
@@ -305,19 +321,19 @@ def _test_completion_close(
     results = []
 
     # test with text prompt
-    completion = client.completions.create(model=model,
-                                           prompt=prompt,
-                                           max_tokens=1,
-                                           logprobs=5,
-                                           temperature=0.0)
+    completion = client.completions.create(
+        model=model, prompt=prompt, max_tokens=1, logprobs=5, temperature=0.0
+    )
 
     logprobs = completion.choices[0].logprobs.top_logprobs[0]
     logprobs = {k: round(v, 2) for k, v in logprobs.items()}
 
-    results.append({
-        "test": "completion_close",
-        "logprobs": logprobs,
-    })
+    results.append(
+        {
+            "test": "completion_close",
+            "logprobs": logprobs,
+        }
+    )
 
     return results
 
@@ -329,26 +345,21 @@ def _test_chat(
 ):
     results = []
 
-    messages = [{
-        "role": "user",
-        "content": [{
-            "type": "text",
-            "text": prompt
-        }]
-    }]
+    messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
 
     # test with text prompt
-    chat_response = client.chat.completions.create(model=model,
-                                                   messages=messages,
-                                                   max_tokens=5,
-                                                   temperature=0.0)
-
-    results.append({
-        "test": "completion_close",
-        "text": chat_response.choices[0].message.content,
-        "finish_reason": chat_response.choices[0].finish_reason,
-        "usage": chat_response.usage,
-    })
+    chat_response = client.chat.completions.create(
+        model=model, messages=messages, max_tokens=5, temperature=0.0
+    )
+
+    results.append(
+        {
+            "test": "completion_close",
+            "text": chat_response.choices[0].message.content,
+            "finish_reason": chat_response.choices[0].finish_reason,
+            "usage": chat_response.usage,
+        }
+    )
 
     return results
 
@@ -367,11 +378,13 @@ def _test_embeddings(
         encoding_format="float",
     )
 
-    results.append({
-        "test": "single_embedding",
-        "embedding": embeddings.data[0].embedding,
-        "usage": embeddings.usage,
-    })
+    results.append(
+        {
+            "test": "single_embedding",
+            "embedding": embeddings.data[0].embedding,
+            "usage": embeddings.usage,
+        }
+    )
 
     return results
 
@@ -384,74 +397,75 @@ def _test_image_text(
     results = []
 
     # test pure text input
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "How do you feel today?"
-            },
-        ],
-    }]
-
-    chat_completion = client.chat.completions.create(model=model_name,
-                                                     messages=messages,
-                                                     temperature=0.0,
-                                                     max_tokens=1,
-                                                     logprobs=True,
-                                                     top_logprobs=5)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "How do you feel today?"},
+            ],
+        }
+    ]
+
+    chat_completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=0.0,
+        max_tokens=1,
+        logprobs=True,
+        top_logprobs=5,
+    )
     top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
 
     for x in top_logprobs:
         x.logprob = round(x.logprob, 2)
 
-    results.append({
-        "test": "pure_text",
-        "logprobs": top_logprobs,
-    })
-
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
-            {
-                "type": "text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
-
-    chat_completion = client.chat.completions.create(model=model_name,
-                                                     messages=messages,
-                                                     temperature=0.0,
-                                                     max_tokens=1,
-                                                     logprobs=True,
-                                                     top_logprobs=5)
+    results.append(
+        {
+            "test": "pure_text",
+            "logprobs": top_logprobs,
+        }
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
+
+    chat_completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=0.0,
+        max_tokens=1,
+        logprobs=True,
+        top_logprobs=5,
+    )
     top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
 
-    results.append({
-        "test": "text_image",
-        "logprobs": top_logprobs,
-    })
+    results.append(
+        {
+            "test": "text_image",
+            "logprobs": top_logprobs,
+        }
+    )
 
     return results
 
 
-def compare_two_settings(model: str,
-                         arg1: list[str],
-                         arg2: list[str],
-                         env1: Optional[dict[str, str]] = None,
-                         env2: Optional[dict[str, str]] = None,
-                         *,
-                         method: str = "generate",
-                         max_wait_seconds: Optional[float] = None) -> None:
+def compare_two_settings(
+    model: str,
+    arg1: list[str],
+    arg2: list[str],
+    env1: Optional[dict[str, str]] = None,
+    env2: Optional[dict[str, str]] = None,
+    *,
+    method: str = "generate",
+    max_wait_seconds: Optional[float] = None,
+) -> None:
     """
     Launch API server with two different sets of arguments/environments
     and compare the results of the API calls.
@@ -473,12 +487,14 @@ def compare_two_settings(model: str,
     )
 
 
-def compare_all_settings(model: str,
-                         all_args: list[list[str]],
-                         all_envs: list[Optional[dict[str, str]]],
-                         *,
-                         method: str = "generate",
-                         max_wait_seconds: Optional[float] = None) -> None:
+def compare_all_settings(
+    model: str,
+    all_args: list[list[str]],
+    all_envs: list[Optional[dict[str, str]]],
+    *,
+    method: str = "generate",
+    max_wait_seconds: Optional[float] = None,
+) -> None:
     """
     Launch API server with several different sets of arguments/environments
     and compare the results of the API calls with the first set of arguments.
@@ -528,21 +544,22 @@ def compare_all_settings(model: str,
             args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
         compare_results: list = []
         results = ref_results if i == 0 else compare_results
-        with RemoteOpenAIServer(model,
-                                args,
-                                env_dict=env,
-                                max_wait_seconds=max_wait_seconds) as server:
+        with RemoteOpenAIServer(
+            model, args, env_dict=env, max_wait_seconds=max_wait_seconds
+        ) as server:
             client = server.get_client()
 
             # test models list
             models = client.models.list()
             models = models.data
             served_model = models[0]
-            results.append({
-                "test": "models_list",
-                "id": served_model.id,
-                "root": served_model.root,
-            })
+            results.append(
+                {
+                    "test": "models_list",
+                    "id": served_model.id,
+                    "root": served_model.root,
+                }
+            )
 
             if method == "generate":
                 results += _test_completion(client, model, prompt, token_ids)
@@ -552,8 +569,9 @@ def compare_all_settings(model: str,
                 results += _test_chat(client, model, prompt)
             elif method == "generate_with_image":
                 results += _test_image_text(
-                    client, model,
-                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
+                    client,
+                    model,
+                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
                 )
             elif method == "encode":
                 results += _test_embeddings(client, model, prompt)
@@ -566,8 +584,7 @@ def compare_all_settings(model: str,
                 ref_envs = all_envs[0]
                 compare_args = all_args[i]
                 compare_envs = all_envs[i]
-                for ref_result, compare_result in zip(ref_results,
-                                                      compare_results):
+                for ref_result, compare_result in zip(ref_results, compare_results):
                     ref_result = copy.deepcopy(ref_result)
                     compare_result = copy.deepcopy(compare_result)
                     if "embedding" in ref_result and method == "encode":
@@ -578,7 +595,8 @@ def compare_all_settings(model: str,
                         )
                         assert sim >= 0.999, (
                             f"Embedding for {model=} are not the same.\n"
-                            f"cosine_similarity={sim}\n")
+                            f"cosine_similarity={sim}\n"
+                        )
                         del ref_result["embedding"]
                         del compare_result["embedding"]
                     assert ref_result == compare_result, (
@@ -586,7 +604,8 @@ def compare_all_settings(model: str,
                         f"{ref_args=} {ref_envs=}\n"
                         f"{compare_args=} {compare_envs=}\n"
                         f"{ref_result=}\n"
-                        f"{compare_result=}\n")
+                        f"{compare_result=}\n"
+                    )
 
 
 def init_test_distributed_environment(
@@ -601,7 +620,8 @@ def init_test_distributed_environment(
         world_size=pp_size * tp_size,
         rank=rank,
         distributed_init_method=distributed_init_method,
-        local_rank=local_rank)
+        local_rank=local_rank,
+    )
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
@@ -624,9 +644,9 @@ def multi_process_parallel(
     ray.init(
         runtime_env={
             "working_dir": VLLM_PATH,
-            "excludes":
-            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
-        })
+            "excludes": ["build", ".git", "cmake-build-*", "shellcheck", "dist"],
+        }
+    )
 
     distributed_init_port = get_open_port()
     refs = []
@@ -638,7 +658,8 @@ def multi_process_parallel(
                 pp_size,
                 rank,
                 distributed_init_port,
-            ), )
+            ),
+        )
     ray.get(refs)
 
     ray.shutdown()
@@ -667,11 +688,13 @@ def get_physical_device_indices(devices):
 
 
 @_nvml()
-def wait_for_gpu_memory_to_clear(*,
-                                 devices: list[int],
-                                 threshold_bytes: Optional[int] = None,
-                                 threshold_ratio: Optional[float] = None,
-                                 timeout_s: float = 120) -> None:
+def wait_for_gpu_memory_to_clear(
+    *,
+    devices: list[int],
+    threshold_bytes: Optional[int] = None,
+    threshold_ratio: Optional[float] = None,
+    timeout_s: float = 120,
+) -> None:
     assert threshold_bytes is not None or threshold_ratio is not None
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
     # context.
@@ -692,29 +715,33 @@ def wait_for_gpu_memory_to_clear(*,
                 gb_used = mem_info.used / 2**30
                 gb_total = mem_info.total / 2**30
             output_raw[device] = (gb_used, gb_total)
-            output[device] = f'{gb_used:.02f}/{gb_total:.02f}'
+            output[device] = f"{gb_used:.02f}/{gb_total:.02f}"
 
-        print('gpu memory used/total (GiB): ', end='')
+        print("gpu memory used/total (GiB): ", end="")
         for k, v in output.items():
-            print(f'{k}={v}; ', end='')
-        print('')
+            print(f"{k}={v}; ", end="")
+        print("")
 
         if threshold_bytes is not None:
             is_free = lambda used, total: used <= threshold_bytes / 2**30
-            threshold = f"{threshold_bytes/2**30} GiB"
+            threshold = f"{threshold_bytes / 2**30} GiB"
         else:
             is_free = lambda used, total: used / total <= threshold_ratio
             threshold = f"{threshold_ratio:.2f}"
 
         dur_s = time.time() - start_time
         if all(is_free(used, total) for used, total in output_raw.values()):
-            print(f'Done waiting for free GPU memory on devices {devices=} '
-                  f'({threshold=}) {dur_s=:.02f}')
+            print(
+                f"Done waiting for free GPU memory on devices {devices=} "
+                f"({threshold=}) {dur_s=:.02f}"
+            )
             break
 
         if dur_s >= timeout_s:
-            raise ValueError(f'Memory of devices {devices=} not free after '
-                             f'{dur_s=:.02f} ({threshold=})')
+            raise ValueError(
+                f"Memory of devices {devices=} not free after "
+                f"{dur_s=:.02f} ({threshold=})"
+            )
 
         time.sleep(5)
 
@@ -722,8 +749,7 @@ def wait_for_gpu_memory_to_clear(*,
 _P = ParamSpec("_P")
 
 
-def fork_new_process_for_each_test(
-        f: Callable[_P, None]) -> Callable[_P, None]:
+def fork_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None]:
     """Decorator to fork a new process for each test function.
     See https://github.com/vllm-project/vllm/issues/7053 for more details.
     """
@@ -734,6 +760,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
         # to avoid sending SIGTERM to the parent process
         os.setpgrp()
         from _pytest.outcomes import Skipped
+
         pid = os.fork()
         print(f"Fork a new process to run a test {pid}")
         if pid == 0:
@@ -745,6 +772,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
                 os._exit(0)
             except Exception:
                 import traceback
+
                 traceback.print_exc()
                 os._exit(1)
             else:
@@ -758,34 +786,34 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
             os.killpg(pgid, signal.SIGTERM)
             # restore the signal handler
             signal.signal(signal.SIGTERM, old_signal_handler)
-            assert _exitcode == 0, (f"function {f} failed when called with"
-                                    f" args {args} and kwargs {kwargs}")
+            assert _exitcode == 0, (
+                f"function {f} failed when called with args {args} and kwargs {kwargs}"
+            )
 
     return wrapper
 
 
-def spawn_new_process_for_each_test(
-        f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to spawn a new process for each test function.
-    """
+def spawn_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None]:
+    """Decorator to spawn a new process for each test function."""
 
     @functools.wraps(f)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
         # Check if we're already in a subprocess
-        if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
+        if os.environ.get("RUNNING_IN_SUBPROCESS") == "1":
             # If we are, just run the function directly
             return f(*args, **kwargs)
 
         import torch.multiprocessing as mp
+
         with suppress(RuntimeError):
-            mp.set_start_method('spawn')
+            mp.set_start_method("spawn")
 
         # Get the module
         module_name = f.__module__
 
         # Create a process with environment variable set
         env = os.environ.copy()
-        env['RUNNING_IN_SUBPROCESS'] = '1'
+        env["RUNNING_IN_SUBPROCESS"] = "1"
 
         with tempfile.TemporaryDirectory() as tempdir:
             output_filepath = os.path.join(tempdir, "new_process.tmp")
@@ -795,29 +823,29 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
 
             cmd = [sys.executable, "-m", f"{module_name}"]
 
-            returned = subprocess.run(cmd,
-                                      input=input_bytes,
-                                      capture_output=True,
-                                      env=env)
+            returned = subprocess.run(
+                cmd, input=input_bytes, capture_output=True, env=env
+            )
 
             # check if the subprocess is successful
             try:
                 returned.check_returncode()
             except Exception as e:
                 # wrap raised exception to provide more information
-                raise RuntimeError(f"Error raised in subprocess:\n"
-                                   f"{returned.stderr.decode()}") from e
+                raise RuntimeError(
+                    f"Error raised in subprocess:\n{returned.stderr.decode()}"
+                ) from e
 
     return wrapper
 
 
 def create_new_process_for_each_test(
-    method: Optional[Literal["spawn", "fork"]] = None
+    method: Optional[Literal["spawn", "fork"]] = None,
 ) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
     """Creates a decorator that runs each test function in a new process.
 
     Args:
-        method: The process creation method. Can be either "spawn" or "fork". 
+        method: The process creation method. Can be either "spawn" or "fork".
                If not specified, it defaults to "spawn" on ROCm and XPU
                platforms and "fork" otherwise.
 
@@ -828,8 +856,7 @@ def create_new_process_for_each_test(
         use_spawn = current_platform.is_rocm() or current_platform.is_xpu()
         method = "spawn" if use_spawn else "fork"
 
-    assert method in ["spawn",
-                      "fork"], "Method must be either 'spawn' or 'fork'"
+    assert method in ["spawn", "fork"], "Method must be either 'spawn' or 'fork'"
 
     if method == "fork":
         return fork_new_process_for_each_test
@@ -913,7 +940,7 @@ async def completions_with_server_args(
     max_wait_seconds: int = 240,
     max_tokens: Union[int, list] = 5,
 ) -> list[Completion]:
-    '''Construct a remote OpenAI server, obtain an async client to the
+    """Construct a remote OpenAI server, obtain an async client to the
     server & invoke the completions API to obtain completions.
 
     Args:
@@ -929,7 +956,7 @@ async def completions_with_server_args(
 
     Returns:
       OpenAI Completion instance
-    '''
+    """
 
     if isinstance(max_tokens, int):
         max_tokens = [max_tokens] * len(prompts)
@@ -937,17 +964,21 @@ async def completions_with_server_args(
     assert len(max_tokens) == len(prompts)
 
     outputs = None
-    with RemoteOpenAIServer(model_name,
-                            server_cli_args,
-                            max_wait_seconds=max_wait_seconds) as server:
+    with RemoteOpenAIServer(
+        model_name, server_cli_args, max_wait_seconds=max_wait_seconds
+    ) as server:
         client = server.get_async_client()
-        outputs = [ client.completions.create(model=model_name,
-                                              prompt=[p],
-                                              temperature=0,
-                                              stream=False,
-                                              max_tokens=max_tok,
-                                              logprobs=num_logprobs) \
-                    for p, max_tok in zip(prompts, max_tokens) ]
+        outputs = [
+            client.completions.create(
+                model=model_name,
+                prompt=[p],
+                temperature=0,
+                stream=False,
+                max_tokens=max_tok,
+                logprobs=num_logprobs,
+            )
+            for p, max_tok in zip(prompts, max_tokens)
+        ]
         outputs = await asyncio.gather(*outputs)
 
     assert outputs is not None, "Completion API call failed."
@@ -956,21 +987,28 @@ async def completions_with_server_args(
 
 
 def get_client_text_generations(completions: list[Completion]) -> list[str]:
-    '''Extract generated tokens from the output of a
+    """Extract generated tokens from the output of a
     request made to an Open-AI-protocol completions endpoint.
-    '''
+    """
     assert all([len(x.choices) == 1 for x in completions])
     return [x.choices[0].text for x in completions]
 
 
 def get_client_text_logprob_generations(
-        completions: list[Completion]) -> list[TextTextLogprobs]:
-    '''Operates on the output of a request made to an Open-AI-protocol
+    completions: list[Completion],
+) -> list[TextTextLogprobs]:
+    """Operates on the output of a request made to an Open-AI-protocol
     completions endpoint; obtains top-rank logprobs for each token in
     each {class}`SequenceGroup`
-    '''
+    """
     text_generations = get_client_text_generations(completions)
-    text = ''.join(text_generations)
-    return [(text_generations, text,
-             (None if x.logprobs is None else x.logprobs.top_logprobs))
-            for completion in completions for x in completion.choices]
+    text = "".join(text_generations)
+    return [
+        (
+            text_generations,
+            text,
+            (None if x.logprobs is None else x.logprobs.top_logprobs),
+        )
+        for completion in completions
+        for x in completion.choices
+    ]
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index b4e0101a0d4b..337bc03d1d67 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -4,19 +4,24 @@
 
 import pytest
 import torch
+from tests.v1.attention.utils import (
+    BatchSpec,
+    _Backend,
+    create_common_attn_metadata,
+    create_standard_kv_cache_spec,
+    create_vllm_config,
+    get_attention_backend,
+)
 
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      create_vllm_config,
-                                      get_attention_backend)
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
-    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1
+    _Backend.FLASH_ATTN_VLLM_V1,
+    _Backend.FLASHINFER_VLLM_V1,
+    _Backend.FLEX_ATTENTION,
+    _Backend.TRITON_ATTN_VLLM_V1,
 ]
 
 # Remove flashinfer from the list if it's not available
@@ -43,34 +48,29 @@ def _convert_dtype_to_torch(dtype):
 
 # Define common batch configurations
 BATCH_SPECS = {
-    "small_decode":
-    BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
-    "small_prefill":
-    BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
-    "mixed_small":
-    BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
-    "medium_decode":
-    BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
-              query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
-    "medium_prefill":
-    BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
-    "mixed_medium":
-    BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
-              query_lens=[1, 1, 1, 7, 7, 7]),
-    "large_decode":
-    BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
-    "large_prefill":
-    BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
-    "single_decode":
-    BatchSpec(seq_lens=[1024], query_lens=[1]),
-    "single_prefill":
-    BatchSpec(seq_lens=[1024], query_lens=[64]),
+    "small_decode": BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
+    "small_prefill": BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
+    "mixed_small": BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
+    "medium_decode": BatchSpec(
+        seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
+        query_lens=[1, 1, 1, 1, 1, 1, 1, 1],
+    ),
+    "medium_prefill": BatchSpec(
+        seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]
+    ),
+    "mixed_medium": BatchSpec(
+        seq_lens=[512, 1024, 2048, 512, 1024, 2048], query_lens=[1, 1, 1, 7, 7, 7]
+    ),
+    "large_decode": BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
+    "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]),
+    "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]),
 }
 
 
-def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
+def create_dummy_kv_cache(
+    kv_cache_spec: FullAttentionSpec, device: torch.device, num_blocks: int = 100
+) -> torch.Tensor:
     """Create a dummy KV cache tensor for testing."""
     kv_cache = torch.randn(
         2,  # K and V
@@ -85,18 +85,19 @@ def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
 
 
 def create_and_prepopulate_kv_cache(
-        k_contexts: list[torch.Tensor],
-        v_contexts: list[torch.Tensor],
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        num_blocks: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        randomize_blocks: bool = True) -> torch.Tensor:
+    k_contexts: list[torch.Tensor],
+    v_contexts: list[torch.Tensor],
+    block_size: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    num_blocks: int,
+    common_attn_metadata: CommonAttentionMetadata,
+    randomize_blocks: bool = True,
+) -> torch.Tensor:
     """Create and prepopulate a KV cache with context data.
-    
+
     Args:
         k_contexts: List of key context tensors for each sequence
         v_contexts: List of value context tensors for each sequence
@@ -108,28 +109,26 @@ def create_and_prepopulate_kv_cache(
         device: Device to create the cache on
         num_blocks: Total number of blocks in the cache
         block_table: Block table tensor to populate
-        randomize_blocks: Whether to randomly permute blocks 
+        randomize_blocks: Whether to randomly permute blocks
                           or use sequential order
-        
+
     Returns:
         Tuple of (kv_cache, updated_block_table)
     """
     batch_size = len(k_contexts)
     seq_lens = common_attn_metadata.seq_lens_cpu
-    query_lens = common_attn_metadata.query_start_loc_cpu[
-        1:] - common_attn_metadata.query_start_loc_cpu[:-1]
+    query_lens = (
+        common_attn_metadata.query_start_loc_cpu[1:]
+        - common_attn_metadata.query_start_loc_cpu[:-1]
+    )
     context_lens = common_attn_metadata.num_computed_tokens_cpu
     block_table = common_attn_metadata.block_table_tensor
     slot_mapping = common_attn_metadata.slot_mapping
 
     # Create KV cache
-    kv_cache = torch.empty(2,
-                           num_blocks,
-                           block_size,
-                           num_kv_heads,
-                           head_size,
-                           dtype=dtype,
-                           device=device)
+    kv_cache = torch.empty(
+        2, num_blocks, block_size, num_kv_heads, head_size, dtype=dtype, device=device
+    )
     kv_cache_flat = kv_cache.view(2, -1, num_kv_heads, head_size)
 
     # Populate the cache with the context tokens
@@ -149,15 +148,14 @@ def create_and_prepopulate_kv_cache(
 
     # Permute the context blocks (excluding block 0 which is null)
     if randomize_blocks:
-        perm = torch.randperm(
-            blocks_end - 1) + 1  # Random permutation starting from block 1
+        perm = (
+            torch.randperm(blocks_end - 1) + 1
+        )  # Random permutation starting from block 1
     else:
-        perm = torch.arange(
-            1, blocks_end)  # Sequential order starting from block 1
+        perm = torch.arange(1, blocks_end)  # Sequential order starting from block 1
 
     inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device)
-    inv_perm[1:] = torch.argsort(
-        perm) + 1  # Add 1 to account for starting from block 1
+    inv_perm[1:] = torch.argsort(perm) + 1  # Add 1 to account for starting from block 1
     kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...]
 
     # Construct the right block table
@@ -178,8 +176,8 @@ def create_and_prepopulate_kv_cache(
         start = common_attn_metadata.query_start_loc_cpu[i]
         end = common_attn_metadata.query_start_loc_cpu[i + 1]
         slot_mapping[start:end] = block_table[
-            i,
-            block_indices] * block_size + token_inter_block_offsets.to(device)
+            i, block_indices
+        ] * block_size + token_inter_block_offsets.to(device)
 
     return kv_cache
 
@@ -196,12 +194,17 @@ def __init__(self, device: torch.device):
         self._v_scale_float = 1.0
 
 
-def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
-                          vllm_config, device: torch.device,
-                          common_attn_metadata: CommonAttentionMetadata,
-                          query: torch.Tensor, key: torch.Tensor,
-                          value: torch.Tensor,
-                          kv_cache: torch.Tensor) -> torch.Tensor:
+def run_attention_backend(
+    backend: _Backend,
+    kv_cache_spec: FullAttentionSpec,
+    vllm_config,
+    device: torch.device,
+    common_attn_metadata: CommonAttentionMetadata,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+) -> torch.Tensor:
     """Run attention computation using the specified backend's AttentionImpl."""
 
     builder_cls, impl_cls = get_attention_backend(backend)
@@ -216,17 +219,17 @@ def mock_get_per_layer_parameters(vllm_config):
             # Return mock parameters for a single layer
             head_size = vllm_config.model_config.get_head_size()
             return {
-                "mock_layer":
-                PerLayerParameters(
+                "mock_layer": PerLayerParameters(
                     window_left=-1,  # No sliding window
                     logits_soft_cap=0.0,  # No soft cap
-                    sm_scale=1.0 / (head_size**0.5)  # Standard scale
+                    sm_scale=1.0 / (head_size**0.5),  # Standard scale
                 )
             }
 
         with unittest.mock.patch(
-                'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters',
-                mock_get_per_layer_parameters):
+            "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
+            mock_get_per_layer_parameters,
+        ):
             builder = builder_cls(kv_cache_spec, vllm_config, device)
             attn_metadata = builder.build(
                 common_prefix_len=0,
@@ -242,9 +245,11 @@ def mock_get_per_layer_parameters(vllm_config):
 
     # Instantiate implementation
     num_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     head_size = vllm_config.model_config.get_head_size()
     scale = 1.0 / (head_size**0.5)
     impl = impl_cls(
@@ -264,21 +269,24 @@ def mock_get_per_layer_parameters(vllm_config):
     # Run forward pass
     # NOTE: The query, key, and value are already shaped correctly
     # in the calling test function.
-    output = impl.forward(mock_layer,
-                          query,
-                          key,
-                          value,
-                          kv_cache,
-                          attn_metadata,
-                          output=output)
+    output = impl.forward(
+        mock_layer, query, key, value, kv_cache, attn_metadata, output=output
+    )
 
     return output
 
 
-@pytest.mark.parametrize("batch_spec_name", [
-    "small_decode", "small_prefill", "mixed_small", "medium_decode",
-    "medium_prefill", "mixed_medium"
-])
+@pytest.mark.parametrize(
+    "batch_spec_name",
+    [
+        "small_decode",
+        "small_prefill",
+        "mixed_small",
+        "medium_decode",
+        "medium_prefill",
+        "mixed_medium",
+    ],
+)
 @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
 def test_backend_correctness(batch_spec_name: str, model: str):
     """
@@ -307,9 +315,11 @@ def test_backend_correctness(batch_spec_name: str, model: str):
     seq_lens = batch_spec.seq_lens
     query_lens = batch_spec.query_lens
     num_q_heads = vllm_config.model_config.get_num_attention_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
+        vllm_config.parallel_config
+    )
     head_size = vllm_config.model_config.get_head_size()
     dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
     block_size = vllm_config.cache_config.block_size
@@ -326,21 +336,9 @@ def test_backend_correctness(batch_spec_name: str, model: str):
         context_len = s_len - q_len
 
         # Generate Q, K, V for the whole sequence to be used in SDPA
-        q = torch.randn(q_len,
-                        num_q_heads,
-                        head_size,
-                        dtype=dtype,
-                        device=device)
-        k_full = torch.randn(s_len,
-                             num_kv_heads,
-                             head_size,
-                             dtype=dtype,
-                             device=device)
-        v_full = torch.randn(s_len,
-                             num_kv_heads,
-                             head_size,
-                             dtype=dtype,
-                             device=device)
+        q = torch.randn(q_len, num_q_heads, head_size, dtype=dtype, device=device)
+        k_full = torch.randn(s_len, num_kv_heads, head_size, dtype=dtype, device=device)
+        v_full = torch.randn(s_len, num_kv_heads, head_size, dtype=dtype, device=device)
 
         # SDPA expects (N, H, L, D), so unsqueeze batch and permute
         q_sdpa_in = q.unsqueeze(0).transpose(1, 2)
@@ -350,7 +348,8 @@ def test_backend_correctness(batch_spec_name: str, model: str):
         if num_q_heads != num_kv_heads:
             assert num_q_heads % num_kv_heads == 0, (
                 f"num_q_heads ({num_q_heads}) must be divisible by "
-                f"num_kv_heads ({num_kv_heads})")
+                f"num_kv_heads ({num_kv_heads})"
+            )
             repeats = num_q_heads // num_kv_heads
             k_sdpa_in = k_sdpa_in.repeat_interleave(repeats, dim=1)
             v_sdpa_in = v_sdpa_in.repeat_interleave(repeats, dim=1)
@@ -359,12 +358,11 @@ def test_backend_correctness(batch_spec_name: str, model: str):
         #  (context_len + i)
         kv_len = s_len
         offset = context_len
-        attn_mask = torch.full((q_len, kv_len),
-                               float('-inf'),
-                               device=device,
-                               dtype=dtype)
+        attn_mask = torch.full(
+            (q_len, kv_len), float("-inf"), device=device, dtype=dtype
+        )
         for i in range(q_len):
-            attn_mask[i, :offset + i + 1] = 0.0
+            attn_mask[i, : offset + i + 1] = 0.0
 
         sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
             q_sdpa_in,
@@ -372,7 +370,8 @@ def test_backend_correctness(batch_spec_name: str, model: str):
             v_sdpa_in,
             attn_mask=attn_mask,
             scale=scale,
-            enable_gqa=True)
+            enable_gqa=True,
+        )
         # Convert back to (L, H, D)
         all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0))
 
@@ -391,7 +390,8 @@ def test_backend_correctness(batch_spec_name: str, model: str):
     sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
 
     common_attn_metadata = create_common_attn_metadata(
-        batch_spec, vllm_config.cache_config.block_size, device)
+        batch_spec, vllm_config.cache_config.block_size, device
+    )
 
     # 3. Simulate Paged KV Cache and a realistic slot_mapping
     kv_cache = create_and_prepopulate_kv_cache(
@@ -404,7 +404,8 @@ def test_backend_correctness(batch_spec_name: str, model: str):
         device=device,
         num_blocks=vllm_config.cache_config.num_gpu_blocks or 1000,
         common_attn_metadata=common_attn_metadata,
-        randomize_blocks=True)
+        randomize_blocks=True,
+    )
 
     # 4. Run vLLM backends and compare
     # Note: flex_attention has known Triton kernel compatibility issues
@@ -419,23 +420,31 @@ def test_backend_correctness(batch_spec_name: str, model: str):
         if backend_name == _Backend.FLASHINFER_VLLM_V1:
             kv_cache_for_backend = kv_cache.transpose(0, 1)
 
-        backend_output = run_attention_backend(backend_name, kv_cache_spec,
-                                               vllm_config, device,
-                                               common_attn_metadata,
-                                               query_vllm, key_vllm,
-                                               value_vllm,
-                                               kv_cache_for_backend)
+        backend_output = run_attention_backend(
+            backend_name,
+            kv_cache_spec,
+            vllm_config,
+            device,
+            common_attn_metadata,
+            query_vllm,
+            key_vllm,
+            value_vllm,
+            kv_cache_for_backend,
+        )
 
         # Check shape and dtype consistency
         assert backend_output.shape == sdpa_output.shape, (
             f"[{backend_name}] shape {backend_output.shape} != "
-            f"SDPA shape {sdpa_output.shape}")
+            f"SDPA shape {sdpa_output.shape}"
+        )
         assert backend_output.dtype == sdpa_output.dtype, (
             f"[{backend_name}] dtype {backend_output.dtype} != "
-            f"SDPA dtype {sdpa_output.dtype}")
+            f"SDPA dtype {sdpa_output.dtype}"
+        )
 
         assert torch.isfinite(backend_output).all(), (
-            f"[{backend_name}] produced non-finite values")
+            f"[{backend_name}] produced non-finite values"
+        )
 
         # Check numerical similarity
         rtol = 1e-2
@@ -448,19 +457,19 @@ def test_backend_correctness(batch_spec_name: str, model: str):
 
         max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
         max_rel_diff = torch.max(
-            torch.abs(backend_output - sdpa_output) /
-            torch.abs(sdpa_output)).item()
-        all_close = torch.allclose(backend_output,
-                                   sdpa_output,
-                                   rtol=rtol,
-                                   atol=atol)
+            torch.abs(backend_output - sdpa_output) / torch.abs(sdpa_output)
+        ).item()
+        all_close = torch.allclose(backend_output, sdpa_output, rtol=rtol, atol=atol)
 
         if not all_close:
-            print(f"[{backend_name}] output differs from SDPA baseline. "
-                  f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
+            print(
+                f"[{backend_name}] output differs from SDPA baseline. "
+                f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})"
+            )
             print(f"[{backend_name}] output: {backend_output}")
             print(f"[{backend_name}] SDPA baseline: {sdpa_output}")
 
         assert all_close, (
             f"[{backend_name}] output differs from SDPA baseline. "
-            f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
+            f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})"
+        )
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 30cfbdda5d86..70195f704f09 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -8,9 +8,17 @@
 import pytest
 import torch
 
-from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
-                         LoadConfig, ModelConfig, ModelDType, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ModelDType,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.platforms import _Backend
 from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
@@ -20,6 +28,7 @@
 @dataclass
 class BatchSpec:
     """Specification for a batch configuration (workload shape only)."""
+
     seq_lens: list[int]
     query_lens: list[int]
 
@@ -37,25 +46,24 @@ def compute_num_tokens(self):
 
 
 def create_common_attn_metadata(
-        batch_spec: BatchSpec,
-        block_size: int,
-        device: torch.device,
-        max_block_idx: int = 1000) -> CommonAttentionMetadata:
+    batch_spec: BatchSpec,
+    block_size: int,
+    device: torch.device,
+    max_block_idx: int = 1000,
+) -> CommonAttentionMetadata:
     """Create CommonAttentionMetadata from a BatchSpec and ModelParams."""
     # Create query start locations
-    query_start_loc = torch.zeros(batch_spec.batch_size + 1,
-                                  dtype=torch.int32,
-                                  device=device)
-    query_start_loc[1:] = torch.tensor(batch_spec.query_lens,
-                                       dtype=torch.int32,
-                                       device=device).cumsum(0)
+    query_start_loc = torch.zeros(
+        batch_spec.batch_size + 1, dtype=torch.int32, device=device
+    )
+    query_start_loc[1:] = torch.tensor(
+        batch_spec.query_lens, dtype=torch.int32, device=device
+    ).cumsum(0)
     query_start_loc_cpu = query_start_loc.cpu()
     num_tokens = batch_spec.compute_num_tokens()
 
     # Create sequence lengths
-    seq_lens = torch.tensor(batch_spec.seq_lens,
-                            dtype=torch.int32,
-                            device=device)
+    seq_lens = torch.tensor(batch_spec.seq_lens, dtype=torch.int32, device=device)
     seq_lens_cpu = seq_lens.cpu()
 
     # Create computed tokens (context length for each sequence)
@@ -67,17 +75,18 @@ def create_common_attn_metadata(
 
     # Create block table (random for testing)
     max_blocks = max(batch_spec.seq_lens) // block_size + 1
-    block_table_tensor = torch.randint(0,
-                                       max_block_idx,
-                                       (batch_spec.batch_size, max_blocks),
-                                       dtype=torch.int32,
-                                       device=device)
+    block_table_tensor = torch.randint(
+        0,
+        max_block_idx,
+        (batch_spec.batch_size, max_blocks),
+        dtype=torch.int32,
+        device=device,
+    )
 
     # Create slot mapping
-    slot_mapping = torch.randint(0,
-                                 max_block_idx, (num_tokens, ),
-                                 dtype=torch.int64,
-                                 device=device)
+    slot_mapping = torch.randint(
+        0, max_block_idx, (num_tokens,), dtype=torch.int64, device=device
+    )
 
     # Calculate max query length
     max_query_len = max(batch_spec.query_lens)
@@ -98,23 +107,19 @@ def create_common_attn_metadata(
 
 def get_attention_backend(backend_name: _Backend):
     """Set up attention backend classes for testing.
-    
+
     Args:
         backend_name: Name of the backend ("flash_attn", "flashinfer", etc.)
         vllm_config: VllmConfig instance
-        
+
     Returns:
         Tuple of (backend_builder_class, backend_impl_class)
     """
     backend_map = {
-        _Backend.FLASH_ATTN_VLLM_V1:
-        "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",
-        _Backend.FLASHINFER_VLLM_V1:
-        "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
-        _Backend.FLEX_ATTENTION:
-        "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
-        _Backend.TRITON_ATTN_VLLM_V1:
-        "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
+        _Backend.FLASH_ATTN_VLLM_V1: "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",
+        _Backend.FLASHINFER_VLLM_V1: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
+        _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
+        _Backend.TRITON_ATTN_VLLM_V1: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
     }
 
     if backend_name not in backend_map:
@@ -129,13 +134,13 @@ def get_attention_backend(backend_name: _Backend):
         pytest.skip(f"{backend_name} not available: {e}")
 
 
-def create_standard_kv_cache_spec(
-        vllm_config: VllmConfig) -> FullAttentionSpec:
+def create_standard_kv_cache_spec(vllm_config: VllmConfig) -> FullAttentionSpec:
     """Create a FullAttentionSpec from ModelParams only."""
     return FullAttentionSpec(
         block_size=vllm_config.cache_config.block_size,
         num_kv_heads=vllm_config.model_config.get_num_kv_heads(
-            vllm_config.parallel_config),
+            vllm_config.parallel_config
+        ),
         head_size=vllm_config.model_config.get_head_size(),
         dtype=vllm_config.model_config.dtype,
         use_mla=vllm_config.model_config.use_mla,
@@ -143,14 +148,16 @@ def create_standard_kv_cache_spec(
     )
 
 
-def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
-                       tensor_parallel_size: int = 1,
-                       max_model_len: int = 1024,
-                       dtype: Union[ModelDType, torch.dtype] = "auto",
-                       block_size: int = 16,
-                       max_num_seqs: int = 256,
-                       max_num_batched_tokens: int = 8192,
-                       add_mock_model_methods: bool = True) -> VllmConfig:
+def create_vllm_config(
+    model_name: str = "meta-llama/Meta-Llama-3-8B",
+    tensor_parallel_size: int = 1,
+    max_model_len: int = 1024,
+    dtype: Union[ModelDType, torch.dtype] = "auto",
+    block_size: int = 16,
+    max_num_seqs: int = 256,
+    max_num_batched_tokens: int = 8192,
+    add_mock_model_methods: bool = True,
+) -> VllmConfig:
     """Create a VllmConfig for testing with reasonable defaults."""
 
     model_config = ModelConfig(
@@ -173,7 +180,8 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
     cache_config.num_cpu_blocks = 0
 
     parallel_config = ParallelConfig(
-        tensor_parallel_size=tensor_parallel_size, )
+        tensor_parallel_size=tensor_parallel_size,
+    )
 
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
@@ -190,15 +198,17 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
         # but some backends expect to query the model for layer-specific
         # parameters
         import types
-        model_config.get_num_layers = types.MethodType(lambda self: 1,
-                                                       model_config)
+
+        model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
         model_config.get_sliding_window_for_layer = types.MethodType(
-            lambda self, i: None, model_config)
+            lambda self, i: None, model_config
+        )
         model_config.get_logits_soft_cap_for_layer = types.MethodType(
-            lambda self, i: 0.0, model_config)
+            lambda self, i: 0.0, model_config
+        )
         model_config.get_sm_scale_for_layer = types.MethodType(
-            lambda self, i: 1.0 / model_config.get_head_size()**0.5,
-            model_config)
+            lambda self, i: 1.0 / model_config.get_head_size() ** 0.5, model_config
+        )
 
     return VllmConfig(
         model_config=model_config,
@@ -211,12 +221,14 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
     )
 
 
-def create_dummy_kv_cache(block_size: int,
-                          num_kv_heads: int,
-                          head_size: int,
-                          dtype: torch.dtype,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
+def create_dummy_kv_cache(
+    block_size: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    num_blocks: int = 100,
+) -> torch.Tensor:
     """Create a dummy KV cache tensor for testing."""
     kv_cache = torch.randn(
         num_blocks,
@@ -225,5 +237,6 @@ def create_dummy_kv_cache(block_size: int,
         num_kv_heads,
         head_size,
         dtype=dtype,
-        device=device)
+        device=device,
+    )
     return kv_cache
diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 3ccefbd81cab..22e9046bd046 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -12,14 +12,12 @@
 
 
 def _make_model_runner_output(
-    scheduler_output: SchedulerOutput, ) -> ModelRunnerOutput:
+    scheduler_output: SchedulerOutput,
+) -> ModelRunnerOutput:
     req_ids = list(scheduler_output.num_scheduled_tokens.keys())
     return ModelRunnerOutput(
         req_ids=req_ids,
-        req_id_to_index={
-            req_id: i
-            for i, req_id in enumerate(req_ids)
-        },
+        req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
         sampled_token_ids=[[i] for i in range(len(req_ids))],
         spec_token_ids=None,
         logprobs=None,
@@ -73,8 +71,7 @@ def abort_request():
         if not abort_order:
             return
         req = requests[abort_order.pop(0)]
-        scheduler.finish_requests(req.request_id,
-                                  RequestStatus.FINISHED_ABORTED)
+        scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
 
     while sched_outputs:
         # Abort a scheduled request.
@@ -110,8 +107,7 @@ def abort_request():
         if not abort_order:
             return
         req = requests[abort_order.pop(0)]
-        scheduler.finish_requests(req.request_id,
-                                  RequestStatus.FINISHED_ABORTED)
+        scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
 
     while sched_outputs:
         # Abort a scheduled request.
@@ -133,14 +129,15 @@ def test_prefix_caching_for_prefill_dedup():
     CHUNK_SIZE = 1000
     BLOCK_SIZE = 16
     num_prompt_tokens = 100
-    scheduler = create_scheduler(async_scheduling=True,
-                                 max_num_batched_tokens=CHUNK_SIZE,
-                                 enable_prefix_caching=True,
-                                 block_size=BLOCK_SIZE)
-    requests = create_requests(num_requests=5,
-                               num_tokens=num_prompt_tokens,
-                               max_tokens=3,
-                               same_prompt=True)
+    scheduler = create_scheduler(
+        async_scheduling=True,
+        max_num_batched_tokens=CHUNK_SIZE,
+        enable_prefix_caching=True,
+        block_size=BLOCK_SIZE,
+    )
+    requests = create_requests(
+        num_requests=5, num_tokens=num_prompt_tokens, max_tokens=3, same_prompt=True
+    )
     requests_copy = requests.copy()
 
     # Two requests with the same prompt.
@@ -182,13 +179,15 @@ def test_prefix_caching_for_multi_turn():
     BLOCK_SIZE = 16
     num_prompt_tokens = 100
     num_output_tokens = 200
-    scheduler = create_scheduler(async_scheduling=True,
-                                 max_num_batched_tokens=CHUNK_SIZE,
-                                 enable_prefix_caching=True,
-                                 block_size=BLOCK_SIZE)
-    requests = create_requests(num_requests=5,
-                               num_tokens=num_prompt_tokens,
-                               max_tokens=num_output_tokens)
+    scheduler = create_scheduler(
+        async_scheduling=True,
+        max_num_batched_tokens=CHUNK_SIZE,
+        enable_prefix_caching=True,
+        block_size=BLOCK_SIZE,
+    )
+    requests = create_requests(
+        num_requests=5, num_tokens=num_prompt_tokens, max_tokens=num_output_tokens
+    )
 
     for req in requests:
         scheduler.add_request(req)
@@ -214,8 +213,9 @@ def test_prefix_caching_for_multi_turn():
         max_tokens=num_output_tokens,
     )
     for i, req in enumerate(next_turn_requests):
-        req.prompt_token_ids = (requests[i].prompt_token_ids +
-                                list(requests[i].output_token_ids))
+        req.prompt_token_ids = requests[i].prompt_token_ids + list(
+            requests[i].output_token_ids
+        )
     # Schedule the next-turn requests.
     for req in next_turn_requests:
         scheduler.add_request(req)
@@ -224,5 +224,4 @@ def test_prefix_caching_for_multi_turn():
     # Make sure the next-turn requests get prefix cache hit by the previous
     # requests.
     for req in next_turn_requests:
-        assert (req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE *
-                BLOCK_SIZE)
+        assert req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE * BLOCK_SIZE
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 0676cb3eb65d..f5bc7e6763f2 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -10,28 +10,38 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
+
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
 from vllm.v1.core.kv_cache_utils import (
-    FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
-    estimate_max_model_len, generate_block_hash_extra_keys,
-    get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
-    hash_block_tokens, hash_request_tokens, init_none_hash,
-    unify_kv_cache_configs)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheTensor,
-                                        SlidingWindowSpec)
+    FreeKVCacheBlockQueue,
+    KVCacheBlock,
+    PrefixCachingMetrics,
+    estimate_max_model_len,
+    generate_block_hash_extra_keys,
+    get_kv_cache_config,
+    get_max_concurrency_for_kv_cache_config,
+    hash_block_tokens,
+    hash_request_tokens,
+    init_none_hash,
+    unify_kv_cache_configs,
+)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCacheTensor,
+    SlidingWindowSpec,
+)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
 # yapf: enable
 
 
-def make_request(request_id,
-                 prompt_token_ids,
-                 mm_positions=None,
-                 mm_hashes=None,
-                 cache_salt=None):
+def make_request(
+    request_id, prompt_token_ids, mm_positions=None, mm_hashes=None, cache_salt=None
+):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -51,32 +61,40 @@ def make_request(request_id,
     )
 
 
-def new_kv_cache_spec(block_size=16,
-                      num_kv_heads=2,
-                      head_size=64,
-                      dtype=torch.float32,
-                      use_mla=False,
-                      sliding_window=None):
-    return FullAttentionSpec(block_size=block_size,
-                             num_kv_heads=num_kv_heads,
-                             head_size=head_size,
-                             dtype=dtype,
-                             use_mla=use_mla,
-                             sliding_window=sliding_window)
+def new_kv_cache_spec(
+    block_size=16,
+    num_kv_heads=2,
+    head_size=64,
+    dtype=torch.float32,
+    use_mla=False,
+    sliding_window=None,
+):
+    return FullAttentionSpec(
+        block_size=block_size,
+        num_kv_heads=num_kv_heads,
+        head_size=head_size,
+        dtype=dtype,
+        use_mla=use_mla,
+        sliding_window=sliding_window,
+    )
 
 
-def new_sliding_window_spec(block_size=16,
-                            num_kv_heads=2,
-                            head_size=64,
-                            dtype=torch.float32,
-                            use_mla=False,
-                            sliding_window=1):
-    return SlidingWindowSpec(block_size=block_size,
-                             num_kv_heads=num_kv_heads,
-                             head_size=head_size,
-                             dtype=dtype,
-                             use_mla=use_mla,
-                             sliding_window=sliding_window)
+def new_sliding_window_spec(
+    block_size=16,
+    num_kv_heads=2,
+    head_size=64,
+    dtype=torch.float32,
+    use_mla=False,
+    sliding_window=1,
+):
+    return SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=num_kv_heads,
+        head_size=head_size,
+        dtype=dtype,
+        use_mla=use_mla,
+        sliding_window=sliding_window,
+    )
 
 
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
@@ -85,7 +103,7 @@ def test_none_hash(monkeypatch, hash_fn):
 
     # case 1: PYTHONHASHSEED is not set, use random
     with monkeypatch.context() as m:
-        m.delenv('PYTHONHASHSEED', raising=False)
+        m.delenv("PYTHONHASHSEED", raising=False)
         reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
         reloaded_kv_cache_utils.init_none_hash(hash_fn)
         assert reloaded_kv_cache_utils.NONE_HASH is not None
@@ -94,12 +112,12 @@ def test_none_hash(monkeypatch, hash_fn):
 
     # case 2: PYTHONHASHSEED is set, use the seed and hash_fn
     with monkeypatch.context() as m:
-        m.setenv('PYTHONHASHSEED', 'python hash seed')
+        m.setenv("PYTHONHASHSEED", "python hash seed")
         reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
         reloaded_kv_cache_utils.init_none_hash(hash_fn)
         assert reloaded_kv_cache_utils.NONE_HASH is not None
         assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int)
-        assert hash_fn('python hash seed') == reloaded_kv_cache_utils.NONE_HASH
+        assert hash_fn("python hash seed") == reloaded_kv_cache_utils.NONE_HASH
 
 
 def test_kv_cache_block():
@@ -118,8 +136,9 @@ def test_kv_cache_block():
     assert block.ref_cnt == 0
 
     # Test block hash setting and resetting
-    block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123,
-                                                       token_ids=(1, 2, 3))
+    block_hash = vllm.v1.core.kv_cache_utils.BlockHash(
+        hash_value=123, token_ids=(1, 2, 3)
+    )
     block.block_hash = block_hash
     assert block.block_hash == block_hash
 
@@ -203,8 +222,7 @@ def test_free_kv_cache_block_queue_get_all_free_blocks():
 
     # Append a block back and check again
     queue.append(block_to_remove)
-    assert queue.get_all_free_blocks() == \
-        blocks[1:2] + blocks[3:] + [block_to_remove]
+    assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:] + [block_to_remove]
 
 
 def test_generate_block_hash_extra_keys():
@@ -220,12 +238,12 @@ def test_generate_block_hash_extra_keys():
 
     # Test with no extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
-    assert extra_keys == ("hash1", )
+    assert extra_keys == ("hash1",)
     assert next_mm_idx == 1
 
     # Test with partial overlap
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
-    assert extra_keys == ("hash1", )
+    assert extra_keys == ("hash1",)
     assert next_mm_idx == 1
 
     # Test with no overlap
@@ -235,7 +253,7 @@ def test_generate_block_hash_extra_keys():
 
     # Test with multiple extra keys
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
-    assert extra_keys == ('hash1', 'hash2')
+    assert extra_keys == ("hash1", "hash2")
     assert next_mm_idx == 2
 
 
@@ -263,9 +281,9 @@ def test_generate_block_hash_extra_keys_cache_salt():
 
     # salt is added for the first token
     extra_keys, _ = generate_block_hash_extra_keys(request, 0, 1, 0)
-    assert extra_keys == ('salt', )
+    assert extra_keys == ("salt",)
     extra_keys, _ = generate_block_hash_extra_keys(request, 0, 10, 0)
-    assert extra_keys == ('salt', )
+    assert extra_keys == ("salt",)
 
     # no salt added for other tokens
     extra_keys, _ = generate_block_hash_extra_keys(request, 1, 2, 0)
@@ -285,8 +303,7 @@ def test_generate_block_hash_extra_keys_cache_salt():
     )
 
     # Test with no extra keys
-    extra_keys, next_mm_idx = generate_block_hash_extra_keys(
-        request_mm, 0, 5, 0)
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request_mm, 0, 5, 0)
     assert extra_keys == ("hash1", "salt")
     assert next_mm_idx == 1
 
@@ -294,16 +311,19 @@ def test_generate_block_hash_extra_keys_cache_salt():
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
 def test_hash_block_tokens(hash_fn):
     import vllm.v1.core.kv_cache_utils
+
     init_none_hash(hash_fn)
     parent_block_hash = 123
     curr_block_token_ids = (1, 2, 3)
     extra_keys = ("key1", "key2")
 
-    block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                   curr_block_token_ids, extra_keys)
+    block_hash = hash_block_tokens(
+        hash_fn, parent_block_hash, curr_block_token_ids, extra_keys
+    )
     assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash)
     assert block_hash.hash_value == hash_fn(
-        (parent_block_hash, curr_block_token_ids, extra_keys))
+        (parent_block_hash, curr_block_token_ids, extra_keys)
+    )
     assert block_hash.token_ids == curr_block_token_ids
     assert block_hash.extra_keys == extra_keys
 
@@ -311,6 +331,7 @@ def test_hash_block_tokens(hash_fn):
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
 def test_hash_request_tokens(hash_fn):
     import vllm.v1.core.kv_cache_utils
+
     init_none_hash(hash_fn)
     request = make_request(
         request_id=0,
@@ -331,11 +352,11 @@ def test_hash_request_tokens(hash_fn):
 
     # Check the first block
     assert block_hashes[0].token_ids == (0, 1, 2)
-    assert block_hashes[0].extra_keys == ("hash1", )
+    assert block_hashes[0].extra_keys == ("hash1",)
 
     # Check the second block
     assert block_hashes[1].token_ids == (3, 4, 5)
-    assert block_hashes[1].extra_keys == ("hash2", )
+    assert block_hashes[1].extra_keys == ("hash2",)
 
 
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
@@ -434,8 +455,7 @@ def test_unify_kv_cache_configs():
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
-                KVCacheGroupSpec(["layer2"],
-                                 new_kv_cache_spec(num_kv_heads=4)),
+                KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
             ],
         ),
         KVCacheConfig(
@@ -446,8 +466,7 @@ def test_unify_kv_cache_configs():
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
-                KVCacheGroupSpec(["layer2"],
-                                 new_kv_cache_spec(num_kv_heads=4)),
+                KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
             ],
         ),
     ]
@@ -464,8 +483,7 @@ def test_unify_kv_cache_configs():
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
-                KVCacheGroupSpec(["layer2"],
-                                 new_kv_cache_spec(num_kv_heads=4)),
+                KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
             ],
         ),
         KVCacheConfig(
@@ -475,8 +493,7 @@ def test_unify_kv_cache_configs():
                 KVCacheTensor(size=100, shared_by=["layer2"]),
             ],
             kv_cache_groups=[
-                KVCacheGroupSpec(["layer2"],
-                                 new_kv_cache_spec(num_kv_heads=4)),
+                KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
             ],
         ),
@@ -495,8 +512,7 @@ def test_unify_kv_cache_configs():
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
-                KVCacheGroupSpec(["layer2"],
-                                 new_kv_cache_spec(num_kv_heads=4)),
+                KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
             ],
         ),
         KVCacheConfig(
@@ -507,8 +523,7 @@ def test_unify_kv_cache_configs():
             ],
             kv_cache_groups=[
                 KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
-                KVCacheGroupSpec(["layer2"],
-                                 new_kv_cache_spec(num_kv_heads=8)),
+                KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=8)),
             ],
         ),
     ]
@@ -559,14 +574,16 @@ def test_merge_kv_cache_spec():
     ]
     with pytest.raises(ValueError):
         different_sliding_window_layer_specs[0].merge(
-            different_sliding_window_layer_specs)
+            different_sliding_window_layer_specs
+        )
 
     same_sliding_window_layer_specs = [
         new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
         new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
     ]
     merged_layer_spec = same_sliding_window_layer_specs[0].merge(
-        same_sliding_window_layer_specs)
+        same_sliding_window_layer_specs
+    )
     assert merged_layer_spec.sliding_window == 1
 
     same_sliding_window_layer_spec_with_none = [
@@ -574,17 +591,19 @@ def test_merge_kv_cache_spec():
         new_kv_cache_spec(num_kv_heads=32, sliding_window=None),
     ]
     merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge(
-        same_sliding_window_layer_spec_with_none)
+        same_sliding_window_layer_spec_with_none
+    )
     assert merged_layer_spec.sliding_window == 1
 
 
 @pytest.mark.parametrize(
-    ("model_id", "max_model_len", "want_estimated_max_len"), [
+    ("model_id", "max_model_len", "want_estimated_max_len"),
+    [
         ("Qwen/Qwen1.5-7B", 16385, 16384),
         ("Qwen/Qwen1.5-7B", 16383, 16383),
-    ])
-def test_estimate_max_model_len(model_id, max_model_len,
-                                want_estimated_max_len):
+    ],
+)
+def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len):
     # Create a VllmConfig
     model_config = ModelConfig(
         model_id,
@@ -615,8 +634,9 @@ def test_estimate_max_model_len(model_id, max_model_len,
             use_mla=False,
         )
     # Estimate the maximum model length, 16384 model_len need 8GB
-    estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
-                                               8 * GiB_bytes)
+    estimated_max_len = estimate_max_model_len(
+        vllm_config, kv_cache_spec, 8 * GiB_bytes
+    )
     assert estimated_max_len == want_estimated_max_len
 
 
@@ -634,8 +654,9 @@ def test_get_max_concurrency_for_kv_cache_config():
         dtype="float16",
         max_model_len=max_model_len,
     )
-    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024,
-                                       enable_chunked_prefill=True)
+    scheduler_config = SchedulerConfig(
+        max_num_batched_tokens=1024, enable_chunked_prefill=True
+    )
 
     vllm_config = VllmConfig(
         model_config=model_config,
@@ -663,38 +684,39 @@ def test_get_max_concurrency_for_kv_cache_config():
         num_blocks=int(1024 * 1.5),
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
-                             full_attention_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
         ],
     )
     max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config_full_attention)
+        vllm_config, kv_cache_config_full_attention
+    )
     assert max_concurrency_full_attention == 1.5
 
     kv_cache_config_sliding_window = KVCacheConfig(
         num_blocks=129 * 3,
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
-                             sliding_window_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], sliding_window_spec),
         ],
     )
     max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config_sliding_window)
+        vllm_config, kv_cache_config_sliding_window
+    )
     assert max_concurrency_sliding_window == 3
 
     kv_cache_config_hybrid_model = KVCacheConfig(
         num_blocks=(1024 + 129) * 3,
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
-                             full_attention_spec),
-            KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)],
-                             sliding_window_spec),
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
+            KVCacheGroupSpec(
+                [f"layer_{i}" for i in range(32, 64)], sliding_window_spec
+            ),
         ],
     )
     max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config(
-        vllm_config, kv_cache_config_hybrid_model)
+        vllm_config, kv_cache_config_hybrid_model
+    )
     assert max_concurrency_hybrid_model == 3
 
 
@@ -707,8 +729,7 @@ def test_allocate_with_lookahead():
             KVCacheTensor(size=100, shared_by=["layer1"]),
         ],
         kv_cache_groups=[
-            KVCacheGroupSpec(["layer1"],
-                             new_kv_cache_spec(block_size=block_size)),
+            KVCacheGroupSpec(["layer1"], new_kv_cache_spec(block_size=block_size)),
         ],
     )
 
@@ -720,8 +741,7 @@ def test_allocate_with_lookahead():
     )
 
     # Test case 1: Requires additional lookahead tokens
-    kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100)
+    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -730,8 +750,7 @@ def test_allocate_with_lookahead():
     assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
-    kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100)
+    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
     # required_blocks = ceil((3 + 2) /4) = 2
     blocks = kv_cache_manager.allocate_slots(
         request,
@@ -742,8 +761,7 @@ def test_allocate_with_lookahead():
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
-    kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100)
+    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -760,77 +778,77 @@ def test_get_kv_cache_config():
     mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
     # all layers are full attention -> single group
     kv_cache_specs_full = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
     }
     kv_cache_config_full = get_kv_cache_config(
-        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
+        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32
+    )
     assert kv_cache_config_full == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_2"]),
         ],
-        kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
-        ])
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())],
+    )
 
     # all layers are sliding window -> single group
     kv_cache_specs_sliding = {
-        'layer_1': new_sliding_window_spec(),
-        'layer_2': new_sliding_window_spec(),
+        "layer_1": new_sliding_window_spec(),
+        "layer_2": new_sliding_window_spec(),
     }
     kv_cache_config_sliding = get_kv_cache_config(
-        vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32)
+        vllm_config, kv_cache_specs_sliding, mem_per_block_per_layer * 2 * 32
+    )
     assert kv_cache_config_sliding == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_2"]),
         ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer_1", "layer_2"], new_sliding_window_spec())
-        ])
+        ],
+    )
 
     # full + sliding, but disable_hybrid_kv_cache_manager
     vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = True
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_config(
-        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32
+    )
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32, shared_by=["layer_2"]),
         ],
         kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2"],
-                             new_kv_cache_spec(sliding_window=1)),
+            KVCacheGroupSpec(
+                ["layer_1", "layer_2"], new_kv_cache_spec(sliding_window=1)
+            ),
         ],
     )
     vllm_config.scheduler_config.disable_hybrid_kv_cache_manager = False
 
     # full + sliding, with hybrid_kv_cache_manager
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_config(
-        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32
+    )
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=64,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 64,
-                          shared_by=["layer_1", "layer_2"]),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 64, shared_by=["layer_1", "layer_2"]
+            ),
         ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer_1"], new_kv_cache_spec()),
@@ -840,90 +858,99 @@ def test_get_kv_cache_config():
 
     # 2 full + 4 sliding, 2 layers per group
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(),
-        'layer_3': new_sliding_window_spec(),
-        'layer_4': new_sliding_window_spec(),
-        'layer_5': new_sliding_window_spec(),
-        'layer_6': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_sliding_window_spec(),
+        "layer_4": new_sliding_window_spec(),
+        "layer_5": new_sliding_window_spec(),
+        "layer_6": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_config(
-        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32)
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32
+    )
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_1", "layer_3", "layer_5"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2", "layer_4", "layer_6"]),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_3", "layer_5"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_2", "layer_4", "layer_6"],
+            ),
         ],
         kv_cache_groups=[
             KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec()),
-            KVCacheGroupSpec(["layer_3", "layer_4"],
-                             new_sliding_window_spec()),
-            KVCacheGroupSpec(["layer_5", "layer_6"],
-                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_3", "layer_4"], new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_5", "layer_6"], new_sliding_window_spec()),
         ],
     )
 
     # 3 full + 7 sliding, pad to 3 full + 9 sliding
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(),
-        'layer_2': new_kv_cache_spec(),
-        'layer_3': new_kv_cache_spec(),
-        'layer_4': new_sliding_window_spec(),
-        'layer_5': new_sliding_window_spec(),
-        'layer_6': new_sliding_window_spec(),
-        'layer_7': new_sliding_window_spec(),
-        'layer_8': new_sliding_window_spec(),
-        'layer_9': new_sliding_window_spec(),
-        'layer_10': new_sliding_window_spec(),
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_kv_cache_spec(),
+        "layer_4": new_sliding_window_spec(),
+        "layer_5": new_sliding_window_spec(),
+        "layer_6": new_sliding_window_spec(),
+        "layer_7": new_sliding_window_spec(),
+        "layer_8": new_sliding_window_spec(),
+        "layer_9": new_sliding_window_spec(),
+        "layer_10": new_sliding_window_spec(),
     }
     kv_cache_config_hybrid = get_kv_cache_config(
-        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32)
+        vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 3 * 32
+    )
     assert kv_cache_config_hybrid == KVCacheConfig(
         num_blocks=32,
         kv_cache_tensors=[
             KVCacheTensor(
                 size=mem_per_block_per_layer * 32,
-                shared_by=["layer_1", "layer_4", "layer_7", "layer_10"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_2", "layer_5", "layer_8"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 32,
-                          shared_by=["layer_3", "layer_6", "layer_9"]),
+                shared_by=["layer_1", "layer_4", "layer_7", "layer_10"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_2", "layer_5", "layer_8"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_3", "layer_6", "layer_9"],
+            ),
         ],
         kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"],
-                             new_kv_cache_spec()),
-            KVCacheGroupSpec(["layer_4", "layer_5", "layer_6"],
-                             new_sliding_window_spec()),
-            KVCacheGroupSpec(["layer_7", "layer_8", "layer_9"],
-                             new_sliding_window_spec()),
+            KVCacheGroupSpec(["layer_1", "layer_2", "layer_3"], new_kv_cache_spec()),
+            KVCacheGroupSpec(
+                ["layer_4", "layer_5", "layer_6"], new_sliding_window_spec()
+            ),
+            KVCacheGroupSpec(
+                ["layer_7", "layer_8", "layer_9"], new_sliding_window_spec()
+            ),
             KVCacheGroupSpec(["layer_10"], new_sliding_window_spec()),
         ],
     )
 
     # different hidden size, unimplemented
     kv_cache_specs_hybrid = {
-        'layer_1': new_kv_cache_spec(head_size=128),
-        'layer_2': new_kv_cache_spec(),
+        "layer_1": new_kv_cache_spec(head_size=128),
+        "layer_2": new_kv_cache_spec(),
     }
     with pytest.raises(NotImplementedError):
-        get_kv_cache_config(vllm_config, kv_cache_specs_hybrid,
-                            mem_per_block_per_layer * 2 * 32)
+        get_kv_cache_config(
+            vllm_config, kv_cache_specs_hybrid, mem_per_block_per_layer * 2 * 32
+        )
 
     # Test num_gpu_blocks_override
     vllm_config.cache_config.num_gpu_blocks_override = 16
     kv_cache_config_override_blocks = get_kv_cache_config(
-        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32)
+        vllm_config, kv_cache_specs_full, mem_per_block_per_layer * 2 * 32
+    )
     assert kv_cache_config_override_blocks == KVCacheConfig(
         num_blocks=16,
         kv_cache_tensors=[
-            KVCacheTensor(size=mem_per_block_per_layer * 16,
-                          shared_by=["layer_1"]),
-            KVCacheTensor(size=mem_per_block_per_layer * 16,
-                          shared_by=["layer_2"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 16, shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 16, shared_by=["layer_2"]),
         ],
-        kv_cache_groups=[
-            KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())
-        ])
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1", "layer_2"], new_kv_cache_spec())],
+    )
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index f31bdf74f4a6..acb39bc4e8f9 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -14,19 +14,29 @@
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         KVCacheBlock, hash_block_tokens,
-                                         init_none_hash)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, SlidingWindowSpec)
-
-
-def make_request(request_id,
-                 prompt_token_ids,
-                 mm_positions=None,
-                 mm_hashes=None,
-                 prompt_logprobs: Optional[int] = None,
-                 cache_salt: Optional[str] = None):
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    BlockHashWithGroupId,
+    KVCacheBlock,
+    hash_block_tokens,
+    init_none_hash,
+)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    SlidingWindowSpec,
+)
+
+
+def make_request(
+    request_id,
+    prompt_token_ids,
+    mm_positions=None,
+    mm_hashes=None,
+    prompt_logprobs: Optional[int] = None,
+    cache_salt: Optional[str] = None,
+):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -38,8 +48,7 @@ def make_request(request_id,
         multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
-        sampling_params=SamplingParams(max_tokens=17,
-                                       prompt_logprobs=prompt_logprobs),
+        sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs),
         pooling_params=None,
         eos_token_id=100,
         lora_request=None,
@@ -60,8 +69,9 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
     )
 
 
-def make_kv_cache_config_hybrid_model(block_size: int,
-                                      num_blocks: int) -> KVCacheConfig:
+def make_kv_cache_config_hybrid_model(
+    block_size: int, num_blocks: int
+) -> KVCacheConfig:
     return KVCacheConfig(
         num_blocks=num_blocks,
         kv_cache_tensors=[],
@@ -72,21 +82,25 @@ def make_kv_cache_config_hybrid_model(block_size: int,
             ),
             KVCacheGroupSpec(
                 ["layer2"],
-                SlidingWindowSpec(block_size,
-                                  1,
-                                  1,
-                                  torch.float32,
-                                  False,
-                                  sliding_window=2 * block_size),
+                SlidingWindowSpec(
+                    block_size,
+                    1,
+                    1,
+                    torch.float32,
+                    False,
+                    sliding_window=2 * block_size,
+                ),
             ),
             KVCacheGroupSpec(
                 ["layer3"],
-                SlidingWindowSpec(block_size,
-                                  1,
-                                  1,
-                                  torch.float32,
-                                  False,
-                                  sliding_window=2 * block_size),
+                SlidingWindowSpec(
+                    block_size,
+                    1,
+                    1,
+                    torch.float32,
+                    False,
+                    sliding_window=2 * block_size,
+                ),
             ),
         ],
     )
@@ -102,8 +116,13 @@ def test_prefill(hash_algo):
     )
 
     # choose the hash function according to the parameter
-    hash_fn = (sha256_cbor_64bit if hash_algo == "sha256_cbor_64bit" else
-               sha256 if hash_algo == "sha256" else hash)
+    hash_fn = (
+        sha256_cbor_64bit
+        if hash_algo == "sha256_cbor_64bit"
+        else sha256
+        if hash_algo == "sha256"
+        else hash
+    )
 
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
@@ -117,24 +136,22 @@ def test_prefill(hash_algo):
     assert len(manager.req_to_block_hashes[req0.request_id]) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
 
     # Check full block metadata
     parent_block_hash = None
     for block_id in (1, 2, 3):
-        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
-        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                       block_tokens)
-        assert manager.block_pool.blocks[
-            block_id].block_hash.block_hash == block_hash
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16 : block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens)
+        assert manager.block_pool.blocks[block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
     # Check partial block metadata
-    for block_id in (4, ):
+    for block_id in (4,):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -144,13 +161,13 @@ def test_prefill(hash_algo):
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert computed_blocks.get_block_ids() == ([1, 2, 3],)
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([5], )
+    blocks = manager.allocate_slots(
+        req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([5],)
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -168,8 +185,7 @@ def test_prefill(hash_algo):
     # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
-        b.block_id
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+        b.block_id for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Cache hit in the common prefix when the original block is already free.
@@ -178,24 +194,26 @@ def test_prefill(hash_algo):
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert computed_blocks.get_block_ids() == ([1, 2, 3],)
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req2, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([6], )
+    blocks = manager.allocate_slots(
+        req2, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([6],)
 
     # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
     assert manager.block_pool.free_block_queue.num_free_blocks == 6
-    assert all([
-        b.ref_cnt == 0
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ])
-    assert len([
-        b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ]) == 6
+    assert all(
+        [
+            b.ref_cnt == 0
+            for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+        ]
+    )
+    assert (
+        len([b for b in manager.block_pool.free_block_queue.get_all_free_blocks()]) == 6
+    )
 
     manager.free(req2)
 
@@ -204,11 +222,11 @@ def test_prefill(hash_algo):
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req3, 16 * 10,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req3, 16 * 10, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     # This block ID order also checks the eviction order.
-    assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
+    assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1],)
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -236,22 +254,20 @@ def test_prefill_hybrid_model():
     assert len(manager.req_to_block_hashes[req0.request_id]) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7,
-                                                     8], [9, 10, 11, 12])
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12])
 
     # Check full block metadata
     parent_block_hash = None
-    for length, block_ids in zip((1, 2, 3),
-                                 ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
-        block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16])
-        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                       block_tokens)
+    for length, block_ids in zip((1, 2, 3), ((1, 5, 9), (2, 6, 10), (3, 7, 11))):
+        block_tokens = tuple(all_token_ids[(length - 1) * 16 : length * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens)
         for block_id in block_ids:
-            assert manager.block_pool.blocks[
-                block_id].block_hash.block_hash == block_hash
+            assert (
+                manager.block_pool.blocks[block_id].block_hash.block_hash == block_hash
+            )
             assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
@@ -266,13 +282,12 @@ def test_prefill_hybrid_model():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
-                                                           7], [0, 10, 11])
+    assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6, 7], [0, 10, 11])
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert blocks.get_block_ids() == ([13], [14], [15])
     for block_per_group in computed_blocks.blocks:
         for block in block_per_group:
@@ -284,75 +299,90 @@ def test_prefill_hybrid_model():
     manager.free(req1)
 
     cached_block_hash_to_block_bak = copy.copy(
-        manager.block_pool.cached_block_hash_to_block)
+        manager.block_pool.cached_block_hash_to_block
+    )
 
-    def test_partial_request_hit(request_id: str,
-                                 hash_to_evict: list[BlockHashWithGroupId],
-                                 expect_hit_length: int):
+    def test_partial_request_hit(
+        request_id: str,
+        hash_to_evict: list[BlockHashWithGroupId],
+        expect_hit_length: int,
+    ):
         req = make_request(request_id, common_token_ids + unique_token_ids)
         for hash_with_group_id in hash_to_evict:
-            manager.block_pool.cached_block_hash_to_block.pop(
-                hash_with_group_id)
+            manager.block_pool.cached_block_hash_to_block.pop(hash_with_group_id)
         computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
         assert len(manager.req_to_block_hashes[req.request_id]) == 3
         assert num_computed_tokens == expect_hit_length * block_size
         for block_per_group in computed_blocks.blocks:
             assert len(block_per_group) == num_computed_tokens // block_size
         for hash_with_group_id in hash_to_evict:
-            manager.block_pool.cached_block_hash_to_block[
-                hash_with_group_id] = cached_block_hash_to_block_bak[
-                    hash_with_group_id]
+            manager.block_pool.cached_block_hash_to_block[hash_with_group_id] = (
+                cached_block_hash_to_block_bak[hash_with_group_id]
+            )
         manager.free(req)
 
     # Evict the blocks outside sliding window, does not affect the hit length.
-    test_partial_request_hit("2", [
-        BlockHashWithGroupId(block_hashes[0], 1),
-        BlockHashWithGroupId(block_hashes[0], 2)
-    ], 3)
+    test_partial_request_hit(
+        "2",
+        [
+            BlockHashWithGroupId(block_hashes[0], 1),
+            BlockHashWithGroupId(block_hashes[0], 2),
+        ],
+        3,
+    )
 
     # Evict the first block of full attention, makes total cache miss.
-    test_partial_request_hit("3", [
-        BlockHashWithGroupId(block_hashes[0], 0),
-    ], 0)
+    test_partial_request_hit(
+        "3",
+        [
+            BlockHashWithGroupId(block_hashes[0], 0),
+        ],
+        0,
+    )
 
     # Evict the last block of all layers, reduces the hit length to 2.
-    test_partial_request_hit("4", [
-        BlockHashWithGroupId(block_hashes[2], 0),
-        BlockHashWithGroupId(block_hashes[2], 1),
-        BlockHashWithGroupId(block_hashes[2], 2),
-    ], 2)
+    test_partial_request_hit(
+        "4",
+        [
+            BlockHashWithGroupId(block_hashes[2], 0),
+            BlockHashWithGroupId(block_hashes[2], 1),
+            BlockHashWithGroupId(block_hashes[2], 2),
+        ],
+        2,
+    )
 
     # Evict the last block of full attention, reduces the hit length to 2.
-    test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)],
-                             2)
+    test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)], 2)
 
     # Evict the last block of sliding window, reduces the hit length to 2.
-    test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)],
-                             2)
+    test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)], 2)
 
     # Evict the last block of sliding window, reduces the hit length to 2.
-    test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)],
-                             2)
+    test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)], 2)
 
     # Evict different set of blocks for full attention and sliding window makes
     # total cache miss.
     # The cache hit length of full attention is 1 * block_size.
     # The cache hit length of sliding window is 2 * block_size.
     # Then it is cache miss as the two type of layers have different hit length.
-    test_partial_request_hit("8", [
-        BlockHashWithGroupId(block_hashes[2], 0),
-        BlockHashWithGroupId(block_hashes[0], 1),
-        BlockHashWithGroupId(block_hashes[0], 2),
-    ], 0)
+    test_partial_request_hit(
+        "8",
+        [
+            BlockHashWithGroupId(block_hashes[2], 0),
+            BlockHashWithGroupId(block_hashes[0], 1),
+            BlockHashWithGroupId(block_hashes[0], 2),
+        ],
+        0,
+    )
 
 
 def test_prefill_plp():
-    '''Test prefill with APC and some prompt logprobs (plp) requests.
+    """Test prefill with APC and some prompt logprobs (plp) requests.
 
     1. Schedule plp request and validate APC block allocation
     2. Schedule non-plp request and validate blocks
     3. Schedule plp request; no hit should occur; validate blocks
-    '''
+    """
     manager = KVCacheManager(
         make_kv_cache_config(16, 11),
         max_model_len=8192,
@@ -374,25 +404,23 @@ def test_prefill_plp():
     assert len(manager.req_to_block_hashes[req0.request_id]) == 0
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
     req0_block_hashes = [b.block_hash for b in blocks.blocks[0]]
 
     # Check full block metadata
     parent_block_hash = None
     for block_id in (1, 2, 3):
-        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
-        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
-                                       block_tokens)
-        assert manager.block_pool.blocks[
-            block_id].block_hash.block_hash == block_hash
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16 : block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash, block_tokens)
+        assert manager.block_pool.blocks[block_id].block_hash.block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
     # Check partial block metadata
-    for block_id in (4, ):
+    for block_id in (4,):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -403,13 +431,13 @@ def test_prefill_plp():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == ([1, 2, 3], )
+    assert computed_blocks.get_block_ids() == ([1, 2, 3],)
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
-    blocks = manager.allocate_slots(req1, num_new_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([5], )
+    blocks = manager.allocate_slots(
+        req1, num_new_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([5],)
     for block in computed_blocks.blocks[0]:
         assert block.ref_cnt == 2
 
@@ -427,27 +455,24 @@ def test_prefill_plp():
     # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
-        b.block_id
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+        b.block_id for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Request #2 is a prompt-logprobs request:
     # NO cache hit in the common prefix; duplicates request #0 cached blocks
     unique_token_ids = [3] * 6
-    req2 = make_request("2",
-                        common_token_ids + unique_token_ids,
-                        prompt_logprobs=5)
+    req2 = make_request("2", common_token_ids + unique_token_ids, prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 0
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req2, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req2, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     block_ids = blocks.get_block_ids()
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks.blocks[0]] == req0_block_hashes
-    assert block_ids != ([1, 2, 3, 4], )
+    assert block_ids != ([1, 2, 3, 4],)
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -474,21 +499,25 @@ def test_decode():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 55,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 55, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
     for _ in range(4):
         req0.append_output_token_ids(8)
-    new_blocks = manager.allocate_slots(req0, 4,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 4, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
-    assert manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id][-1].block_hash is None
+    assert (
+        manager.coordinator.single_type_managers[0]
+        .req_to_blocks[req0.request_id][-1]
+        .block_hash
+        is None
+    )
 
     # Append slots with allocating a new block.
     req0.num_computed_tokens = 59
@@ -496,14 +525,22 @@ def test_decode():
     # the preallocated block.
     for _ in range(9 + 10):
         req0.append_output_token_ids(7)
-    new_blocks = manager.allocate_slots(req0, 19,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 19, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 1
-    assert manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id][-2].block_hash is not None
-    assert manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id][-1].block_hash is None
+    assert (
+        manager.coordinator.single_type_managers[0]
+        .req_to_blocks[req0.request_id][-2]
+        .block_hash
+        is not None
+    )
+    assert (
+        manager.coordinator.single_type_managers[0]
+        .req_to_blocks[req0.request_id][-1]
+        .block_hash
+        is None
+    )
 
 
 def test_evict():
@@ -518,20 +555,19 @@ def test_evict():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, 5 * 16 + 7,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req0, 5 * 16 + 7, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 6  # 5 full + 1 partial
 
     # 3 blocks.
-    req1 = make_request("1", list(range(last_token_id,
-                                        last_token_id + 3 * 16)))
+    req1 = make_request("1", list(range(last_token_id, last_token_id + 3 * 16)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req1, 3 * 16,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, 3 * 16, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
@@ -542,19 +578,18 @@ def test_evict():
     manager.free(req1)
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     assert [
-        b.block_id
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+        b.block_id for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert computed_blocks.get_block_ids() == ([1, 2], )
+    assert computed_blocks.get_block_ids() == ([1, 2],)
     assert num_computed_tokens == 2 * 16
-    blocks = manager.allocate_slots(req2, 3,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([10], )
+    blocks = manager.allocate_slots(
+        req2, 3, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([10],)
     assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
@@ -576,9 +611,9 @@ def test_hash_block_correct_reuse():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req, num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 1
 
     # Deallocate the block.
@@ -590,13 +625,12 @@ def test_hash_block_correct_reuse():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req, num_tokens - 1,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req, num_tokens - 1, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 1
 
-    assert manager.block_pool.blocks[blocks.blocks[0]
-                                     [0].block_id].block_hash is None
+    assert manager.block_pool.blocks[blocks.blocks[0][0].block_id].block_hash is None
 
 
 def test_computed_blocks_not_evicted():
@@ -617,9 +651,9 @@ def test_computed_blocks_not_evicted():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req0, num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req0, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 1
     assert blocks.blocks[0][0].block_id == 1
 
@@ -628,9 +662,9 @@ def test_computed_blocks_not_evicted():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req1, num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, num_tokens, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 1
     assert blocks.blocks[0][0].block_id == 2
 
@@ -646,9 +680,12 @@ def test_computed_blocks_not_evicted():
     assert computed_blocks.blocks[0][0].block_id == 1
     assert num_computed_tokens == block_size
 
-    blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req2,
+        num_tokens * 2 - num_tokens,
+        len(computed_blocks.blocks[0]) * 16,
+        computed_blocks,
+    )
     assert len(blocks.blocks[0]) == 1
     assert blocks.blocks[0][0].block_id == 2
 
@@ -669,9 +706,9 @@ def test_basic_prefix_caching_disabled():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req1, 10,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req1, 10, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 3
 
     # Free the blocks.
@@ -682,9 +719,9 @@ def test_basic_prefix_caching_disabled():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req2, 16,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req2, 16, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
@@ -692,9 +729,9 @@ def test_basic_prefix_caching_disabled():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req3, 4,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
+    blocks = manager.allocate_slots(
+        req3, 4, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert not blocks
 
 
@@ -803,24 +840,33 @@ def test_cache_blocks_multi_group():
     # Block hash 1: hit for group 0 and 1
     # Block hash 2: hit for group 1
 
-    assert block_pool.get_cached_block(block_hashes[0],
-                                       kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
-                                       kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
-                                       kv_cache_group_ids=[0]) is None
-    assert block_pool.get_cached_block(block_hashes[0],
-                                       kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
-                                       kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
-                                       kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[0],
-                                       kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
-                                       kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
-                                       kv_cache_group_ids=[0, 1]) is None
+    assert (
+        block_pool.get_cached_block(block_hashes[0], kv_cache_group_ids=[0]) is not None
+    )
+    assert (
+        block_pool.get_cached_block(block_hashes[1], kv_cache_group_ids=[0]) is not None
+    )
+    assert block_pool.get_cached_block(block_hashes[2], kv_cache_group_ids=[0]) is None
+    assert (
+        block_pool.get_cached_block(block_hashes[0], kv_cache_group_ids=[1]) is not None
+    )
+    assert (
+        block_pool.get_cached_block(block_hashes[1], kv_cache_group_ids=[1]) is not None
+    )
+    assert (
+        block_pool.get_cached_block(block_hashes[2], kv_cache_group_ids=[1]) is not None
+    )
+    assert (
+        block_pool.get_cached_block(block_hashes[0], kv_cache_group_ids=[0, 1])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(block_hashes[1], kv_cache_group_ids=[0, 1])
+        is not None
+    )
+    assert (
+        block_pool.get_cached_block(block_hashes[2], kv_cache_group_ids=[0, 1]) is None
+    )
 
 
 def test_mm_prefix_caching():
@@ -848,14 +894,11 @@ def test_mm_prefix_caching():
     # A unique image plus some text tokens.
     unique_token_ids = [-1] * 7 + [100] * 4
     all_token_ids = common_token_ids + unique_token_ids
-    mm_positions = common_mm_positions + [
-        PlaceholderRange(offset=48, length=7)
-    ]
+    mm_positions = common_mm_positions + [PlaceholderRange(offset=48, length=7)]
     mm_hashes = common_mm_hashes + ["ccc"]
-    req0 = make_request("0",
-                        all_token_ids,
-                        mm_positions=mm_positions,
-                        mm_hashes=mm_hashes)
+    req0 = make_request(
+        "0", all_token_ids, mm_positions=mm_positions, mm_hashes=mm_hashes
+    )
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
@@ -863,39 +906,36 @@ def test_mm_prefix_caching():
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
-    assert block_hashes[0].extra_keys == ("aaa", )
+    assert block_hashes[0].extra_keys == ("aaa",)
     assert block_hashes[1].extra_keys == ("aaa", "bbb")
-    assert block_hashes[2].extra_keys == ("bbb", )
+    assert block_hashes[2].extra_keys == ("bbb",)
 
-    blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 5, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # The just completed block should have hashes with extra keys.
     assert len(block_hashes) == 4
-    assert block_hashes[3].extra_keys == ("ccc", )
+    assert block_hashes[3].extra_keys == ("ccc",)
 
     # Cache hit.
     unique_token_ids = [-1] * 7 + [200] * 5
     all_token_ids = common_token_ids + unique_token_ids
-    mm_positions = common_mm_positions + [
-        PlaceholderRange(offset=48, length=7)
-    ]
+    mm_positions = common_mm_positions + [PlaceholderRange(offset=48, length=7)]
     mm_hashes = common_mm_hashes + ["ccc"]
-    req1 = make_request("1",
-                        all_token_ids,
-                        mm_positions=mm_positions,
-                        mm_hashes=mm_hashes)
+    req1 = make_request(
+        "1", all_token_ids, mm_positions=mm_positions, mm_hashes=mm_hashes
+    )
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(computed_blocks.blocks[0]) == 3
     assert num_computed_tokens == 3 * 16
@@ -924,22 +964,22 @@ def test_cache_key_salting():
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req0.request_id]
     assert len(block_hashes) == 3
-    assert block_hashes[0].extra_keys == ("salt1", )
+    assert block_hashes[0].extra_keys == ("salt1",)
     assert block_hashes[1].extra_keys is None
     assert block_hashes[2].extra_keys is None
 
-    blocks = manager.allocate_slots(req0, 59,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    blocks = manager.allocate_slots(
+        req0, 59, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
     for _ in range(5):
         req0.append_output_token_ids(8)
-    new_blocks = manager.allocate_slots(req0, 5,
-                                        len(computed_blocks.blocks[0]) * 16,
-                                        computed_blocks)
+    new_blocks = manager.allocate_slots(
+        req0, 5, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
 
     # Now one more block that should not have extra keys.
@@ -962,7 +1002,7 @@ def test_cache_key_salting():
     assert num_computed_tokens == 0
     block_hashes = manager.req_to_block_hashes[req2.request_id]
     assert len(block_hashes) == 3
-    assert block_hashes[0].extra_keys == ("salt2", )
+    assert block_hashes[0].extra_keys == ("salt2",)
 
 
 def test_prefill_not_enough_free_blocks_with_computed_blocks():
@@ -985,22 +1025,24 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    manager.allocate_slots(req0, 48,
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req0, 48, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     block_part0 = manager.coordinator.single_type_managers[0].req_to_blocks[
-        req0.request_id]
+        req0.request_id
+    ]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
-    manager.allocate_slots(req1, 48,
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req1, 48, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     block_part1 = manager.coordinator.single_type_managers[0].req_to_blocks[
-        req1.request_id]
+        req1.request_id
+    ]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| ... |
     manager.free(req1)
@@ -1013,9 +1055,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    manager.allocate_slots(req2, block_size * 2,
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req2, block_size * 2, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
@@ -1026,9 +1068,12 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
     # Req3 cannot be allocated.
-    assert manager.allocate_slots(req3, 48,
-                                  len(computed_blocks.blocks[0]) * 16,
-                                  computed_blocks) is None
+    assert (
+        manager.allocate_slots(
+            req3, 48, len(computed_blocks.blocks[0]) * 16, computed_blocks
+        )
+        is None
+    )
     # Block 0-2 are used by Req 1.
     assert {block.ref_cnt for block in block_part1[:3]} == {1}
     # Block 3-5 are free.
@@ -1047,7 +1092,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     blocks = manager.allocate_slots(req0, 55)
-    assert blocks.get_block_ids() == ([1, 2, 3, 4], )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4],)
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
@@ -1055,10 +1100,10 @@ def test_reset_prefix_cache():
     computed_blocks, _ = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
     assert len(computed_blocks.blocks[0]) == 3
-    blocks = manager.allocate_slots(req1, 7,
-                                    len(computed_blocks.blocks[0]) * 16,
-                                    computed_blocks)
-    assert blocks.get_block_ids() == ([5], )
+    blocks = manager.allocate_slots(
+        req1, 7, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([5],)
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
@@ -1088,9 +1133,9 @@ def test_prefix_cache_stats_disabled():
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    manager.allocate_slots(req, 16,
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, 16, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     manager.reset_prefix_cache()
 
     # Ensure prefix_cache_stats remains None
@@ -1119,8 +1164,11 @@ def test_kv_cache_events(blocks_to_cache: int):
     events = manager.take_events()
 
     block = events[-1]
-    assert (len(block.block_hashes) == blocks_to_cache == len(
-        manager.block_pool.cached_block_hash_to_block))
+    assert (
+        len(block.block_hashes)
+        == blocks_to_cache
+        == len(manager.block_pool.cached_block_hash_to_block)
+    )
     assert len(block.token_ids) == block.block_size * len(block.block_hashes)
     assert len(manager.block_pool.kv_event_queue) == 0
 
@@ -1137,9 +1185,12 @@ def test_kv_cache_events(blocks_to_cache: int):
     for blocks in events[:-1]:
         assert blocks.block_hashes[0] in stored_block_hash
     assert len(events) == blocks_to_cache + 1
-    assert (isinstance(events[-2], BlockRemoved))
-    assert (len(events[-1].block_hashes) == blocks_to_cache == len(
-        manager.block_pool.cached_block_hash_to_block))
+    assert isinstance(events[-2], BlockRemoved)
+    assert (
+        len(events[-1].block_hashes)
+        == blocks_to_cache
+        == len(manager.block_pool.cached_block_hash_to_block)
+    )
 
     # All Blocks Cleared
     # Should see a single all blocks cleared event
@@ -1152,7 +1203,7 @@ def test_kv_cache_events(blocks_to_cache: int):
 
 
 def test_eagle_enabled_removes_last_block():
-    """Verify Eagle does NOT remove blocks when request 
+    """Verify Eagle does NOT remove blocks when request
     length is divisible by block size."""
     block_size = 16
     manager = KVCacheManager(
@@ -1168,9 +1219,9 @@ def test_eagle_enabled_removes_last_block():
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
-    manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, len(token_ids), len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
@@ -1199,9 +1250,9 @@ def test_eagle_with_partial_blocks():
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
-    manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, len(token_ids), len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     manager.free(req)
 
     # New request with Eagle enabled
@@ -1227,7 +1278,7 @@ def test_eagle_with_sliding_window():
         KVCacheConfig(
             num_blocks=10,
             kv_cache_tensors=[],
-            kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)],
+            kv_cache_groups=[KVCacheGroupSpec(["layer"], sliding_window_spec)],
         ),
         max_model_len=8192,
         enable_caching=True,
@@ -1240,9 +1291,9 @@ def test_eagle_with_sliding_window():
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
-    manager.allocate_slots(req, len(token_ids),
-                           len(computed_blocks.blocks[0]) * 16,
-                           computed_blocks)
+    manager.allocate_slots(
+        req, len(token_ids), len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
     # record the block hash of the first block in the request for later use
     block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
     assert block_hash_first_block is not None
@@ -1256,10 +1307,15 @@ def test_eagle_with_sliding_window():
     assert num_tokens == 1 * block_size
 
     # Evict the first block in the request
-    assert manager.block_pool.get_cached_block(
-        block_hash_first_block, kv_cache_group_ids=[0]) is not None
+    assert (
+        manager.block_pool.get_cached_block(
+            block_hash_first_block, kv_cache_group_ids=[0]
+        )
+        is not None
+    )
     manager.block_pool.cached_block_hash_to_block.pop(
-        BlockHashWithGroupId(block_hash_first_block, 0))
+        BlockHashWithGroupId(block_hash_first_block, 0)
+    )
 
     # New request
     req_after_evict = make_request("partial_eagle_after_evict", token_ids)
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index a858a4d8c823..b759a6151eb8 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -6,14 +6,23 @@
 import pytest
 import torch
 
-from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
-                         SchedulerConfig, SpeculativeConfig, VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
@@ -39,8 +48,7 @@ def test_finish_request():
         scheduler.add_request(request)
 
     for i, request in enumerate(requests):
-        scheduler.finish_requests(request.request_id,
-                                  RequestStatus.FINISHED_ABORTED)
+        scheduler.finish_requests(request.request_id, RequestStatus.FINISHED_ABORTED)
         assert request.request_id not in scheduler.requests
         assert len(scheduler.waiting) == 9 - i
 
@@ -52,23 +60,25 @@ def test_get_num_unfinished_requests():
         scheduler.add_request(request)
 
     for i, request in enumerate(requests):
-        scheduler.finish_requests(request.request_id,
-                                  RequestStatus.FINISHED_STOPPED)
+        scheduler.finish_requests(request.request_id, RequestStatus.FINISHED_STOPPED)
         assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
 
 
-@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
-    (None, None),
-    (True, 5),
-])
-def test_schedule(enable_prefix_caching: Optional[bool],
-                  prompt_logprobs: Optional[int]):
-    '''Test scheduling.
+@pytest.mark.parametrize(
+    "enable_prefix_caching, prompt_logprobs",
+    [
+        (None, None),
+        (True, 5),
+    ],
+)
+def test_schedule(
+    enable_prefix_caching: Optional[bool], prompt_logprobs: Optional[int]
+):
+    """Test scheduling.
     Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
-    '''
+    """
     scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
-    requests = create_requests(num_requests=10,
-                               prompt_logprobs=prompt_logprobs)
+    requests = create_requests(num_requests=10, prompt_logprobs=prompt_logprobs)
     for request in requests:
         scheduler.add_request(request)
 
@@ -90,8 +100,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
 
 def test_schedule_multimodal_requests():
     scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
-    mm_positions = [[PlaceholderRange(offset=i, length=100)]
-                    for i in range(10)]
+    mm_positions = [[PlaceholderRange(offset=i, length=100)] for i in range(10)]
     requests = create_requests(
         num_requests=10,
         num_tokens=200,
@@ -124,8 +133,7 @@ def test_schedule_partial_requests():
         model="llava-hf/llava-1.5-7b-hf",
         max_num_batched_tokens=1024,
     )
-    mm_positions = [[PlaceholderRange(offset=100, length=600)]
-                    for _ in range(3)]
+    mm_positions = [[PlaceholderRange(offset=100, length=600)] for _ in range(3)]
     requests = create_requests(
         num_requests=3,
         num_tokens=800,
@@ -148,10 +156,7 @@ def test_schedule_partial_requests():
     # The third request is also scheduled partially.
     # The <img> tokens are not scheduled because of the encoder budget.
     assert output.num_scheduled_tokens[requests[2].request_id] == 100
-    req_to_index = {
-        request.request_id: i
-        for i, request in enumerate(requests)
-    }
+    req_to_index = {request.request_id: i for i, request in enumerate(requests)}
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
@@ -188,9 +193,9 @@ def test_no_mm_input_chunking():
         max_model_len=2048,
     )
     mm_positions = [[PlaceholderRange(offset=400, length=800)]]
-    requests = create_requests(num_requests=1,
-                               num_tokens=1200,
-                               mm_positions=mm_positions)
+    requests = create_requests(
+        num_requests=1, num_tokens=1200, mm_positions=mm_positions
+    )
     for request in requests:
         scheduler.add_request(request)
 
@@ -201,10 +206,7 @@ def test_no_mm_input_chunking():
     # We want to only see the 400 text tokens at the start scheduled
     assert output.num_scheduled_tokens[requests[0].request_id] == 400
 
-    req_to_index = {
-        request.request_id: i
-        for i, request in enumerate(requests)
-    }
+    req_to_index = {request.request_id: i for i, request in enumerate(requests)}
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
@@ -265,10 +267,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     assert output.num_scheduled_tokens[requests[1].request_id] == 400
     # The third request is also scheduled partially - 1024 - 400 - 400 = 224.
     assert output.num_scheduled_tokens[requests[2].request_id] == 224
-    req_to_index = {
-        request.request_id: i
-        for i, request in enumerate(requests)
-    }
+    req_to_index = {request.request_id: i for i, request in enumerate(requests)}
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
@@ -311,8 +310,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     assert len(output2.finished_req_ids) == 0
     assert output2.num_scheduled_tokens[requests[0].request_id] == 1
     assert output2.num_scheduled_tokens[requests[1].request_id] == 1
-    assert output2.num_scheduled_tokens[
-        requests[2].request_id] == 800 - 224 - 224
+    assert output2.num_scheduled_tokens[requests[2].request_id] == 800 - 224 - 224
 
 
 def test_stop_via_update_from_output():
@@ -330,35 +328,32 @@ def test_stop_via_update_from_output():
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
         scheduled_cached_reqs=CachedRequestData.make_empty(),
-        num_scheduled_tokens={
-            requests[0].request_id: 1,
-            requests[1].request_id: 2
-        },
+        num_scheduled_tokens={requests[0].request_id: 1, requests[1].request_id: 2},
         total_num_scheduled_tokens=3,
         scheduled_encoder_inputs={},
         scheduled_spec_decode_tokens={
             requests[0].request_id: [],
-            requests[1].request_id: [10]
+            requests[1].request_id: [10],
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
         structured_output_request_ids={},
-        grammar_bitmask=None)
+        grammar_bitmask=None,
+    )
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
-        sampled_token_ids=[[EOS_TOKEN_ID],
-                           [10,
-                            11]],  # First request hits EOS, second continues
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[
+            [EOS_TOKEN_ID],
+            [10, 11],
+        ],  # First request hits EOS, second continues
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -372,9 +367,7 @@ def test_stop_via_update_from_output():
 
     # Test case 2: Stop on custom stop token
     scheduler = create_scheduler(num_speculative_tokens=2)
-    requests = create_requests(num_requests=2,
-                               max_tokens=10,
-                               stop_token_ids=[42, 43])
+    requests = create_requests(num_requests=2, max_tokens=10, stop_token_ids=[42, 43])
     for req in requests:
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
@@ -384,15 +377,12 @@ def test_stop_via_update_from_output():
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
         scheduled_cached_reqs=CachedRequestData.make_empty(),
-        num_scheduled_tokens={
-            requests[0].request_id: 3,
-            requests[1].request_id: 2
-        },
+        num_scheduled_tokens={requests[0].request_id: 3, requests[1].request_id: 2},
         total_num_scheduled_tokens=5,
         scheduled_encoder_inputs={},
         scheduled_spec_decode_tokens={
             requests[0].request_id: [10, 42],
-            requests[1].request_id: [13]
+            requests[1].request_id: [13],
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
@@ -403,16 +393,13 @@ def test_stop_via_update_from_output():
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
-        sampled_token_ids=[[10, 42, 12],
-                           [13, 14]],  # First request hits stop token
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[[10, 42, 12], [13, 14]],  # First request hits stop token
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -437,15 +424,12 @@ def test_stop_via_update_from_output():
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
         scheduled_cached_reqs=CachedRequestData.make_empty(),
-        num_scheduled_tokens={
-            requests[0].request_id: 3,
-            requests[1].request_id: 1
-        },
+        num_scheduled_tokens={requests[0].request_id: 3, requests[1].request_id: 1},
         total_num_scheduled_tokens=4,
         scheduled_encoder_inputs={},
         scheduled_spec_decode_tokens={
             requests[0].request_id: [10, 11],
-            requests[1].request_id: []
+            requests[1].request_id: [],
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
@@ -456,16 +440,13 @@ def test_stop_via_update_from_output():
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(requests)
-        },
-        sampled_token_ids=[[10, 11, 12],
-                           [13]],  # First request exceeds max_tokens
+        req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
+        sampled_token_ids=[[10, 11, 12], [13]],  # First request exceeds max_tokens
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -474,8 +455,7 @@ def test_stop_via_update_from_output():
     assert scheduler.running[0].request_id == requests[1].request_id
     assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
     assert requests[0].request_id in scheduler.finished_req_ids
-    assert list(requests[0].output_token_ids) == [10, 11
-                                                  ]  # Truncated to max_tokens
+    assert list(requests[0].output_token_ids) == [10, 11]  # Truncated to max_tokens
     assert list(requests[1].output_token_ids) == [13]
 
     # Test case 4: Ignore EOS flag
@@ -492,14 +472,13 @@ def test_stop_via_update_from_output():
         num_scheduled_tokens={requests[0].request_id: 3},
         total_num_scheduled_tokens=3,
         scheduled_encoder_inputs={},
-        scheduled_spec_decode_tokens={
-            requests[0].request_id: [EOS_TOKEN_ID, 10]
-        },
+        scheduled_spec_decode_tokens={requests[0].request_id: [EOS_TOKEN_ID, 10]},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
         structured_output_request_ids={},
-        grammar_bitmask=None)
+        grammar_bitmask=None,
+    )
 
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
@@ -508,7 +487,8 @@ def test_stop_via_update_from_output():
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
-        pooler_output=[])
+        pooler_output=[],
+    )
 
     scheduler.update_from_output(scheduler_output, model_output)
 
@@ -518,12 +498,16 @@ def test_stop_via_update_from_output():
     assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
 
 
-@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
-    (None, None),
-    (True, 5),
-])
-def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
-                                     prompt_logprobs: Optional[int]):
+@pytest.mark.parametrize(
+    "enable_prefix_caching, prompt_logprobs",
+    [
+        (None, None),
+        (True, 5),
+    ],
+)
+def test_schedule_concurrent_batches(
+    enable_prefix_caching: Optional[bool], prompt_logprobs: Optional[int]
+):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
         max_num_seqs=2,
@@ -539,15 +523,13 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
     scheduler.add_request(requests[0])
     scheduler_output0 = scheduler.schedule()
     assert len(scheduler_output0.scheduled_new_reqs) == 1
-    assert scheduler_output0.num_scheduled_tokens[
-        requests[0].request_id] == 512
+    assert scheduler_output0.num_scheduled_tokens[requests[0].request_id] == 512
 
     # The first request is still running, so only schedule the second request.
     scheduler.add_request(requests[1])
     scheduler_output1 = scheduler.schedule()
     assert len(scheduler_output1.scheduled_new_reqs) == 1
-    assert scheduler_output1.num_scheduled_tokens[
-        requests[1].request_id] == 512
+    assert scheduler_output1.num_scheduled_tokens[requests[1].request_id] == 512
 
     # Model output of the first request.
     model_runner_output = ModelRunnerOutput(
@@ -583,10 +565,12 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
 def test_preempt_during_execution():
     # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
     # because block 0 is reserved as the null block.
-    scheduler = create_scheduler(max_num_batched_tokens=100,
-                                 block_size=16,
-                                 num_blocks=11,
-                                 enable_prefix_caching=False)
+    scheduler = create_scheduler(
+        max_num_batched_tokens=100,
+        block_size=16,
+        num_blocks=11,
+        enable_prefix_caching=False,
+    )
     requests = create_requests(num_requests=2, num_tokens=80)
 
     # Schedule the first request.
@@ -645,13 +629,16 @@ def test_preempt_during_execution():
     [
         ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
         ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        ([[1, 2], [3]], [[1, 2, 5], [3, 4]],
-         (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])),  # multiple sequences
         ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
         ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
-        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
-         (2, 6, 3, [2, 1, 0])),  # multiple mismatches
-    ])
+        (
+            [[1, 2, 3], [4, 5, 6]],
+            [[1, 2, 7], [4, 8]],
+            (2, 6, 3, [2, 1, 0]),
+        ),  # multiple mismatches
+    ],
+)
 def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     """Test scheduling behavior with speculative decoding.
 
@@ -687,8 +674,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         prompt_logprobs_dict={},
         pooler_output=[],
     )
-    engine_core_outputs = scheduler.update_from_output(output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(output, model_runner_output)
 
     for i in range(len(requests)):
         running_req = scheduler.running[i]
@@ -701,20 +687,23 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
 
     # No draft or accepted tokens counted yet
     assert not engine_core_outputs or (
-        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
+        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None
+    )
 
     # Schedule the speculated tokens for validation
     output = scheduler.schedule()
     assert len(output.scheduled_new_reqs) == 0
     # The sampled token and speculated tokens
-    assert output.total_num_scheduled_tokens == \
-        len(requests) + sum(len(ids) for ids in spec_tokens)
+    assert output.total_num_scheduled_tokens == len(requests) + sum(
+        len(ids) for ids in spec_tokens
+    )
     for i in range(len(requests)):
         req_id = requests[i].request_id
         assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
         if spec_tokens[i]:
-            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
-                len(spec_tokens[i])
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == len(
+                spec_tokens[i]
+            )
         else:
             assert req_id not in output.scheduled_spec_decode_tokens
 
@@ -727,11 +716,11 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         prompt_logprobs_dict={},
         pooler_output=[],
     )
-    engine_core_outputs = scheduler.update_from_output(output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(output, model_runner_output)
 
-    scheduler_stats = engine_core_outputs[0].scheduler_stats \
-        if engine_core_outputs else None
+    scheduler_stats = (
+        engine_core_outputs[0].scheduler_stats if engine_core_outputs else None
+    )
     if expected[0] == 0:
         assert scheduler_stats.spec_decoding_stats is None
     else:
@@ -771,18 +760,25 @@ def _assert_right_kv_cache_manager(
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req_id in req_ids:
-        blocks = (scheduler.kv_cache_manager.coordinator.
-                  single_type_managers[0].req_to_blocks[req_id])
+        blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
+            0
+        ].req_to_blocks[req_id]
         hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
-        assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-                num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
+        assert (
+            scheduler.kv_cache_manager.coordinator.single_type_managers[
+                0
+            ].num_cached_block[req_id]
+            == EXPECTED_TOTAL_BLOCKS
+        )
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
 
     # Make sure we actually touched all the blocks.
     BLOCKS_PER_REQ = num_tokens / block_size
-    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
-            num_total_blocks - num_requests * BLOCKS_PER_REQ)
+    assert (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
+        == num_total_blocks - num_requests * BLOCKS_PER_REQ
+    )
 
 
 def _step_until_done(
@@ -821,24 +817,25 @@ def test_kv_connector_basic():
         enable_prefix_caching=True,
         use_kv_connector=True,
     )
-    NUM_TOTAL_BLOCKS = (
-        scheduler.kv_cache_manager.block_pool.get_num_free_blocks())
+    NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
     BLOCK_SIZE = scheduler.cache_config.block_size
 
     # Mock External Cache Hit.
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
     scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS, False)
+        NUM_MATCHED_NEW_TOKENS,
+        False,
+    )
 
     ######################################################
     # FIRST SET OF REQUESTS - External Hit Only
     NUM_REQUESTS = 2
     NUM_TOKENS = NUM_MATCHED_NEW_TOKENS * 2
     MAX_TOKENS = 3
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -866,15 +863,17 @@ def test_kv_connector_basic():
     )
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
-                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+    _assert_right_kv_cache_manager(
+        scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS
+    )
 
     # Continue Generation until done.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
     _ = scheduler.schedule()
     # Confirm we clean up the memory properly.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_TOTAL_BLOCKS
+    assert (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_TOTAL_BLOCKS
+    )
 
     ######################################################
     # SECOND SET OF REQUESTS - Local And External Hit
@@ -882,9 +881,9 @@ def test_kv_connector_basic():
     # We will get a local prefix cache hit for the first
     # NUM_TOKENS_PREFIX tokens since they are used above.
     NUM_TOKENS = NUM_TOKENS_PREFIX * 2
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -909,19 +908,23 @@ def test_kv_connector_basic():
         output=output,
         num_requests=NUM_REQUESTS,
         # Just the incremental tokens after local + remote cache hit.
-        expected_num_scheduled_tokens=(NUM_TOKENS - NUM_TOKENS_PREFIX -
-                                       NUM_MATCHED_NEW_TOKENS))
+        expected_num_scheduled_tokens=(
+            NUM_TOKENS - NUM_TOKENS_PREFIX - NUM_MATCHED_NEW_TOKENS
+        ),
+    )
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
-                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+    _assert_right_kv_cache_manager(
+        scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS
+    )
 
     # Continue Generation until done.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
     _ = scheduler.schedule()
     # Confirm we clean up the memory properly.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_TOTAL_BLOCKS
+    assert (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_TOTAL_BLOCKS
+    )
 
 
 def test_kv_connector_unable_to_allocate():
@@ -942,16 +945,18 @@ def test_kv_connector_unable_to_allocate():
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
     scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS, False)
+        NUM_MATCHED_NEW_TOKENS,
+        False,
+    )
 
     # Create two requests. The second request will not be able to
     # allocate slots because it will not have enough blocks.
     NUM_REQUESTS = 2
     NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
     MAX_TOKENS = 2
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -971,33 +976,33 @@ def test_kv_connector_unable_to_allocate():
 
     # Just one request should be running.
     output = scheduler.schedule()
-    _assert_right_scheduler_output(output,
-                                   num_requests=1,
-                                   expected_num_scheduled_tokens=NUM_TOKENS -
-                                   NUM_MATCHED_NEW_TOKENS)
+    _assert_right_scheduler_output(
+        output,
+        num_requests=1,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
     # All memory should be freed, with one request waiting.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
 
     # Just one request should be running.
     output = scheduler.schedule()
-    _assert_right_scheduler_output(output,
-                                   num_requests=1,
-                                   expected_num_scheduled_tokens=NUM_TOKENS -
-                                   NUM_MATCHED_NEW_TOKENS)
+    _assert_right_scheduler_output(
+        output,
+        num_requests=1,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 0
 
     # All memory should be freed, with no requests waiting / running.
     _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 0
 
@@ -1022,7 +1027,9 @@ def test_kv_connector_handles_preemption():
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
     scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
     scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS, False)
+        NUM_MATCHED_NEW_TOKENS,
+        False,
+    )
 
     # Create two requests.
     # Both can be scheduled at first, but the second request
@@ -1030,9 +1037,9 @@ def test_kv_connector_handles_preemption():
     NUM_REQUESTS = 2
     NUM_TOKENS = BLOCK_SIZE * 2 + 1
     MAX_TOKENS = BLOCK_SIZE * 2
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS
+    )
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -1056,7 +1063,8 @@ def test_kv_connector_handles_preemption():
         output,
         # 2 remote kv cache hits.
         num_requests=2,
-        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
     assert len(scheduler.running) == 2
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
 
@@ -1066,7 +1074,8 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.running) == 2
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
 
@@ -1076,7 +1085,8 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
@@ -1089,14 +1099,14 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.waiting) == 1
     assert len(scheduler.running) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
     assert len(scheduler.running) == 0
     # All memory should be freed since nothing is running.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
 
     # Restarts the preempted request - generate 3rd token.
     # This will have a local and remote cache hit.
@@ -1121,22 +1131,19 @@ def test_kv_connector_handles_preemption():
         output,
         # no connector_metadata
         num_requests=0,
-        expected_num_scheduled_tokens=1)
+        expected_num_scheduled_tokens=1,
+    )
     assert len(scheduler.running) == 1
     _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
     assert len(scheduler.running) == 0
     # All memory should be freed since nothing is running.
-    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
-        == NUM_BLOCKS - 1
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
 
 
 def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(scheduler.running)
-        },
+        req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
         sampled_token_ids=[[1000]] * len(scheduler.running),
         spec_token_ids=None,
         logprobs=None,
@@ -1158,15 +1165,25 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               req_to_blocks) == 0
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks
+        )
+        == 0
+    )
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               num_cached_block) == 0
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[
+                0
+            ].num_cached_block
+        )
+        == 0
+    )
     num_free_blocks = (
-        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
-    assert num_free_blocks == (
-        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks
+    )
+    assert num_free_blocks == (scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
 
     # NOTE(rob): just the ref count on blocks will be 0. The hash
     # value, etc will remain since we lazily evict for prefix cache.
@@ -1186,9 +1203,9 @@ def test_memory_leak():
     NUM_REQUESTS = 5
     NUM_TOKENS = 10
     MAX_TOKENS = 10
-    requests = create_requests(num_requests=NUM_REQUESTS,
-                               num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+    requests = create_requests(
+        num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS
+    )
 
     # Add each request.
     for request in requests:
@@ -1222,7 +1239,7 @@ def create_scheduler_with_priority(
     max_model_len: Optional[int] = None,
     num_speculative_tokens: Optional[int] = None,
 ) -> Scheduler:
-    '''Create scheduler with priority policy enabled.
+    """Create scheduler with priority policy enabled.
 
     Args:
       model: model under test
@@ -1234,7 +1251,7 @@ def create_scheduler_with_priority(
 
     Returns:
       {class}`Scheduler` instance with priority scheduling
-    '''
+    """
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -1256,9 +1273,11 @@ def create_scheduler_with_priority(
         seed=42,
     )
     # Cache config, optionally force APC
-    kwargs_cache = ({} if enable_prefix_caching is None else {
-        'enable_prefix_caching': enable_prefix_caching
-    })
+    kwargs_cache = (
+        {}
+        if enable_prefix_caching is None
+        else {"enable_prefix_caching": enable_prefix_caching}
+    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
@@ -1266,16 +1285,21 @@ def create_scheduler_with_priority(
         cache_dtype="auto",
         **kwargs_cache,
     )
-    kv_transfer_config = KVTransferConfig(
-        kv_connector="SharedStorageConnector",
-        kv_role="kv_both",
-        kv_connector_extra_config={"shared_storage_path": "local_storage"},
-    ) if use_kv_connector else None
+    kv_transfer_config = (
+        KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        )
+        if use_kv_connector
+        else None
+    )
 
     speculative_config: Optional[SpeculativeConfig] = None
     if num_speculative_tokens is not None:
         speculative_config = SpeculativeConfig(
-            model="ngram", num_speculative_tokens=num_speculative_tokens)
+            model="ngram", num_speculative_tokens=num_speculative_tokens
+        )
 
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
@@ -1288,9 +1312,9 @@ def create_scheduler_with_priority(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+            )
         ],
     )
     cache_config.num_gpu_blocks = num_blocks
@@ -1303,14 +1327,15 @@ def create_scheduler_with_priority(
 
 
 def create_requests_with_priority(
-        num_requests: int,
-        priorities: list[int],
-        arrival_times: Optional[list[float]] = None,
-        num_tokens: int = 10,
-        mm_positions: Optional[list[PlaceholderRange]] = None,
-        max_tokens: int = 16,
-        stop_token_ids: Optional[list[int]] = None,
-        prompt_logprobs: Optional[int] = None):
+    num_requests: int,
+    priorities: list[int],
+    arrival_times: Optional[list[float]] = None,
+    num_tokens: int = 10,
+    mm_positions: Optional[list[PlaceholderRange]] = None,
+    max_tokens: int = 16,
+    stop_token_ids: Optional[list[int]] = None,
+    prompt_logprobs: Optional[int] = None,
+):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
     if arrival_times is not None:
@@ -1318,10 +1343,12 @@ def create_requests_with_priority(
     else:
         arrival_times = [float(i) for i in range(num_requests)]
 
-    sampling_params = SamplingParams(ignore_eos=False,
-                                     max_tokens=max_tokens,
-                                     stop_token_ids=stop_token_ids,
-                                     prompt_logprobs=prompt_logprobs)
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=max_tokens,
+        stop_token_ids=stop_token_ids,
+        prompt_logprobs=prompt_logprobs,
+    )
     requests = []
     for i in range(num_requests):
         if mm_positions is not None:
@@ -1355,9 +1382,9 @@ def test_priority_scheduling_basic_ordering():
     # Priority 0 (highest), 1, 2 (lowest)
     priorities = [2, 0, 1]  # Add in non-priority order
     arrival_times = [1.0, 2.0, 3.0]  # All different arrival times
-    requests = create_requests_with_priority(num_requests=3,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=3, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests in non-priority order
     for request in requests:
@@ -1383,9 +1410,9 @@ def test_priority_scheduling_arrival_time_tiebreaker():
     # Create requests with same priority but different arrival times
     priorities = [1, 1, 1]  # All same priority
     arrival_times = [3.0, 1.0, 2.0]  # Different arrival times
-    requests = create_requests_with_priority(num_requests=3,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=3, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests in non-arrival order
     for request in requests:
@@ -1410,9 +1437,9 @@ def test_priority_scheduling_mixed_priority_and_arrival():
     # Create requests with mixed priorities and arrival times
     priorities = [2, 1, 1, 0]  # Mixed priorities
     arrival_times = [1.0, 3.0, 2.0, 4.0]  # Mixed arrival times
-    requests = create_requests_with_priority(num_requests=4,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=4, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests
     for request in requests:
@@ -1449,7 +1476,7 @@ def test_priority_scheduling_preemption():
         num_requests=2,
         priorities=[5, 5],  # Low priority
         arrival_times=[1.0, 2.0],
-        num_tokens=30  # Large enough to consume significant memory
+        num_tokens=30,  # Large enough to consume significant memory
     )
 
     # Add and schedule low priority requests
@@ -1463,8 +1490,7 @@ def test_priority_scheduling_preemption():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in low_priority_requests],
         req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(low_priority_requests)
+            req.request_id: i for i, req in enumerate(low_priority_requests)
         },
         sampled_token_ids=[[100] for _ in low_priority_requests],
         spec_token_ids=None,
@@ -1483,7 +1509,7 @@ def test_priority_scheduling_preemption():
         num_requests=1,
         priorities=[0],  # High priority
         arrival_times=[3.0],
-        num_tokens=30  # Large enough to require significant memory
+        num_tokens=30,  # Large enough to require significant memory
     )[0]
 
     scheduler.add_request(high_priority_request)
@@ -1524,10 +1550,8 @@ def test_priority_scheduling_no_preemption_when_space_available():
 
     # Add two low-priority running requests
     low_priority_requests = create_requests_with_priority(
-        num_requests=2,
-        priorities=[5, 5],
-        arrival_times=[1.0, 2.0],
-        num_tokens=30)
+        num_requests=2, priorities=[5, 5], arrival_times=[1.0, 2.0], num_tokens=30
+    )
 
     for request in low_priority_requests:
         scheduler.add_request(request)
@@ -1536,8 +1560,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in low_priority_requests],
         req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(low_priority_requests)
+            req.request_id: i for i, req in enumerate(low_priority_requests)
         },
         sampled_token_ids=[[100] for _ in low_priority_requests],
         spec_token_ids=None,
@@ -1548,10 +1571,9 @@ def test_priority_scheduling_no_preemption_when_space_available():
     scheduler.update_from_output(output, model_output)
 
     # Add high-priority request
-    high_priority_request = create_requests_with_priority(num_requests=1,
-                                                          priorities=[0],
-                                                          arrival_times=[3.0],
-                                                          num_tokens=30)[0]
+    high_priority_request = create_requests_with_priority(
+        num_requests=1, priorities=[0], arrival_times=[3.0], num_tokens=30
+    )[0]
 
     scheduler.add_request(high_priority_request)
 
@@ -1579,7 +1601,8 @@ def test_priority_scheduling_preemption_victim_selection():
         num_requests=3,
         priorities=[3, 2, 0],  # Different priorities: low, medium, high
         arrival_times=[1.0, 2.0, 3.0],
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1618,7 +1641,8 @@ def test_priority_scheduling_equal_priority_preemption():
         num_requests=3,
         priorities=[2, 2, 2],  # Same priority
         arrival_times=[3.0, 1.0, 2.0],  # Different arrival times
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1654,7 +1678,8 @@ def test_priority_scheduling_waiting_queue_order():
         num_requests=4,
         priorities=[3, 1, 2, 0],  # Mixed priorities
         arrival_times=[1.0, 2.0, 3.0, 4.0],
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1689,9 +1714,9 @@ def test_priority_scheduling_fcfs_fallback():
     # Create requests with same priority but different arrival times
     priorities = [1, 1, 1, 1]  # All same priority
     arrival_times = [4.0, 1.0, 3.0, 2.0]  # Different arrival times
-    requests = create_requests_with_priority(num_requests=4,
-                                             priorities=priorities,
-                                             arrival_times=arrival_times)
+    requests = create_requests_with_priority(
+        num_requests=4, priorities=priorities, arrival_times=arrival_times
+    )
 
     # Add requests
     for request in requests:
@@ -1721,7 +1746,8 @@ def test_priority_scheduling_with_limited_slots():
         num_requests=4,
         priorities=[3, 1, 2, 0],  # Mixed priorities
         arrival_times=[1.0, 2.0, 3.0, 4.0],
-        num_tokens=10)
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1759,10 +1785,12 @@ def test_priority_scheduling_heap_property():
     # Add requests in random priority order
     priorities = [5, 1, 8, 3, 2, 7, 4, 6]
     arrival_times = [float(i) for i in range(len(priorities))]
-    requests = create_requests_with_priority(num_requests=len(priorities),
-                                             priorities=priorities,
-                                             arrival_times=arrival_times,
-                                             num_tokens=10)
+    requests = create_requests_with_priority(
+        num_requests=len(priorities),
+        priorities=priorities,
+        arrival_times=arrival_times,
+        num_tokens=10,
+    )
 
     # Add all requests
     for request in requests:
@@ -1790,8 +1818,7 @@ def test_priority_scheduling_heap_property():
             scheduler.update_from_output(output, model_output)
 
             # Finish the request to make room for the next one
-            scheduler.finish_requests(req.req_id,
-                                      RequestStatus.FINISHED_STOPPED)
+            scheduler.finish_requests(req.req_id, RequestStatus.FINISHED_STOPPED)
 
     # Verify requests were scheduled in priority order (lowest value first)
     expected_priorities = sorted(priorities)
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 85415f6ad4b6..3c996e390f23 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -15,13 +15,15 @@
 
 @pytest.fixture(scope="module")
 def model() -> LLM:
-    return LLM(MODEL,
-               enforce_eager=True,
-               enable_prefix_caching=True,
-               long_prefill_token_threshold=2,
-               max_num_batched_tokens=6,
-               max_num_seqs=3,
-               block_size=16)
+    return LLM(
+        MODEL,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        long_prefill_token_threshold=2,
+        max_num_batched_tokens=6,
+        max_num_seqs=3,
+        block_size=16,
+    )
 
 
 def test_concurrent_partial_prefill(model):
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index a9e1898df934..292c70988b7e 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -4,17 +4,18 @@
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         KVCacheBlock)
+from vllm.v1.core.kv_cache_utils import BlockHash, BlockHashWithGroupId, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager
 from vllm.v1.kv_cache_interface import SlidingWindowSpec
 
 
 def get_sliding_window_manager(sliding_window_spec, block_pool):
-    return SlidingWindowManager(sliding_window_spec,
-                                block_pool,
-                                caching_hash_fn=lambda x: x,
-                                kv_cache_group_id=0)
+    return SlidingWindowManager(
+        sliding_window_spec,
+        block_pool,
+        caching_hash_fn=lambda x: x,
+        kv_cache_group_id=0,
+    )
 
 
 def test_sliding_window_possible_cached_prefix():
@@ -32,20 +33,20 @@ def test_sliding_window_possible_cached_prefix():
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
     def run_one_case(block_is_cached, expect_length):
-        block_hash_list = [
-            BlockHash(i, ()) for i in range(len(block_is_cached))
-        ]
+        block_hash_list = [BlockHash(i, ()) for i in range(len(block_is_cached))]
 
         block_pool.cached_block_hash_to_block.clear()
 
         # Mock the block pool with the cached blocks
-        for i, (block_hash,
-                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+        for i, (block_hash, is_cached) in enumerate(
+            zip(block_hash_list, block_is_cached)
+        ):
             if is_cached:
-                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
-                    block_hash, 0)] = {
-                        i: block_pool.blocks[i + 10],
-                    }
+                block_pool.cached_block_hash_to_block[
+                    BlockHashWithGroupId(block_hash, 0)
+                ] = {
+                    i: block_pool.blocks[i + 10],
+                }
 
         computed_blocks = manager.find_longest_cache_hit(
             block_hashes=block_hash_list,
@@ -53,16 +54,18 @@ def run_one_case(block_is_cached, expect_length):
             kv_cache_group_ids=[0],
             block_pool=block_pool,
             kv_cache_spec=sliding_window_spec,
-            use_eagle=False)[0]
+            use_eagle=False,
+        )[0]
         assert len(computed_blocks) == expect_length
 
-        assert all(block == block_pool.null_block
-                   for block in computed_blocks[:expect_length - 2])
+        assert all(
+            block == block_pool.null_block
+            for block in computed_blocks[: expect_length - 2]
+        )
         for i in range(2):
             if i < expect_length:
                 block_index = expect_length - i - 1
-                assert computed_blocks[
-                    block_index].block_id == block_index + 10
+                assert computed_blocks[block_index].block_id == block_index + 10
 
     run_one_case([False] * 10, 0)
     run_one_case([True], 1)
@@ -71,17 +74,16 @@ def run_one_case(block_is_cached, expect_length):
     run_one_case([True, True, False], 2)
     run_one_case([True, True, True], 3)
     run_one_case([True, True, True, False], 3)
-    run_one_case([
-        True, True, False, True, False, False, True, True, False, True, True,
-        True
-    ], 12)
-    run_one_case([
-        True, True, False, True, False, False, True, True, False, False, False
-    ], 8)
-    run_one_case([
-        True, True, False, True, False, False, True, True, False, False, False,
-        True
-    ], 8)
+    run_one_case(
+        [True, True, False, True, False, False, True, True, False, True, True, True], 12
+    )
+    run_one_case(
+        [True, True, False, True, False, False, True, True, False, False, False], 8
+    )
+    run_one_case(
+        [True, True, False, True, False, False, True, True, False, False, False, True],
+        8,
+    )
 
 
 def test_sliding_window_remove_skipped_blocks():
@@ -102,8 +104,8 @@ def test_sliding_window_remove_skipped_blocks():
 
     def id_to_block_table(ids) -> list[KVCacheBlock]:
         return [
-            KVCacheBlock(id_)
-            if id_ != null_block_id else block_pool.null_block for id_ in ids
+            KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
+            for id_ in ids
         ]
 
     def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
@@ -114,7 +116,17 @@ def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
                 assert block.block_id == id_
 
     original_block_ids = [
-        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+        1000,
+        1001,
+        1002,
+        1003,
+        1004,
+        1005,
+        1006,
+        1007,
+        1008,
+        1009,
+        1010,
     ]
     block_table = id_to_block_table(original_block_ids)
     manager.req_to_blocks["test"] = block_table
@@ -165,10 +177,13 @@ def test_get_num_blocks_to_allocate():
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
-    cached_blocks_2 = [block_pool.null_block for _ in range(5)
-                       ] + [KVCacheBlock(i + 1) for i in range(5)]
+    cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
+        KVCacheBlock(i + 1) for i in range(5)
+    ]
 
-    assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
-                                              cached_blocks_1) == 20
-    assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
-                                              cached_blocks_2) == 15
+    assert (
+        manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
+    )
+    assert (
+        manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
+    )
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 0b7d8251b640..02936bbe8e88 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -4,14 +4,23 @@
 
 import torch
 
-from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
-                         SchedulerConfig, SpeculativeConfig, VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+)
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
 
@@ -33,7 +42,7 @@ def create_scheduler(
     skip_tokenizer_init: bool = False,
     async_scheduling: bool = False,
 ) -> Union[Scheduler, AsyncScheduler]:
-    '''Create scheduler under test.
+    """Create scheduler under test.
 
     Args:
       model: model under test
@@ -45,7 +54,7 @@ def create_scheduler(
 
     Returns:
       {class}`Scheduler` instance
-    '''
+    """
     if max_model_len is None:
         max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
@@ -68,9 +77,11 @@ def create_scheduler(
         skip_tokenizer_init=skip_tokenizer_init,
     )
     # Cache config, optionally force APC
-    kwargs_cache = ({} if enable_prefix_caching is None else {
-        'enable_prefix_caching': enable_prefix_caching
-    })
+    kwargs_cache = (
+        {}
+        if enable_prefix_caching is None
+        else {"enable_prefix_caching": enable_prefix_caching}
+    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
@@ -78,16 +89,21 @@ def create_scheduler(
         cache_dtype="auto",
         **kwargs_cache,
     )
-    kv_transfer_config = KVTransferConfig(
-        kv_connector="SharedStorageConnector",
-        kv_role="kv_both",
-        kv_connector_extra_config={"shared_storage_path": "local_storage"},
-    ) if use_kv_connector else None
+    kv_transfer_config = (
+        KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        )
+        if use_kv_connector
+        else None
+    )
 
     speculative_config: Optional[SpeculativeConfig] = None
     if num_speculative_tokens is not None:
         speculative_config = SpeculativeConfig(
-            model="ngram", num_speculative_tokens=num_speculative_tokens)
+            model="ngram", num_speculative_tokens=num_speculative_tokens
+        )
 
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
@@ -100,9 +116,9 @@ def create_scheduler(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+            )
         ],
     )
     cache_config.num_gpu_blocks = num_blocks
@@ -124,10 +140,12 @@ def create_requests(
     prompt_logprobs: Optional[int] = None,
     same_prompt: bool = False,
 ) -> list[Request]:
-    sampling_params = SamplingParams(ignore_eos=False,
-                                     max_tokens=max_tokens,
-                                     stop_token_ids=stop_token_ids,
-                                     prompt_logprobs=prompt_logprobs)
+    sampling_params = SamplingParams(
+        ignore_eos=False,
+        max_tokens=max_tokens,
+        stop_token_ids=stop_token_ids,
+        prompt_logprobs=prompt_logprobs,
+    )
     requests = []
     for i in range(num_requests):
         if mm_positions is not None:
@@ -136,8 +154,7 @@ def create_requests(
         else:
             mm_position = None
             mm_inputs = None
-        prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
-                            num_tokens)
+        prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
         request = Request(
             request_id=f"{i}",
             prompt_token_ids=prompt_token_ids,
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index f2f460513605..34891c95b0d2 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -9,8 +9,7 @@
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("attn_backend",
-                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
 def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index 277ea3c83850..56aad920c7dd 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -6,8 +6,10 @@
 
 from vllm import LLM, SamplingParams
 
-from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
-                                                               prep_prompts)
+from ...core.block.e2e.test_correctness_sliding_window import (
+    check_answers,
+    prep_prompts,
+)
 
 
 @dataclass
@@ -27,7 +29,8 @@ class TestConfig:
     [
         "bigcode/starcoder2-3b",  # sliding window only
         "google/gemma-3-1b-it",  # sliding window + full attention
-    ])
+    ],
+)
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
@@ -45,29 +48,34 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
         llm = LLM(model=model)
         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
 
-        prompts, answer, indices = prep_prompts(batch_size,
-                                                ln_range=test_config.ln_range)
+        prompts, answer, indices = prep_prompts(
+            batch_size, ln_range=test_config.ln_range
+        )
 
         check_length(prompts, llm, test_config.sliding_window)
 
         # Fresh generation
         responses = llm.generate(prompts, sampling_params)
-        check_answers(indices,
-                      answer,
-                      [response.outputs[0].text for response in responses],
-                      accept_rate=1.0)
+        check_answers(
+            indices,
+            answer,
+            [response.outputs[0].text for response in responses],
+            accept_rate=1.0,
+        )
 
         # Re-generate with the same prompts to test prefix caching
         responses = llm.generate(prompts, sampling_params)
-        check_answers(indices,
-                      answer,
-                      [response.outputs[0].text for response in responses],
-                      accept_rate=1.0)
+        check_answers(
+            indices,
+            answer,
+            [response.outputs[0].text for response in responses],
+            accept_rate=1.0,
+        )
 
 
 def check_length(prompts: list[str], llm: LLM, sliding_window: int):
     """
-    Check if the prompt length is valid, i.e., longer than the sliding window 
+    Check if the prompt length is valid, i.e., longer than the sliding window
     size and shorter than the model's max length.
 
     Args:
@@ -77,9 +85,9 @@ def check_length(prompts: list[str], llm: LLM, sliding_window: int):
     """
     tokenizer = llm.get_tokenizer()
     max_model_len = llm.llm_engine.model_config.max_model_len
-    assert any(
-        len(tokenizer.encode(prompt)) > sliding_window
-        for prompt in prompts), "Prompt is too short for test"
-    assert all(
-        len(tokenizer.encode(prompt)) <= max_model_len
-        for prompt in prompts), "Prompt is too long for test"
+    assert any(len(tokenizer.encode(prompt)) > sliding_window for prompt in prompts), (
+        "Prompt is too short for test"
+    )
+    assert all(len(tokenizer.encode(prompt)) <= max_model_len for prompt in prompts), (
+        "Prompt is too long for test"
+    )
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 2423f966acfa..43e5be86796c 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -61,10 +61,10 @@ def test_ngram_correctness(
     sampling_config: SamplingParams,
     model_name: str,
 ):
-    '''
+    """
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
-    '''
+    """
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -103,35 +103,51 @@ def test_ngram_correctness(
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize("model_setup", [
-    ("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-     "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
-    ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-     "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
-    pytest.param(
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-],
-                         ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        (
+            "eagle",
+            "meta-llama/Llama-3.1-8B-Instruct",
+            "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+            1,
+        ),
+        (
+            "eagle3",
+            "meta-llama/Llama-3.1-8B-Instruct",
+            "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
+            1,
+        ),
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+                "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
+                4,
+            ),
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"),
+        ),
+    ],
+    ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"],
+)
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
 ):
-    '''
+    """
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using eagle speculative decoding.
     model_setup: (method, model_name, eagle_model_name, tp_size)
-    '''
+    """
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
         method, model_name, spec_model_name, tp_size = model_setup
 
-        ref_llm = LLM(model=model_name,
-                      max_model_len=2048,
-                      tensor_parallel_size=tp_size)
+        ref_llm = LLM(
+            model=model_name, max_model_len=2048, tensor_parallel_size=tp_size
+        )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
         torch.cuda.empty_cache()
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index d7722142b207..000ea4d742f0 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -5,12 +5,15 @@
 import torch
 from transformers import AutoTokenizer
 
-from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
-                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST, PROMPT_LEN,
-                                   TOKENIZER_NAME,
-                                   DummyOutputProcessorTestVectors,
-                                   generate_dummy_prompt_logprobs_tensors,
-                                   generate_dummy_sample_logprobs)
+from tests.v1.engine.utils import (
+    NUM_PROMPT_LOGPROBS_UNDER_TEST,
+    NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+    PROMPT_LEN,
+    TOKENIZER_NAME,
+    DummyOutputProcessorTestVectors,
+    generate_dummy_prompt_logprobs_tensors,
+    generate_dummy_sample_logprobs,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 
@@ -24,7 +27,7 @@
 
 def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
     """Generate output processor dummy test vectors, without logprobs
-    
+
     Returns:
       DummyOutputProcessorTestVectors instance with no logprobs
     """
@@ -32,9 +35,7 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
     vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
     # Tokenize prompts under test & create dummy generated tokens
-    prompt_tokens = [
-        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-    ]
+    prompt_tokens = [tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS]
     generation_tokens = [
         tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
     ]
@@ -43,14 +44,14 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
         tokenizer.decode(prompt_tokens, skip_special_tokens=True)
         for prompt_tokens in prompt_tokens
     ]
-    prompt_strings_len = [
-        len(prompt_string) for prompt_string in prompt_strings
-    ]
+    prompt_strings_len = [len(prompt_string) for prompt_string in prompt_strings]
     return DummyOutputProcessorTestVectors(
         tokenizer=tokenizer,
         tokenizer_group=init_tokenizer_from_configs(
-            vllm_config.model_config, vllm_config.scheduler_config,
-            vllm_config.lora_config),
+            vllm_config.model_config,
+            vllm_config.scheduler_config,
+            vllm_config.lora_config,
+        ),
         vllm_config=vllm_config,
         full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
         prompt_tokens=prompt_tokens,
@@ -62,13 +63,14 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
             for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
         ],
         prompt_logprobs=[],
-        generation_logprobs=[])
+        generation_logprobs=[],
+    )
 
 
 @pytest.fixture
 def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
     """Generate output processor dummy test vectors, with logprobs
-    
+
     Returns:
       DummyOutputProcessorTestVectors instance with logprobs
     """
@@ -80,12 +82,16 @@ def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
         generate_dummy_sample_logprobs(
             sampled_tokens_list=tokens_list,
             num_logprobs=NUM_SAMPLE_LOGPROBS_UNDER_TEST,
-            tokenizer=dtv.tokenizer) for tokens_list in dtv.generation_tokens
+            tokenizer=dtv.tokenizer,
+        )
+        for tokens_list in dtv.generation_tokens
     ]
     dtv.prompt_logprobs = [
         generate_dummy_prompt_logprobs_tensors(
             prompt_tokens_list=tokens_list,
             num_logprobs=NUM_PROMPT_LOGPROBS_UNDER_TEST,
-            tokenizer=dtv.tokenizer) for tokens_list in dtv.prompt_tokens
+            tokenizer=dtv.tokenizer,
+        )
+        for tokens_list in dtv.prompt_tokens
     ]
     return dtv
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index e137452f2625..fa167bcd58bc 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -20,8 +20,7 @@
 from vllm.v1.metrics.loggers import LoggingStatLogger
 
 if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.",
-                allow_module_level=True)
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
 
 TEXT_ENGINE_ARGS = AsyncEngineArgs(
     model="meta-llama/Llama-3.2-1B-Instruct",
@@ -29,9 +28,9 @@
     disable_log_requests=True,
 )
 
-VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
-                                     enforce_eager=True,
-                                     disable_log_requests=True)
+VISION_ENGINE_ARGS = AsyncEngineArgs(
+    model="Qwen/Qwen2-VL-2B-Instruct", enforce_eager=True, disable_log_requests=True
+)
 
 TEXT_PROMPT = "Hello my name is Robert and"
 
@@ -39,12 +38,11 @@
     "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
     "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
     "What is in the image?<|im_end|>\n"
-    "<|im_start|>assistant\n")
+    "<|im_start|>assistant\n"
+)
 VISION_PROMPT = {
     "prompt": VISION_PROMPT_TEMPLATE,
-    "multi_modal_data": {
-        "image": ImageAsset("stop_sign").pil_image
-    },
+    "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
 }
 
 
@@ -71,10 +69,9 @@ async def generate(
         n=n,
         prompt_logprobs=prompt_logprobs,
     )
-    async for out in engine.generate(request_id=request_id,
-                                     prompt=prompt,
-                                     sampling_params=sampling_params):
-
+    async for out in engine.generate(
+        request_id=request_id, prompt=prompt, sampling_params=sampling_params
+    ):
         num_tokens = sum(len(output.token_ids) for output in out.outputs)
         if output_kind == RequestOutputKind.DELTA:
             count += num_tokens
@@ -90,7 +87,8 @@ async def generate(
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
 @pytest.mark.parametrize(
     "engine_args,prompt",
     [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
@@ -122,25 +120,29 @@ async def test_load(
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                    generate(
+                        engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS
+                    )
+                )
+            )
 
         # Confirm that we got all the EXPECTED tokens from the requests.
-        done, pending = await asyncio.wait(tasks,
-                                           return_when=asyncio.FIRST_EXCEPTION)
+        done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
         for task in pending:
             task.cancel()
         for task in done:
             num_generated_tokens, request_id = await task
             assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} but "
-                f"expected {NUM_EXPECTED_TOKENS}")
+                f"expected {NUM_EXPECTED_TOKENS}"
+            )
 
         assert not engine.output_processor.has_unfinished_requests()
 
 
 @pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
 @pytest.mark.parametrize(
     "engine_args,prompt",
     [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
@@ -152,7 +154,6 @@ async def test_abort(
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -171,14 +172,17 @@ async def test_abort(
         # Create concurrent requests.
         tasks: list[asyncio.Task] = []
         for idx, request_id in enumerate(request_ids):
-            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
-                          (idx
-                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            max_tokens = (
+                NUM_EXPECTED_TOKENS_LONG
+                if (idx in REQUEST_IDS_TO_ABORT)
+                else NUM_EXPECTED_TOKENS
+            )
             n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             max_tokens, n)))
+                    generate(engine, request_id, prompt, output_kind, max_tokens, n)
+                )
+            )
 
         # API server cancels requests when they disconnect.
         for idx in REQUEST_IDS_TO_ABORT:
@@ -198,7 +202,8 @@ async def test_abort(
                 expected_tokens = NUM_EXPECTED_TOKENS * n
                 assert num_generated_tokens == expected_tokens, (
                     f"{request_id} generated {num_generated_tokens} but "
-                    f"expected {expected_tokens}")
+                    f"expected {expected_tokens}"
+                )
 
         # Make sure all aborted requests were really aborted.
         assert not engine.output_processor.has_unfinished_requests()
@@ -206,8 +211,8 @@ async def test_abort(
         # Confirm we can do another generation.
         request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
         task = asyncio.create_task(
-            generate(engine, request_id, prompt, output_kind,
-                     NUM_EXPECTED_TOKENS))
+            generate(engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS)
+        )
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()
@@ -225,7 +230,6 @@ async def test_finished_flag(
     engine_args: AsyncEngineArgs,
     prompt: PromptType,
 ):
-
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -242,9 +246,9 @@ async def test_finished_flag(
         )
         outputs = [
             out
-            async for out in engine.generate(request_id="request-33",
-                                             prompt=prompt,
-                                             sampling_params=sampling_params)
+            async for out in engine.generate(
+                request_id="request-33", prompt=prompt, sampling_params=sampling_params
+            )
         ]
 
         # Assert only the last output has the finished flag set
@@ -257,9 +261,9 @@ async def test_finished_flag(
     [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)],
 )
 @pytest.mark.asyncio
-async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
-                                       engine_args: AsyncEngineArgs,
-                                       prompt: PromptType):
+async def test_mid_stream_cancellation(
+    monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
+):
     """Test that requests can be cancelled mid-stream."""
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
@@ -286,7 +290,9 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
                         RequestOutputKind.DELTA,
                         NUM_TOKENS,
                         cancel_after=NUM_EXPECTED_TOKENS,
-                    )))
+                    )
+                )
+            )
 
         # Wait for all tasks to complete
         results = await asyncio.gather(*tasks)
@@ -295,7 +301,8 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
         for num_generated_tokens, request_id in results:
             assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} tokens but "
-                f"expected to cancel after {NUM_EXPECTED_TOKENS}")
+                f"expected to cancel after {NUM_EXPECTED_TOKENS}"
+            )
 
         # Make sure no requests are left hanging
         assert not engine.output_processor.has_unfinished_requests()
@@ -303,15 +310,16 @@ async def test_mid_stream_cancellation(monkeypatch: pytest.MonkeyPatch,
         # Confirm we can reuse the request id after the cancellations.
         request_id = request_ids[0]
         task = asyncio.create_task(
-            generate(engine, request_id, prompt, RequestOutputKind.DELTA,
-                     NUM_EXPECTED_TOKENS))
+            generate(
+                engine, request_id, prompt, RequestOutputKind.DELTA, NUM_EXPECTED_TOKENS
+            )
+        )
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()
 
 
 class MockLoggingStatLogger(LoggingStatLogger):
-
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         super().__init__(vllm_config, engine_index)
         self.log = MagicMock()
@@ -350,24 +358,30 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
             engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
         after.callback(engine.shutdown)
 
-        sampling_params = SamplingParams(max_tokens=100,
-                                         output_kind=RequestOutputKind.DELTA,
-                                         temperature=1.0,
-                                         seed=33)
+        sampling_params = SamplingParams(
+            max_tokens=100,
+            output_kind=RequestOutputKind.DELTA,
+            temperature=1.0,
+            seed=33,
+        )
 
         # Test with valid DP rank.
-        async for _ in engine.generate(request_id="request-34",
-                                       prompt=TEXT_PROMPT,
-                                       sampling_params=sampling_params,
-                                       data_parallel_rank=0):
+        async for _ in engine.generate(
+            request_id="request-34",
+            prompt=TEXT_PROMPT,
+            sampling_params=sampling_params,
+            data_parallel_rank=0,
+        ):
             pass
 
         # Test with out-of-range DP rank.
         with pytest.raises(ValueError):
-            async for _ in engine.generate(request_id="request-35",
-                                           prompt=TEXT_PROMPT,
-                                           sampling_params=sampling_params,
-                                           data_parallel_rank=1):
+            async for _ in engine.generate(
+                request_id="request-35",
+                prompt=TEXT_PROMPT,
+                sampling_params=sampling_params,
+                data_parallel_rank=1,
+            ):
                 pass
 
 
@@ -391,10 +405,14 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
         await engine.check_health()
 
         # Test 2: Mock the errored property to simulate a dead engine
-        with patch.object(type(engine),
-                          'errored',
-                          new_callable=lambda: property(lambda self: True)
-                          ), pytest.raises(EngineDeadError):
+        with (
+            patch.object(
+                type(engine),
+                "errored",
+                new_callable=lambda: property(lambda self: True),
+            ),
+            pytest.raises(EngineDeadError),
+        ):
             await engine.check_health()
 
         # Test 3: Verify healthy engine still works after mock
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index f70a3ce147ff..e65522696447 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -22,8 +22,9 @@ def test_prefix_caching_from_cli():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
     vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
-    assert (vllm_config.cache_config.enable_prefix_caching
-            ), "V1 turns on prefix caching by default."
+    assert vllm_config.cache_config.enable_prefix_caching, (
+        "V1 turns on prefix caching by default."
+    )
 
     # Turn it off possible with flag.
     args = parser.parse_args(["--no-enable-prefix-caching"])
@@ -56,10 +57,10 @@ def test_prefix_caching_from_cli():
 
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
-    vllm_config: VllmConfig = engine_args.create_engine_config(
-        UsageContext.LLM_CLASS)
+    vllm_config: VllmConfig = engine_args.create_engine_config(UsageContext.LLM_CLASS)
 
     from vllm.platforms import current_platform
+
     device_name = current_platform.get_device_name().lower()
     if "h100" in device_name or "h200" in device_name:
         # For H100 and H200, we use larger default values.
@@ -75,7 +76,6 @@ def test_defaults_with_usage_context():
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501
 
     engine_args = EngineArgs(model="facebook/opt-125m")
-    vllm_config = engine_args.create_engine_config(
-        UsageContext.OPENAI_API_SERVER)
+    vllm_config = engine_args.create_engine_config(UsageContext.OPENAI_API_SERVER)
     assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index bbdc73e9608a..92c5b4e46116 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -22,8 +22,7 @@
 from ...utils import create_new_process_for_each_test, multi_gpu_test
 
 if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.",
-                allow_module_level=True)
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -50,7 +49,6 @@ def make_request() -> EngineCoreRequest:
 
 @create_new_process_for_each_test()
 def test_engine_core(monkeypatch: pytest.MonkeyPatch):
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
@@ -59,9 +57,9 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     executor_class=executor_class,
-                                     log_stats=True)
+            engine_core = EngineCore(
+                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+            )
         """Test basic request lifecycle."""
 
         # First request.
@@ -194,9 +192,9 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     executor_class=executor_class,
-                                     log_stats=True)
+            engine_core = EngineCore(
+                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+            )
         """Test basic request lifecycle."""
         # First request.
         request: EngineCoreRequest = make_request()
@@ -236,17 +234,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     Test that the engine can handle multiple concurrent batches.
     """
 
-    def make_request_with_max_tokens(req_id: int,
-                                     max_tokens: int) -> EngineCoreRequest:
+    def make_request_with_max_tokens(req_id: int, max_tokens: int) -> EngineCoreRequest:
         request = make_request()
         request.request_id = req_id
         request.sampling_params.max_tokens = max_tokens
         return request
 
     class DummyExecutor(UniProcExecutor):
-
-        def initialize_from_config(
-                self, kv_cache_configs: list[KVCacheConfig]) -> None:
+        def initialize_from_config(self, kv_cache_configs: list[KVCacheConfig]) -> None:
             super().initialize_from_config(kv_cache_configs)
 
             # Create a thread pool with a single worker
@@ -259,8 +254,7 @@ def execute_model(
             """Make execute_model non-blocking."""
 
             def _execute():
-                output = self.collective_rpc("execute_model",
-                                             args=(scheduler_output, ))
+                output = self.collective_rpc("execute_model", args=(scheduler_output,))
                 # Make a copy because output[0] may be reused
                 # by the next batch.
                 return copy.deepcopy(output[0])
@@ -273,7 +267,7 @@ def max_concurrent_batches(self) -> int:
             return 2
 
         def shutdown(self):
-            if hasattr(self, 'thread_pool'):
+            if hasattr(self, "thread_pool"):
                 self.thread_pool.shutdown(wait=False)
 
     with monkeypatch.context() as m:
@@ -291,9 +285,9 @@ def shutdown(self):
         )
         vllm_config = engine_args.create_engine_config()
         with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     log_stats=False,
-                                     executor_class=DummyExecutor)
+            engine_core = EngineCore(
+                vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
+            )
         assert engine_core.batch_queue is not None
 
         # Add two requests in a row. Each request have 12 prompt tokens.
@@ -308,8 +302,7 @@ def shutdown(self):
         scheduler_output = engine_core.batch_queue.queue[-1][1]
         assert scheduler_output.num_scheduled_tokens[0] == 10
         # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests[
-            req0.request_id].num_computed_tokens == 10
+        assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
 
         # Schedule Batch 2: (2, req0), (8, req1)
         assert engine_core.step_with_batch_queue()[0] is None
@@ -371,8 +364,10 @@ def shutdown(self):
                 assert output is not None
                 assert len(output[0].outputs) == 1
                 if req_id in engine_core.scheduler.requests:
-                    assert engine_core.scheduler.requests[
-                        req_id].num_tokens == expected_num_tokens[req_id]
+                    assert (
+                        engine_core.scheduler.requests[req_id].num_tokens
+                        == expected_num_tokens[req_id]
+                    )
                 expected_num_tokens[req_id] += 1
                 req_id = (req_id + 1) % 2
             else:
@@ -400,16 +395,18 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
-            engine_core = EngineCore(vllm_config=vllm_config,
-                                     executor_class=executor_class,
-                                     log_stats=True)
+            engine_core = EngineCore(
+                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+            )
 
         def get_worker_cache_config_field(worker, key: str):
             return getattr(worker.cache_config, key)
 
         num_gpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_gpu_blocks", ))
+            get_worker_cache_config_field, args=("num_gpu_blocks",)
+        )
         num_cpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_cpu_blocks", ))
+            get_worker_cache_config_field, args=("num_cpu_blocks",)
+        )
         assert all(x is not None for x in num_gpu_blocks)
         assert all(x is not None for x in num_cpu_blocks)
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 65f1da803fb2..b297f3745959 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -16,16 +16,14 @@
 
 from tests.utils import multi_gpu_test
 from vllm import SamplingParams
-from vllm.distributed.kv_events import (BlockStored, KVEventBatch,
-                                        ZmqEventPublisher)
+from vllm.distributed.kv_events import BlockStored, KVEventBatch, ZmqEventPublisher
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
-                                        SyncMPClient)
+from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
 from vllm.v1.engine.utils import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor
 
@@ -33,8 +31,7 @@
 from ...utils import create_new_process_for_each_test
 
 if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.",
-                allow_module_level=True)
+    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -43,8 +40,8 @@
 
 
 def make_request(
-        params: SamplingParams,
-        prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest:
+    params: SamplingParams, prompt_tokens_ids: Optional[list[int]] = None
+) -> EngineCoreRequest:
     if not prompt_tokens_ids:
         prompt_tokens_ids = PROMPT_TOKENS
 
@@ -65,7 +62,6 @@ def make_request(
 
 
 def loop_until_done(client: EngineCoreClient, outputs: dict):
-
     while True:
         engine_core_outputs = client.get_output().outputs
 
@@ -83,7 +79,6 @@ def loop_until_done(client: EngineCoreClient, outputs: dict):
 
 
 async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
-
     while True:
         engine_core_outputs = (await client.get_output_async()).outputs
 
@@ -101,7 +96,6 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
 
 
 async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
-
     while True:
         engine_core_outputs = (await client.get_output_async()).outputs
 
@@ -129,9 +123,9 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
-def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
-                            multiprocessing_mode: bool):
-
+def test_engine_core_client(
+    monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
+):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -139,8 +133,7 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
         m.setattr(EngineCore, "echo", echo, raising=False)
 
         engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
-        vllm_config = engine_args.create_engine_config(
-            UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
@@ -168,7 +161,8 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
         for req_id in request_ids:
             assert len(outputs[req_id]) == MAX_TOKENS, (
-                f"{outputs[req_id]=}, {MAX_TOKENS=}")
+                f"{outputs[req_id]=}, {MAX_TOKENS=}"
+            )
         """Abort Request Cycle."""
 
         # Note: this code pathway will only work for multiprocessing
@@ -187,10 +181,12 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
         for idx, req_id in enumerate(request_ids):
             if idx % 2 == 0:
                 assert len(outputs[req_id]) < MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                )
             else:
                 assert len(outputs[req_id]) == MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                )
         """Abort after request is finished."""
 
         # Note: this code pathway will only work for multiprocessing
@@ -198,7 +194,7 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
         request = requests[0]
         client.add_request(request)
-        time.sleep(10.)
+        time.sleep(10.0)
 
         client.abort_requests([request.request_id])
 
@@ -218,7 +214,6 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -227,7 +222,8 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
         engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
         vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+            usage_context=UsageContext.UNKNOWN_CONTEXT
+        )
         executor_class = Executor.get_class(vllm_config)
 
         with set_default_torch_num_threads(1):
@@ -257,7 +253,8 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
             for req_id in request_ids:
                 assert len(outputs[req_id]) == MAX_TOKENS, (
-                    f"{outputs[req_id]=}, {MAX_TOKENS=}")
+                    f"{outputs[req_id]=}, {MAX_TOKENS=}"
+                )
             """Abort Request Cycle."""
 
             # Add requests to the engine.
@@ -273,10 +270,12 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             for idx, req_id in enumerate(request_ids):
                 if idx % 2 == 0:
                     assert len(outputs[req_id]) < MAX_TOKENS, (
-                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                    )
                 else:
                     assert len(outputs[req_id]) == MAX_TOKENS, (
-                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}"
+                    )
             """Utility method invocation"""
 
             core_client: AsyncMPClient = client
@@ -302,7 +301,6 @@ def test_kv_cache_events(
     multiprocessing_mode: bool,
     publisher_config,
 ):
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
         block_size = 16
@@ -316,8 +314,7 @@ def test_kv_cache_events(
         )
         engine_args.kv_events_config = publisher_config
 
-        vllm_config = engine_args.create_engine_config(
-            UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
 
         executor_class = Executor.get_class(vllm_config)
         with set_default_torch_num_threads(1):
@@ -329,9 +326,9 @@ def test_kv_cache_events(
                 log_stats=False,
             )
         endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        subscriber = MockSubscriber(endpoint,
-                                    topic=publisher_config.topic,
-                                    decode_type=KVEventBatch)
+        subscriber = MockSubscriber(
+            endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
+        )
 
         try:
             custom_tokens = list(range(num_blocks * block_size))
@@ -348,22 +345,25 @@ def test_kv_cache_events(
             seq, received = result
 
             assert seq == 0, "Sequence number mismatch"
-            assert (len(received.events) == 1
-                    ), "We should have exactly one BlockStored event"
+            assert len(received.events) == 1, (
+                "We should have exactly one BlockStored event"
+            )
             event = received.events[0]
-            assert isinstance(
-                event, BlockStored), "We should have a BlockStored event"
-            assert (len(event.block_hashes) == num_blocks
-                    ), "We should have a BlockStored event with 2 block_hashes"
-            assert (event.block_size == block_size
-                    ), "Block size should be the same as the block size"
-            assert (event.parent_block_hash
-                    is None), "Parent block hash should be None"
+            assert isinstance(event, BlockStored), "We should have a BlockStored event"
+            assert len(event.block_hashes) == num_blocks, (
+                "We should have a BlockStored event with 2 block_hashes"
+            )
+            assert event.block_size == block_size, (
+                "Block size should be the same as the block size"
+            )
+            assert event.parent_block_hash is None, "Parent block hash should be None"
             assert event.lora_id is None, "Lora id should be None"
-            assert (len(event.token_ids) == num_blocks * block_size
-                    ), "Token ids should be the same as the custom tokens"
-            assert (event.token_ids == custom_tokens
-                    ), "Token ids should be the same as the custom tokens"
+            assert len(event.token_ids) == num_blocks * block_size, (
+                "Token ids should be the same as the custom tokens"
+            )
+            assert event.token_ids == custom_tokens, (
+                "Token ids should be the same as the custom tokens"
+            )
         finally:
             client.shutdown()
             subscriber.close()
@@ -381,7 +381,6 @@ async def test_kv_cache_events_dp(
     multiprocessing_mode: bool,
     publisher_config,
 ):
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
         block_size = 16
@@ -399,8 +398,7 @@ async def test_kv_cache_events_dp(
         )
         engine_args.kv_events_config = publisher_config
 
-        vllm_config = engine_args.create_engine_config(
-            UsageContext.UNKNOWN_CONTEXT)
+        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
 
         executor_class = Executor.get_class(vllm_config)
         with set_default_torch_num_threads(1):
@@ -417,13 +415,12 @@ async def test_kv_cache_events_dp(
         base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
         endpoints = []
         for i in range(dp_size):
-            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(
-                base_endpoint, i)
+            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
             endpoints.append(offset_endpoint)
 
-        subscriber = MockSubscriber(endpoints,
-                                    topic=publisher_config.topic,
-                                    decode_type=KVEventBatch)
+        subscriber = MockSubscriber(
+            endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
+        )
 
         try:
             custom_tokens = list(range(num_blocks * block_size))
@@ -441,15 +438,12 @@ async def test_kv_cache_events_dp(
             await asyncio.sleep(0.1)
 
             # Initialize outputs dict for all requests
-            outputs: dict[str, list] = {
-                req_id: []
-                for req_id in all_request_ids
-            }
+            outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
 
             print("processing requests...")
-            await asyncio.wait_for(loop_until_fully_done_async(
-                client, outputs),
-                                   timeout=20.0)
+            await asyncio.wait_for(
+                loop_until_fully_done_async(client, outputs), timeout=20.0
+            )
 
             # Receive from subscriber until no more messages
             print("collecting results...")
@@ -462,13 +456,11 @@ async def test_kv_cache_events_dp(
                 results.append(result)
 
             # Collect all events and data_parallel_ranks from all results
-            all_dp_ranks = [
-                received.data_parallel_rank for (_, received) in results
-            ]
+            all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
             unique_dps = set(all_dp_ranks)
-            assert (
-                len(unique_dps) == 2
-            ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+            assert len(unique_dps) == 2, (
+                f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+            )
 
         finally:
             client.shutdown()
@@ -477,7 +469,6 @@ async def test_kv_cache_events_dp(
 
 @pytest.mark.timeout(20)
 def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
-
     with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -494,7 +485,8 @@ def patched_cepm_ctor(self: CoreEngineProcManager, *args, **kwargs):
         t = time.time()
         engine_args = EngineArgs(model=MODEL_NAME)
         vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
+            usage_context=UsageContext.UNKNOWN_CONTEXT
+        )
         executor_class = Executor.get_class(vllm_config)
         print(f"VllmConfig creation took {time.time() - t:.2f} seconds.")
 
@@ -522,8 +514,7 @@ def kill_first_child():
 
 
 @create_new_process_for_each_test()
-def test_engine_core_proc_instantiation_cuda_empty(
-        monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPatch):
     """
     Test that EngineCoreProc can be instantiated when CUDA_VISIBLE_DEVICES
     is empty. This ensures the engine frontend does not need access to GPUs.
@@ -540,18 +531,17 @@ def create_mock_executor(vllm_config):
 
         # Only implement the methods that are actually called during init
         from vllm.v1.kv_cache_interface import FullAttentionSpec
-        mock_spec = FullAttentionSpec(block_size=16,
-                                      num_kv_heads=1,
-                                      head_size=64,
-                                      dtype=torch.float16,
-                                      use_mla=False)
-
-        mock_executor.get_kv_cache_specs.return_value = [{
-            "default": mock_spec
-        }]
-        mock_executor.determine_available_memory.return_value = [
-            1024 * 1024 * 1024
-        ]
+
+        mock_spec = FullAttentionSpec(
+            block_size=16,
+            num_kv_heads=1,
+            head_size=64,
+            dtype=torch.float16,
+            use_mla=False,
+        )
+
+        mock_executor.get_kv_cache_specs.return_value = [{"default": mock_spec}]
+        mock_executor.determine_available_memory.return_value = [1024 * 1024 * 1024]
         mock_executor.initialize_from_config.return_value = None
         mock_executor.max_concurrent_batches = 1
 
@@ -565,19 +555,22 @@ def create_mock_executor(vllm_config):
 
         from vllm.v1.engine.utils import EngineZmqAddresses
 
-        def mock_startup_handshake(self, handshake_socket, on_head_node,
-                                   parallel_config):
-            return EngineZmqAddresses(inputs=["tcp://127.0.0.1:5555"],
-                                      outputs=["tcp://127.0.0.1:5556"],
-                                      coordinator_input=None,
-                                      coordinator_output=None)
+        def mock_startup_handshake(
+            self, handshake_socket, on_head_node, parallel_config
+        ):
+            return EngineZmqAddresses(
+                inputs=["tcp://127.0.0.1:5555"],
+                outputs=["tcp://127.0.0.1:5556"],
+                coordinator_input=None,
+                coordinator_output=None,
+            )
 
         # Background processes are not important here
         m.setattr(EngineCoreProc, "startup_handshake", mock_startup_handshake)
 
         vllm_config = EngineArgs(
-            model="deepseek-ai/DeepSeek-V2-Lite",
-            trust_remote_code=True).create_engine_config()
+            model="deepseek-ai/DeepSeek-V2-Lite", trust_remote_code=True
+        ).create_engine_config()
         engine_core_proc = EngineCoreProc(
             vllm_config=vllm_config,
             local_client=True,
diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
index f028b4ab1d73..a6b986645153 100644
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -42,23 +42,139 @@ def test_fast_inc_detok_invalid_utf8_err_case():
 
     detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
 
-    assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", \
+    assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", (
         "Should use FastIncrementalDetokenizer by default"
+    )
 
     # Process tokens incrementally
     test_tokens = [
-        236840, 107, 138, 236782, 107, 140, 236775, 6265, 1083, 623, 121908,
-        147418, 827, 107, 140, 236775, 6265, 236779, 2084, 1083, 623, 203292,
-        827, 107, 140, 236775, 6265, 236779, 7777, 1083, 623, 121908, 147418,
-        569, 537, 236789, 65880, 569, 537, 236789, 62580, 853, 115693, 210118,
-        35178, 16055, 1270, 759, 215817, 4758, 1925, 1117, 827, 107, 140,
-        236775, 5654, 1083, 623, 110733, 46291, 827, 107, 140, 236775, 5654,
-        236779, 2084, 1083, 623, 136955, 56731, 827, 107, 140, 236775, 5654,
-        236779, 7777, 1083, 623, 194776, 2947, 496, 109811, 1608, 890, 215817,
-        4758, 1925, 1117, 2789, 432, 398, 602, 31118, 569, 124866, 134772, 509,
-        19478, 1640, 33779, 236743, 236770, 236819, 236825, 236771, 432, 398,
-        432, 237167, 827, 107, 140, 236775, 77984, 1083, 623, 2709, 236745,
-        2555, 513, 236789, 602, 31118, 569
+        236840,
+        107,
+        138,
+        236782,
+        107,
+        140,
+        236775,
+        6265,
+        1083,
+        623,
+        121908,
+        147418,
+        827,
+        107,
+        140,
+        236775,
+        6265,
+        236779,
+        2084,
+        1083,
+        623,
+        203292,
+        827,
+        107,
+        140,
+        236775,
+        6265,
+        236779,
+        7777,
+        1083,
+        623,
+        121908,
+        147418,
+        569,
+        537,
+        236789,
+        65880,
+        569,
+        537,
+        236789,
+        62580,
+        853,
+        115693,
+        210118,
+        35178,
+        16055,
+        1270,
+        759,
+        215817,
+        4758,
+        1925,
+        1117,
+        827,
+        107,
+        140,
+        236775,
+        5654,
+        1083,
+        623,
+        110733,
+        46291,
+        827,
+        107,
+        140,
+        236775,
+        5654,
+        236779,
+        2084,
+        1083,
+        623,
+        136955,
+        56731,
+        827,
+        107,
+        140,
+        236775,
+        5654,
+        236779,
+        7777,
+        1083,
+        623,
+        194776,
+        2947,
+        496,
+        109811,
+        1608,
+        890,
+        215817,
+        4758,
+        1925,
+        1117,
+        2789,
+        432,
+        398,
+        602,
+        31118,
+        569,
+        124866,
+        134772,
+        509,
+        19478,
+        1640,
+        33779,
+        236743,
+        236770,
+        236819,
+        236825,
+        236771,
+        432,
+        398,
+        432,
+        237167,
+        827,
+        107,
+        140,
+        236775,
+        77984,
+        1083,
+        623,
+        2709,
+        236745,
+        2555,
+        513,
+        236789,
+        602,
+        31118,
+        569,
     ]
 
     output = ""
@@ -68,8 +184,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
         finished = i == len(test_tokens) - 1
         output += detokenizer.get_next_output_text(finished, delta=True)
 
-
-# fmt: off
+    # fmt: off
     assert output == r'''[
   {
     "source": "Résultats",
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 059106c62a20..9bd78f55a357 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -43,7 +43,8 @@ def _vllm_model(
     # env var adjustment via monkeypatch
     scope="function",
     # Prefix caching
-    params=[False, True])
+    params=[False, True],
+)
 def vllm_model(vllm_runner, request, monkeypatch):
     """VllmRunner test fixture parameterized by APC True/False."""
     with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
@@ -62,14 +63,15 @@ def vllm_model_apc(vllm_runner, monkeypatch):
     # env var adjustment via monkeypatch
     scope="function",
     # Prefix caching
-    params=[False, True])
+    params=[False, True],
+)
 def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
     """VllmRunner test fixture with APC."""
     with _vllm_model(
-            request.param,
-            vllm_runner,
-            monkeypatch,
-            skip_tokenizer_init=True,
+        request.param,
+        vllm_runner,
+        monkeypatch,
+        skip_tokenizer_init=True,
     ) as vllm_model:
         yield vllm_model
 
@@ -97,9 +99,11 @@ def get_mostly_n_gt1() -> int:
             top_p=0.95,
             n=n,
             seed=seed,
-            guided_decoding=GuidedDecodingParams(
-                regex="[0-9]+") if structured_outputs else None,
-        ) for n in n_list
+            guided_decoding=GuidedDecodingParams(regex="[0-9]+")
+            if structured_outputs
+            else None,
+        )
+        for n in n_list
     ], n_list
 
 
@@ -132,23 +136,20 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
     for out, n in zip(outputs, n_list):
         completion_counts: dict[str, int] = {}
         # Assert correct number of completions
-        assert len(out.outputs) == n, (
-            f"{len(out.outputs)} completions; {n} expected.")
+        assert len(out.outputs) == n, f"{len(out.outputs)} completions; {n} expected."
         for idx in range(n):
             comp = out.outputs[idx]
             # Assert correct completion indices
-            assert comp.index == idx, (f"Index {comp.index}; expected {idx}.")
+            assert comp.index == idx, f"Index {comp.index}; expected {idx}."
             text = comp.text
             completion_counts[text] = completion_counts.get(text, 0) + 1
         # Assert unique completions
         if len(completion_counts) != n:
-            repeats = {
-                txt: num
-                for (txt, num) in completion_counts.items() if num > 1
-            }
+            repeats = {txt: num for (txt, num) in completion_counts.items() if num > 1}
             raise AssertionError(
                 f"{len(completion_counts)} unique completions; expected"
-                f" {n}. Repeats: {repeats}")
+                f" {n}. Repeats: {repeats}"
+            )
 
 
 def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@@ -162,13 +163,12 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
     }
     monkeypatch.setenv("VLLM_USE_V1", "1")
     with vllm_runner(
-            MODEL,
-            speculative_config=speculative_config,
-            disable_log_stats=False,
+        MODEL,
+        speculative_config=speculative_config,
+        disable_log_stats=False,
     ) as vllm_model:
         model: LLM = vllm_model.model
-        sampling_params = SamplingParams(temperature=0.0,
-                                         max_tokens=max_tokens)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = model.generate(example_prompts, sampling_params)
 
         n_prompts = len(example_prompts)
@@ -192,15 +192,14 @@ def find_metric(name) -> list[Metric]:
         num_requests_running = find_metric("vllm:num_requests_running")
         assert len(num_requests_running) == 1
         assert isinstance(num_requests_running[0], Gauge)
-        assert num_requests_running[0].value == .0
+        assert num_requests_running[0].value == 0.0
 
         generation_tokens = find_metric("vllm:generation_tokens")
         assert len(generation_tokens) == 1
         assert isinstance(generation_tokens[0], Counter)
         assert generation_tokens[0].value == total_tokens
 
-        request_generation_tokens = find_metric(
-            "vllm:request_generation_tokens")
+        request_generation_tokens = find_metric("vllm:request_generation_tokens")
         assert len(request_generation_tokens) == 1
         assert isinstance(request_generation_tokens[0], Histogram)
         assert "+Inf" in request_generation_tokens[0].buckets
@@ -209,7 +208,8 @@ def find_metric(name) -> list[Metric]:
         assert request_generation_tokens[0].sum == total_tokens
 
         num_accepted_tokens_per_pos = find_metric(
-            "vllm:spec_decode_num_accepted_tokens_per_pos")
+            "vllm:spec_decode_num_accepted_tokens_per_pos"
+        )
         assert len(num_accepted_tokens_per_pos) == 1
         assert isinstance(num_accepted_tokens_per_pos[0], Vector)
         assert len(num_accepted_tokens_per_pos[0].values) == 5
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 949ab764e2e9..7193c055be82 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -7,18 +7,19 @@
 
 import pytest
 
-from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
-                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST,
-                                   STOP_STRINGS,
-                                   DummyOutputProcessorTestVectors,
-                                   MockEngineCore)
+from tests.v1.engine.utils import (
+    NUM_PROMPT_LOGPROBS_UNDER_TEST,
+    NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+    STOP_STRINGS,
+    DummyOutputProcessorTestVectors,
+    MockEngineCore,
+)
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.output_processor import (OutputProcessor,
-                                             RequestOutputCollector)
+from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -39,35 +40,38 @@ def _ref_convert_id_to_token(
 
 
 @pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind,
-                                    dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
-                                       log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens)
+    "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
+def test_incremental_detokenization(
+    request_output_kind: RequestOutputKind, dummy_test_vectors
+):
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer_group, log_stats=False
+    )
+    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
-        EngineCoreRequest(request_id=f"request-{idx}",
-                          prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_inputs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
-                          eos_token_id=None,
-                          lora_request=None,
-                          cache_salt=None,
-                          data_parallel_rank=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False,
-                          ),
-                          pooling_params=None)
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=request_output_kind,
+                stop=[],
+                include_stop_str_in_output=False,
+            ),
+            pooling_params=None,
+        )
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
@@ -103,8 +107,8 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(dummy_test_vectors.generation_strings,
-                dummy_test_vectors.generation_tokens)):
+        zip(dummy_test_vectors.generation_strings, dummy_test_vectors.generation_tokens)
+    ):
         gen_str = gen_strings[f"request-{idx}"]
         gen_toks = gen_tokens[f"request-{idx}"]
 
@@ -135,9 +139,11 @@ def _validate_logprobs(
         ref_prompt_logprobs = dtv.prompt_logprobs[req_idx]
         if num_sample_logprobs is not None:
             # Validate sample logprobs
-            assert logprobs is not None, (f"Request {req_id} requires sample"
-                                          " logprobs but sample logprobs are"
-                                          " None.")
+            assert logprobs is not None, (
+                f"Request {req_id} requires sample"
+                " logprobs but sample logprobs are"
+                " None."
+            )
             # Require num sampled tokens to match num
             # sampled logprobs - especially important
             # to check since the detokenizer can cause
@@ -148,44 +154,51 @@ def _validate_logprobs(
             assert num_new_tokens == len_sample_logprobs, (
                 f"Request {req_id} has {num_new_tokens}"
                 " completion tokens but has"
-                f" {len_sample_logprobs} sample logprobs.")
+                f" {len_sample_logprobs} sample logprobs."
+            )
             ref_cumulative_logprob = 0.0
-            for idx, (sampled_token,
-                      pos_logprob_dict) in enumerate(zip(new_tokens,
-                                                         logprobs)):
+            for idx, (sampled_token, pos_logprob_dict) in enumerate(
+                zip(new_tokens, logprobs)
+            ):
                 # Break out the reference log probability value &
                 # logprob token id tensors associated with this
                 # position in the completion. Also break out the
                 # sampled token ranks
-                (ref_pos_logprob_toks, ref_pos_logprob_vals,
-                 ref_sampled_token_rank) = ref_logprobs[idx]
+                (ref_pos_logprob_toks, ref_pos_logprob_vals, ref_sampled_token_rank) = (
+                    ref_logprobs[idx]
+                )
                 # For each position in the completion sequence,
                 # ensure the actual sampled token is among the
                 # logprobs
                 assert sampled_token in pos_logprob_dict, (
                     f"Sampled token {sampled_token} not"
-                    f" present in logprob at index {idx}")
+                    f" present in logprob at index {idx}"
+                )
 
                 # Validate number of sample logprobs
                 num_lp_toks = len(pos_logprob_dict)
-                assert (num_lp_toks == num_sample_logprobs
-                        or num_lp_toks == num_sample_logprobs +
-                        1), ("Valid numbers of sample logprobs are"
-                             f" {num_sample_logprobs} or"
-                             f" {num_sample_logprobs+1} but"
-                             f" {num_lp_toks} logprobs found at"
-                             f" position {idx}. Logprobs dict:"
-                             f" {pos_logprob_dict}")
+                assert (
+                    num_lp_toks == num_sample_logprobs
+                    or num_lp_toks == num_sample_logprobs + 1
+                ), (
+                    "Valid numbers of sample logprobs are"
+                    f" {num_sample_logprobs} or"
+                    f" {num_sample_logprobs + 1} but"
+                    f" {num_lp_toks} logprobs found at"
+                    f" position {idx}. Logprobs dict:"
+                    f" {pos_logprob_dict}"
+                )
 
                 # Validate sampled token logprob rank
                 smp_lp = pos_logprob_dict[sampled_token]
                 smp_lp_rank = smp_lp.rank
-                assert (ref_sampled_token_rank == smp_lp_rank), (
+                assert ref_sampled_token_rank == smp_lp_rank, (
                     "Sampled token logprob rank"
                     f" {smp_lp_rank} does not match"
                     " correct value"
                     f" {ref_sampled_token_rank}"
-                    f" in Logprob {smp_lp}")
+                    f" in Logprob {smp_lp}"
+                )
 
                 # Validate that the logprob processor yields
                 # the correct log probabilities and valid
@@ -199,7 +212,8 @@ def _validate_logprobs(
                     ref_tok_id = ref_pos_logprob_toks[jdx]
                     assert ref_tok_id in pos_logprob_dict, (
                         f"Expected token {ref_tok_id} to be"
-                        f" in logprob dict but it is not.")
+                        f" in logprob dict but it is not."
+                    )
 
                     # Extract actually-generated logprob
                     # info
@@ -209,40 +223,43 @@ def _validate_logprobs(
 
                     # A "top" (rank 1) logprob must be
                     # present
-                    rank_one_appears = (True
-                                        if lp_rank == 1 else rank_one_appears)
+                    rank_one_appears = True if lp_rank == 1 else rank_one_appears
 
                     # Rank must be >= 1
-                    assert lp_rank >= 1, (f"Logprob {lp} has invalid"
-                                          f" rank {lp_rank} < 1."
-                                          f" Logprob dict: {pos_logprob_dict}")
+                    assert lp_rank >= 1, (
+                        f"Logprob {lp} has invalid"
+                        f" rank {lp_rank} < 1."
+                        f" Logprob dict: {pos_logprob_dict}"
+                    )
 
                     # Validate log probability
                     assert math.isclose(lp_val, ref_lp_val), (
                         f"Token id {ref_tok_id} appears in logprobs dict"
                         f" at position {idx} in completion with log"
                         f" probability {lp_val} but {ref_lp_val} was"
-                        f" expected. Logprob: {lp}")
+                        f" expected. Logprob: {lp}"
+                    )
 
-                assert rank_one_appears, (f"No Logprob has rank 1"
-                                          " in the following Logprob"
-                                          f" dict: {pos_logprob_dict}")
+                assert rank_one_appears, (
+                    f"No Logprob has rank 1"
+                    " in the following Logprob"
+                    f" dict: {pos_logprob_dict}"
+                )
 
                 # Validate logprobs detokenization
                 for lp_tok in pos_logprob_dict:
                     # Confirm that sample logprob decoded token matches
                     # the logprob token id at this sequence position
                     decoded_token = pos_logprob_dict[lp_tok].decoded_token
-                    ref_decoded_token = _ref_convert_id_to_token(
-                        dtv.tokenizer, lp_tok)
+                    ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok)
                     assert decoded_token == ref_decoded_token, (
                         f"Sampled logprob token id {lp_tok} decodes to"
                         f" {ref_decoded_token} but Logprob decoded"
                         f" token is {decoded_token} instead"
-                        f" (at position {idx})")
+                        f" (at position {idx})"
+                    )
 
-                ref_cumulative_logprob += pos_logprob_dict[
-                    sampled_token].logprob
+                ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob
             # Assert that cumulative logprobs are correct
             assert math.isclose(cumulative_logprob, ref_cumulative_logprob)
         else:
@@ -255,7 +272,8 @@ def _validate_logprobs(
             assert prompt_logprobs is not None, (
                 f"Request {req_id} requires prompt"
                 " logprobs but prompt logprobs are"
-                " None.")
+                " None."
+            )
             # Require num prompt tokens to match num
             # prompt logprobs
             num_prompt_tokens = len(prompt_token_ids)
@@ -263,56 +281,70 @@ def _validate_logprobs(
             assert num_prompt_tokens == len_prompt_logprobs, (
                 f"Request {req_id} has {num_prompt_tokens}"
                 " prompt tokens but has"
-                f" {len_prompt_logprobs} prompt logprobs.")
+                f" {len_prompt_logprobs} prompt logprobs."
+            )
             # First prompt logprob is None
             first_plp_dict = prompt_logprobs[0]
             assert first_plp_dict is None, (
                 f"Request {req_id} first prompt logprob"
                 f" should be None but has following value"
-                f" instead: {first_plp_dict}")
+                f" instead: {first_plp_dict}"
+            )
             # Break out the reference prompt log prob value &
             # logprob token id matrices for the whole prompt.
             # Also break out the prompt token rank vector
-            (ref_prompt_logprob_toks, ref_prompt_logprob_vals,
-             ref_prompt_token_ranks) = ref_prompt_logprobs
+            (
+                ref_prompt_logprob_toks,
+                ref_prompt_logprob_vals,
+                ref_prompt_token_ranks,
+            ) = ref_prompt_logprobs
             for idx, (prompt_token, pos_logprob_dict) in enumerate(
-                    zip(prompt_token_ids[1:], prompt_logprobs[1:])):
-
+                zip(prompt_token_ids[1:], prompt_logprobs[1:])
+            ):
                 # Break out the reference prompt log prob value
                 # vector, prompt logprob token id vector, and
                 # prompt token rank at the current position.
-                (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals,
-                 ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :],
-                                               ref_prompt_logprob_vals[idx, :],
-                                               ref_prompt_token_ranks[idx])
+                (
+                    ref_pos_prompt_logprob_toks,
+                    ref_pos_prompt_logprob_vals,
+                    ref_pos_prompt_token_rank,
+                ) = (
+                    ref_prompt_logprob_toks[idx, :],
+                    ref_prompt_logprob_vals[idx, :],
+                    ref_prompt_token_ranks[idx],
+                )
 
                 # For each position in the prompt sequence,
                 # ensure the actual prompt token is among the
                 # logprobs
                 assert prompt_token in pos_logprob_dict, (
-                    f"Prompt token {prompt_token} not"
-                    f" present in logprob at index {idx}")
+                    f"Prompt token {prompt_token} not present in logprob at index {idx}"
+                )
                 # Validate number of prompt logprobs
                 num_plp_toks = len(pos_logprob_dict)
-                assert (num_plp_toks == num_prompt_logprobs
-                        or num_plp_toks == num_prompt_logprobs +
-                        1), ("Valid numbers of prompt logprobs are"
-                             f" {num_prompt_logprobs} or"
-                             f" {num_prompt_logprobs+1} but"
-                             f" {num_plp_toks} logprobs found at"
-                             f" position {idx}. Logprobs dict:"
-                             f" {pos_logprob_dict}")
+                assert (
+                    num_plp_toks == num_prompt_logprobs
+                    or num_plp_toks == num_prompt_logprobs + 1
+                ), (
+                    "Valid numbers of prompt logprobs are"
+                    f" {num_prompt_logprobs} or"
+                    f" {num_prompt_logprobs + 1} but"
+                    f" {num_plp_toks} logprobs found at"
+                    f" position {idx}. Logprobs dict:"
+                    f" {pos_logprob_dict}"
+                )
 
                 # Validate prompt token logprob rank
                 prmpt_tok_lp = pos_logprob_dict[prompt_token]
                 prmpt_tok_lp_rank = prmpt_tok_lp.rank
                 ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank
-                assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), (
+                assert ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank, (
                     "Prompt token logprob rank"
                     f" {prmpt_tok_lp_rank} does not match"
                     " correct value"
                     f" {ref_prmpt_tok_lp_rank}"
-                    f" in Logprob {prmpt_tok_lp}")
+                    f" in Logprob {prmpt_tok_lp}"
+                )
 
                 # Validate that the logprob processor yields
                 # the correct prompt log probs and valid
@@ -326,7 +358,8 @@ def _validate_logprobs(
                     ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx])
                     assert ref_tok_id in pos_logprob_dict, (
                         f"Expected token {ref_tok_id} to be"
-                        f" in logprob dict but it is not.")
+                        f" in logprob dict but it is not."
+                    )
 
                     # Extract actually-generated logprob
                     # info
@@ -336,89 +369,97 @@ def _validate_logprobs(
 
                     # A "top" (rank 1) logprob must be
                     # present
-                    rank_one_appears = (True
-                                        if plp_rank == 1 else rank_one_appears)
+                    rank_one_appears = True if plp_rank == 1 else rank_one_appears
 
                     # Rank must be >= 1
                     assert plp_rank >= 1, (
                         f"Logprob {plp} has invalid"
                         f" rank {plp_rank} < 1."
-                        f" Logprob dict: {pos_logprob_dict}")
+                        f" Logprob dict: {pos_logprob_dict}"
+                    )
 
                     # Validate log probability
                     assert math.isclose(plp_val, ref_plp_val), (
                         f"Token id {ref_tok_id} appears in logprobs dict"
                         f" at position {idx} in completion with log"
                         f" probability {plp_val} but {ref_plp_val} was"
-                        f" expected. Logprob: {plp}")
+                        f" expected. Logprob: {plp}"
+                    )
 
-                assert rank_one_appears, (f"No Logprob has rank 1"
-                                          " in the following Logprob"
-                                          f" dict: {pos_logprob_dict}")
+                assert rank_one_appears, (
+                    f"No Logprob has rank 1"
+                    " in the following Logprob"
+                    f" dict: {pos_logprob_dict}"
+                )
 
                 # Validate prompt logprob detokenization
                 for plp_tok in pos_logprob_dict:
                     # Confirm that prompt logprob decoded token matches
                     # the logprob token id at this sequence position
                     decoded_token = pos_logprob_dict[plp_tok].decoded_token
-                    ref_decoded_token = _ref_convert_id_to_token(
-                        dtv.tokenizer, plp_tok)
+                    ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok)
                     assert decoded_token == ref_decoded_token, (
                         f"Prompt logprob token id {plp_tok} decodes to"
                         f" {ref_decoded_token} but Logprob decoded"
                         f" token is {decoded_token} instead"
-                        f" (at position {idx})")
+                        f" (at position {idx})"
+                    )
         else:
             # Prompt logprobs disabled for this request
             assert prompt_logprobs is None
 
 
 @pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("num_sample_logprobs",
-                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-@pytest.mark.parametrize("num_prompt_logprobs",
-                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
-def test_logprobs_processor(request_output_kind: RequestOutputKind,
-                            num_sample_logprobs: Optional[int],
-                            num_prompt_logprobs: Optional[int],
-                            dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
-                                       log_stats=False)
+    "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
+)
+@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+@pytest.mark.parametrize("num_prompt_logprobs", [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
+def test_logprobs_processor(
+    request_output_kind: RequestOutputKind,
+    num_sample_logprobs: Optional[int],
+    num_prompt_logprobs: Optional[int],
+    dummy_test_vectors,
+):
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer_group, log_stats=False
+    )
     engine_core = MockEngineCore(
         tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=None if num_sample_logprobs is None else
-        dummy_test_vectors.generation_logprobs,
+        generated_logprobs_raw=None
+        if num_sample_logprobs is None
+        else dummy_test_vectors.generation_logprobs,
         prompt_logprobs_raw=None
-        if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs)
+        if num_prompt_logprobs is None
+        else dummy_test_vectors.prompt_logprobs,
+    )
 
     # Make N requests.
     request_id_list = [
-        f"request-{idx}"
-        for idx in range(len(dummy_test_vectors.prompt_strings))
+        f"request-{idx}" for idx in range(len(dummy_test_vectors.prompt_strings))
     ]
     requests = [
-        EngineCoreRequest(request_id=request_id_list[idx],
-                          prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_inputs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
-                          eos_token_id=None,
-                          lora_request=None,
-                          cache_salt=None,
-                          data_parallel_rank=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False,
-                              logprobs=num_sample_logprobs,
-                              prompt_logprobs=num_prompt_logprobs,
-                          ),
-                          pooling_params=None)
+        EngineCoreRequest(
+            request_id=request_id_list[idx],
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            cache_salt=None,
+            data_parallel_rank=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=request_output_kind,
+                stop=[],
+                include_stop_str_in_output=False,
+                logprobs=num_sample_logprobs,
+                prompt_logprobs=num_prompt_logprobs,
+            ),
+            pooling_params=None,
+        )
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
@@ -449,7 +490,8 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
             prompt_logprobs = request_output.prompt_logprobs
             logprobs = request_output.outputs[0].logprobs
             gen_cumulative_logprobs[request_id] = request_output.outputs[
-                0].cumulative_logprob
+                0
+            ].cumulative_logprob
             if request_id not in gen_logprobs:
                 # Start tracking sample and prompt logprobs for this request
                 gen_tokens[request_id] = new_tokens
@@ -466,10 +508,16 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                     plp.extend(prompt_logprobs)
 
     # Confirmed tracked logprobs match what we expect
-    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
-                       gen_cumulative_logprobs, dummy_test_vectors,
-                       request_id_list, num_sample_logprobs,
-                       num_prompt_logprobs)
+    _validate_logprobs(
+        gen_tokens,
+        gen_logprobs,
+        gen_prompt_logprobs,
+        gen_cumulative_logprobs,
+        dummy_test_vectors,
+        request_id_list,
+        num_sample_logprobs,
+        num_prompt_logprobs,
+    )
 
     assert output_processor.get_num_unfinished_requests() == 0
     assert not output_processor.has_unfinished_requests()
@@ -477,15 +525,23 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
 
 @pytest.mark.parametrize(
     "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
-    [(False, "stop_token_ids", False, None),
-     (True, "stop_token_ids", False, None),
-     (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
-     (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
-     (False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
-     (False, "eos_token_id", True, None)])
-def test_stop_token(include_stop_str_in_output: bool,
-                    num_sample_logprobs: Optional[int], stop_token_type: str,
-                    ignore_eos: bool, dummy_test_vectors):
+    [
+        (False, "stop_token_ids", False, None),
+        (True, "stop_token_ids", False, None),
+        (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+        (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+        (False, "eos_token_id", False, None),
+        (True, "eos_token_id", False, None),
+        (False, "eos_token_id", True, None),
+    ],
+)
+def test_stop_token(
+    include_stop_str_in_output: bool,
+    num_sample_logprobs: Optional[int],
+    stop_token_type: str,
+    ignore_eos: bool,
+    dummy_test_vectors,
+):
     """Test output processor EOS/stop token handling.
 
     Send mock engine core request to mock engine core and pass core outputs
@@ -526,9 +582,10 @@ def test_stop_token(include_stop_str_in_output: bool,
         dummy_test_vectors: dummy engine core outputs and other data structures
     """
     model_id = dummy_test_vectors.tokenizer.name_or_path
-    if model_id != 'meta-llama/Llama-3.2-1B':
-        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
-                             f"{model_id} is in use.")
+    if model_id != "meta-llama/Llama-3.2-1B":
+        raise AssertionError(
+            f"Test requires meta-llama/Llama-3.2-1B but {model_id} is in use."
+        )
     do_logprobs = num_sample_logprobs is not None
     # EOS under test; if False, stop_token_ids under test
     is_eos_test = stop_token_type == "eos_token_id"
@@ -539,18 +596,18 @@ def test_stop_token(include_stop_str_in_output: bool,
     )  # '<|end_of_text|>'
     stop_token_ids = [128009] if not is_eos_test else None  # '<|eot_id|>'
 
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
-                                       log_stats=False)
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer_group, log_stats=False
+    )
     # Dummy engine core outputs, with control tokens suffixed to test stops
-    suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
+    suffix_token = [eos_token_id] if is_eos_test else stop_token_ids
     assert suffix_token is not None and isinstance(suffix_token[0], int)
     generation_string = dummy_test_vectors.generation_strings[0]
-    generation_tokens = (dummy_test_vectors.generation_tokens[0] +
-                         2 * suffix_token)
+    generation_tokens = dummy_test_vectors.generation_tokens[0] + 2 * suffix_token
     if do_logprobs:
-        generation_logprobs = (
-            dummy_test_vectors.generation_logprobs[0] +
-            2 * [dummy_test_vectors.generation_logprobs[0][-1]])
+        generation_logprobs = dummy_test_vectors.generation_logprobs[0] + 2 * [
+            dummy_test_vectors.generation_logprobs[0][-1]
+        ]
     prompt_string = dummy_test_vectors.prompt_strings[0]
     prompt_tokens = dummy_test_vectors.prompt_tokens[0]
     engine_core = MockEngineCore(
@@ -559,7 +616,8 @@ def test_stop_token(include_stop_str_in_output: bool,
         prompt_logprobs_raw=None,
         eos_token_id=eos_token_id,
         stop_token_ids=stop_token_ids,
-        ignore_eos=ignore_eos)
+        ignore_eos=ignore_eos,
+    )
 
     # Make request.
     request_id = "request-0"
@@ -585,7 +643,8 @@ def test_stop_token(include_stop_str_in_output: bool,
             prompt_logprobs=None,
             ignore_eos=ignore_eos,
         ),
-        pooling_params=None)
+        pooling_params=None,
+    )
 
     # Add request to the detokenizer.
     output_processor.add_request(request, prompt_string)
@@ -610,7 +669,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         # Update tracking.
         request_output = request_outputs[0]
         if request_output.finished:
-            finish_reason = ("length" if is_eos_ignore_test else "stop")
+            finish_reason = "length" if is_eos_ignore_test else "stop"
             assert request_output.outputs[0].finish_reason == finish_reason
 
         gen_string += request_output.outputs[0].text
@@ -619,7 +678,7 @@ def test_stop_token(include_stop_str_in_output: bool,
             gen_logprobs.extend(request_output.outputs[0].logprobs)
 
     # Validate generated text
-    control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
+    control_token = "<|end_of_text|>" if is_eos_test else "<|eot_id|>"
     if is_eos_ignore_test:
         # Length-based stop; expect full string
         ref_str = generation_string + 2 * control_token
@@ -629,14 +688,15 @@ def test_stop_token(include_stop_str_in_output: bool,
     else:
         # Stop token triggered but not in output
         ref_str = generation_string
-    assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
+    assert gen_string == ref_str, f"{gen_string=}, {ref_str=}"
 
     if do_logprobs:
         # Validate number of sample logprobs
         num_tokens = len(gen_tokens)
         num_logprobs = len(gen_logprobs)
         assert num_tokens == num_logprobs, (
-            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
+            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})"
+        )
 
     # Check requests are finished
     assert output_processor.get_num_unfinished_requests() == 0
@@ -644,22 +704,26 @@ def test_stop_token(include_stop_str_in_output: bool,
 
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.parametrize("num_sample_logprobs",
-                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-def test_stop_string(include_stop_str_in_output: bool,
-                     num_sample_logprobs: Optional[int], dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
-                                       log_stats=False)
+@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+def test_stop_string(
+    include_stop_str_in_output: bool,
+    num_sample_logprobs: Optional[int],
+    dummy_test_vectors,
+):
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer_group, log_stats=False
+    )
     engine_core = MockEngineCore(
         tokens_list=dummy_test_vectors.generation_tokens,
         generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if num_sample_logprobs else None,
-        prompt_logprobs_raw=None)
+        if num_sample_logprobs
+        else None,
+        prompt_logprobs_raw=None,
+    )
 
     # Make N requests.
     request_id_list = [
-        f"request-{idx}"
-        for idx in range(len(dummy_test_vectors.prompt_strings))
+        f"request-{idx}" for idx in range(len(dummy_test_vectors.prompt_strings))
     ]
     requests = [
         EngineCoreRequest(
@@ -682,7 +746,8 @@ def test_stop_string(include_stop_str_in_output: bool,
                 logprobs=num_sample_logprobs,
                 prompt_logprobs=None,
             ),
-            pooling_params=None)
+            pooling_params=None,
+        )
         for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
@@ -722,7 +787,8 @@ def test_stop_string(include_stop_str_in_output: bool,
             prompt_logprobs = request_output.prompt_logprobs
             logprobs = request_output.outputs[0].logprobs
             gen_cumulative_logprobs[request_id] = request_output.outputs[
-                0].cumulative_logprob
+                0
+            ].cumulative_logprob
             if request_id not in gen_strings:
                 gen_strings[request_id] = new_text
                 gen_tokens[request_id] = new_tokens
@@ -740,8 +806,8 @@ def test_stop_string(include_stop_str_in_output: bool,
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, stop_str) in enumerate(
-            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
-
+        zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
+    ):
         # Request should be aborted.
         request_id = f"request-{idx}"
         assert request_id in aborted
@@ -755,24 +821,30 @@ def test_stop_string(include_stop_str_in_output: bool,
         ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
 
         if include_stop_str_in_output:
-            assert gen_str == ref_str_inc_stop, (
-                f"{gen_str=}, {ref_str_inc_stop=}")
+            assert gen_str == ref_str_inc_stop, f"{gen_str=}, {ref_str_inc_stop=}"
         else:
-            assert gen_str == ref_str_exc_stop, (
-                f"{gen_str=}, {ref_str_exc_stop=}")
+            assert gen_str == ref_str_exc_stop, f"{gen_str=}, {ref_str_exc_stop=}"
 
     # Confirmed tracked logprobs match what we expect
-    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
-                       gen_cumulative_logprobs, dummy_test_vectors,
-                       request_id_list, num_sample_logprobs, None)
+    _validate_logprobs(
+        gen_tokens,
+        gen_logprobs,
+        gen_prompt_logprobs,
+        gen_cumulative_logprobs,
+        dummy_test_vectors,
+        request_id_list,
+        num_sample_logprobs,
+        None,
+    )
 
     assert output_processor.get_num_unfinished_requests() == 0
     assert not output_processor.has_unfinished_requests()
 
 
 def test_iteration_stats(dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
-                                       log_stats=True)
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer_group, log_stats=True
+    )
     engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
     engine_core_timestamp = time.monotonic()
 
@@ -791,7 +863,8 @@ def test_iteration_stats(dummy_test_vectors):
             data_parallel_rank=None,
             sampling_params=SamplingParams(),
             pooling_params=None,
-        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
+        )
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
     # Add all requests except one to the OutputProcessor.
@@ -803,12 +876,13 @@ def test_iteration_stats(dummy_test_vectors):
     # First iteration has 2 prefills.
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
-    total_prompt_tokens = sum([
-        len(prompt_tokens)
-        for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
-    ])
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
+    total_prompt_tokens = sum(
+        [
+            len(prompt_tokens)
+            for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
+        ]
+    )
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
     assert iteration_stats.num_generation_tokens == num_active
@@ -816,8 +890,7 @@ def test_iteration_stats(dummy_test_vectors):
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
@@ -827,8 +900,7 @@ def test_iteration_stats(dummy_test_vectors):
     num_active += 1
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
     total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
@@ -837,8 +909,7 @@ def test_iteration_stats(dummy_test_vectors):
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
-    output_processor.process_outputs(outputs, engine_core_timestamp,
-                                     iteration_stats)
+    output_processor.process_outputs(outputs, engine_core_timestamp, iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
@@ -862,16 +933,13 @@ def make_outputs() -> list[RequestOutput]:
                         text=TEXT,
                         token_ids=[idx],
                         cumulative_logprob=(idx + 1 * 1.0),
-                        logprobs=[{
-                            "a": idx,
-                            "b": idx
-                        }],
-                        finish_reason="length" if
-                        (idx == NUM_REQS - 1) else None,
+                        logprobs=[{"a": idx, "b": idx}],
+                        finish_reason="length" if (idx == NUM_REQS - 1) else None,
                     )
                 ],
                 finished=(idx == NUM_REQS - 1),
-            ) for idx in range(NUM_REQS)
+            )
+            for idx in range(NUM_REQS)
         ]
 
     collector = RequestOutputCollector(RequestOutputKind.DELTA)
@@ -897,8 +965,7 @@ def make_outputs() -> list[RequestOutput]:
     assert not output.finished
     # Text, token_ids, and logprobs should get merged.
     assert output.outputs[0].text == TEXT * num_to_put
-    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
-                            list(range(num_to_put))):
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids, list(range(num_to_put))):
         assert tok_0 == tok_1
     assert len(output.outputs[0].logprobs) == num_to_put
 
@@ -919,8 +986,7 @@ def make_outputs() -> list[RequestOutput]:
     assert output.outputs[0].finish_reason == "length"
     # Text, token_ids, and logprobs should get merged.
     assert output.outputs[0].text == TEXT * num_to_put
-    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
-                            list(range(num_to_put))):
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids, list(range(num_to_put))):
         assert tok_0 == tok_1
     assert len(output.outputs[0].logprobs) == num_to_put
 
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index b58bc75fc956..4d084d8979fa 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -39,7 +39,7 @@ def _create_random_top_logprob_test_vector(
     upper: float,
 ) -> torch.Tensor:
     """Create a random vector of top logprob float values.
-    
+
     Use to create fake sample logprobs for testing.
 
     Note that a real production scenario would require
@@ -63,7 +63,7 @@ def _create_random_top_logprob_test_matrix(
     upper: float,
 ) -> torch.Tensor:
     """Create a random matrix of top logprob float values.
-    
+
     Use to create fake prompt logprobs for testing.
 
     Note that a real production scenario would require
@@ -83,11 +83,12 @@ def _create_random_top_logprob_test_matrix(
 
 
 def _create_random_top_token_test_vector(
-        num_logprobs: int,
-        lower: int,
-        upper: int,
-        sampled_token_id: int,
-        adjust_num_logprobs: bool = True) -> tuple[torch.Tensor, int]:
+    num_logprobs: int,
+    lower: int,
+    upper: int,
+    sampled_token_id: int,
+    adjust_num_logprobs: bool = True,
+) -> tuple[torch.Tensor, int]:
     """Create a random vector of top logprob token indices
 
     Use to create fake sample logprobs for testing. The sampled token
@@ -128,8 +129,9 @@ def _create_random_top_token_test_vector(
 
     # Check if the sampled_token_id occurs in choice_tensor[1:]
     if sampled_token_id in choice_tensor[1:]:
-        sampled_token_rank = (choice_tensor[1:] == sampled_token_id).nonzero(
-            as_tuple=True)[0].item()
+        sampled_token_rank = (
+            (choice_tensor[1:] == sampled_token_id).nonzero(as_tuple=True)[0].item()
+        )
     else:
         # If not found, assign a random int between num_logprobs and 50700
         sampled_token_rank = random.randint(num_logprobs, 50700)
@@ -165,9 +167,12 @@ def _create_random_top_token_test_matrix(
     num_elements = shape[0] * shape[1]
     choice_tensor = torch.randperm(upper - lower)[:num_elements] + lower
     matrix = torch.cat(
-        (torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
-         choice_tensor.view(shape)),
-        dim=1)
+        (
+            torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
+            choice_tensor.view(shape),
+        ),
+        dim=1,
+    )
 
     # Initialize the tensor for storing the ranks
     prompt_token_ranks = torch.empty(shape[0], dtype=torch.int)
@@ -175,8 +180,7 @@ def _create_random_top_token_test_matrix(
     # Iterate over each row to check presence of
     # tokens_list[rdx] and determine its index
     for rdx in range(shape[0]):
-        row = matrix[rdx,
-                     1:]  # Skip the first column as it contains the token list
+        row = matrix[rdx, 1:]  # Skip the first column as it contains the token list
         token_index = (row == tokens_list[rdx]).nonzero(as_tuple=True)[0]
         if token_index.numel() > 0:
             prompt_token_ranks[rdx] = token_index.item()
@@ -230,19 +234,21 @@ def generate_dummy_sample_logprobs(
         (
             token_vector,
             sampled_token_rank,
-        ) = _create_random_top_token_test_vector(num_logprobs, 0,
-                                                 len(tokenizer.vocab) - 1,
-                                                 sampled_token_id)
+        ) = _create_random_top_token_test_vector(
+            num_logprobs, 0, len(tokenizer.vocab) - 1, sampled_token_id
+        )
 
         res.append(
-            (token_vector,
-             _create_random_top_logprob_test_vector(num_logprobs + 1, -100,
-                                                    0), sampled_token_rank))
+            (
+                token_vector,
+                _create_random_top_logprob_test_vector(num_logprobs + 1, -100, 0),
+                sampled_token_rank,
+            )
+        )
 
     # Convert tensors in the list tuples to Python lists
     res_list_format = [
-        (log_probs_tensor.tolist(), token_ids_tensor.tolist(),
-         sampled_token_rank)
+        (log_probs_tensor.tolist(), token_ids_tensor.tolist(), sampled_token_rank)
         for log_probs_tensor, token_ids_tensor, sampled_token_rank in res
     ]
 
@@ -283,18 +289,24 @@ def generate_dummy_prompt_logprobs_tensors(
         token_vector,
         prompt_token_ranks,
     ) = _create_random_top_token_test_matrix(
-        (num_prompt_logprobs, num_logprobs), 0,
-        len(tokenizer.vocab) - 1, prompt_tokens_list[1:])
+        (num_prompt_logprobs, num_logprobs),
+        0,
+        len(tokenizer.vocab) - 1,
+        prompt_tokens_list[1:],
+    )
     return LogprobsTensors(
         token_vector,
         _create_random_top_logprob_test_matrix(
-            (num_prompt_logprobs, num_logprobs + 1), -100, 0),
-        prompt_token_ranks)
+            (num_prompt_logprobs, num_logprobs + 1), -100, 0
+        ),
+        prompt_token_ranks,
+    )
 
 
 @dataclass
 class DummyOutputProcessorTestVectors:
     """Dummy test vectors for output processor tests"""
+
     tokenizer: GeneralTokenizerType
     tokenizer_group: TokenizerGroup
     vllm_config: EngineArgs
@@ -322,9 +334,9 @@ def __init__(
         # For each request, for each sampled token offset,
         # a tuple of
         # (list of topk token ids, list of sample logprob vals, rank)
-        generated_logprobs_raw: Optional[list[list[tuple[list[int],
-                                                         list[float],
-                                                         int]]]] = None,
+        generated_logprobs_raw: Optional[
+            list[list[tuple[list[int], list[float], int]]]
+        ] = None,
         # For each request, a tuple of
         # (prompt logprob val matrix, prompt logprob tok id matrix);
         # each matrix has dimensions
@@ -357,7 +369,8 @@ def get_outputs(self) -> list[EngineCoreOutput]:
                 if do_logprobs:
                     assert self.generated_logprobs_raw is not None
                     (logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
-                        self.generated_logprobs_raw[req_idx][token_idx])
+                        self.generated_logprobs_raw[req_idx][token_idx]
+                    )
                     logprobs = LogprobsLists(
                         [logprobs_token_ids_],
                         [logprobs_],
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index ffe061212466..b2621dcf60ea 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -26,8 +26,10 @@ def sample_token_ids():
 
 @pytest.fixture
 def sample_regex():
-    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+    return (
+        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+    )
 
 
 # Note: Ensure this only uses attributes compatible with xgrammar
@@ -36,53 +38,44 @@ def sample_json_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
             "skills": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                }
+                },
             },
             "grade": {
                 "type": "string",
-                "pattern": "^[A-D]$"  # Regex pattern
+                "pattern": "^[A-D]$",  # Regex pattern
             },
             "email": {
                 "type": "string",
-                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
             },
             "work_history": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
-                        "company": {
-                            "type": "string"
-                        },
+                        "company": {"type": "string"},
                         "duration": {
                             "type": "number",
                             "minimum": 0.0,
                             "maximum": 100.0,  # Numeric range
                         },
-                        "position": {
-                            "type": "string"
-                        }
+                        "position": {"type": "string"},
                     },
                     "required": ["company", "duration", "position"],
-                    "additionalProperties": False
+                    "additionalProperties": False,
                 },
                 "minItems": 0,
-                "maxItems": 3
-            }
+                "maxItems": 3,
+            },
         },
-        "required":
-        ["name", "age", "skills", "grade", "email", "work_history"],
-        "additionalProperties": False
+        "required": ["name", "age", "skills", "grade", "email", "work_history"],
+        "additionalProperties": False,
     }
 
 
@@ -94,67 +87,60 @@ def unsupported_json_schema():
         "properties": {
             "score": {
                 "type": "integer",
-                "multipleOf": 5  # Numeric multiple
+                "multipleOf": 5,  # Numeric multiple
             },
             "tags": {
                 "type": "array",
-                "items": {
-                    "type": "string",
-                    "minLength": 10,
-                    "maxLength": 20
-                }
-            }
+                "items": {"type": "string", "minLength": 10, "maxLength": 20},
+            },
         },
         "required": ["score", "tags"],
-        "additionalProperties": False
+        "additionalProperties": False,
     }
 
 
 @pytest.fixture
 def sample_definition_json_schema():
     return {
-        '$defs': {
-            'Step': {
-                'properties': {
-                    'explanation': {
-                        'title': 'Explanation',
-                        'type': 'string'
-                    },
-                    'output': {
-                        'title': 'Output',
-                        'type': 'string'
-                    }
+        "$defs": {
+            "Step": {
+                "properties": {
+                    "explanation": {"title": "Explanation", "type": "string"},
+                    "output": {"title": "Output", "type": "string"},
                 },
-                'required': ['explanation', 'output'],
-                'title': 'Step',
-                'type': 'object'
+                "required": ["explanation", "output"],
+                "title": "Step",
+                "type": "object",
             }
         },
-        'properties': {
-            'steps': {
-                'items': {
-                    '$ref': '#/$defs/Step'
-                },
-                'title': 'Steps',
-                'type': 'array'
+        "properties": {
+            "steps": {
+                "items": {"$ref": "#/$defs/Step"},
+                "title": "Steps",
+                "type": "array",
             },
-            'final_answer': {
-                'title': 'Final Answer',
-                'type': 'string'
-            }
+            "final_answer": {"title": "Final Answer", "type": "string"},
         },
-        'required': ['steps', 'final_answer'],
-        'title': 'MathReasoning',
-        'type': 'object',
-        "additionalProperties": False
+        "required": ["steps", "final_answer"],
+        "title": "MathReasoning",
+        "type": "object",
+        "additionalProperties": False,
     }
 
 
 @pytest.fixture
 def sample_guided_choice():
     return [
-        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
-        "Ruby", "Swift", "Kotlin"
+        "Python",
+        "Java",
+        "JavaScript",
+        "C++",
+        "C#",
+        "PHP",
+        "TypeScript",
+        "Ruby",
+        "Swift",
+        "Kotlin",
     ]
 
 
@@ -172,11 +158,11 @@ def sample_sql_ebnf():
 
 @pytest.fixture
 def sample_sql_lark():
-    return ("""
+    return """
 start: select_statement
 select_statement: "SELECT" column "from" table "where" condition
 column: "col_1" | "col_2"
 table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
-""")
+"""
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 8bddfb0b48a5..f618f46b7a0c 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -43,15 +43,12 @@
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto",
-     NGRAM_SPEC_CONFIG),
-    #FIXME: This test is flaky on CI thus disabled
-    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto",
-     NGRAM_SPEC_CONFIG),
+    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG),
+    # FIXME: This test is flaky on CI thus disabled
+    # ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
-    ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto",
-     EAGLE_SPEC_CONFIG)
+    ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG),
 ]
 
 PARAMS_MODELS_TOKENIZER_MODE = [
@@ -79,14 +76,15 @@ def _load_json(s: str, backend: str) -> str:
 
     # xgrammar specific workarounds
     # https://github.com/mlc-ai/xgrammar/issues/286
-    s = re.sub(r'[\x00-\x1F\x7F-\xFF]', '', s)
+    s = re.sub(r"[\x00-\x1F\x7F-\xFF]", "", s)
     return json.loads(s)
 
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
-    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
+    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
+)
 def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
@@ -115,10 +113,12 @@ def test_structured_output(
         enforce_eager=enforce_eager,
         max_model_len=1024,
         guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=(guided_decoding_backend
-                                                in {"xgrammar", "guidance"}),
+        guided_decoding_disable_any_whitespace=(
+            guided_decoding_backend in {"xgrammar", "guidance"}
+        ),
         tokenizer_mode=tokenizer_mode,
-        speculative_config=speculative_config)
+        speculative_config=speculative_config,
+    )
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -126,14 +126,20 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(prompts=[
-        (f"Give an example JSON for an employee profile that fits this "
-         f"schema. Make the response as short as possible. Schema: "
-         f"{sample_json_schema}")
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema),
+    )
+    outputs = llm.generate(
+        prompts=[
+            (
+                f"Give an example JSON for an employee profile that fits this "
+                f"schema. Make the response as short as possible. Schema: "
+                f"{sample_json_schema}"
+            )
+        ]
+        * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -157,14 +163,18 @@ def test_structured_output(
             temperature=1.0,
             max_tokens=4096,
             n=2,
-            guided_decoding=GuidedDecodingParams(json_object=True))
+            guided_decoding=GuidedDecodingParams(json_object=True),
+        )
 
-        outputs = llm.generate(prompts=(
-            "Generate a JSON object with curly braces for a person with "
-            "name and age fields for John Smith who is 31 years old. "
-            "Make the response as short as possible."),
-                               sampling_params=sampling_params,
-                               use_tqdm=True)
+        outputs = llm.generate(
+            prompts=(
+                "Generate a JSON object with curly braces for a person with "
+                "name and age fields for John Smith who is 31 years old. "
+                "Make the response as short as possible."
+            ),
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
 
         assert outputs is not None
         for output in outputs:
@@ -186,25 +196,37 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema),
+    )
     if guided_decoding_backend.startswith("xgrammar"):
-        with pytest.raises(ValueError,
-                           match="The provided JSON schema contains features "
-                           "not supported by xgrammar."):
+        with pytest.raises(
+            ValueError,
+            match="The provided JSON schema contains features "
+            "not supported by xgrammar.",
+        ):
             llm.generate(
-                prompts=[(f"Give an example JSON for an employee profile that "
-                          f"fits this schema: {unsupported_json_schema}. "
-                          f"Make the response as short as possible.")] * 2,
+                prompts=[
+                    (
+                        f"Give an example JSON for an employee profile that "
+                        f"fits this schema: {unsupported_json_schema}. "
+                        f"Make the response as short as possible."
+                    )
+                ]
+                * 2,
                 sampling_params=sampling_params,
-                use_tqdm=True)
+                use_tqdm=True,
+            )
     else:
-        outputs = llm.generate(prompts=(
-            "Give an example JSON object for a grade "
-            "that fits this schema: "
-            f"{unsupported_json_schema}. Make the response as short as "
-            "possible."),
-                               sampling_params=sampling_params,
-                               use_tqdm=True)
+        outputs = llm.generate(
+            prompts=(
+                "Give an example JSON object for a grade "
+                "that fits this schema: "
+                f"{unsupported_json_schema}. Make the response as short as "
+                "possible."
+            ),
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
         assert outputs is not None
         for output in outputs:
             assert output is not None
@@ -225,12 +247,14 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
+            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf),
+        )
         outputs = llm.generate(
             prompts=(
                 "Generate a sql statement that selects col_1 from "
                 "table_1 where it is equal to 1. Make the response as short as "
-                "possible."),
+                "possible."
+            ),
             sampling_params=sampling_params,
             use_tqdm=True,
         )
@@ -245,8 +269,7 @@ def test_structured_output(
             assert generated_text is not None
 
             # remove spaces for comparison b/c we removed them in the grammar
-            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
-                " ", "")
+            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
 
             assert generated_text.strip() == ground_truth
 
@@ -259,12 +282,14 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
+            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark),
+        )
         outputs = llm.generate(
             prompts=(
                 "Generate a sql statement that selects col_1 from "
                 "table_1 where it is equal to 1. Make the response as short as "
-                "possible."),
+                "possible."
+            ),
             sampling_params=sampling_params,
             use_tqdm=True,
         )
@@ -280,12 +305,12 @@ def test_structured_output(
 
             # use Lark to parse the output, and make sure it's a valid parse tree
             from lark import Lark
+
             parser = Lark(sample_sql_lark)
             parser.parse(generated_text)
 
             # remove spaces for comparison b/c we removed them in the grammar
-            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
-                " ", "")
+            ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
 
             assert generated_text.strip() == ground_truth
 
@@ -298,13 +323,15 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
+            guided_decoding=GuidedDecodingParams(grammar="not a grammar"),
+        )
         with pytest.raises(ValueError, match="Failed to convert the grammar "):
             llm.generate(
-                prompts=
-                ("Generate a sql statement that selects col_1 from "
-                 "table_1 where it is equal to 1. Make the response as short "
-                 "as possible."),
+                prompts=(
+                    "Generate a sql statement that selects col_1 from "
+                    "table_1 where it is equal to 1. Make the response as short "
+                    "as possible."
+                ),
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )
@@ -315,12 +342,16 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+        guided_decoding=GuidedDecodingParams(regex=sample_regex),
+    )
     outputs = llm.generate(
         prompts=[
-            (f"Give an example IPv4 address with this regex: {sample_regex}. "
-             f"Make the response as short as possible.")
-        ] * 2,
+            (
+                f"Give an example IPv4 address with this regex: {sample_regex}. "
+                f"Make the response as short as possible."
+            )
+        ]
+        * 2,
         sampling_params=sampling_params,
         use_tqdm=True,
     )
@@ -342,12 +373,16 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice),
+    )
     outputs = llm.generate(
-        prompts=("The best language for type-safe systems programming is "
-                 "(Make the response as short as possible.) "),
+        prompts=(
+            "The best language for type-safe systems programming is "
+            "(Make the response as short as possible.) "
+        ),
         sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
     assert outputs is not None
     for output in outputs:
         assert output is not None
@@ -366,13 +401,17 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
-    outputs = llm.generate(prompts=(
-        "Generate a JSON with the brand, model and car_type of the most "
-        "iconic car from the 90's. Make the response as short as "
-        "possible."),
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+        guided_decoding=GuidedDecodingParams(json=json_schema),
+    )
+    outputs = llm.generate(
+        prompts=(
+            "Generate a JSON with the brand, model and car_type of the most "
+            "iconic car from the 90's. Make the response as short as "
+            "possible."
+        ),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -398,23 +437,27 @@ def test_structured_output(
             "description": {
                 "type": "string",
                 "maxLength": max_length,
-                "minLength": min_length
+                "minLength": min_length,
             }
         },
         "required": ["description"],
-        "additionalProperties": False
+        "additionalProperties": False,
     }
 
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
+        guided_decoding=GuidedDecodingParams(json=json_schema),
+    )
 
     outputs = llm.generate(
-        prompts=("Generate a description of a frog using 50 characters. "
-                 "Make the response as short as possible."),
+        prompts=(
+            "Generate a description of a frog using 50 characters. "
+            "Make the response as short as possible."
+        ),
         sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -434,29 +477,28 @@ def test_structured_output(
         # Test 11: Generate structured output using structural_tag format
         #
         structural_tag_config = {
-            "type":
-            "structural_tag",
-            "structures": [{
-                "begin": "<function=get_weather>",
-                "schema": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string"
-                        }
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                        "additionalProperties": False,
                     },
-                    "additionalProperties": False
-                },
-                "end": "</function>"
-            }],
-            "triggers": ["<function="]
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
         }
 
         sampling_params = SamplingParams(
             temperature=0.0,
             max_tokens=4096,
             guided_decoding=GuidedDecodingParams(
-                structural_tag=json.dumps(structural_tag_config)))
+                structural_tag=json.dumps(structural_tag_config)
+            ),
+        )
 
         prompt = """
 You have access to the following function to retrieve the weather in a city:
@@ -498,9 +540,9 @@ def test_structured_output(
 """
 
         # Change this once other backends support structural_tag
-        outputs = llm.generate(prompts=prompt,
-                               sampling_params=sampling_params,
-                               use_tqdm=True)
+        outputs = llm.generate(
+            prompts=prompt, sampling_params=sampling_params, use_tqdm=True
+        )
         assert outputs is not None
 
         for output in outputs:
@@ -510,12 +552,13 @@ def test_structured_output(
             assert generated_text is not None
 
             # Search for function call pattern in the response
-            function_call_pattern = r'<function=get_weather>(.*?)</function>'
+            function_call_pattern = r"<function=get_weather>(.*?)</function>"
             matches = re.findall(function_call_pattern, generated_text)
 
             if not matches:
-                print(f"Warning: No function calls found in response: "
-                      f"{generated_text!r}")
+                print(
+                    f"Warning: No function calls found in response: {generated_text!r}"
+                )
                 continue
 
             # Take the first function call if multiple are found
@@ -526,16 +569,22 @@ def test_structured_output(
                 assert isinstance(json_content["city"], str)
                 print(f"Found valid function call: {generated_text!r}")
             except (json.JSONDecodeError, AssertionError) as e:
-                pytest.fail("Invalid function call format: "
-                            f"{generated_text!r}\nError: {str(e)}")
+                pytest.fail(
+                    f"Invalid function call format: {generated_text!r}\nError: {str(e)}"
+                )
 
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
     [
-        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
-         "deepseek_r1", NGRAM_SPEC_CONFIG),
+        (
+            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            "xgrammar",
+            "auto",
+            "deepseek_r1",
+            NGRAM_SPEC_CONFIG,
+        ),
         ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None),
     ],
 )
@@ -569,18 +618,15 @@ def test_structured_output_with_reasoning_matrices(
     )
     tokenizer = llm.get_tokenizer(None)
     reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(
-        tokenizer=tokenizer)
+        tokenizer=tokenizer
+    )
 
     reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\nProblem: What is 5 * 8 + 2?"  # noqa: E501
     reasoning_schema = {
         "type": "object",
-        "properties": {
-            "result": {
-                "type": "integer"
-            }
-        },
+        "properties": {"result": {"type": "integer"}},
         "required": ["result"],
-        "additionalProperties": False
+        "additionalProperties": False,
     }
     if "Qwen3" in model_name:
         reasoning_prompt += "<think>\n"
@@ -601,11 +647,8 @@ def test_structured_output_with_reasoning_matrices(
     assert output is not None and isinstance(output, RequestOutput)
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    reasoning_content, content = run_reasoning_extraction(
-        reasoner, [generated_text])
-    print(
-        f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}"
-    )
+    reasoning_content, content = run_reasoning_extraction(reasoner, [generated_text])
+    print(f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}")
 
     assert content is not None and reasoning_content is not None
     output_json = json.loads(content)
@@ -613,8 +656,7 @@ def test_structured_output_with_reasoning_matrices(
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("model_name, tokenizer_mode",
-                         PARAMS_MODELS_TOKENIZER_MODE)
+@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
     monkeypatch: pytest.MonkeyPatch,
     unsupported_json_schema: dict[str, Any],
@@ -623,31 +665,34 @@ def test_structured_output_auto_mode(
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
 
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend="auto",
-              tokenizer_mode=tokenizer_mode)
+    llm = LLM(
+        model=model_name,
+        max_model_len=1024,
+        guided_decoding_backend="auto",
+        tokenizer_mode=tokenizer_mode,
+    )
 
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema),
+    )
 
     prompts = (
         "Give an example JSON object for a grade "
         "that fits this schema: "
-        f"{unsupported_json_schema}. Make the response as short as possible.")
+        f"{unsupported_json_schema}. Make the response as short as possible."
+    )
     # This would fail with the default of "xgrammar", but in "auto"
     # we will handle fallback automatically.
-    outputs = llm.generate(prompts=prompts,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+    outputs = llm.generate(
+        prompts=prompts, sampling_params=sampling_params, use_tqdm=True
+    )
     # Make sure `auto` backend handling doesn't mess up sampling_params
     # and that we can reuse it without error.
     outputs.extend(
-        llm.generate(prompts=prompts,
-                     sampling_params=sampling_params,
-                     use_tqdm=True))
+        llm.generate(prompts=prompts, sampling_params=sampling_params, use_tqdm=True)
+    )
 
     assert outputs is not None
     for output in outputs:
@@ -666,26 +711,22 @@ def test_structured_output_auto_mode(
 def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("VLLM_USE_V1", "1")
 
-    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-              max_model_len=1024,
-              guided_decoding_backend="guidance",
-              guided_decoding_disable_any_whitespace=True,
-              guided_decoding_disable_additional_properties=True)
+    llm = LLM(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        max_model_len=1024,
+        guided_decoding_backend="guidance",
+        guided_decoding_disable_any_whitespace=True,
+        guided_decoding_disable_additional_properties=True,
+    )
 
     schema = {
-        'type': 'object',
-        'properties': {
-            'a1': {
-                'type': 'string'
-            },
-            'a2': {
-                'type': 'string'
-            },
-            'a3': {
-                'type': 'string'
-            }
+        "type": "object",
+        "properties": {
+            "a1": {"type": "string"},
+            "a2": {"type": "string"},
+            "a3": {"type": "string"},
         },
-        'required': ['a1', 'a2', 'a3'],
+        "required": ["a1", "a2", "a3"],
     }
 
     prompt = (
@@ -693,17 +734,19 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
         "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
         "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20. "
         "Make the response as short as possible."
-        "<|im_end|>\n<|im_start|>assistant\n")
+        "<|im_end|>\n<|im_start|>assistant\n"
+    )
 
     def generate_with_backend(backend):
         guided_params = GuidedDecodingParams(
             json=schema,
             backend=backend,
             disable_any_whitespace=True,
-            disable_additional_properties=True)
-        sampling_params = SamplingParams(temperature=0,
-                                         max_tokens=256,
-                                         guided_decoding=guided_params)
+            disable_additional_properties=True,
+        )
+        sampling_params = SamplingParams(
+            temperature=0, max_tokens=256, guided_decoding=guided_params
+        )
 
         outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
         assert outputs is not None
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py
index 2dcdda04ecb5..a2b4eb5d2e12 100644
--- a/tests/v1/entrypoints/openai/responses/conftest.py
+++ b/tests/v1/entrypoints/openai/responses/conftest.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import pytest_asyncio
-
 from tests.utils import RemoteOpenAIServer
 
 # Use a small reasoning model to test the responses API.
diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py
index 974ea8673c44..355d60fe9cec 100644
--- a/tests/v1/entrypoints/openai/responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -35,24 +35,14 @@ async def test_instructions(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_chat(client: openai.AsyncOpenAI):
-    response = await client.responses.create(input=[
-        {
-            "role": "system",
-            "content": "Finish the answer with QED."
-        },
-        {
-            "role": "user",
-            "content": "What is 5 * 3?"
-        },
-        {
-            "role": "assistant",
-            "content": "15. QED."
-        },
-        {
-            "role": "user",
-            "content": "Multiply the result by 2."
-        },
-    ], )
+    response = await client.responses.create(
+        input=[
+            {"role": "system", "content": "Finish the answer with QED."},
+            {"role": "user", "content": "What is 5 * 3?"},
+            {"role": "assistant", "content": "15. QED."},
+            {"role": "user", "content": "Multiply the result by 2."},
+        ],
+    )
     print(response)
 
     output_text = response.output[-1].content[0].text
@@ -62,14 +52,13 @@ async def test_chat(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_chat_with_input_type(client: openai.AsyncOpenAI):
-    response = await client.responses.create(input=[
-        {
-            "role": "user",
-            "content": [{
-                "type": "input_text",
-                "text": "Hello!"
-            }],
-        },
-    ], )
+    response = await client.responses.create(
+        input=[
+            {
+                "role": "user",
+                "content": [{"type": "input_text", "text": "Hello!"}],
+            },
+        ],
+    )
     print(response)
     assert response.status == "completed"
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py
index f3bce91e97cd..fcdfc882ed84 100644
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -6,8 +6,8 @@
 import openai
 import pytest
 import pytest_asyncio
-
 from tests.utils import RemoteOpenAIServer
+
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 # Use a small vision model for testing
@@ -37,8 +37,7 @@ def default_image_server_args():
 
 @pytest.fixture(scope="module")
 def image_server(default_image_server_args):
-    with RemoteOpenAIServer(MODEL_NAME,
-                            default_image_server_args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, default_image_server_args) as remote_server:
         yield remote_server
 
 
@@ -59,24 +58,23 @@ def base64_encoded_image() -> dict[str, str]:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image(client: openai.AsyncOpenAI,
-                                         model_name: str, image_url: str):
+async def test_single_chat_session_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_image",
-                "image_url": image_url,
-                "detail": "auto",
-            },
-            {
-                "type": "input_text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_image",
+                    "image_url": image_url,
+                    "detail": "auto",
+                },
+                {"type": "input_text", "text": content_text},
+            ],
+        }
+    ]
 
     # test image url
     response = await client.responses.create(
@@ -96,22 +94,19 @@ async def test_single_chat_session_image_base64encoded(
     base64_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "input_image",
-                "image_url":
-                f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
-                "detail": "auto",
-            },
-            {
-                "type": "input_text",
-                "text": content_text
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_image",
+                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
+                    "detail": "auto",
+                },
+                {"type": "input_text", "text": content_text},
+            ],
+        }
+    ]
     # test image base64
     response = await client.responses.create(
         model=model_name,
@@ -123,25 +118,27 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
-    "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
-async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_urls: list[str]):
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *({
-                "type": "input_image",
-                "image_url": image_url,
-                "detail": "auto",
-            } for image_url in image_urls),
-            {
-                "type": "input_text",
-                "text": "What's in this image?"
-            },
-        ],
-    }]
+    "image_urls", [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]
+)
+async def test_multi_image_input(
+    client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {
+                        "type": "input_image",
+                        "image_url": image_url,
+                        "detail": "auto",
+                    }
+                    for image_url in image_urls
+                ),
+                {"type": "input_text", "text": "What's in this image?"},
+            ],
+        }
+    ]
 
     if len(image_urls) > MAXIMUM_IMAGES:
         with pytest.raises(openai.BadRequestError):  # test multi-image input
@@ -152,10 +149,12 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
         # the server should still work afterwards
         response = await client.responses.create(
             model=model_name,
-            input=[{
-                "role": "user",
-                "content": "What's the weather like in Paris today?",
-            }],
+            input=[
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris today?",
+                }
+            ],
         )
         assert len(response.output_text) > 0
     else:
diff --git a/tests/v1/entrypoints/openai/responses/test_stateful.py b/tests/v1/entrypoints/openai/responses/test_stateful.py
index a2d581ef7ced..6f7edb6bd7e7 100644
--- a/tests/v1/entrypoints/openai/responses/test_stateful.py
+++ b/tests/v1/entrypoints/openai/responses/test_stateful.py
@@ -24,8 +24,7 @@ async def test_store(client: openai.AsyncOpenAI):
     assert response.status == "completed"
 
     # The response should not be found.
-    with pytest.raises(openai.NotFoundError,
-                       match="Response with id .* not found."):
+    with pytest.raises(openai.NotFoundError, match="Response with id .* not found."):
         await client.responses.retrieve(response.id)
 
 
@@ -53,8 +52,8 @@ async def test_background(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 async def test_background_error(client: openai.AsyncOpenAI):
     with pytest.raises(
-            openai.BadRequestError,
-            match="background can only be used when `store` is true"):
+        openai.BadRequestError, match="background can only be used when `store` is true"
+    ):
         _ = await client.responses.create(
             input="What is 13 * 24?",
             background=True,
@@ -87,8 +86,9 @@ async def test_cancel_completed(client: openai.AsyncOpenAI):
     response = await client.responses.create(input="Hello")
     assert response.status == "completed"
 
-    with pytest.raises(openai.BadRequestError,
-                       match="Cannot cancel a synchronous response."):
+    with pytest.raises(
+        openai.BadRequestError, match="Cannot cancel a synchronous response."
+    ):
         await client.responses.cancel(response.id)
 
 
@@ -97,7 +97,8 @@ async def test_previous_response_id(client: openai.AsyncOpenAI):
     response1 = await client.responses.create(
         instructions="You are tested on your ability to retrieve the correct "
         "information from the previous response.",
-        input="Hello, my name is John.")
+        input="Hello, my name is John.",
+    )
 
     response2 = await client.responses.create(
         input="Actually, my name is not John. My real name is Mark.",
@@ -118,7 +119,8 @@ async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
     response1 = await client.responses.create(
         instructions="You are tested on your ability to retrieve the correct "
         "information from the previous response.",
-        input="Hello, my name is John.")
+        input="Hello, my name is John.",
+    )
 
     # Both response 2 and 3 use response 1 as the previous response.
     response2 = client.responses.create(
diff --git a/tests/v1/entrypoints/openai/responses/test_structured_output.py b/tests/v1/entrypoints/openai/responses/test_structured_output.py
index c4c43a87b601..db8b87768e44 100644
--- a/tests/v1/entrypoints/openai/responses/test_structured_output.py
+++ b/tests/v1/entrypoints/openai/responses/test_structured_output.py
@@ -11,14 +11,10 @@
 async def test_structured_output(client: openai.AsyncOpenAI):
     response = await client.responses.create(
         input=[
-            {
-                "role": "system",
-                "content": "Extract the event information."
-            },
+            {"role": "system", "content": "Extract the event information."},
             {
                 "role": "user",
-                "content":
-                "Alice and Bob are going to a science fair on Friday.",
+                "content": "Alice and Bob are going to a science fair on Friday.",
             },
         ],
         text={
@@ -28,18 +24,9 @@ async def test_structured_output(client: openai.AsyncOpenAI):
                 "schema": {
                     "type": "object",
                     "properties": {
-                        "event_name": {
-                            "type": "string"
-                        },
-                        "date": {
-                            "type": "string"
-                        },
-                        "participants": {
-                            "type": "array",
-                            "items": {
-                                "type": "string"
-                            }
-                        },
+                        "event_name": {"type": "string"},
+                        "date": {"type": "string"},
+                        "participants": {"type": "array", "items": {"type": "string"}},
                     },
                     "required": ["event_name", "date", "participants"],
                     "additionalProperties": False,
@@ -65,7 +52,6 @@ async def test_structured_output(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
-
     class CalendarEvent(BaseModel):
         event_name: str
         date: str
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index dffb32846c05..b71357132ce0 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -4,7 +4,6 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-
 from tests.utils import RemoteOpenAIServer
 
 # any model with a chat template defined in tokenizer_config should work here
@@ -40,8 +39,7 @@ async def client(server):
     "model_name",
     [MODEL_NAME],
 )
-async def test_invalid_json_schema(client: openai.AsyncOpenAI,
-                                   model_name: str) -> None:
+async def test_invalid_json_schema(client: openai.AsyncOpenAI, model_name: str) -> None:
     invalid_json_schema = {
         "$defs": {
             "CarType": {
@@ -51,32 +49,28 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
             }
         },
         "properties": {
-            "brand": {
-                "title": "Brand",
-                "type": "string"
-            },
-            "model": {
-                "title": "Model",
-                "type": "string"
-            },
-            "car_type": {
-                "$ref": "#/$defs/CarType"
-            },
+            "brand": {"title": "Brand", "type": "string"},
+            "model": {"title": "Model", "type": "string"},
+            "car_type": {"$ref": "#/$defs/CarType"},
             "foo": "bar",
         },
         "required": ["brand", "model", "car_type"],
         "title": "CarDescription",
         "type": "object",
     }
-    prompt = ("Generate a JSON with the brand, model and car_type of"
-              "the most iconic car from the 90's")
+    prompt = (
+        "Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's"
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "user",
-                "content": prompt,
-            }],
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
             extra_body={"guided_json": invalid_json_schema},
         )
 
@@ -87,21 +81,22 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
     [MODEL_NAME],
 )
 async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
-    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-              "End in .com and new line. Example result:"
-              "alan.turing@enigma.com\n")
+    prompt = (
+        "Generate an email address for Alan Turing, who works in Enigma."
+        "End in .com and new line. Example result:"
+        "alan.turing@enigma.com\n"
+    )
 
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "user",
-                "content": prompt,
-            }],
-            extra_body={
-                "guided_regex": r"[.*",
-                "stop": ["\n"]
-            },
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            extra_body={"guided_regex": r"[.*", "stop": ["\n"]},
         )
 
 
@@ -125,14 +120,18 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
         number ::= "1 " | "2 "
     """
 
-    prompt = ("Generate an SQL query to show the 'username' and 'email'"
-              "from the 'users' table.")
+    prompt = (
+        "Generate an SQL query to show the 'username' and 'email'"
+        "from the 'users' table."
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.chat.completions.create(
             model=model_name,
-            messages=[{
-                "role": "user",
-                "content": prompt,
-            }],
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
             extra_body={"guided_grammar": invalid_simplified_sql_grammar},
         )
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 2462f8f9f10c..a5a325777251 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -9,8 +9,8 @@
 import regex as re
 import requests
 from openai import BadRequestError
-
 from tests.utils import RemoteOpenAIServer
+
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # any model with a chat template should work here
@@ -32,12 +32,13 @@ def default_server_args():
     ]
 
 
-@pytest.fixture(scope="module",
-                params=[["--no-enable-prefix-caching"],
-                        [
-                            "--no-enable-prefix-caching",
-                            "--disable-frontend-multiprocessing"
-                        ]])
+@pytest.fixture(
+    scope="module",
+    params=[
+        ["--no-enable-prefix-caching"],
+        ["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
+    ],
+)
 def server(default_server_args, request):
     if request.param:
         default_server_args = default_server_args + request.param
@@ -56,12 +57,10 @@ async def client(server):
     "model_name",
     [MODEL_NAME],
 )
-async def test_single_completion(client: openai.AsyncOpenAI,
-                                 model_name: str) -> None:
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str) -> None:
+    completion = await client.completions.create(
+        model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
 
     assert completion.id is not None
     assert completion.choices is not None and len(completion.choices) == 1
@@ -70,7 +69,8 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+        completion_tokens=5, prompt_tokens=6, total_tokens=11
+    )
 
     # test using token IDs
     completion = await client.completions.create(
@@ -148,11 +148,12 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME],
 )
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str) -> None:
-
+async def test_too_many_completion_logprobs(
+    client: openai.AsyncOpenAI, model_name: str
+) -> None:
     with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        (openai.BadRequestError, openai.APIError)
+    ):  # test using token IDs
         await client.completions.create(
             model=model_name,
             prompt=[0, 0, 0, 0, 0],
@@ -164,7 +165,8 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
         )
         ...
     with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        (openai.BadRequestError, openai.APIError)
+    ):  # test using token IDs
         stream = await client.completions.create(
             model=model_name,
             prompt=[0, 0, 0, 0, 0],
@@ -189,13 +191,13 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
-                                                         (MODEL_NAME, 0),
-                                                         (MODEL_NAME, 1),
-                                                         (MODEL_NAME, None)])
-async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
-                                          model_name: str,
-                                          prompt_logprobs: Optional[int]):
+@pytest.mark.parametrize(
+    "model_name, prompt_logprobs",
+    [(MODEL_NAME, -1), (MODEL_NAME, 0), (MODEL_NAME, 1), (MODEL_NAME, None)],
+)
+async def test_prompt_logprobs_completion(
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
+):
     params: dict = {
         "prompt": ["A robot may not injure another robot", "My name is"],
         "model": model_name,
@@ -224,8 +226,9 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str) -> None:
+async def test_completion_streaming(
+    client: openai.AsyncOpenAI, model_name: str
+) -> None:
     prompt = "What is an LLM?"
 
     single_completion = await client.completions.create(
@@ -235,11 +238,9 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
         temperature=0.0,
     )
     single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
+    stream = await client.completions.create(
+        model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+    )
     chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
@@ -258,8 +259,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
-                                     model_name: str):
+async def test_parallel_no_streaming(client: openai.AsyncOpenAI, model_name: str):
     """Parallel sampling without streaming.
     A single request output contains a list of completions.
     """
@@ -269,27 +269,26 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
     max_tokens = 50  # we want some to finish earlier than others
 
     # High temperature to maximize chance of unique completions.
-    completion = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=max_tokens,
-                                                 n=n,
-                                                 temperature=1.0,
-                                                 stream=False,
-                                                 logprobs=0,
-                                                 seed=42)
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        n=n,
+        temperature=1.0,
+        stream=False,
+        logprobs=0,
+        seed=42,
+    )
 
     # Assert `n` completions
     num_completions = len(completion.choices)
-    assert num_completions == n, (
-        f"Num completions {num_completions} but expected {n}.")
+    assert num_completions == n, f"Num completions {num_completions} but expected {n}."
     completion_repeats: dict[str, int] = {}
     output_token_lengths = set()
     for idx, choice in enumerate(completion.choices):
         # Assert correct completion index & some finish reason.
-        assert choice.index == idx, (
-            f"Index {choice.index} but expected {idx}.")
-        assert choice.finish_reason is not None, (
-            "None finish_reason is invalid.")
+        assert choice.index == idx, f"Index {choice.index} but expected {idx}."
+        assert choice.finish_reason is not None, "None finish_reason is invalid."
         text = choice.text
         completion_repeats[text] = completion_repeats.get(text, 0) + 1
         output_token_lengths.add(len(choice.logprobs.tokens))
@@ -298,13 +297,10 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
     # Assert `n` unique completions
     num_unique = len(completion_repeats)
     if num_unique != n:
-        repeats = {
-            txt: num
-            for (txt, num) in completion_repeats.items() if num > 1
-        }
+        repeats = {txt: num for (txt, num) in completion_repeats.items() if num > 1}
         raise AssertionError(
-            f"Expected {n} unique completions, got {num_unique};"
-            f" repeats: {repeats}.")
+            f"Expected {n} unique completions, got {num_unique}; repeats: {repeats}."
+        )
 
 
 @pytest.mark.asyncio
@@ -322,13 +318,15 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     n = 3
     max_tokens = 50  # we want some to finish earlier than others
 
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=max_tokens,
-                                             n=n,
-                                             temperature=1.0,
-                                             stream=True,
-                                             seed=42)
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        n=n,
+        temperature=1.0,
+        stream=True,
+        seed=42,
+    )
     chunks: list[list[str]] = [[] for _ in range(n)]
     finish_reason_count = 0
     async for chunk in stream:
@@ -339,7 +337,8 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
             finish_reason_count += 1
     # Assert `n` completions with correct finish reasons
     assert finish_reason_count == n, (
-        f"Expected {n} completions with valid indices and finish_reason.")
+        f"Expected {n} completions with valid indices and finish_reason."
+    )
     completion_repeats: dict[str, int] = {}
     chunk_lengths = set()
     for chunk in chunks:
@@ -347,7 +346,8 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
         # Assert correct number of completion tokens
         chunk_lengths.add(chunk_len)
         assert chunk_len <= max_tokens, (
-            f"max_tokens={max_tokens} but chunk len is {chunk_len}.")
+            f"max_tokens={max_tokens} but chunk len is {chunk_len}."
+        )
         text = "".join(chunk)
         completion_repeats[text] = completion_repeats.get(text, 0) + 1
         print(text)
@@ -356,12 +356,10 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     # Assert `n` unique completions
     num_unique = len(completion_repeats)
     if num_unique != n:
-        repeats = {
-            txt: num
-            for (txt, num) in completion_repeats.items() if num > 1
-        }
-        raise AssertionError(f"{num_unique} unique completions, expected {n};"
-                             f" repeats: {repeats}")
+        repeats = {txt: num for (txt, num) in completion_repeats.items() if num > 1}
+        raise AssertionError(
+            f"{num_unique} unique completions, expected {n}; repeats: {repeats}"
+        )
 
 
 @pytest.mark.asyncio
@@ -369,53 +367,55 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     "model_name",
     [MODEL_NAME],
 )
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
+async def test_completion_stream_options(client: openai.AsyncOpenAI, model_name: str):
     prompt = "What is the capital of France?"
 
     # Test stream=True, stream_options=
     #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": False,
+            "continuous_usage_stats": False,
+        },
+    )
 
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options=
     #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": False,
+            "continuous_usage_stats": True,
+        },
+    )
     async for chunk in stream:
         assert chunk.usage is None
 
     # Test stream=True, stream_options=
     #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": False,
+        },
+    )
     async for chunk in stream:
         if chunk.choices[0].finish_reason is None:
             assert chunk.usage is None
@@ -426,57 +426,63 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=True, stream_options=
     #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+    )
     async for chunk in stream:
         assert chunk.usage is not None
         assert chunk.usage.prompt_tokens > 0
         assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
         if chunk.choices[0].finish_reason is not None:
             final_chunk = await stream.__anext__()
             assert final_chunk.usage is not None
             assert final_chunk.usage.prompt_tokens > 0
             assert final_chunk.usage.completion_tokens > 0
             assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
             assert final_chunk.choices == []
 
     # Test stream=False, stream_options=
     #     {"include_usage": None}
     with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None},
+        )
 
     # Test stream=False, stream_options=
     #    {"include_usage": True}
     with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True},
+        )
 
     # Test stream=False, stream_options=
     #     {"continuous_usage_stats": None}
@@ -487,7 +493,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             max_tokens=5,
             temperature=0.0,
             stream=False,
-            stream_options={"continuous_usage_stats": None})
+            stream_options={"continuous_usage_stats": None},
+        )
 
     # Test stream=False, stream_options=
     #    {"continuous_usage_stats": True}
@@ -498,7 +505,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
             max_tokens=5,
             temperature=0.0,
             stream=False,
-            stream_options={"continuous_usage_stats": True})
+            stream_options={"continuous_usage_stats": True},
+        )
 
 
 @pytest.mark.asyncio
@@ -529,15 +537,19 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
             extra_body=dict(
                 # NOTE: this has to be true for n > 1 in vLLM, but
                 # not necessary for official client.
-                use_beam_search=True),
+                use_beam_search=True
+            ),
         )
         assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
+        assert batch.choices[0].text != batch.choices[1].text, (
+            "beam search should be different"
+        )
+        assert batch.choices[0].text == batch.choices[2].text, (
+            "two copies of the same prompt should be the same"
+        )
+        assert batch.choices[1].text == batch.choices[3].text, (
+            "two copies of the same prompt should be the same"
+        )
 
         # test streaming
         batch = await client.completions.create(
@@ -561,31 +573,30 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
     [MODEL_NAME],
 )
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
+async def test_echo_logprob_completion(
+    client: openai.AsyncOpenAI, model_name: str, logprobs_arg: int
+):
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
     # test using text and token IDs
     for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            echo=True,
+            logprobs=logprobs_arg,
+        )
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt, list) else prompt
         assert re.search(r"^" + prompt_text, completion.choices[0].text)
         logprobs = completion.choices[0].logprobs
         assert logprobs is not None
         assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
+        assert len(logprobs.token_logprobs) > 5 and logprobs.token_logprobs[0] is None
+        assert len(logprobs.top_logprobs) > 5 and logprobs.top_logprobs[0] is None
         for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
+            assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) > 5
 
 
@@ -594,8 +605,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_invalid_json_schema(client: openai.AsyncOpenAI,
-                                   model_name: str) -> None:
+async def test_invalid_json_schema(client: openai.AsyncOpenAI, model_name: str) -> None:
     invalid_json_schema = {
         "$defs": {
             "CarType": {
@@ -605,25 +615,19 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
             }
         },
         "properties": {
-            "brand": {
-                "title": "Brand",
-                "type": "string"
-            },
-            "model": {
-                "title": "Model",
-                "type": "string"
-            },
-            "car_type": {
-                "$ref": "#/$defs/CarType"
-            },
+            "brand": {"title": "Brand", "type": "string"},
+            "model": {"title": "Model", "type": "string"},
+            "car_type": {"$ref": "#/$defs/CarType"},
             "foo": "bar",
         },
         "required": ["brand", "model", "car_type"],
         "title": "CarDescription",
         "type": "object",
     }
-    prompt = ("Generate a JSON with the brand, model and car_type of"
-              "the most iconic car from the 90's")
+    prompt = (
+        "Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's"
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.completions.create(
             model=model_name,
@@ -638,18 +642,17 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
     [MODEL_NAME],
 )
 async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
-    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-              "End in .com and new line. Example result:"
-              "alan.turing@enigma.com\n")
+    prompt = (
+        "Generate an email address for Alan Turing, who works in Enigma."
+        "End in .com and new line. Example result:"
+        "alan.turing@enigma.com\n"
+    )
 
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.completions.create(
             model=model_name,
             prompt=prompt,
-            extra_body={
-                "guided_regex": r"[.*",
-                "stop": ["\n"]
-            },
+            extra_body={"guided_regex": r"[.*", "stop": ["\n"]},
         )
 
 
@@ -673,8 +676,10 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
         number ::= "1 " | "2 "
     """
 
-    prompt = ("Generate an SQL query to show the 'username' and 'email'"
-              "from the 'users' table.")
+    prompt = (
+        "Generate an SQL query to show the 'username' and 'email'"
+        "from the 'users' table."
+    )
     with pytest.raises((openai.BadRequestError, openai.APIError)):
         await client.completions.create(
             model=model_name,
@@ -684,14 +689,14 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
 
 
 @pytest.mark.asyncio
-async def test_completion_with_empty_prompt_embeds(
-        client: openai.AsyncOpenAI) -> None:
+async def test_completion_with_empty_prompt_embeds(client: openai.AsyncOpenAI) -> None:
     """Test completion with empty prompt embeds."""
     payload: dict[str, list] = {"prompt_embeds": []}
     headers: dict[str, str] = {"Content-Type": "application/json"}
     # base_url = http://localhost:8000/v1/completions
-    response = requests.post(f"{client.base_url}completions",
-                             headers=headers,
-                             json=payload)
+    response = requests.post(
+        f"{client.base_url}completions", headers=headers, json=payload
+    )
     assert response.status_code == 200, (
-        f"Expected status code 200, got {response.status_code}. ")
+        f"Expected status code 200, got {response.status_code}. "
+    )
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
index e84b5e3095d0..a3d2063230a2 100644
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -8,7 +8,6 @@
 import pytest
 import pytest_asyncio
 import requests
-
 from tests.utils import RemoteOpenAIServer
 
 MODEL_NAME = "ibm-research/PowerMoE-3b"
@@ -16,10 +15,9 @@
 DP_SIZE = os.getenv("DP_SIZE", "1")
 
 
-def get_prometheus_metrics(
-        server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
+def get_prometheus_metrics(server: RemoteOpenAIServer) -> dict[str, dict[str, float]]:
     """Fetch and parse Prometheus metrics from the /metrics endpoint.
-    
+
     Returns:
         Dict mapping metric names to their values grouped by labels.
         For example: {"vllm:request_success": {
@@ -34,14 +32,14 @@ def get_prometheus_metrics(
 
         # Regex patterns for Prometheus metrics
         metric_with_labels = re.compile(
-            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$')
-        metric_simple = re.compile(
-            r'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$')
+            r"^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$"
+        )
+        metric_simple = re.compile(r"^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$")
 
-        for line in response.text.split('\n'):
+        for line in response.text.split("\n"):
             line = line.strip()
             # Skip comments and empty lines
-            if not line or line.startswith('#'):
+            if not line or line.startswith("#"):
                 continue
 
             # Try to match metric with labels first
@@ -52,7 +50,7 @@ def get_prometheus_metrics(
                     value = float(value_str)
                     if metric_name not in metrics:
                         metrics[metric_name] = {}
-                    metrics[metric_name][f'{{{labels_part}}}'] = value
+                    metrics[metric_name][f"{{{labels_part}}}"] = value
                 except ValueError:
                     continue
             else:
@@ -64,7 +62,7 @@ def get_prometheus_metrics(
                         value = float(value_str)
                         if metric_name not in metrics:
                             metrics[metric_name] = {}
-                        metrics[metric_name][''] = value
+                        metrics[metric_name][""] = value
                     except ValueError:
                         continue
 
@@ -74,10 +72,9 @@ def get_prometheus_metrics(
         return {}
 
 
-def get_engine_request_counts(
-        metrics: dict[str, dict[str, float]]) -> dict[str, float]:
+def get_engine_request_counts(metrics: dict[str, dict[str, float]]) -> dict[str, float]:
     """Extract request counts per engine from Prometheus metrics.
-    
+
     Returns:
         Dict mapping engine indices to request counts.
         For example: {"0": 15.0, "1": 12.0}
@@ -102,7 +99,7 @@ def get_engine_request_counts(
 
 def check_request_balancing(server: RemoteOpenAIServer):
     """Check request balancing via Prometheus metrics if DP_SIZE > 1.
-    
+
     Args:
         server: The RemoteOpenAIServer instance
     """
@@ -121,7 +118,8 @@ def check_request_balancing(server: RemoteOpenAIServer):
     assert len(engines_with_requests) == dp_size, (
         f"Expected requests to be distributed across multiple engines,"
         f" but only engine(s) {engines_with_requests} received "
-        f"requests. Engine counts: {engine_counts}")
+        f"requests. Engine counts: {engine_counts}"
+    )
 
     # Verify that the load is reasonably balanced
     # (no engine should handle all requests)
@@ -129,7 +127,8 @@ def check_request_balancing(server: RemoteOpenAIServer):
 
     for count in engine_counts.values():
         assert count > total_requests // (dp_size + 1), (
-            f"requests are imbalanced: {engine_counts}")
+            f"requests are imbalanced: {engine_counts}"
+        )
 
 
 @pytest.fixture(scope="module")
@@ -167,16 +166,13 @@ async def client(server):
     "model_name",
     [MODEL_NAME],
 )
-async def test_single_completion(client: openai.AsyncOpenAI,
-                                 server: RemoteOpenAIServer,
-                                 model_name: str) -> None:
-
+async def test_single_completion(
+    client: openai.AsyncOpenAI, server: RemoteOpenAIServer, model_name: str
+) -> None:
     async def make_request():
         completion = await client.completions.create(
-            model=model_name,
-            prompt="Hello, my name is",
-            max_tokens=10,
-            temperature=1.0)
+            model=model_name, prompt="Hello, my name is", max_tokens=10, temperature=1.0
+        )
 
         assert completion.id is not None
         assert completion.choices is not None and len(completion.choices) == 1
@@ -225,9 +221,9 @@ async def make_request():
     "model_name",
     [MODEL_NAME],
 )
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    server: RemoteOpenAIServer,
-                                    model_name: str) -> None:
+async def test_completion_streaming(
+    client: openai.AsyncOpenAI, server: RemoteOpenAIServer, model_name: str
+) -> None:
     prompt = "What is an LLM?"
 
     async def make_streaming_request():
@@ -241,11 +237,9 @@ async def make_streaming_request():
         single_output = single_completion.choices[0].text
 
         # Perform the streaming request
-        stream = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=5,
-                                                 temperature=0.0,
-                                                 stream=True)
+        stream = await client.completions.create(
+            model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+        )
         chunks: list[str] = []
         finish_reason_count = 0
         last_chunk = None
@@ -256,16 +250,15 @@ async def make_streaming_request():
             last_chunk = chunk  # Keep track of the last chunk
 
         # finish reason should only return in the last block for OpenAI API
-        assert finish_reason_count == 1, (
-            "Finish reason should appear exactly once.")
-        assert last_chunk is not None, (
-            "Stream should have yielded at least one chunk.")
-        assert last_chunk.choices[
-            0].finish_reason == "length", "Finish reason should be 'length'."
+        assert finish_reason_count == 1, "Finish reason should appear exactly once."
+        assert last_chunk is not None, "Stream should have yielded at least one chunk."
+        assert last_chunk.choices[0].finish_reason == "length", (
+            "Finish reason should be 'length'."
+        )
         # Check that the combined text matches the non-streamed version.
-        assert "".join(
-            chunks
-        ) == single_output, "Streamed output should match non-streamed output."
+        assert "".join(chunks) == single_output, (
+            "Streamed output should match non-streamed output."
+        )
         return True  # Indicate success for this request
 
     # Test single request
@@ -279,9 +272,9 @@ async def make_streaming_request():
     tasks = [make_streaming_request() for _ in range(num_requests)]
     results = await asyncio.gather(*tasks)
 
-    assert len(
-        results
-    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert len(results) == num_requests, (
+        f"Expected {num_requests} results, got {len(results)}"
+    )
     assert all(results), "Not all streaming requests completed successfully."
 
     await asyncio.sleep(0.5)
@@ -289,9 +282,9 @@ async def make_streaming_request():
     tasks = [make_streaming_request() for _ in range(num_requests)]
     results = await asyncio.gather(*tasks)
 
-    assert len(
-        results
-    ) == num_requests, f"Expected {num_requests} results, got {len(results)}"
+    assert len(results) == num_requests, (
+        f"Expected {num_requests} results, got {len(results)}"
+    )
     assert all(results), "Not all streaming requests completed successfully."
 
     # Check request balancing via Prometheus metrics if DP_SIZE > 1
diff --git a/tests/v1/executor/test_multiproc_executor.py b/tests/v1/executor/test_multiproc_executor.py
index c1425d82becf..41a5ae6c13ca 100644
--- a/tests/v1/executor/test_multiproc_executor.py
+++ b/tests/v1/executor/test_multiproc_executor.py
@@ -10,24 +10,22 @@
 
 
 class DummyMultiprocExecutor(MultiprocExecutor):
-
     def __init__(self, output_rank, world_size):
         # Manually initialize minimal required fields
         self.output_rank = output_rank
         self.world_size = world_size
-        self._send_remaining_count = defaultdict[str,
-                                                 int](lambda: self.world_size)
-        self._recv_remaining_count = defaultdict[str,
-                                                 int](lambda: self.world_size)
+        self._send_remaining_count = defaultdict[str, int](lambda: self.world_size)
+        self._recv_remaining_count = defaultdict[str, int](lambda: self.world_size)
         self.io_thread_pool = None
         self.shutdown_event = threading.Event()
 
 
 class DummyModelRunnerOutput(ModelRunnerOutput):
-
-    def __init__(self,
-                 finished_sending: Optional[set[str]] = None,
-                 finished_recving: Optional[set[str]] = None):
+    def __init__(
+        self,
+        finished_sending: Optional[set[str]] = None,
+        finished_recving: Optional[set[str]] = None,
+    ):
         self.finished_sending = finished_sending
         self.finished_recving = finished_recving
 
@@ -35,10 +33,10 @@ def __init__(self,
 def test_aggregate_workers_output():
     executor = DummyMultiprocExecutor(output_rank=0, world_size=2)
 
-    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    output2 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
+    output1 = DummyModelRunnerOutput(
+        finished_sending={"req1"}, finished_recving={"req2"}
+    )
+    output2 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None)
 
     aggregated = executor._aggregate_workers_output([output1, output2])
 
@@ -46,27 +44,25 @@ def test_aggregate_workers_output():
     assert aggregated.finished_sending is None
     assert aggregated.finished_recving is None
 
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving=None)
+    output1 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={"req1"}, finished_recving=None)
 
     aggregated = executor._aggregate_workers_output([output1, output2])
 
     assert aggregated is output1
-    assert aggregated.finished_sending == {'req1'}
+    assert aggregated.finished_sending == {"req1"}
     assert aggregated.finished_recving is None
 
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
+    output1 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None)
+    output2 = DummyModelRunnerOutput(
+        finished_sending={"req1"}, finished_recving={"req2"}
+    )
 
     aggregated = executor._aggregate_workers_output([output1, output2])
 
     assert aggregated is output1
     assert aggregated.finished_sending is None
-    assert aggregated.finished_recving == {'req2'}
+    assert aggregated.finished_recving == {"req2"}
 
 
 def test_async_aggregate_workers_output():
@@ -74,13 +70,12 @@ def test_async_aggregate_workers_output():
 
     future1: Future[DummyModelRunnerOutput] = Future()
     future2: Future[DummyModelRunnerOutput] = Future()
-    result_future = executor._async_aggregate_workers_output(
-        [future1, future2])
+    result_future = executor._async_aggregate_workers_output([future1, future2])
 
-    output1 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
-    output2 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
+    output1 = DummyModelRunnerOutput(
+        finished_sending={"req1"}, finished_recving={"req2"}
+    )
+    output2 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None)
     future1.set_result(output1)
     future2.set_result(output2)
 
@@ -92,31 +87,27 @@ def test_async_aggregate_workers_output():
 
     future1 = Future()
     future2 = Future()
-    result_future = executor._async_aggregate_workers_output(
-        [future1, future2])
+    result_future = executor._async_aggregate_workers_output([future1, future2])
 
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving=None)
+    output1 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None)
+    output2 = DummyModelRunnerOutput(finished_sending={"req1"}, finished_recving=None)
     future1.set_result(output1)
     future2.set_result(output2)
 
     assert result_future.done()
     aggregated = result_future.result()
     assert aggregated is output1
-    assert aggregated.finished_sending == {'req1'}
+    assert aggregated.finished_sending == {"req1"}
     assert aggregated.finished_recving is None
 
     future1 = Future()
     future2 = Future()
-    result_future = executor._async_aggregate_workers_output(
-        [future1, future2])
+    result_future = executor._async_aggregate_workers_output([future1, future2])
 
-    output1 = DummyModelRunnerOutput(finished_sending=None,
-                                     finished_recving=None)
-    output2 = DummyModelRunnerOutput(finished_sending={'req1'},
-                                     finished_recving={'req2'})
+    output1 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None)
+    output2 = DummyModelRunnerOutput(
+        finished_sending={"req1"}, finished_recving={"req2"}
+    )
     future1.set_result(output1)
     future2.set_result(output2)
 
@@ -124,4 +115,4 @@ def test_async_aggregate_workers_output():
     aggregated = result_future.result()
     assert aggregated is output1
     assert aggregated.finished_sending is None
-    assert aggregated.finished_recving == {'req2'}
+    assert aggregated.finished_recving == {"req2"}
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index e5d66ffeeeb2..81bd5e30d40e 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -12,12 +12,11 @@
 RTOL = 0.03
 
 # Model-specific expected values
-EXPECTED_VALUES = {
-    "Qwen/Qwen3-0.6B": 0.41,
-    "deepseek-ai/deepseek-vl2-small": 0.59
-}
+EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
 
-SIMPLE_PROMPT = "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",  # noqa: E501
+SIMPLE_PROMPT = (
+    "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",
+)  # noqa: E501
 
 # Get model name from environment variable
 MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
@@ -25,8 +24,7 @@
 
 def run_simple_prompt():
     client = openai.OpenAI(api_key="EMPTY", base_url=BASE_URL)
-    completion = client.completions.create(model=MODEL_NAME,
-                                           prompt=SIMPLE_PROMPT)
+    completion = client.completions.create(model=MODEL_NAME, prompt=SIMPLE_PROMPT)
 
     print("-" * 50)
     print(f"Completion results for {MODEL_NAME}:")
@@ -38,9 +36,11 @@ def test_accuracy():
     """Run the end to end accuracy test."""
     run_simple_prompt()
 
-    model_args = (f"model={MODEL_NAME},"
-                  f"base_url={BASE_URL}/completions,"
-                  f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+    model_args = (
+        f"model={MODEL_NAME},"
+        f"base_url={BASE_URL}/completions,"
+        f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+    )
 
     results = lm_eval.simple_evaluate(
         model="local-completions",
@@ -52,11 +52,14 @@ def test_accuracy():
     expected_value = EXPECTED_VALUES.get(MODEL_NAME)
 
     if expected_value is None:
-        print(f"Warning: No expected value found for {MODEL_NAME}. "
-              "Skipping accuracy check.")
+        print(
+            f"Warning: No expected value found for {MODEL_NAME}. "
+            "Skipping accuracy check."
+        )
         print(f"Measured value: {measured_value}")
         return
 
-    assert (measured_value - RTOL < expected_value
-            and measured_value + RTOL > expected_value
-            ), f"Expected: {expected_value} | Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < expected_value
+        and measured_value + RTOL > expected_value
+    ), f"Expected: {expected_value} | Measured: {measured_value}"
diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
index 95465a25fc9d..1f0fd394b233 100644
--- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
+++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
@@ -9,8 +9,7 @@
 PROXY_PORT = os.getenv("PROXY_PORT", None)
 
 if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None:
-    raise ValueError(
-        "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
+    raise ValueError("Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.")
 
 LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, "  # noqa: E501
 PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result,"  # noqa: E501
@@ -38,13 +37,13 @@ def test_edge_cases():
 
     # (1) Check that we can handle a very short prompt,
     # less than the length of the block size.
-    completion = proxy_client.completions.create(model=MODEL,
-                                                 prompt=SHORT_PROMPT,
-                                                 temperature=0)
+    completion = proxy_client.completions.create(
+        model=MODEL, prompt=SHORT_PROMPT, temperature=0
+    )
     proxy_response = completion.choices[0].text
-    completion = prefill_client.completions.create(model=MODEL,
-                                                   prompt=SHORT_PROMPT,
-                                                   temperature=0)
+    completion = prefill_client.completions.create(
+        model=MODEL, prompt=SHORT_PROMPT, temperature=0
+    )
     prefill_response = completion.choices[0].text
     print(f"SMALL PROMPT: {proxy_response=}")
     assert proxy_response == prefill_response
@@ -52,27 +51,27 @@ def test_edge_cases():
     # (2) Check that we can handle a full prefix cache
     # hit on the D worker but not on the P worker.
     # (2a): prime the D worker.
-    completion = decode_client.completions.create(model=MODEL,
-                                                  prompt=PROMPT,
-                                                  temperature=0)
+    completion = decode_client.completions.create(
+        model=MODEL, prompt=PROMPT, temperature=0
+    )
     decode_response = completion.choices[0].text
     # (2b): send via the P/D setup
-    completion = proxy_client.completions.create(model=MODEL,
-                                                 prompt=PROMPT,
-                                                 temperature=0)
+    completion = proxy_client.completions.create(
+        model=MODEL, prompt=PROMPT, temperature=0
+    )
     proxy_response = completion.choices[0].text
     print(f"FULL CACHE HIT: {proxy_response=}")
     assert proxy_response == decode_response
 
     # (3) Check that we can handle a partial prefix cache
     # hit on the D worker.
-    completion = proxy_client.completions.create(model=MODEL,
-                                                 prompt=LONG_PROMPT,
-                                                 temperature=0)
+    completion = proxy_client.completions.create(
+        model=MODEL, prompt=LONG_PROMPT, temperature=0
+    )
     proxy_response = completion.choices[0].text
-    completion = prefill_client.completions.create(model=MODEL,
-                                                   prompt=LONG_PROMPT,
-                                                   temperature=0)
+    completion = prefill_client.completions.create(
+        model=MODEL, prompt=LONG_PROMPT, temperature=0
+    )
     prefill_response = completion.choices[0].text
     print(f"PARTIAL CACHE HIT: {proxy_response=}")
     assert proxy_response == prefill_response
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index c58cb0286f13..585975c0a909 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -27,49 +27,45 @@ async def lifespan(app: FastAPI):
 
     # Create prefill clients
     for i, (host, port) in enumerate(global_args.prefiller_instances):
-        prefiller_base_url = f'http://{host}:{port}/v1'
-        app.state.prefill_clients.append({
-            'client':
-            httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
-            'host':
-            host,
-            'port':
-            port,
-            'id':
-            i
-        })
+        prefiller_base_url = f"http://{host}:{port}/v1"
+        app.state.prefill_clients.append(
+            {
+                "client": httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
+                "host": host,
+                "port": port,
+                "id": i,
+            }
+        )
 
     # Create decode clients
     for i, (host, port) in enumerate(global_args.decoder_instances):
-        decoder_base_url = f'http://{host}:{port}/v1'
-        app.state.decode_clients.append({
-            'client':
-            httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
-            'host':
-            host,
-            'port':
-            port,
-            'id':
-            i
-        })
+        decoder_base_url = f"http://{host}:{port}/v1"
+        app.state.decode_clients.append(
+            {
+                "client": httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
+                "host": host,
+                "port": port,
+                "id": i,
+            }
+        )
 
     # Initialize round-robin iterators
-    app.state.prefill_iterator = itertools.cycle(
-        range(len(app.state.prefill_clients)))
-    app.state.decode_iterator = itertools.cycle(
-        range(len(app.state.decode_clients)))
+    app.state.prefill_iterator = itertools.cycle(range(len(app.state.prefill_clients)))
+    app.state.decode_iterator = itertools.cycle(range(len(app.state.decode_clients)))
 
-    print(f"Initialized {len(app.state.prefill_clients)} prefill clients "
-          f"and {len(app.state.decode_clients)} decode clients.")
+    print(
+        f"Initialized {len(app.state.prefill_clients)} prefill clients "
+        f"and {len(app.state.decode_clients)} decode clients."
+    )
 
     yield
 
     # Shutdown: Close all clients
     for client_info in app.state.prefill_clients:
-        await client_info['client'].aclose()
+        await client_info["client"].aclose()
 
     for client_info in app.state.decode_clients:
-        await client_info['client'].aclose()
+        await client_info["client"].aclose()
 
 
 # Update FastAPI app initialization to use lifespan
@@ -83,43 +79,38 @@ def parse_args():
     parser.add_argument("--host", type=str, default="localhost")
 
     # For prefiller instances
-    parser.add_argument("--prefiller-hosts",
-                        "--prefiller-host",
-                        type=str,
-                        nargs="+",
-                        default=["localhost"])
-    parser.add_argument("--prefiller-ports",
-                        "--prefiller-port",
-                        type=int,
-                        nargs="+",
-                        default=[8100])
+    parser.add_argument(
+        "--prefiller-hosts",
+        "--prefiller-host",
+        type=str,
+        nargs="+",
+        default=["localhost"],
+    )
+    parser.add_argument(
+        "--prefiller-ports", "--prefiller-port", type=int, nargs="+", default=[8100]
+    )
 
     # For decoder instances
-    parser.add_argument("--decoder-hosts",
-                        "--decoder-host",
-                        type=str,
-                        nargs="+",
-                        default=["localhost"])
-    parser.add_argument("--decoder-ports",
-                        "--decoder-port",
-                        type=int,
-                        nargs="+",
-                        default=[8200])
+    parser.add_argument(
+        "--decoder-hosts", "--decoder-host", type=str, nargs="+", default=["localhost"]
+    )
+    parser.add_argument(
+        "--decoder-ports", "--decoder-port", type=int, nargs="+", default=[8200]
+    )
 
     args = parser.parse_args()
 
     # Validate and pair hosts with ports
     if len(args.prefiller_hosts) != len(args.prefiller_ports):
         raise ValueError(
-            "Number of prefiller hosts must match number of prefiller ports")
+            "Number of prefiller hosts must match number of prefiller ports"
+        )
 
     if len(args.decoder_hosts) != len(args.decoder_ports):
-        raise ValueError(
-            "Number of decoder hosts must match number of decoder ports")
+        raise ValueError("Number of decoder hosts must match number of decoder ports")
 
     # Create tuples of (host, port) for each service type
-    args.prefiller_instances = list(
-        zip(args.prefiller_hosts, args.prefiller_ports))
+    args.prefiller_instances = list(zip(args.prefiller_hosts, args.prefiller_ports))
     args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports))
 
     return args
@@ -136,29 +127,30 @@ def get_next_client(app, service_type: str):
     Returns:
         The next client to use
     """
-    if service_type == 'prefill':
+    if service_type == "prefill":
         client_idx = next(app.state.prefill_iterator)
         return app.state.prefill_clients[client_idx]
-    elif service_type == 'decode':
+    elif service_type == "decode":
         client_idx = next(app.state.decode_iterator)
         return app.state.decode_clients[client_idx]
     else:
         raise ValueError(f"Unknown service type: {service_type}")
 
 
-async def send_request_to_service(client_info: dict, endpoint: str,
-                                  req_data: dict, request_id: str):
+async def send_request_to_service(
+    client_info: dict, endpoint: str, req_data: dict, request_id: str
+):
     """
     Send a request to a service using a client from the pool.
     """
     req_data = req_data.copy()
-    req_data['kv_transfer_params'] = {
+    req_data["kv_transfer_params"] = {
         "do_remote_decode": True,
         "do_remote_prefill": False,
         "remote_engine_id": None,
         "remote_block_ids": None,
         "remote_host": None,
-        "remote_port": None
+        "remote_port": None,
     }
     req_data["stream"] = False
     req_data["max_tokens"] = 1
@@ -166,31 +158,31 @@ async def send_request_to_service(client_info: dict, endpoint: str,
         del req_data["stream_options"]
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        "X-Request-Id": request_id
+        "X-Request-Id": request_id,
     }
 
-    response = await client_info['client'].post(endpoint,
-                                                json=req_data,
-                                                headers=headers)
+    response = await client_info["client"].post(
+        endpoint, json=req_data, headers=headers
+    )
     response.raise_for_status()
 
     return response
 
 
-async def stream_service_response(client_info: dict, endpoint: str,
-                                  req_data: dict, request_id: str):
+async def stream_service_response(
+    client_info: dict, endpoint: str, req_data: dict, request_id: str
+):
     """
     Asynchronously stream response from a service using a client from the pool.
     """
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        "X-Request-Id": request_id
+        "X-Request-Id": request_id,
     }
 
-    async with client_info['client'].stream("POST",
-                                            endpoint,
-                                            json=req_data,
-                                            headers=headers) as response:
+    async with client_info["client"].stream(
+        "POST", endpoint, json=req_data, headers=headers
+    ) as response:
         response.raise_for_status()
         async for chunk in response.aiter_bytes():
             yield chunk
@@ -202,40 +194,39 @@ async def _handle_completions(api: str, request: Request):
         request_id = str(uuid.uuid4())
 
         # Get the next prefill client in round-robin fashion
-        prefill_client_info = get_next_client(request.app, 'prefill')
+        prefill_client_info = get_next_client(request.app, "prefill")
 
         # Send request to prefill service
-        response = await send_request_to_service(prefill_client_info, api,
-                                                 req_data, request_id)
+        response = await send_request_to_service(
+            prefill_client_info, api, req_data, request_id
+        )
 
         # Extract the needed fields
         response_json = response.json()
-        kv_transfer_params = response_json.get('kv_transfer_params', {})
+        kv_transfer_params = response_json.get("kv_transfer_params", {})
         if kv_transfer_params:
             req_data["kv_transfer_params"] = kv_transfer_params
 
         # Get the next decode client in round-robin fashion
-        decode_client_info = get_next_client(request.app, 'decode')
+        decode_client_info = get_next_client(request.app, "decode")
 
         logger.debug("Using %s %s", prefill_client_info, decode_client_info)
 
         # Stream response from decode service
         async def generate_stream():
-            async for chunk in stream_service_response(decode_client_info,
-                                                       api,
-                                                       req_data,
-                                                       request_id=request_id):
+            async for chunk in stream_service_response(
+                decode_client_info, api, req_data, request_id=request_id
+            ):
                 yield chunk
 
-        return StreamingResponse(generate_stream(),
-                                 media_type="application/json")
+        return StreamingResponse(generate_stream(), media_type="application/json")
 
     except Exception as e:
         import sys
         import traceback
+
         exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server"
-              f" - {api} endpoint")
+        print(f"Error occurred in disagg prefill proxy server - {api} endpoint")
         print(e)
         print("".join(traceback.format_exception(*exc_info)))
         raise
@@ -257,13 +248,14 @@ async def healthcheck():
     return {
         "status": "ok",
         "prefill_instances": len(app.state.prefill_clients),
-        "decode_instances": len(app.state.decode_clients)
+        "decode_instances": len(app.state.decode_clients),
     }
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     global global_args
     global_args = parse_args()
 
     import uvicorn
+
     uvicorn.run(app, host=global_args.host, port=global_args.port)
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index b1780d8a9af8..74ae3ca9a863 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -52,29 +52,26 @@ def test_multi_shared_storage_connector_consistency():
         kv_connector="MultiConnector",
         kv_role="kv_both",
         kv_connector_extra_config={
-            "connectors": [{
-                "kv_connector":
-                "TestSharedStorageConnector",
-                "kv_role":
-                "kv_both",
-                "kv_connector_extra_config": {
-                    "shared_storage_path": str(storage_1_path),
-                    "name": "storage1",
+            "connectors": [
+                {
+                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_role": "kv_both",
+                    "kv_connector_extra_config": {
+                        "shared_storage_path": str(storage_1_path),
+                        "name": "storage1",
+                    },
+                    "kv_connector_module_path": "tests.v1.kv_connector.unit.utils",
                 },
-                "kv_connector_module_path":
-                "tests.v1.kv_connector.unit.utils",
-            }, {
-                "kv_connector":
-                "TestSharedStorageConnector",
-                "kv_role":
-                "kv_both",
-                "kv_connector_extra_config": {
-                    "shared_storage_path": str(storage_2_path),
-                    "name": "storage2",
+                {
+                    "kv_connector": "TestSharedStorageConnector",
+                    "kv_role": "kv_both",
+                    "kv_connector_extra_config": {
+                        "shared_storage_path": str(storage_2_path),
+                        "name": "storage2",
+                    },
+                    "kv_connector_module_path": "tests.v1.kv_connector.unit.utils",
                 },
-                "kv_connector_module_path":
-                "tests.v1.kv_connector.unit.utils",
-            }]
+            ]
         },
     )
 
@@ -93,14 +90,16 @@ def test_multi_shared_storage_connector_consistency():
     local_subdirs = list(storage_1_path.iterdir())
     external_subdirs = list(storage_2_path.iterdir())
 
-    assert len(
-        local_subdirs
-    ) > 0, f"Local storage path {storage_1_path} is empty after generation."
+    assert len(local_subdirs) > 0, (
+        f"Local storage path {storage_1_path} is empty after generation."
+    )
     assert len(external_subdirs) > 0, (
-        f"External storage path {storage_2_path} is empty after generation.")
+        f"External storage path {storage_2_path} is empty after generation."
+    )
     assert len(local_subdirs) == len(external_subdirs), (
         f"Mismatch in number of cache entries: "
-        f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
+        f"Local={len(local_subdirs)}, External={len(external_subdirs)}"
+    )
 
     # The subdirectories should correspond to the prompt hashes
     # Since prompts are the same, the hash directories should be the same name
@@ -113,29 +112,39 @@ def test_multi_shared_storage_connector_consistency():
     # Compare the contents of each corresponding cache directory
     for subdir_name in local_subdir_names:
         print(f"Comparing contents of cache directory: {subdir_name}")
-        assert _compare_directories(storage_1_path / subdir_name,
-                                    storage_2_path / subdir_name), \
-            (f"Contents differ for cache directory '{subdir_name}' between "
-             f"{storage_1_path} and {storage_2_path}")
+        assert _compare_directories(
+            storage_1_path / subdir_name, storage_2_path / subdir_name
+        ), (
+            f"Contents differ for cache directory '{subdir_name}' between "
+            f"{storage_1_path} and {storage_2_path}"
+        )
 
     events = get_connector_events()
     # get_num_new_matched_tokens and update_state_after_alloc will be called
     # on each connector in turn.
     assert events["storage1-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
     assert events["storage1-WORKER"][:5] == [
-        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
-        'wait_for_layer_load', 'save_kv_layer'
+        "register_kv_caches",
+        "bind_connector_metadata",
+        "start_load_kv",
+        "wait_for_layer_load",
+        "save_kv_layer",
     ]
     assert events["storage2-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
     assert events["storage2-WORKER"][:5] == [
-        'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
-        'wait_for_layer_load', 'save_kv_layer'
+        "register_kv_caches",
+        "bind_connector_metadata",
+        "start_load_kv",
+        "wait_for_layer_load",
+        "save_kv_layer",
     ]
 
     # Reset prefix cache or else we'll just get the tokens back from there.
@@ -151,12 +160,14 @@ def test_multi_shared_storage_connector_consistency():
     # on that one but with zero blocks for others (first nonzero match is
     # chosen).
     assert events["storage1-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[7] 96",
+        "build_connector_meta",
     ]
     assert events["storage2-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
 
     # Delete storage1 connector state
@@ -175,12 +186,14 @@ def test_multi_shared_storage_connector_consistency():
     # a hit, so update_state_after_alloc will only be called with allocated
     # blocks for the second connector.
     assert events["storage1-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[0] 0",
+        "build_connector_meta",
     ]
     assert events["storage2-SCHEDULER"][:3] == [
-        'get_num_new_matched_tokens 0',
-        'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
+        "get_num_new_matched_tokens 0",
+        "update_state_after_alloc num_blocks=[7] 96",
+        "build_connector_meta",
     ]
 
     # Clean up
@@ -191,15 +204,14 @@ def test_multi_shared_storage_connector_consistency():
 def get_connector_events() -> dict[str, list[str]]:
     # Read in connector events and reset the files.
     import glob
+
     event_files = glob.glob(tempfile.gettempdir() + "/connector_*_events.log")
     connector_events = {}
     for fname in event_files:
         name = fname.split("connector_")[1].split("_events.log")[0]
         try:
             with open(fname, "r+") as f:
-                connector_events[name] = [
-                    line.strip() for line in f if line.strip()
-                ]
+                connector_events[name] = [line.strip() for line in f if line.strip()]
                 f.truncate(0)
         except Exception as e:
             print(f"[ERROR] Could not read connector events for {name}: {e}")
@@ -211,5 +223,5 @@ def test_engine_id_conflict():
     configs = [KVTransferConfig() for _ in range(2)]
     ids = [config.engine_id for config in configs]
     assert ids[0] != ids[1], (
-        "Engine IDs should be different for different configs. "
-        f"Got {ids}")
+        f"Engine IDs should be different for different configs. Got {ids}"
+    )
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index c4f558b7acdb..20d6815f403e 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -12,8 +12,12 @@
 from vllm import LLM
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
-    KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
-    NixlConnectorWorker)
+    KVConnectorRole,
+    NixlAgentMetadata,
+    NixlConnector,
+    NixlConnectorMetadata,
+    NixlConnectorWorker,
+)
 from vllm.forward_context import ForwardContext
 from vllm.sampling_params import SamplingParams
 
@@ -31,9 +35,9 @@ def test_basic_interface():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request = create_request(request_id=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+    request = create_request(
+        request_id=1, num_tokens=NUM_TOKENS, do_remote_prefill=True
+    )
     request_id = request.request_id
 
     scheduler.add_request(request)
@@ -49,8 +53,11 @@ def test_basic_interface():
     req_meta = kv_connector_metadata.reqs_to_recv[request_id]
 
     for block_id, block in zip(
-            req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator.
-            single_type_managers[0].req_to_blocks[request_id]):
+        req_meta.local_block_ids,
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ],
+    ):
         assert block_id == block.block_id
 
 
@@ -70,10 +77,9 @@ def test_prompt_less_than_block_size():
     NUM_TOKENS = int(BLOCK_SIZE * 0.5)
 
     # Request will have 0 remote blocks.
-    request = create_request(request_id=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True,
-                             num_remote_blocks=0)
+    request = create_request(
+        request_id=1, num_tokens=NUM_TOKENS, do_remote_prefill=True, num_remote_blocks=0
+    )
     scheduler.add_request(request)
     scheduler_output = scheduler.schedule()
 
@@ -89,7 +95,7 @@ def test_prompt_less_than_block_size():
 
 class FakeNixlWrapper:
     """Mock implementation of NixlWrapper for testing.
-    
+
     We don't inherit from nixl._api.nixl_agent because nixl may not be
     installed.
     """
@@ -99,8 +105,7 @@ class FakeNixlWrapper:
 
     def __init__(self, agent_name: str, *args, **kwargs):
         self._cycles_before_xfer_done = 0
-        self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict(
-            lambda: 0)
+        self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict(lambda: 0)
 
     def get_reg_descs(self, caches_data, memory_type: str) -> list:
         return [str(uuid.uuid4()) for _ in caches_data]
@@ -125,8 +130,7 @@ def get_new_notifs(self) -> dict[str, list[bytes]]:
         return {}
 
     def check_xfer_state(self, handle: int) -> str:
-        if self._check_xfer_state_cycles[
-                handle] >= self._cycles_before_xfer_done:
+        if self._check_xfer_state_cycles[handle] >= self._cycles_before_xfer_done:
             return "DONE"
         self._check_xfer_state_cycles[handle] += 1
         return "PROC"
@@ -137,13 +141,15 @@ def release_xfer_handle(self, handle: int) -> None:
     def send_notif(self, agent_name: str, notif_msg: bytes) -> None:
         pass
 
-    def make_prepped_xfer(self,
-                          xfer_type: str,
-                          local_xfer_side_handle: int,
-                          local_block_descs_ids: list[int],
-                          remote_xfer_side_handle: int,
-                          remote_block_descs_ids: list[int],
-                          notif_msg: Optional[bytes] = None) -> int:
+    def make_prepped_xfer(
+        self,
+        xfer_type: str,
+        local_xfer_side_handle: int,
+        local_block_descs_ids: list[int],
+        remote_xfer_side_handle: int,
+        remote_block_descs_ids: list[int],
+        notif_msg: Optional[bytes] = None,
+    ) -> int:
         return uuid.uuid4().int
 
     def transfer(self, handle: int) -> str:
@@ -159,15 +165,15 @@ def set_cycles_before_xfer_done(self, cycles: int):
 
 
 class FakeNixlConnectorWorker(NixlConnectorWorker):
-
     REMOTE_ENGINE_ID = "remote_engine"
 
     def __init__(self, *args, hand_shake_latency: float = 1.8, **kwargs):
         super().__init__(*args, **kwargs)
         self._hand_shake_latency = hand_shake_latency
 
-    def _nixl_handshake(self, host: str, port: int, remote_tp_size: int,
-                        expected_engine_id: str) -> dict[int, str]:
+    def _nixl_handshake(
+        self, host: str, port: int, remote_tp_size: int, expected_engine_id: str
+    ) -> dict[int, str]:
         # Mimic slow _nixl_handshake, as well as bypass zmq communication.
         time.sleep(self._hand_shake_latency)
         # These should've been done in register_kv_caches(), called by
@@ -188,21 +194,23 @@ def _nixl_handshake(self, host: str, port: int, remote_tp_size: int,
                 block_len=self.block_len,
                 attn_backend_name=self.backend_name,
             ),
-            remote_tp_size=remote_tp_size)
+            remote_tp_size=remote_tp_size,
+        )
         return {0: remote_agent_name}
 
 
 class TestNixlHandshake:
-
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-        FakeNixlWrapper)
+        FakeNixlWrapper,
+    )
     def test_multi_xfer_one_engine(
         self,
         # dist_init is a fixture that initializes the distributed environment.
-        dist_init):
+        dist_init,
+    ):
         """Test case where multiple xfers are initiated to the same engine.
-        
+
         This test triggers the connector to load remote KV for the same
         `request_id`. The transfer is not done immediately due to
         `set_cycles_before_xfer_done`, so there is a state where there are
@@ -216,9 +224,9 @@ def test_multi_xfer_one_engine(
         # Test worker role in decode server.
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-            vllm_config, connector.engine_id, hand_shake_latency=0)
-        assert isinstance(connector.connector_worker.nixl_wrapper,
-                          FakeNixlWrapper)
+            vllm_config, connector.engine_id, hand_shake_latency=0
+        )
+        assert isinstance(connector.connector_worker.nixl_wrapper, FakeNixlWrapper)
         connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
         num_xfers = 4
         while True:
@@ -229,21 +237,19 @@ def test_multi_xfer_one_engine(
                 num_xfers -= 1
                 metadata.add_new_req(
                     request_id=request_id,
-                    local_block_ids=[
-                        num_xfers + 1, num_xfers + 2, num_xfers + 3
-                    ],
+                    local_block_ids=[num_xfers + 1, num_xfers + 2, num_xfers + 3],
                     kv_transfer_params={
-                        "remote_block_ids":
-                        [num_xfers + 4, num_xfers + 5, num_xfers + 6],
-                        "remote_engine_id":
-                        FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
-                        "remote_host":
-                        "localhost",
-                        "remote_port":
-                        1234,
-                        "remote_tp_size":
-                        1,
-                    })
+                        "remote_block_ids": [
+                            num_xfers + 4,
+                            num_xfers + 5,
+                            num_xfers + 6,
+                        ],
+                        "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                        "remote_host": "localhost",
+                        "remote_port": 1234,
+                        "remote_tp_size": 1,
+                    },
+                )
             connector.bind_connector_metadata(metadata)
 
             # Mimic maybe_setup_kv_connector in gpu_model_runner.
@@ -255,8 +261,9 @@ def test_multi_xfer_one_engine(
             _before_load = time.perf_counter()
             connector.start_load_kv(dummy_ctx)
             _after_load = time.perf_counter()
-            assert _after_load - _before_load < 0.1, "start_load_kv took " \
-                f"{_after_load - _before_load} seconds"
+            assert _after_load - _before_load < 0.1, (
+                f"start_load_kv took {_after_load - _before_load} seconds"
+            )
 
             # Mimic get_finished_kv_transfers in gpu_model_runner.
             _, done_recving = connector.get_finished(finished_req_ids=set())
@@ -268,20 +275,25 @@ def test_multi_xfer_one_engine(
 
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-        FakeNixlWrapper)
-    @pytest.mark.parametrize("decode_tp_size, prefill_tp_size", [
-        (1, 1),
-        (2, 1),
-        (4, 2),
-        (4, 4),
-    ])
+        FakeNixlWrapper,
+    )
+    @pytest.mark.parametrize(
+        "decode_tp_size, prefill_tp_size",
+        [
+            (1, 1),
+            (2, 1),
+            (4, 2),
+            (4, 4),
+        ],
+    )
     def test_async_load_kv(
-            self,
-            # Fixture that initializes the distributed environment.
-            dist_init,
-            # Simulate consumer-producer TP sizes.
-            decode_tp_size,
-            prefill_tp_size):
+        self,
+        # Fixture that initializes the distributed environment.
+        dist_init,
+        # Simulate consumer-producer TP sizes.
+        decode_tp_size,
+        prefill_tp_size,
+    ):
         """Test that NixlConnector's start_load_kv should be non-blocking."""
 
         vllm_config = create_vllm_config()
@@ -290,18 +302,20 @@ def test_async_load_kv(
         # Test worker role in decode server.
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-            vllm_config, connector.engine_id)
+            vllm_config, connector.engine_id
+        )
         metadata = NixlConnectorMetadata()
-        metadata.add_new_req(request_id="id",
-                             local_block_ids=[1, 2, 3],
-                             kv_transfer_params={
-                                 "remote_block_ids": [4, 5, 6],
-                                 "remote_engine_id":
-                                 FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
-                                 "remote_host": "localhost",
-                                 "remote_port": 1234,
-                                 "remote_tp_size": prefill_tp_size,
-                             })
+        metadata.add_new_req(
+            request_id="id",
+            local_block_ids=[1, 2, 3],
+            kv_transfer_params={
+                "remote_block_ids": [4, 5, 6],
+                "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                "remote_host": "localhost",
+                "remote_port": 1234,
+                "remote_tp_size": prefill_tp_size,
+            },
+        )
         connector.bind_connector_metadata(metadata)
 
         timeout = 2.5
@@ -315,8 +329,9 @@ def test_async_load_kv(
             _before_load = time.perf_counter()
             connector.start_load_kv(dummy_ctx)
             _after_load = time.perf_counter()
-            assert _after_load - _before_load < 0.1, "start_load_kv took " \
-                f"{_after_load - _before_load} seconds"
+            assert _after_load - _before_load < 0.1, (
+                f"start_load_kv took {_after_load - _before_load} seconds"
+            )
             time.sleep(0.5)  # backoff for the async handshake to complete.
             connector.bind_connector_metadata(NixlConnectorMetadata())
             _, done_recving = connector.get_finished(finished_req_ids=set())
@@ -326,11 +341,13 @@ def test_async_load_kv(
 
     @patch(
         "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-        FakeNixlWrapper)
+        FakeNixlWrapper,
+    )
     def test_concurrent_load_kv(
         self,
         # dist_init is a fixture that initializes the distributed environment.
-        dist_init):
+        dist_init,
+    ):
         """Test that multiple start_load_kv calls should occur concurrently."""
 
         vllm_config = create_vllm_config()
@@ -338,20 +355,22 @@ def test_concurrent_load_kv(
         # Test worker role in decode server.
         connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
         connector.connector_worker = FakeNixlConnectorWorker(
-            vllm_config, connector.engine_id)
+            vllm_config, connector.engine_id
+        )
         metadata = NixlConnectorMetadata()
         total_reqs = 5
         for i in range(total_reqs):
-            metadata.add_new_req(request_id=f"id_{i}",
-                                 local_block_ids=[1, 2, 3],
-                                 kv_transfer_params={
-                                     "remote_block_ids": [4, 5, 6],
-                                     "remote_engine_id":
-                                     FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
-                                     "remote_host": "localhost",
-                                     "remote_port": 1234,
-                                     "remote_tp_size": 1,
-                                 })
+            metadata.add_new_req(
+                request_id=f"id_{i}",
+                local_block_ids=[1, 2, 3],
+                kv_transfer_params={
+                    "remote_block_ids": [4, 5, 6],
+                    "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                    "remote_host": "localhost",
+                    "remote_port": 1234,
+                    "remote_tp_size": 1,
+                },
+            )
         connector.bind_connector_metadata(metadata)
 
         timeout = 2.5 * total_reqs
@@ -366,8 +385,9 @@ def test_concurrent_load_kv(
             _before_load = time.perf_counter()
             connector.start_load_kv(dummy_ctx)
             _after_load = time.perf_counter()
-            assert _after_load - _before_load < 0.1, "start_load_kv took " \
-                f"{_after_load - _before_load} seconds"
+            assert _after_load - _before_load < 0.1, (
+                f"start_load_kv took {_after_load - _before_load} seconds"
+            )
             time.sleep(0.5)  # backoff for the async handshake to complete.
             connector.bind_connector_metadata(NixlConnectorMetadata())
             _, done_recving = connector.get_finished(finished_req_ids=set())
@@ -380,11 +400,12 @@ def test_concurrent_load_kv(
 
 @patch(
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-    FakeNixlWrapper)
+    FakeNixlWrapper,
+)
 def test_abort_timeout_on_prefiller(monkeypatch):
     """
     Test lifecycle of an aborted Remote Prefill request hitting the timeout.
-    -----> P 
+    -----> P
             |  {process request}
      <-\--- |  {result is NOT delivered, eg proxy is down}
             |
@@ -417,29 +438,28 @@ def test_abort_timeout_on_prefiller(monkeypatch):
     sampling_params = SamplingParams(
         temperature=0.0,
         max_tokens=1,
-        extra_args={"kv_transfer_params": remote_prefill_opts})
+        extra_args={"kv_transfer_params": remote_prefill_opts},
+    )
     scheduler = llm.llm_engine.engine_core.engine_core.scheduler
     req_to_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks
+        0
+    ].req_to_blocks
 
     padding = "Just making this request a little longer so that we're sure "
     "we're not hitting the small-request lower bound beneath which we don't "
     "actually trigger the whole kv transfer, but rather just recompute the "
     "blocks on D."
-    _ = llm.generate([f"What is the capital of Japan? {padding}"],
-                     sampling_params)
+    _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
 
     # Request finished but not freed
-    assert '0' in scheduler.finished_req_ids and '0' in req_to_blocks
+    assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
     # Some other request, 0 still not freed
-    _ = llm.generate([f"What is the capital of Italy? {padding}"],
-                     sampling_params)
-    assert '0' in req_to_blocks
-    assert '1' in scheduler.finished_req_ids and '1' in req_to_blocks
+    _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
+    assert "0" in req_to_blocks
+    assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
 
     # Wait for timeout and trigger another scheduler loop
     time.sleep(timeout)
-    _ = llm.generate([f"What is the capital of France? {padding}"],
-                     sampling_params)
+    _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
     # Request-0 times out and is cleared!
-    assert '0' not in req_to_blocks
+    assert "0" not in req_to_blocks
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 12a71d97e8d2..141bc22e38f0 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -5,8 +5,13 @@
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
 from vllm.v1.request import FinishReason, RequestStatus
 
-from .utils import (assert_scheduler_empty, create_model_runner_output,
-                    create_request, create_scheduler, create_vllm_config)
+from .utils import (
+    assert_scheduler_empty,
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
 
 
 def test_basic_lifecycle():
@@ -20,10 +25,9 @@ def test_basic_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request = create_request(request_id=1,
-                             max_tokens=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_decode=True)
+    request = create_request(
+        request_id=1, max_tokens=1, num_tokens=NUM_TOKENS, do_remote_decode=True
+    )
 
     scheduler.add_request(request)
     request_id = request.request_id
@@ -38,8 +42,9 @@ def test_basic_lifecycle():
     model_runner_output = create_model_runner_output(reqs=[request])
 
     # (1c): update_from_output()
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
 
     # Ensure the request is finished after 1 tokens.
     assert request.is_finished()
@@ -55,7 +60,8 @@ def test_basic_lifecycle():
 
     # ... but blocks should not be freed.
     blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_id]
+        0
+    ].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
 
@@ -103,10 +109,9 @@ def test_short_prompt_lifecycle():
 
     # Not enough tokens for full block.
     NUM_TOKENS = vllm_config.cache_config.block_size // 2
-    request = create_request(request_id=1,
-                             max_tokens=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_decode=True)
+    request = create_request(
+        request_id=1, max_tokens=1, num_tokens=NUM_TOKENS, do_remote_decode=True
+    )
 
     scheduler.add_request(request)
 
@@ -145,8 +150,9 @@ def test_prefix_cache_lifecycle():
 
     scheduler.add_request(request_normal)
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_normal],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     scheduler.schedule()
     scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT)
@@ -158,9 +164,9 @@ def test_prefix_cache_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS -= 1
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request_remote = create_request(request_id=1,
-                                    num_tokens=NUM_TOKENS,
-                                    do_remote_decode=True)
+    request_remote = create_request(
+        request_id=1, num_tokens=NUM_TOKENS, do_remote_decode=True
+    )
 
     scheduler.add_request(request_remote)
     scheduler_output = scheduler.schedule()
@@ -169,8 +175,7 @@ def test_prefix_cache_lifecycle():
     kv_transfer_params = eco[0].outputs[0].kv_transfer_params
 
     # Ensure we send all block ids, even if there is a cache hit.
-    assert (len(
-        kv_transfer_params["remote_block_ids"]) == NUM_EXTERNAL_FULL_BLOCKS)
+    assert len(kv_transfer_params["remote_block_ids"]) == NUM_EXTERNAL_FULL_BLOCKS
 
     # STEP (2): Ensure it is freed.
     scheduler_output = scheduler.schedule()
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index f89970bf2c80..2af8c942a31a 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -5,8 +5,13 @@
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT
 from vllm.v1.request import FinishReason, RequestStatus
 
-from .utils import (assert_scheduler_empty, create_model_runner_output,
-                    create_request, create_scheduler, create_vllm_config)
+from .utils import (
+    assert_scheduler_empty,
+    create_model_runner_output,
+    create_request,
+    create_scheduler,
+    create_vllm_config,
+)
 
 
 def test_basic_lifecycle():
@@ -20,11 +25,12 @@ def test_basic_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
     START_FREE_BLOCK_QUEUE_SIZE = (
-        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks
+    )
 
-    request = create_request(request_id=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+    request = create_request(
+        request_id=1, num_tokens=NUM_TOKENS, do_remote_prefill=True
+    )
 
     scheduler.add_request(request)
     request_id = request.request_id
@@ -43,16 +49,16 @@ def test_basic_lifecycle():
     # Req waiting for KVs with no computed/scheduled toks ...
     assert len(scheduler.waiting) == 1
     assert request in scheduler.waiting
-    assert (request.status == RequestStatus.WAITING_FOR_REMOTE_KVS)
-    assert (request.num_computed_tokens == 0)
+    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+    assert request.num_computed_tokens == 0
 
     # ... but should have (uncached) blocks allocated to it.
     block_pool = scheduler.kv_cache_manager.block_pool
-    assert (block_pool.free_block_queue.num_free_blocks
-            < START_FREE_BLOCK_QUEUE_SIZE)
+    assert block_pool.free_block_queue.num_free_blocks < START_FREE_BLOCK_QUEUE_SIZE
     assert len(block_pool.cached_block_hash_to_block) == 0
     blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_id]
+        0
+    ].req_to_blocks[request_id]
     for block in blocks:
         assert block._block_hash is None
 
@@ -60,8 +66,9 @@ def test_basic_lifecycle():
     model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
 
     # (1c): update_from_output()
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     assert not engine_core_outputs or not engine_core_outputs[0].outputs
 
     # STEP (2):
@@ -75,10 +82,11 @@ def test_basic_lifecycle():
     model_runner_output.finished_recving = [request_id]
 
     # (2c): update_from_output():
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     assert len(scheduler.waiting) == 1
-    assert (request_id in scheduler.finished_recving_kv_req_ids)
+    assert request_id in scheduler.finished_recving_kv_req_ids
 
     # STEP (3):
     # (3a): schedule(): this should actually schedule.
@@ -88,10 +96,11 @@ def test_basic_lifecycle():
     # Confirm the block are actually allocated.
     num_hashed_blocks = 0
     blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_id]
+        0
+    ].req_to_blocks[request_id]
     for block in blocks:
         assert block.ref_cnt == 1
-        num_hashed_blocks += (1 if block._block_hash is not None else 0)
+        num_hashed_blocks += 1 if block._block_hash is not None else 0
     assert num_hashed_blocks == NUM_EXTERNAL_FULL_BLOCKS
 
     # Confirm the rest of the prompt is scheduled in this step.
@@ -99,7 +108,7 @@ def test_basic_lifecycle():
     num_scheduled_tokens = scheduler_output.num_scheduled_tokens[request_id]
     num_computed_tokens = scheduled_req.num_computed_tokens
     total_prompt_tokens = len(scheduled_req.prompt_token_ids)
-    assert (num_scheduled_tokens == total_prompt_tokens - num_computed_tokens)
+    assert num_scheduled_tokens == total_prompt_tokens - num_computed_tokens
 
     # (3b): execute_model()
     model_runner_output = create_model_runner_output([request])
@@ -109,8 +118,9 @@ def test_basic_lifecycle():
     # Step (4): Hit EOS.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output([request], use_eos=True)
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     scheduler.schedule()
 
     outputs = engine_core_outputs[0].outputs
@@ -131,9 +141,9 @@ def test_interleaved_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request_remote = create_request(request_id=1,
-                                    num_tokens=NUM_TOKENS,
-                                    do_remote_prefill=True)
+    request_remote = create_request(
+        request_id=1, num_tokens=NUM_TOKENS, do_remote_prefill=True
+    )
     request_local_a = create_request(
         request_id=2,
         num_tokens=NUM_TOKENS,
@@ -160,8 +170,7 @@ def test_interleaved_lifecycle():
     assert len(scheduler_output.scheduled_new_reqs) == 1
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 1
 
-    model_runner_output = create_model_runner_output(
-        [request_local_a, request_local_b])
+    model_runner_output = create_model_runner_output([request_local_a, request_local_b])
     scheduler.update_from_output(scheduler_output, model_runner_output)
 
     # STEP 3: continue running, KVs not arrived yet.
@@ -172,7 +181,8 @@ def test_interleaved_lifecycle():
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
 
     model_runner_output = create_model_runner_output(
-        reqs=[request_local_a, request_local_b])
+        reqs=[request_local_a, request_local_b]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 2
     assert len(scheduler.waiting) == 1
@@ -187,8 +197,8 @@ def test_interleaved_lifecycle():
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
 
     model_runner_output = create_model_runner_output(
-        [request_local_a, request_local_b],
-        finished_recving=[request_remote.request_id])
+        [request_local_a, request_local_b], finished_recving=[request_remote.request_id]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
 
     # STEP 5: RECVed KVs are sent to ModelRunner.
@@ -199,7 +209,8 @@ def test_interleaved_lifecycle():
     assert scheduler_output.scheduled_cached_reqs.num_reqs == 2
 
     model_runner_output = create_model_runner_output(
-        [request_local_a, request_local_b, request_remote])
+        [request_local_a, request_local_b, request_remote]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
 
     # STEP 6: Hit EOS and free.
@@ -262,15 +273,17 @@ def test_no_spurious_prefix_caching():
     assert len(scheduler.waiting) == 1
 
     local_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_local.request_id]
+        0
+    ].req_to_blocks[request_local.request_id]
     remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[
-        0].req_to_blocks[request_remote.request_id]
+        0
+    ].req_to_blocks[request_remote.request_id]
 
     # Local should have cached blocks (but not all due to preallocate).
     num_hashed_blocks = 0
     for block in local_blocks:
         assert block.ref_cnt == 1
-        num_hashed_blocks += (1 if block._block_hash is not None else 0)
+        num_hashed_blocks += 1 if block._block_hash is not None else 0
     assert num_hashed_blocks > 0
 
     # Remote blocks should not be cached.
@@ -290,9 +303,9 @@ def test_full_block_prompt():
     NUM_EXTERNAL_FULL_BLOCKS = 2
     NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS)
 
-    request = create_request(request_id=1,
-                             num_tokens=NUM_TOKENS,
-                             do_remote_prefill=True)
+    request = create_request(
+        request_id=1, num_tokens=NUM_TOKENS, do_remote_prefill=True
+    )
 
     scheduler.add_request(request)
     request_id = request.request_id
@@ -300,8 +313,11 @@ def test_full_block_prompt():
     # STEP (1): Initialize a recv.
     scheduler_output = scheduler.schedule()
     # All blocks should be allocated.
-    num_blocks = len(scheduler.kv_cache_manager.coordinator.
-                     single_type_managers[0].req_to_blocks[request_id])
+    num_blocks = len(
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ]
+    )
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
     model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -312,19 +328,21 @@ def test_full_block_prompt():
     model_runner_output.finished_recving = [request_id]
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.waiting) == 1
-    assert (request_id in scheduler.finished_recving_kv_req_ids)
+    assert request_id in scheduler.finished_recving_kv_req_ids
 
     # # STEP (3): Run as usual.
     scheduler_output = scheduler.schedule()
 
     # We need to recompute the final token of the prompt to generate
     # the first new token, so we should not have a new block.
-    num_blocks = len(scheduler.kv_cache_manager.coordinator.
-                     single_type_managers[0].req_to_blocks[request_id])
+    num_blocks = len(
+        scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks[
+            request_id
+        ]
+    )
     assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS
-    assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
-            NUM_TOKENS - 1)
-    assert (scheduler_output.num_scheduled_tokens[request_id] == 1)
+    assert scheduler_output.scheduled_new_reqs[0].num_computed_tokens == NUM_TOKENS - 1
+    assert scheduler_output.num_scheduled_tokens[request_id] == 1
 
     model_runner_output = create_model_runner_output([request])
     scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -332,8 +350,9 @@ def test_full_block_prompt():
     # # Step (4): Hit EOS.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output([request], use_eos=True)
-    engine_core_outputs = scheduler.update_from_output(scheduler_output,
-                                                       model_runner_output)
+    engine_core_outputs = scheduler.update_from_output(
+        scheduler_output, model_runner_output
+    )
     scheduler.schedule()
 
     outputs = engine_core_outputs[0].outputs
@@ -363,9 +382,9 @@ def test_cannot_schedule_after_recv():
     NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
 
     request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
-    request_remote = create_request(request_id=2,
-                                    num_tokens=NUM_TOKENS_REMOTE,
-                                    do_remote_prefill=True)
+    request_remote = create_request(
+        request_id=2, num_tokens=NUM_TOKENS_REMOTE, do_remote_prefill=True
+    )
 
     # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode).
     scheduler.add_request(request_normal)
@@ -386,7 +405,8 @@ def test_cannot_schedule_after_recv():
     # Step 3: finish recving (5 blocks in use)
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(
-        reqs=[request_normal], finished_recving=[request_remote.request_id])
+        reqs=[request_normal], finished_recving=[request_remote.request_id]
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
@@ -400,8 +420,9 @@ def test_cannot_schedule_after_recv():
 
     # Step 5: finish the request, free it.
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_normal],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
@@ -409,16 +430,19 @@ def test_cannot_schedule_after_recv():
     # Step 6: now we can schedule (with 2 blocks computed).
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_remote])
-    assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
-            NUM_PROMPT_BLOCKS * BLOCK_SIZE)
+    assert (
+        scheduler_output.scheduled_new_reqs[0].num_computed_tokens
+        == NUM_PROMPT_BLOCKS * BLOCK_SIZE
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 0
 
     # Step 7: free everything.
     scheduler_output = scheduler.schedule()
-    model_runner_output = create_model_runner_output(reqs=[request_remote],
-                                                     use_eos=True)
+    model_runner_output = create_model_runner_output(
+        reqs=[request_remote], use_eos=True
+    )
     scheduler.update_from_output(scheduler_output, model_runner_output)
     _ = scheduler.schedule()
     assert_scheduler_empty(scheduler)
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index cf20d44fbaae..fff39c5b4284 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -7,16 +7,25 @@
 import torch
 
 from vllm import SamplingParams
-from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
-                         ModelConfig, SchedulerConfig, VllmConfig)
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    KVTransferConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
+from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
-    SharedStorageConnector)
+    SharedStorageConnector,
+)
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
@@ -38,15 +47,25 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               req_to_blocks) == 0
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks
+        )
+        == 0
+    )
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-               num_cached_block) == 0
+    assert (
+        len(
+            scheduler.kv_cache_manager.coordinator.single_type_managers[
+                0
+            ].num_cached_block
+        )
+        == 0
+    )
     num_free_blocks = (
-        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
-    assert num_free_blocks == (
-        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks
+    )
+    assert num_free_blocks == (scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
 
     # NOTE(rob): just the ref count on blocks will be 0. The hash
     # value, etc will remain since we lazily evict for prefix cache.
@@ -87,11 +106,13 @@ def create_vllm_config(
         kv_connector="NixlConnector",
         kv_role="kv_both",
     )
-    return VllmConfig(scheduler_config=scheduler_config,
-                      model_config=model_config,
-                      cache_config=cache_config,
-                      kv_transfer_config=kv_transfer_config,
-                      device_config=DeviceConfig("cpu"))
+    return VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        device_config=DeviceConfig("cpu"),
+    )
 
 
 def create_scheduler(
@@ -104,9 +125,9 @@ def create_scheduler(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
         kv_cache_tensors=[],
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
+            )
         ],
     )
     vllm_config.cache_config.num_gpu_blocks = num_blocks
@@ -133,16 +154,16 @@ def create_request(
 
     if do_remote_decode:
         assert not do_remote_prefill
-        kv_transfer_params = dict(do_remote_prefill=False,
-                                  do_remote_decode=True)
+        kv_transfer_params = dict(do_remote_prefill=False, do_remote_decode=True)
     elif do_remote_prefill:
-        kv_transfer_params = dict(do_remote_prefill=True,
-                                  do_remote_decode=False,
-                                  remote_engine_id="my-engine-id",
-                                  remote_block_ids=list(
-                                      range(num_remote_blocks)),
-                                  remote_host="my-host",
-                                  remote_port=1234)
+        kv_transfer_params = dict(
+            do_remote_prefill=True,
+            do_remote_decode=False,
+            remote_engine_id="my-engine-id",
+            remote_block_ids=list(range(num_remote_blocks)),
+            remote_host="my-host",
+            remote_port=1234,
+        )
 
     max_tokens = 1 if do_remote_decode else max_tokens
     sampling_params = SamplingParams(max_tokens=max_tokens)
@@ -197,22 +218,30 @@ def create_model_runner_output(
 
 
 class TestSharedStorageConnector(SharedStorageConnector):
-
     def __init__(self, config: VllmConfig, role):
         self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
         self._connector = SharedStorageConnector(config, role)
         self.call_record: dict[str, int] = defaultdict(int)
         # Use a unique temp file per connector
-        self._event_file = tempfile.gettempdir(
-        ) + f"/connector_{self.name}-{self.role.name}_events.log"
+        self._event_file = (
+            tempfile.gettempdir()
+            + f"/connector_{self.name}-{self.role.name}_events.log"
+        )
         # Start with an empty file
         with open(self._event_file, "w") as _:
             pass
 
     def __getattribute__(self, name):
-        if name in ("_connector", "call_record", "name", "_event_file",
-                    "__class__", "__dict__", "__getattribute__",
-                    "__init__"):  # avoid recursion
+        if name in (
+            "_connector",
+            "call_record",
+            "name",
+            "_event_file",
+            "__class__",
+            "__dict__",
+            "__getattribute__",
+            "__init__",
+        ):  # avoid recursion
             return object.__getattribute__(self, name)
         if not hasattr(self._connector, name):
             return object.__getattribute__(self, name)
@@ -231,21 +260,20 @@ def wrapper(*args, **kwargs):
                     if isinstance(arg, int):
                         to_log.append(str(arg))
                     elif isinstance(arg, KVCacheBlocks):
-                        to_log.append(
-                            f"num_blocks={[len(b) for b in arg.blocks]}")
+                        to_log.append(f"num_blocks={[len(b) for b in arg.blocks]}")
 
                 # Log the event as a line to the file
                 try:
                     with open(self._event_file, "a") as f:
-                        f.write(' '.join(to_log) + "\n")
+                        f.write(" ".join(to_log) + "\n")
                 except Exception as e:
-                    print(f"[ERROR] Could not log event {name} "
-                          f"for {self.name}: {e}")
+                    print(f"[ERROR] Could not log event {name} for {self.name}: {e}")
                 return attr(*args, **kwargs)
 
             return wrapper
         return attr
 
 
-KVConnectorFactory.register_connector("TestSharedStorageConnector", __name__,
-                                      TestSharedStorageConnector.__name__)
+KVConnectorFactory.register_connector(
+    "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__
+)
diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py
index 0898ae65e7cd..3cac771b1a3c 100644
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@@ -13,7 +13,7 @@ def use_v1_only(monkeypatch):
     """
     The change relies on V1 APIs, so set VLLM_USE_V1=1.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '1')
+    monkeypatch.setenv("VLLM_USE_V1", "1")
 
 
 MODELS = [
@@ -30,12 +30,11 @@ def test_engine_log_metrics_ray(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    """ Simple smoke test, verifying this can be used without exceptions.
+    """Simple smoke test, verifying this can be used without exceptions.
     Need to start a Ray cluster in order to verify outputs."""
 
     @ray.remote(num_gpus=1)
     class EngineTestActor:
-
         async def run(self):
             engine_args = AsyncEngineArgs(
                 model=model,
@@ -44,7 +43,8 @@ async def run(self):
             )
 
             engine = AsyncLLM.from_engine_args(
-                engine_args, stat_loggers=[RayPrometheusStatLogger])
+                engine_args, stat_loggers=[RayPrometheusStatLogger]
+            )
 
             for i, prompt in enumerate(example_prompts):
                 results = engine.generate(
diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 84ee3b0392b4..64842b9da611 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -9,22 +9,30 @@
 import pytest
 import torch
 
-from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
-                                   create_penalty_tensor,
-                                   create_prompt_tokens_tensor,
-                                   fake_apply_logitsprocs,
-                                   fake_update_logitsprocs_state)
+from tests.v1.sample.utils import (
+    LogitsprocsTestFakes,
+    create_fake_logits,
+    create_penalty_tensor,
+    create_prompt_tokens_tensor,
+    fake_apply_logitsprocs,
+    fake_update_logitsprocs_state,
+)
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
+
 # yapf: disable
-from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
-                                             LogitBiasLogitsProcessor,
-                                             LogitsProcessor,
-                                             MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate,
+    BatchUpdateBuilder,
+    LogitBiasLogitsProcessor,
+    LogitsProcessor,
+    MinPLogitsProcessor,
+    MinTokensLogitsProcessor,
+    MoveDirectionality,
+    init_builtin_logitsprocs,
+)
+
 # yapf: enable
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -47,9 +55,10 @@
 
 class LogitsProcsRequestParams:
     """Encapsulates key params for a single request in a batch.
-    
+
     Params can be customized based on the enabled logitproc
     """
+
     workload_index: int
     logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
@@ -61,13 +70,12 @@ def __init__(self, workload_index: int, logitproc_type: LogitprocType):
         # Number of output tokens is randomly 0 or twice the min-tokens
         # threshold which will be used in testing. Output token values
         # don't matter *for these tests* so use 0 as a dummy value
-        self.out_tokens = ([0] *
-                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.out_tokens = [0] * (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2))
         self.params = _sampling_params_from_logitproc(logitproc_type)
 
     def __str__(self):
         """For debugging"""
-        summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
+        summ = ", ".join(f"{k}={v}" for k, v in vars(self).items())
         return f"MyClass({summ})"
 
 
@@ -82,27 +90,30 @@ def _generate_fake_sampling_metadata(
     prompt_token_ids: list[list[int]] = []
     for _ in range(batch_size):
         output_token_ids.append(
-            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist()
+        )
         prompt_token_ids.append(
-            np.random.randint(0,
-                              vocab_size,
-                              size=np.random.randint(
-                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+            np.random.randint(
+                0, vocab_size, size=np.random.randint(1, MAX_NUM_PROMPT_TOKENS)
+            ).tolist()
+        )
     logitsprocs = init_builtin_logitsprocs(
         pin_memory_available=PIN_MEMORY_AVAILABLE,
         max_num_reqs=MAX_NUM_REQS + 1,
-        device=device)
+        device=device,
+    )
 
     fake_sampling_metadata = SamplingMetadata(
-        temperature=torch.full((batch_size, ), 0.0),
+        temperature=torch.full((batch_size,), 0.0),
         all_greedy=True,
         all_random=False,
         top_p=None,
         top_k=None,
         generators={},
         max_num_logprobs=0,
-        prompt_token_ids=create_prompt_tokens_tensor(prompt_token_ids,
-                                                     vocab_size, device),
+        prompt_token_ids=create_prompt_tokens_tensor(
+            prompt_token_ids, vocab_size, device
+        ),
         output_token_ids=output_token_ids,
         frequency_penalties=create_penalty_tensor(batch_size, 0.0, device),
         presence_penalties=create_penalty_tensor(batch_size, 0.0, device),
@@ -110,7 +121,8 @@ def _generate_fake_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=logitsprocs)
+        logitsprocs=logitsprocs,
+    )
     return fake_sampling_metadata
 
 
@@ -122,15 +134,15 @@ def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
         fake_logits[i, 0] = 10.0  # High logit for first token
         fake_logits[i, 1:] = 1e-2  # Others remain low
     sampling_metadata = _generate_fake_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     return LogitsprocsTestFakes(
         logits=fake_logits,
         sampling_metadata=sampling_metadata,
     )
 
 
-def _sampling_params_from_logitproc(
-        logitproc_type: LogitprocType) -> SamplingParams:
+def _sampling_params_from_logitproc(logitproc_type: LogitprocType) -> SamplingParams:
     """Customize request SamplingParams for a specified logitproc"""
     # SamplingParams for req with no logitproc
     kwargs = {"min_p": 0.0, "logit_bias": None, "min_tokens": 0}
@@ -145,7 +157,7 @@ def _generate_mixed_logitsprocs_batch_params(
 ) -> list[LogitsProcsRequestParams]:
     """Define key params for a batch of requests with a different
     logitproc enabled per request.
-    
+
     The batch will have `reqs_per_logitproc` repeats for all
     `logitsprocs_types` under test, including the case where
     no logitsproc is enabled. The batch is randomly shuffled. The
@@ -168,7 +180,8 @@ def _generate_mixed_logitsprocs_batch_params(
     return [
         LogitsProcsRequestParams(
             workload_index=idx,
-            logitproc_type=logitsprocs_types[pdx // reqs_per_logitproc])
+            logitproc_type=logitsprocs_types[pdx // reqs_per_logitproc],
+        )
         for idx, pdx in enumerate(batch_perm)
     ]
 
@@ -180,10 +193,12 @@ def _raise_error_invalid(
     step_idx: int,
     err_cls: type[Exception] = ValueError,
 ) -> None:
-    raise err_cls(f"Validation failed for step={step_idx}, "
-                  f"batch_index={batch_index}, "
-                  f"workload_index={request_params.workload_index}, "
-                  f"req_params={request_params}. Reason: {msg_suffix}")
+    raise err_cls(
+        f"Validation failed for step={step_idx}, "
+        f"batch_index={batch_index}, "
+        f"workload_index={request_params.workload_index}, "
+        f"req_params={request_params}. Reason: {msg_suffix}"
+    )
 
 
 def _logit_bias_params(kwargs: dict) -> None:
@@ -203,8 +218,7 @@ def _logit_bias_validate(
 ) -> None:
     """Validate logit bias logitproc applied correctly"""
     logit_bias = request_params.params.logit_bias
-    logits_old = (
-        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    logits_old = test_fakes.logits[persistent_batch[batch_index].workload_index].cpu()
     logits_new = logits_new[batch_index].cpu()
     for token_id in range(VOCAB_SIZE):
         logit_old_value = logits_old[token_id]
@@ -213,22 +227,28 @@ def _logit_bias_validate(
             bias_value = logit_bias[token_id]
             exp_value = bias_value + logit_old_value
             if logit_new_value != pytest.approx(exp_value):
-                _raise_error_invalid(msg_suffix=(
-                    f"Biased token {token_id} logit value {logit_new_value} "
-                    f"does not match expected value {exp_value} "
-                    f"given bias {bias_value}"),
-                                     batch_index=batch_index,
-                                     request_params=request_params,
-                                     step_idx=step_idx)
+                _raise_error_invalid(
+                    msg_suffix=(
+                        f"Biased token {token_id} logit value {logit_new_value} "
+                        f"does not match expected value {exp_value} "
+                        f"given bias {bias_value}"
+                    ),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx,
+                )
 
         else:
             if logit_new_value != pytest.approx(logit_old_value):
-                _raise_error_invalid(msg_suffix=(
-                    f"Unbiased token {token_id} logit value {logit_new_value} "
-                    f"does not match expected value {logit_old_value}"),
-                                     batch_index=batch_index,
-                                     request_params=request_params,
-                                     step_idx=step_idx)
+                _raise_error_invalid(
+                    msg_suffix=(
+                        f"Unbiased token {token_id} logit value {logit_new_value} "
+                        f"does not match expected value {logit_old_value}"
+                    ),
+                    batch_index=batch_index,
+                    request_params=request_params,
+                    step_idx=step_idx,
+                )
 
 
 def _min_p_params(kwargs: dict) -> None:
@@ -254,26 +274,27 @@ def _min_p_validate(
                     msg_suffix="Invalid: dominant token 0 masked (-inf)",
                     batch_index=batch_index,
                     request_params=request_params,
-                    step_idx=step_idx)
+                    step_idx=step_idx,
+                )
         else:
             if request_params.params.min_p > 0.0:
                 # Non-dominant tokens should be masked when min_p > 0
                 if logits_for_token != -float("inf"):
                     _raise_error_invalid(
-                        msg_suffix=
-                        f"Invalid: non-dominant token {token_id} not masked",
+                        msg_suffix=f"Invalid: non-dominant token {token_id} not masked",
                         batch_index=batch_index,
                         request_params=request_params,
-                        step_idx=step_idx)
+                        step_idx=step_idx,
+                    )
             else:
                 # No masking when min_p is 0
                 if logits_for_token == -float("inf"):
                     _raise_error_invalid(
-                        msg_suffix=
-                        f"Invalid: token {token_id} masked when min_p=0.0",
+                        msg_suffix=f"Invalid: token {token_id} masked when min_p=0.0",
                         batch_index=batch_index,
                         request_params=request_params,
-                        step_idx=step_idx)
+                        step_idx=step_idx,
+                    )
 
 
 def _min_tokens_params(kwargs: dict) -> None:
@@ -298,7 +319,8 @@ def _min_tokens_validate(
     min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
     ref_all_stop_token_ids = request_params.params.all_stop_token_ids
     mt_lp: MinTokensLogitsProcessor = next(
-        test_fakes.get_logitsprocs_by_cls(MinTokensLogitsProcessor))
+        test_fakes.get_logitsprocs_by_cls(MinTokensLogitsProcessor)
+    )
     assert isinstance(mt_lp, MinTokensLogitsProcessor)
     min_tok = mt_lp.min_toks.get(batch_index, None)
 
@@ -307,38 +329,50 @@ def _min_tokens_validate(
         (_, out_tok, all_stop_token_ids) = min_tok
         num_out_tokens = len(out_tok)
         if num_out_tokens != ref_num_out_tokens:
-            _raise_error_invalid(msg_suffix=(
-                "Number of output tokens in min-token logit processor "
-                f"request metadata ({num_out_tokens}) does not match "
-                f"reference ({ref_num_out_tokens})."),
-                                 batch_index=batch_index,
-                                 request_params=request_params,
-                                 step_idx=step_idx)
+            _raise_error_invalid(
+                msg_suffix=(
+                    "Number of output tokens in min-token logit processor "
+                    f"request metadata ({num_out_tokens}) does not match "
+                    f"reference ({ref_num_out_tokens})."
+                ),
+                batch_index=batch_index,
+                request_params=request_params,
+                step_idx=step_idx,
+            )
         if ref_all_stop_token_ids != all_stop_token_ids:
-            _raise_error_invalid(msg_suffix=(
-                "Stop token ids do not match reference; all_stop_token_ids: "
-                f"{sorted(all_stop_token_ids)}, ref_all_stop_token_ids: "
-                f"{sorted(ref_all_stop_token_ids)}"),
-                                 batch_index=batch_index,
-                                 request_params=request_params,
-                                 step_idx=step_idx)
+            _raise_error_invalid(
+                msg_suffix=(
+                    "Stop token ids do not match reference; all_stop_token_ids: "
+                    f"{sorted(all_stop_token_ids)}, ref_all_stop_token_ids: "
+                    f"{sorted(ref_all_stop_token_ids)}"
+                ),
+                batch_index=batch_index,
+                request_params=request_params,
+                step_idx=step_idx,
+            )
         if min_reached:
-            _raise_error_invalid(msg_suffix=(
-                "Expected min-tokens request with min reached, but batch "
-                "index is recognized by min-tokens logits processor."),
-                                 batch_index=batch_index,
-                                 request_params=request_params,
-                                 step_idx=step_idx,
-                                 err_cls=RuntimeError)
+            _raise_error_invalid(
+                msg_suffix=(
+                    "Expected min-tokens request with min reached, but batch "
+                    "index is recognized by min-tokens logits processor."
+                ),
+                batch_index=batch_index,
+                request_params=request_params,
+                step_idx=step_idx,
+                err_cls=RuntimeError,
+            )
 
     elif not min_reached:
-        _raise_error_invalid(msg_suffix=(
-            "Expected min-tokens request with min not reached, but batch "
-            "index is not recognized by min-tokens logits processor."),
-                             batch_index=batch_index,
-                             request_params=request_params,
-                             step_idx=step_idx,
-                             err_cls=RuntimeError)
+        _raise_error_invalid(
+            msg_suffix=(
+                "Expected min-tokens request with min not reached, but batch "
+                "index is not recognized by min-tokens logits processor."
+            ),
+            batch_index=batch_index,
+            request_params=request_params,
+            step_idx=step_idx,
+            err_cls=RuntimeError,
+        )
 
     # Validate min-token logits
     for token_id in range(VOCAB_SIZE):
@@ -346,21 +380,27 @@ def _min_tokens_validate(
         if token_id in ref_all_stop_token_ids and not min_reached:
             if logits_for_token != -float("inf"):
                 _raise_error_invalid(
-                    msg_suffix=(f"Token {token_id} is a stop token and "
-                                "the sequence has not reached min length, "
-                                "but the token is not masked "
-                                f"(logit={logits_for_token})"),
+                    msg_suffix=(
+                        f"Token {token_id} is a stop token and "
+                        "the sequence has not reached min length, "
+                        "but the token is not masked "
+                        f"(logit={logits_for_token})"
+                    ),
                     batch_index=batch_index,
                     request_params=request_params,
-                    step_idx=step_idx)
+                    step_idx=step_idx,
+                )
         else:
             if logits_for_token == -float("inf"):
                 _raise_error_invalid(
-                    msg_suffix=(f"Token {token_id} should not be masked but "
-                                f"is (output len={ref_num_out_tokens})"),
+                    msg_suffix=(
+                        f"Token {token_id} should not be masked but "
+                        f"is (output len={ref_num_out_tokens})"
+                    ),
                     batch_index=batch_index,
                     request_params=request_params,
-                    step_idx=step_idx)
+                    step_idx=step_idx,
+                )
 
 
 def _none_validate(
@@ -372,52 +412,58 @@ def _none_validate(
     step_idx: int,
 ) -> None:
     """Validate that no logits processors are applied"""
-    logits = (
-        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    logits = test_fakes.logits[persistent_batch[batch_index].workload_index].cpu()
     ref_logits = logits_new[batch_index]
     if not torch.all(ref_logits == logits):
-        mismatch_toks = (ref_logits
-                         != logits).nonzero(as_tuple=True)[0].tolist()
+        mismatch_toks = (ref_logits != logits).nonzero(as_tuple=True)[0].tolist()
         mismatch_strs = []
         for token in mismatch_toks:
             val = float(logits[token])
             ref_val = float(ref_logits[token])
             mismatch_strs.append(f"({token=},{val=},{ref_val=})")
-        _raise_error_invalid(msg_suffix=(
-            f"Unexpected modification of logits: {','.join(mismatch_strs)}"),
-                             batch_index=batch_index,
-                             request_params=request_params,
-                             step_idx=step_idx)
+        _raise_error_invalid(
+            msg_suffix=(
+                f"Unexpected modification of logits: {','.join(mismatch_strs)}"
+            ),
+            batch_index=batch_index,
+            request_params=request_params,
+            step_idx=step_idx,
+        )
 
 
 class LogitsprocTestHelpers(NamedTuple):
     """Supports setting up and validating logitsprocs unit tests."""
+
     eval_fxn: Callable
     gen_request_fxn: Optional[Callable] = None
 
 
 logitsprocs_test_mapping = {
-    STR_NO_LOGITPROC:
-    LogitsprocTestHelpers(eval_fxn=_none_validate),
-    LogitBiasLogitsProcessor:
-    LogitsprocTestHelpers(gen_request_fxn=_logit_bias_params,
-                          eval_fxn=_logit_bias_validate),
-    MinPLogitsProcessor:
-    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
-                          eval_fxn=_min_p_validate),
-    MinTokensLogitsProcessor:
-    LogitsprocTestHelpers(gen_request_fxn=_min_tokens_params,
-                          eval_fxn=_min_tokens_validate),
+    STR_NO_LOGITPROC: LogitsprocTestHelpers(eval_fxn=_none_validate),
+    LogitBiasLogitsProcessor: LogitsprocTestHelpers(
+        gen_request_fxn=_logit_bias_params, eval_fxn=_logit_bias_validate
+    ),
+    MinPLogitsProcessor: LogitsprocTestHelpers(
+        gen_request_fxn=_min_p_params, eval_fxn=_min_p_validate
+    ),
+    MinTokensLogitsProcessor: LogitsprocTestHelpers(
+        gen_request_fxn=_min_tokens_params, eval_fxn=_min_tokens_validate
+    ),
 }
 
 
 def _get_test_cases() -> list[list[str]]:
     """Each test case is a set of logitsprocs"""
     logitsprocs_types = list(logitsprocs_test_mapping.keys())
-    return [[STR_NO_LOGITPROC]] + [[logitproc_type, STR_NO_LOGITPROC]
-                                   for logitproc_type in logitsprocs_types
-                                   if logitproc_type != STR_NO_LOGITPROC
-                                   ] + [logitsprocs_types]
+    return (
+        [[STR_NO_LOGITPROC]]
+        + [
+            [logitproc_type, STR_NO_LOGITPROC]
+            for logitproc_type in logitsprocs_types
+            if logitproc_type != STR_NO_LOGITPROC
+        ]
+        + [logitsprocs_types]
+    )
 
 
 def _generate_fake_step_update(
@@ -435,11 +481,18 @@ def _generate_fake_step_update(
     # Other 50%: add a limited number of reqs (less than the number
     # of workload reqs remaining, less than an arbitrary max)
     # If no workload reqs remain: 100% of steps have 0 adds
-    num_step_add = random.choice([
-        0,
-        random.randint(1, min(max_add_remove_per_step,
-                              workload_reqs_remaining))
-    ]) if workload_reqs_remaining else 0
+    num_step_add = (
+        random.choice(
+            [
+                0,
+                random.randint(
+                    1, min(max_add_remove_per_step, workload_reqs_remaining)
+                ),
+            ]
+        )
+        if workload_reqs_remaining
+        else 0
+    )
 
     # 50% of steps: remove no requests
     # Other 50%: remove a limited number of reqs (less than the number
@@ -447,9 +500,11 @@ def _generate_fake_step_update(
     # If persistent batch is empty: 100% of steps have 0 removals until
     # more requests are added. Assume that removed requests are always
     # drawn from the current batch, before new adds
-    num_step_remove = random.choice([
-        0, random.randint(1, min(max_add_remove_per_step, batch_size))
-    ]) if batch_size else 0
+    num_step_remove = (
+        random.choice([0, random.randint(1, min(max_add_remove_per_step, batch_size))])
+        if batch_size
+        else 0
+    )
 
     num_step_add_replace = min(num_step_add, num_step_remove)
 
@@ -458,21 +513,24 @@ def _generate_fake_step_update(
         batch_update_builder.removed_append(removal)
 
     # Get added requests from workload
-    for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
+    for add_req_params in workload_params[wdx : (wdx + num_step_add_replace)]:
         # Replace as many removed requests as possible with added requests
         add_remove_idx = batch_update_builder.pop_removed()
         batch_update_builder.added.append(
-            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+            (add_remove_idx, add_req_params.params, add_req_params.out_tokens)
+        )
         persistent_batch[add_remove_idx] = add_req_params
 
     # Append remaining added requests to end of batch
-    add_reqs_append = workload_params[(wdx +
-                                       num_step_add_replace):(wdx +
-                                                              num_step_add)]
-    batch_update_builder.added.extend([
-        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
-        for adx, add_req_params in enumerate(add_reqs_append)
-    ])
+    add_reqs_append = workload_params[
+        (wdx + num_step_add_replace) : (wdx + num_step_add)
+    ]
+    batch_update_builder.added.extend(
+        [
+            (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+            for adx, add_req_params in enumerate(add_reqs_append)
+        ]
+    )
     persistent_batch.extend(add_reqs_append)
     pre_condense_batch_size = len(persistent_batch)
     wdx += num_step_add  # Update workload offset
@@ -481,8 +539,10 @@ def _generate_fake_step_update(
     last_nonempty_index = pre_condense_batch_size - 1
     condensed_to_idxs = set()
     while batch_update_builder.removed:
-        if (last_nonempty_index in batch_update_builder.removed
-                or last_nonempty_index in condensed_to_idxs):
+        if (
+            last_nonempty_index in batch_update_builder.removed
+            or last_nonempty_index in condensed_to_idxs
+        ):
             last_nonempty_index -= 1
             continue
         # last_nonempty_index is the highest persistent batch index that was
@@ -497,11 +557,10 @@ def _generate_fake_step_update(
         # move last_nonempty_index -> first_empty_index
         batch_update_builder.pop_removed()
         condensed_to_idxs.add(first_empty_index)
-        persistent_batch[first_empty_index] = persistent_batch[
-            last_nonempty_index]
+        persistent_batch[first_empty_index] = persistent_batch[last_nonempty_index]
         batch_update_builder.moved.append(
-            (last_nonempty_index, first_empty_index,
-             MoveDirectionality.UNIDIRECTIONAL))
+            (last_nonempty_index, first_empty_index, MoveDirectionality.UNIDIRECTIONAL)
+        )
 
         last_nonempty_index -= 1
 
@@ -517,18 +576,21 @@ def _generate_fake_step_update(
         k = random.randint(0, condensed_batch_size // 2)
         idxs = list(range(condensed_batch_size))
         random.shuffle(idxs)
-        swaps = [
-            tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
-        ]
-        batch_update_builder.moved.extend([
-            (sw[0], sw[1], MoveDirectionality.SWAP) for sw in swaps
-        ])
+        swaps = [tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)]
+        batch_update_builder.moved.extend(
+            [(sw[0], sw[1], MoveDirectionality.SWAP) for sw in swaps]
+        )
         for adx, bdx in swaps:
-            persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
-                bdx], persistent_batch[adx]
-
-    return (batch_update_builder.get_and_reset(condensed_batch_size), wdx,
-            workload_size - wdx)
+            persistent_batch[adx], persistent_batch[bdx] = (
+                persistent_batch[bdx],
+                persistent_batch[adx],
+            )
+
+    return (
+        batch_update_builder.get_and_reset(condensed_batch_size),
+        wdx,
+        workload_size - wdx,
+    )
 
 
 def _assert_valid(
@@ -543,8 +605,10 @@ def _assert_valid(
         # Trivial case of empty persistent batch
         assert len(persistent_batch) == 0
         if logits_w_lp.shape[0] != 0:
-            raise ValueError("Fake persistent batch is empty but logitsprocs "
-                             f"output batch has shape {logits_w_lp.shape}")
+            raise ValueError(
+                "Fake persistent batch is empty but logitsprocs "
+                f"output batch has shape {logits_w_lp.shape}"
+            )
         return
 
     # Validate logits for each fake request
@@ -553,35 +617,39 @@ def _assert_valid(
         # Invoke the appropriate validation function for
         # the logitproc employed by this request
         fxn = logitsprocs_test_mapping[request_params.logitproc_type].eval_fxn
-        fxn(test_fakes=test_fakes,
+        fxn(
+            test_fakes=test_fakes,
             persistent_batch=persistent_batch,
             logits_new=logits_w_lp,
             batch_index=batch_index,
             request_params=request_params,
-            step_idx=step_idx)
+            step_idx=step_idx,
+        )
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
-def test_logitsprocs(device: str, reqs_per_logitproc: int,
-                     logitsprocs_under_test: list[str]):
+def test_logitsprocs(
+    device: str, reqs_per_logitproc: int, logitsprocs_under_test: list[str]
+):
     random.seed(40)
     torch.set_default_device(device)
 
     # Define a shuffled batch of requests which individually use a different
     # logitproc, or no logitproc at all
     workload_params = _generate_mixed_logitsprocs_batch_params(
-        reqs_per_logitproc=reqs_per_logitproc,
-        logitsprocs_types=logitsprocs_under_test)
+        reqs_per_logitproc=reqs_per_logitproc, logitsprocs_types=logitsprocs_under_test
+    )
     workload_size = len(workload_params)
 
     # Create fake test data structures for testing.
     test_fakes = _generate_test_fakes(workload_size, device)
 
     wdx = 0  # Next request index in workload to add
-    persistent_batch: list[LogitsProcsRequestParams] = [
-    ]  # Persistent batch state, as list of workload indices
+    persistent_batch: list[
+        LogitsProcsRequestParams
+    ] = []  # Persistent batch state, as list of workload indices
 
     # Generate fake removed request indices from current persistent
     # batch before adds
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 69180e6e5db4..f7a46c1ee066 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -8,9 +8,12 @@
 import torch
 
 from tests.v1.sample.utils import (
-    BatchLogprobsComposition, BatchLogprobsSpecType,
+    BatchLogprobsComposition,
+    BatchLogprobsSpecType,
     assert_incr_detok_str_matches_non_incr_detok_str,
-    compute_correct_cumulative_logprob, get_test_batch)
+    compute_correct_cumulative_logprob,
+    get_test_batch,
+)
 from vllm import SamplingParams
 
 from ...conftest import HfRunner, VllmRunner
@@ -27,22 +30,23 @@
 @pytest.fixture(
     scope="module",
     # Parameterize APC
-    params=[False, True])
+    params=[False, True],
+)
 def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
     with vllm_runner(
-            MODEL,
-            dtype=DTYPE,
-            max_logprobs=7,
-            # Very small number of batched tokens to ensure
-            # that we test chunking.
-            max_num_batched_tokens=16,
-            max_num_seqs=16,
-            max_model_len=128,
-            enforce_eager=True,
-            #TODO: enable this once we support it for
-            # prompt logprobs.
-            enable_prefix_caching=request.param,
-            gpu_memory_utilization=0.4,  # up to 2 alive concurrently
+        MODEL,
+        dtype=DTYPE,
+        max_logprobs=7,
+        # Very small number of batched tokens to ensure
+        # that we test chunking.
+        max_num_batched_tokens=16,
+        max_num_seqs=16,
+        max_model_len=128,
+        enforce_eager=True,
+        # TODO: enable this once we support it for
+        # prompt logprobs.
+        enable_prefix_caching=request.param,
+        gpu_memory_utilization=0.4,  # up to 2 alive concurrently
     ) as vllm_model:
         yield vllm_model
 
@@ -94,8 +98,8 @@ def _repeat_logprob_config(
     num_test_prompts = len(test_prompts)
     # Make sure there is a logprobs configuration for each test prompt
     logprob_prompt_logprob_list = list(
-        itertools.islice(itertools.cycle(logprob_prompt_logprob_list),
-                         num_test_prompts))
+        itertools.islice(itertools.cycle(logprob_prompt_logprob_list), num_test_prompts)
+    )
     # Now the number of prompts should match the number of sample params combos
     assert num_test_prompts == len(logprob_prompt_logprob_list)
     return logprob_prompt_logprob_list
@@ -113,24 +117,28 @@ def _run_and_validate(
     do_apc: bool,
 ) -> None:
     vllm_results = vllm_model.model.generate(
-        test_prompts, sampling_params=vllm_sampling_params)
+        test_prompts, sampling_params=vllm_sampling_params
+    )
 
     for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
-            vllm_results, hf_logprobs, hf_outputs,
-            logprob_prompt_logprob_list):
-
+        vllm_results, hf_logprobs, hf_outputs, logprob_prompt_logprob_list
+    ):
         # Extract request-level (prompt)logprobs config
         num_top_logprobs, num_top_prompt_logprobs = logprob_prompt_logprob
 
         # Test whether sampled token output is consistent between vLLM and HF
         # vLLM prompt+completion should match HF output
         if temperature == 0.0:
-            assert (vllm_result.prompt_token_ids +
-                    vllm_result.outputs[0].token_ids == hf_output[0])
+            assert (
+                vllm_result.prompt_token_ids + vllm_result.outputs[0].token_ids
+                == hf_output[0]
+            )
         else:
             # Sampled tokens won't match if not greedy
-            assert (vllm_result.prompt_token_ids == hf_output[0]
-                    [:len(vllm_result.prompt_token_ids)])
+            assert (
+                vllm_result.prompt_token_ids
+                == hf_output[0][: len(vllm_result.prompt_token_ids)]
+            )
 
         # Validate sample logprobs
         if num_top_logprobs is not None:
@@ -139,8 +147,9 @@ def _run_and_validate(
             # correct
             assert vllm_result.outputs[0].logprobs is not None
             assert len(vllm_result.outputs[0].logprobs) == max_tokens
-            for logprobs, token_id in zip(vllm_result.outputs[0].logprobs,
-                                          vllm_result.outputs[0].token_ids):
+            for logprobs, token_id in zip(
+                vllm_result.outputs[0].logprobs, vllm_result.outputs[0].token_ids
+            ):
                 assert logprobs is not None
 
                 # Confirm that the output token appears among the logprobs
@@ -157,23 +166,26 @@ def _run_and_validate(
                 if num_top_logprobs > 0:
                     # We should have an entry for each of the topk ranks
                     all_ranks = {lp.rank for lp in logprobs.values()}
-                    assert all(r in all_ranks
-                               for r in range(1, num_top_logprobs + 1))
+                    assert all(r in all_ranks for r in range(1, num_top_logprobs + 1))
 
             output_text = vllm_result.outputs[0].text
             output_string_from_most_likely_tokens_lst: list[str] = []
             for top_logprobs in vllm_result.outputs[0].logprobs:
                 top_logprob = next(iter(top_logprobs.values()))
                 output_string_from_most_likely_tokens_lst.append(
-                    top_logprob.decoded_token)
+                    top_logprob.decoded_token
+                )
 
             output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens_lst)
+                output_string_from_most_likely_tokens_lst
+            )
             assert_incr_detok_str_matches_non_incr_detok_str(
-                output_text, output_string_from_most_likely_tokens,
+                output_text,
+                output_string_from_most_likely_tokens,
                 "The output text from the top logprob for each token "
                 "position should be the same as the output text in the "
-                "result.")
+                "result.",
+            )
 
             # Compare vLLM sample logprobs to HF
             vllm_sample_logprobs = vllm_result.outputs[0].logprobs
@@ -185,11 +197,12 @@ def _run_and_validate(
                             logprob,
                             hf_logprob[i][-1][token_id].item(),
                             atol=1e-2,
-                            rtol=1e-2)
-                    assert isinstance(
-                        sample_logprob.decoded_token,
-                        str), ("The token should be decoded by the time it is"
-                               " returned to the user.")
+                            rtol=1e-2,
+                        )
+                    assert isinstance(sample_logprob.decoded_token, str), (
+                        "The token should be decoded by the time it is"
+                        " returned to the user."
+                    )
 
             # At this point we know the sample logprobs are correct for this
             # request. Validate that cumulative_logprob is actually the sum.
@@ -199,7 +212,8 @@ def _run_and_validate(
                 vllm_result.outputs[0].cumulative_logprob,
                 compute_correct_cumulative_logprob(vllm_result.outputs[0]),
                 atol=1e-6,
-                rtol=1e-6)
+                rtol=1e-6,
+            )
         else:
             # Logprobs disabled for this request; should be None
             assert vllm_result.outputs[0].logprobs is None
@@ -212,17 +226,17 @@ def _run_and_validate(
             assert vllm_result.prompt_logprobs[0] is None
             # - Prompt logprobs are returned for all indices in
             #   the prompt
-            assert len(vllm_result.prompt_logprobs) == len(
-                vllm_result.prompt_token_ids)
+            assert len(vllm_result.prompt_logprobs) == len(vllm_result.prompt_token_ids)
             for prompt_logprobs, prompt_token_id in zip(
-                    vllm_result.prompt_logprobs[1:],
-                    vllm_result.prompt_token_ids[1:]):
+                vllm_result.prompt_logprobs[1:], vllm_result.prompt_token_ids[1:]
+            ):
                 assert prompt_logprobs is not None
 
                 # Confirm that the prompt token appears among the logprobs
                 assert prompt_token_id in prompt_logprobs
-                token_in_topk = prompt_logprobs[
-                    prompt_token_id].rank <= num_top_prompt_logprobs
+                token_in_topk = (
+                    prompt_logprobs[prompt_token_id].rank <= num_top_prompt_logprobs
+                )
 
                 # If the prompt token is not included in the top K
                 # logprob, it can return 1 more data
@@ -234,8 +248,9 @@ def _run_and_validate(
                 if num_top_prompt_logprobs > 0:
                     # We should have an entry for each of the topk ranks
                     all_ranks = {lp.rank for lp in prompt_logprobs.values()}
-                    assert all(r in all_ranks
-                               for r in range(1, num_top_prompt_logprobs + 1))
+                    assert all(
+                        r in all_ranks for r in range(1, num_top_prompt_logprobs + 1)
+                    )
 
             # Compare prompt logprobs to HF
             # The first prompt logprob is always None, so we compare it from
@@ -247,19 +262,24 @@ def _run_and_validate(
                         logprob.logprob,
                         hf_logprob[0][i][token_id].item(),
                         atol=2e-2,
-                        rtol=2e-2)
+                        rtol=2e-2,
+                    )
         else:
             assert vllm_result.prompt_logprobs is None
 
 
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
+@pytest.mark.parametrize(
+    "batch_logprobs_composition", [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]
+)
 @pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
-        hf_model, vllm_model,
-        batch_logprobs_composition: BatchLogprobsComposition,
-        temperature: float, example_prompts: list[str],
-        monkeypatch: pytest.MonkeyPatch) -> None:
+    hf_model,
+    vllm_model,
+    batch_logprobs_composition: BatchLogprobsComposition,
+    temperature: float,
+    example_prompts: list[str],
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
     """Test V1 Engine logprobs & prompt logprobs
 
     Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
@@ -289,8 +309,9 @@ def test_get_logprobs_and_prompt_logprobs(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
         do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
-        if do_apc and (temperature < 2.0
-                       or batch_logprobs_composition != SAMPLE_PROMPT):
+        if do_apc and (
+            temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
+        ):
             # Skip some test-cases to save time.
             pytest.skip()
         test_prompts = example_prompts
@@ -307,19 +328,21 @@ def test_get_logprobs_and_prompt_logprobs(
 
         # Batch has mixed sample params
         # (different logprobs/prompt logprobs combos)
-        logprob_prompt_logprob_list = get_test_batch(
-            batch_logprobs_composition)
+        logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
 
         # Ensure that each test prompt has a logprob config for testing
         logprob_prompt_logprob_list = _repeat_logprob_config(
-            test_prompts, logprob_prompt_logprob_list)
+            test_prompts, logprob_prompt_logprob_list
+        )
         # Generate SamplingParams
         vllm_sampling_params = [
-            SamplingParams(max_tokens=max_tokens,
-                           logprobs=num_lp,
-                           prompt_logprobs=num_plp,
-                           temperature=temperature,
-                           seed=1984)
+            SamplingParams(
+                max_tokens=max_tokens,
+                logprobs=num_lp,
+                prompt_logprobs=num_plp,
+                temperature=temperature,
+                seed=1984,
+            )
             for num_lp, num_plp in logprob_prompt_logprob_list
         ]
         for _ in range(2 if do_apc else 1):
@@ -332,7 +355,8 @@ def test_get_logprobs_and_prompt_logprobs(
                 logprob_prompt_logprob_list=logprob_prompt_logprob_list,
                 temperature=temperature,
                 max_tokens=max_tokens,
-                do_apc=do_apc)
+                do_apc=do_apc,
+            )
 
 
 def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
@@ -349,19 +373,18 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
             enable_prefix_caching=False,
             # 2 other llms alive during whole session
             gpu_memory_utilization=0.15,
-            max_model_len=256)
+            max_model_len=256,
+        )
         vllm_sampling_params = SamplingParams(logprobs=1)
         # should pass
         runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
         bad_sampling_params = SamplingParams(logprobs=2)
         with pytest.raises(ValueError):
-            runner.generate(["Hello world"],
-                            sampling_params=bad_sampling_params)
+            runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
 
-def test_none_logprobs(vllm_model, example_prompts,
-                       monkeypatch: pytest.MonkeyPatch):
+def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
 
     Args:
@@ -386,14 +409,12 @@ def test_none_logprobs(vllm_model, example_prompts,
         for i in range(len(results_logprobs_none)):
             # Check sample logprobs are None
             assert results_logprobs_none[i].outputs[0].logprobs is None
-            assert results_logprobs_none[i].outputs[
-                0].cumulative_logprob is None
+            assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
             # Check prompt logprobs are None
             assert results_logprobs_none[i].prompt_logprobs is None
 
 
-def test_zero_logprobs(vllm_model, example_prompts,
-                       monkeypatch: pytest.MonkeyPatch):
+def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
     """Engine should return sampled token and prompt token logprobs
 
     Args:
@@ -404,12 +425,12 @@ def test_zero_logprobs(vllm_model, example_prompts,
         m.setenv("VLLM_USE_V1", "1")
         max_tokens = 5
 
-        sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=0,
-                                                       prompt_logprobs=0,
-                                                       temperature=0.0)
+        sampling_params_logprobs_zero = SamplingParams(
+            max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
+        )
         results_logprobs_zero = vllm_model.model.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_zero)
+            example_prompts, sampling_params=sampling_params_logprobs_zero
+        )
 
         for i in range(len(results_logprobs_zero)):
             # Check that there is one sample logprob dict for each
@@ -420,8 +441,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
             prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
             assert logprobs is not None
             assert len(sampled_token_ids) == len(logprobs)
-            assert results_logprobs_zero[i].outputs[
-                0].cumulative_logprob is not None
+            assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
             # Check that there is one prompt logprob dict for each
             # prompt token
             assert prompt_logprobs is not None
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 50b14a15dc16..08e59e8e585d 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -15,22 +15,24 @@
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
-    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
-    "--gpu-memory-utilization=0.8"
+    "--enforce_eager",
+    "--no_enable_prefix_caching",
+    "--disable-log-requests",
+    "--gpu-memory-utilization=0.8",
 ]
 NUM_CONCURRENT = 100
 
 
 def test_prompt_logprobs_e2e():
-    results = lm_eval.simple_evaluate(model="vllm",
-                                      model_args=MODEL_ARGS,
-                                      tasks=TASK,
-                                      batch_size="auto")
+    results = lm_eval.simple_evaluate(
+        model="vllm", model_args=MODEL_ARGS, tasks=TASK, batch_size="auto"
+    )
 
     measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert (
+        measured_value - RTOL < EXPECTED_VALUE
+        and measured_value + RTOL > EXPECTED_VALUE
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
 def test_prompt_logprobs_e2e_server():
@@ -40,7 +42,8 @@ def test_prompt_logprobs_e2e_server():
         model_args = (
             f"model={MODEL},"
             f"base_url={url},"
-            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+        )
 
         results = lm_eval.simple_evaluate(
             model="local-completions",
@@ -49,6 +52,7 @@ def test_prompt_logprobs_e2e_server():
         )
 
         measured_value = results["results"][TASK][FILTER]
-        assert (measured_value - RTOL < EXPECTED_VALUE
-                and measured_value + RTOL > EXPECTED_VALUE
-                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+        assert (
+            measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 3a4d48afc9d7..4f9de47db470 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -9,8 +9,7 @@
 from vllm.platforms import current_platform
 from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
-                                              RejectionSampler)
+from vllm.v1.sample.rejection_sampler import PLACEHOLDER_TOKEN_ID, RejectionSampler
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
 DEVICE = current_platform.device_type
@@ -21,10 +20,11 @@ def rejection_sampler():
     return RejectionSampler()
 
 
-def create_logits_tensor(output_token_ids: list[list[int]],
-                         vocab_size: int = 100) -> torch.Tensor:
+def create_logits_tensor(
+    output_token_ids: list[list[int]], vocab_size: int = 100
+) -> torch.Tensor:
     """Helper function to create logits tensor that
-       will produce desired token ids on argmax"""
+    will produce desired token ids on argmax"""
     token_ids = [tokens[:-1] for tokens in output_token_ids]
     num_total_tokens = sum(len(tokens) for tokens in token_ids)
     logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
@@ -44,8 +44,8 @@ def create_sampling_metadata(
     generators: Optional[dict[int, Any]] = None,
 ) -> SamplingMetadata:
     """Create a v1 sampling metadata object with all_greedy set
-        to the given value. Either all greedy or all random sampling
-        is used.
+    to the given value. Either all greedy or all random sampling
+    is used.
     """
     generators = generators or {}
     if all_greedy:
@@ -81,10 +81,10 @@ def test_perfect_match(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
 
     output = rejection_sampler(
         spec_decode_metadata,
@@ -93,9 +93,7 @@ def test_perfect_match(rejection_sampler):
         bonus_token_ids=bonus_token_tensor,
         sampling_metadata=metadata,
     )
-    expected = torch.tensor([[1, 2, 3, 4]],
-                            dtype=torch.int,
-                            device=logits.device)
+    expected = torch.tensor([[1, 2, 3, 4]], dtype=torch.int, device=logits.device)
     assert torch.equal(output, expected)
 
 
@@ -106,10 +104,10 @@ def test_early_mismatch(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
 
     output = rejection_sampler(
         spec_decode_metadata,
@@ -129,15 +127,16 @@ def test_early_mismatch(rejection_sampler):
 def test_multiple_sequences(rejection_sampler):
     """Test handling multiple sequences of speculated tokens"""
     spec_tokens = [[1, 2], [3]]
-    output_tokens = [[1, 2, 5], [3,
-                                 4]]  # Two sequences with bonus tokens 5 and 4
+    output_tokens = [[1, 2, 5], [3, 4]]  # Two sequences with bonus tokens 5 and 4
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor(
-        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
 
     output = rejection_sampler(
         spec_decode_metadata,
@@ -146,9 +145,9 @@ def test_multiple_sequences(rejection_sampler):
         bonus_token_ids=bonus_token_tensor,
         sampling_metadata=metadata,
     )
-    expected = torch.tensor([[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]],
-                            dtype=torch.int,
-                            device=logits.device)
+    expected = torch.tensor(
+        [[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]], dtype=torch.int, device=logits.device
+    )
     assert torch.equal(output, expected)
 
 
@@ -159,10 +158,10 @@ def test_single_token_sequence(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
 
     output = rejection_sampler(
         spec_decode_metadata,
@@ -182,10 +181,10 @@ def test_empty_sequence(rejection_sampler):
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
 
     output = rejection_sampler(
         spec_decode_metadata,
@@ -201,15 +200,16 @@ def test_empty_sequence(rejection_sampler):
 def test_multiple_mismatches(rejection_sampler):
     """Test handling multiple sequences with mismatches"""
     spec_tokens = [[1, 2, 3], [4, 5, 6]]
-    output_tokens = [[1, 2, 7, 6], [4, 8, 6,
-                                    9]]  # Mismatches in both sequences
+    output_tokens = [[1, 2, 7, 6], [4, 8, 6, 9]]  # Mismatches in both sequences
 
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor(
-        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
 
     output = rejection_sampler(
         spec_decode_metadata,
@@ -219,8 +219,10 @@ def test_multiple_mismatches(rejection_sampler):
         sampling_metadata=metadata,
     )
     expected = torch.tensor(
-        [[1, 2, 7, PLACEHOLDER_TOKEN_ID],
-         [4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
+        [
+            [1, 2, 7, PLACEHOLDER_TOKEN_ID],
+            [4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID],
+        ],
         dtype=torch.int,
         device=logits.device,
     )
@@ -232,18 +234,23 @@ def test_multiple_mismatches(rejection_sampler):
     [
         ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
         ([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]),  # First mismatch
-        ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
-         [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
-    ])
-def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
-                            expected):
+        (
+            [[1, 2], [3, 4]],
+            [[1, 5, 6], [3, 4, 7]],
+            [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]],
+        ),  # Mixed matches
+    ],
+)
+def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens, expected):
     """Parametrized test for various matching scenarios"""
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
-    bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
-                                      device=logits.device)
-    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
-                                                         device=logits.device)
+    bonus_token_tensor = torch.tensor(
+        [tokens[-1] for tokens in output_tokens], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
 
     output = rejection_sampler(
         spec_decode_metadata,
@@ -252,9 +259,7 @@ def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
         bonus_token_ids=bonus_token_tensor,
         sampling_metadata=metadata,
     )
-    expected_tensor = torch.tensor(expected,
-                                   dtype=torch.int,
-                                   device=logits.device)
+    expected_tensor = torch.tensor(expected, dtype=torch.int, device=logits.device)
     assert torch.equal(output, expected_tensor)
 
 
@@ -273,22 +278,15 @@ def test_deterministic_when_seeded(
     n_rep: int,
 ):
     num_tokens = batch_size * k
-    draft_probs = torch.rand(num_tokens,
-                             vocab_size,
-                             dtype=torch.float32,
-                             device=DEVICE)
+    draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE)
     draft_probs = F.softmax(draft_probs, dim=-1)
     target_logits = torch.rand_like(draft_probs)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device=DEVICE)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device=DEVICE)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64, device=DEVICE
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device=DEVICE
+    )
 
     seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
 
@@ -296,17 +294,17 @@ def test_deterministic_when_seeded(
     for _ in range(n_rep):
         seeded_seqs = {
             i: torch.Generator(device=DEVICE).manual_seed(i)
-            for i in range(batch_size) if seeded_mask[i]
+            for i in range(batch_size)
+            if seeded_mask[i]
         }
 
-        temperature = torch.ones(batch_size,
-                                 dtype=torch.float32,
-                                 device=DEVICE)
-        sampling_metadata = create_sampling_metadata(all_greedy=False,
-                                                     temperature=temperature,
-                                                     generators=seeded_seqs)
+        temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+        sampling_metadata = create_sampling_metadata(
+            all_greedy=False, temperature=temperature, generators=seeded_seqs
+        )
         spec_decode_metadata = SpecDecodeMetadata.make_dummy(
-            draft_token_ids.tolist(), device=DEVICE)
+            draft_token_ids.tolist(), device=DEVICE
+        )
         rep_result = rejection_sampler(
             spec_decode_metadata,
             draft_probs=draft_probs,
@@ -352,8 +350,7 @@ def test_rejection_sampling_approximates_target_distribution():
     num_reference_probs = 100
 
     # Prepare draft, target, and reference probability distributions
-    draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32),
-                            dim=-1)
+    draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32), dim=-1)
     target_logits = torch.rand(vocab_size, dtype=torch.float32)
     target_probs = F.softmax(target_logits, dim=-1)
     reference_probs = F.softmax(
@@ -368,38 +365,48 @@ def test_rejection_sampling_approximates_target_distribution():
     for num_samples in sample_sizes:
         # Sample using rejection sampling.
         rej_sample_probs = estimate_rejection_sampling_pdf(
-            draft_probs, target_logits, k, vocab_size, num_samples)
+            draft_probs, target_logits, k, vocab_size, num_samples
+        )
         rej_sample_probs = rej_sample_probs.to(DEVICE)
 
         # Average distance from reference probs.
-        reference_vs_rejsample_dist = torch.dist(
-            reference_probs,
-            rej_sample_probs).item() / reference_probs.shape[0]
-        target_vs_rejsample_dist = torch.dist(target_probs,
-                                              rej_sample_probs).item()
+        reference_vs_rejsample_dist = (
+            torch.dist(reference_probs, rej_sample_probs).item()
+            / reference_probs.shape[0]
+        )
+        target_vs_rejsample_dist = torch.dist(target_probs, rej_sample_probs).item()
 
         distance_wrt_reference.append(reference_vs_rejsample_dist)
         distance_wrt_target.append(target_vs_rejsample_dist)
 
         relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-            distance_wrt_target)
+            distance_wrt_target
+        )
         relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-            distance_wrt_reference)
+            distance_wrt_reference
+        )
 
-        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
-              f"{reference_vs_rejsample_dist=:.05f}")
-        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
-              f"{relative_change_in_distance_wrt_reference=:.02f}")
+        print(
+            f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+            f"{reference_vs_rejsample_dist=:.05f}"
+        )
+        print(
+            f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+            f"{relative_change_in_distance_wrt_reference=:.02f}"
+        )
 
     relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-        distance_wrt_target)
+        distance_wrt_target
+    )
     relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-        distance_wrt_reference)
+        distance_wrt_reference
+    )
 
     expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target
-            > relative_change_in_distance_wrt_reference *
-            expected_improvement_multiplier)
+    assert (
+        relative_change_in_distance_wrt_target
+        > relative_change_in_distance_wrt_reference * expected_improvement_multiplier
+    )
 
 
 def get_ratio_first_to_last(elements: list[float]) -> float:
@@ -427,28 +434,29 @@ def estimate_rejection_sampling_pdf(
     rejection_sampler = RejectionSampler()
     num_tokens = num_samples * k
     # Repeat draft probs num_samples * k times.
-    draft_probs = draft_probs.reshape(1, 1,
-                                      vocab_size).repeat(num_samples, k, 1)
+    draft_probs = draft_probs.reshape(1, 1, vocab_size).repeat(num_samples, k, 1)
 
     # Repeat target probs num_tokens times.
     target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
 
     # Randomly sample draft token ids from draft probs.
-    draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
-                                        num_samples=k,
-                                        replacement=True).reshape(
-                                            num_samples, k)
+    draft_token_ids = torch.multinomial(
+        draft_probs[:, 0, :], num_samples=k, replacement=True
+    ).reshape(num_samples, k)
     draft_probs = draft_probs.view(num_tokens, vocab_size)
 
     # Bonus tokens not used but required.
-    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
-                                  device=DEVICE).repeat(num_samples, 1)
+    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE).repeat(
+        num_samples, 1
+    )
 
     temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
-    sampling_metadata = create_sampling_metadata(all_greedy=False,
-                                                 temperature=temperature)
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False, temperature=temperature
+    )
     spec_decode_metadata = SpecDecodeMetadata.make_dummy(
-        draft_token_ids.tolist(), device=bonus_token_ids.device)
+        draft_token_ids.tolist(), device=bonus_token_ids.device
+    )
     output_token_ids = rejection_sampler(
         spec_decode_metadata,
         draft_probs=draft_probs,
@@ -458,11 +466,12 @@ def estimate_rejection_sampling_pdf(
     )
     output_token_ids = output_token_ids[:, :-1].flatten()
 
-    hist = torch.histogram(output_token_ids.to(dtype=torch.float,
-                                               device="cpu"),
-                           bins=vocab_size,
-                           range=(0, vocab_size),
-                           density=True)
+    hist = torch.histogram(
+        output_token_ids.to(dtype=torch.float, device="cpu"),
+        bins=vocab_size,
+        range=(0, vocab_size),
+        density=True,
+    )
 
     return hist.hist
 
@@ -480,9 +489,9 @@ def _test_masked_logits(
     num_tokens = batch_size * num_draft_tokens
 
     # Create random draft probabilities.
-    draft_probs = torch.rand((num_tokens, vocab_size),
-                             dtype=torch.float32,
-                             device=DEVICE)
+    draft_probs = torch.rand(
+        (num_tokens, vocab_size), dtype=torch.float32, device=DEVICE
+    )
     draft_probs = F.softmax(draft_probs, dim=-1)
 
     # Randomly sample draft token ids from draft probs
@@ -491,9 +500,7 @@ def _test_masked_logits(
     draft_token_ids = draft_token_ids.tolist()
 
     # Bonus tokens not used but required
-    bonus_token_ids = torch.zeros((batch_size, 1),
-                                  dtype=torch.int64,
-                                  device=DEVICE)
+    bonus_token_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=DEVICE)
 
     # Create spec decode metadata
     spec_decode_metadata = SpecDecodeMetadata.make_dummy(
@@ -531,8 +538,7 @@ def test_top_k(rejection_sampler, top_k):
 
     # Randomly create top-k indices.
     top_k_indices = [
-        torch.randperm(vocab_size, device=DEVICE)[:top_k]
-        for _ in range(num_tokens)
+        torch.randperm(vocab_size, device=DEVICE)[:top_k] for _ in range(num_tokens)
     ]
     top_k_indices = torch.stack(top_k_indices)
 
@@ -550,9 +556,7 @@ def test_top_k(rejection_sampler, top_k):
     sampling_metadata = create_sampling_metadata(
         all_greedy=False,
         temperature=temperature,
-        top_k=torch.tensor([top_k] * batch_size,
-                           device=DEVICE,
-                           dtype=torch.int64),
+        top_k=torch.tensor([top_k] * batch_size, device=DEVICE, dtype=torch.int64),
     )
 
     _test_masked_logits(
@@ -595,9 +599,7 @@ def test_top_p(rejection_sampler, top_p):
     sampling_metadata = create_sampling_metadata(
         all_greedy=False,
         temperature=temperature,
-        top_p=torch.tensor([top_p] * batch_size,
-                           device=DEVICE,
-                           dtype=torch.float32),
+        top_p=torch.tensor([top_p] * batch_size, device=DEVICE, dtype=torch.float32),
     )
 
     _test_masked_logits(
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index ea10661ea113..c9a1c5719c86 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -29,12 +29,12 @@ def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
     return fake_logits
 
 
-def _create_penalty_tensor(batch_size: int, penalty_value: float,
-                           device: torch.device) -> torch.Tensor:
-    return torch.full((batch_size, ),
-                      fill_value=penalty_value,
-                      dtype=torch.float,
-                      device=device)
+def _create_penalty_tensor(
+    batch_size: int, penalty_value: float, device: torch.device
+) -> torch.Tensor:
+    return torch.full(
+        (batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
+    )
 
 
 def _create_prompt_tokens_tensor(
@@ -62,9 +62,9 @@ def _create_allowed_token_ids(
         if i % 2 == 1:
             continue
         if mask is None:
-            mask = torch.zeros((batch_size, vocab_size),
-                               dtype=torch.bool,
-                               device=device)
+            mask = torch.zeros(
+                (batch_size, vocab_size), dtype=torch.bool, device=device
+            )
         start = min(i, vocab_size - 1)
         end = min(i + num_allowed_token_ids, vocab_size - 1)
         mask[i, start:end] = True
@@ -72,15 +72,15 @@ def _create_allowed_token_ids(
 
 
 def _create_bad_words_token_ids(
-        batch_size: int, vocab_size: int,
-        bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
+    batch_size: int, vocab_size: int, bad_words_lengths: list[tuple[int]]
+) -> dict[int, list[list[int]]]:
     bad_words_token_ids = {}
     for batch_idx in range(batch_size):
         token_ids_single_batch = []
         for bad_words_length in bad_words_lengths:
-            token_ids = np.random.choice(vocab_size,
-                                         size=bad_words_length,
-                                         replace=True).tolist()
+            token_ids = np.random.choice(
+                vocab_size, size=bad_words_length, replace=True
+            ).tolist()
             token_ids_single_batch.append(token_ids)
         bad_words_token_ids[batch_idx] = token_ids_single_batch
     if batch_size >= 2:
@@ -91,7 +91,8 @@ def _create_bad_words_token_ids(
 
 
 def _update_output_token_ids_for_bad_words(
-        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
+    metadata: SamplingMetadata, vocab_size: int
+) -> dict[int, list[int]]:
     bad_words_last_tokens = {}
     for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
         output_token_ids = metadata.output_token_ids[batch_idx]
@@ -108,8 +109,7 @@ def _update_output_token_ids_for_bad_words(
                     bad_words_last_token.append(bad_word_token_ids[-1])
                     break  # Maximum one update to output_token_ids
                 else:  # Make sure no accidental match to bad words
-                    output_token_ids[-1] = (bad_word_token_ids[-2] +
-                                            1) % vocab_size
+                    output_token_ids[-1] = (bad_word_token_ids[-2] + 1) % vocab_size
         bad_words_last_tokens[batch_idx] = bad_words_last_token
     return bad_words_last_tokens
 
@@ -124,22 +124,24 @@ def _create_default_sampling_metadata(
     prompt_token_ids: list[list[int]] = []
     for _ in range(batch_size):
         output_token_ids.append(
-            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist()
+        )
         prompt_token_ids.append(
-            np.random.randint(0,
-                              vocab_size,
-                              size=np.random.randint(
-                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+            np.random.randint(
+                0, vocab_size, size=np.random.randint(1, MAX_NUM_PROMPT_TOKENS)
+            ).tolist()
+        )
     fake_sampling_metadata = SamplingMetadata(
-        temperature=torch.full((batch_size, ), 0.0),
+        temperature=torch.full((batch_size,), 0.0),
         all_greedy=True,
         all_random=False,
         top_p=None,
         top_k=None,
         generators={},
         max_num_logprobs=0,
-        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
-                                                      vocab_size, device),
+        prompt_token_ids=_create_prompt_tokens_tensor(
+            prompt_token_ids, vocab_size, device
+        ),
         output_token_ids=output_token_ids,
         frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
@@ -153,8 +155,8 @@ def _create_default_sampling_metadata(
 
 
 def _create_weighted_output_token_list(
-        batch_size: int,
-        vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
+    batch_size: int, vocab_size: int
+) -> tuple[list[list[int]], list[list[int]]]:
     """
     Creates an output token list where each token occurs a distinct
     number of times.
@@ -175,14 +177,13 @@ def _create_weighted_output_token_list(
     output_token_ids: list[list[int]] = []
     sorted_token_ids_in_output: list[list[int]] = []
     for _ in range(batch_size):
-        distinct_token_ids = np.random.choice(vocab_size,
-                                              size=np.random.randint(1, 10),
-                                              replace=False).tolist()
+        distinct_token_ids = np.random.choice(
+            vocab_size, size=np.random.randint(1, 10), replace=False
+        ).tolist()
         sorted_token_ids_in_output.append(distinct_token_ids)
         output_token_ids_for_batch = []
         for index, token_id in enumerate(distinct_token_ids):
-            output_token_ids_for_batch.extend(
-                [token_id for _ in range(index + 1)])
+            output_token_ids_for_batch.extend([token_id for _ in range(index + 1)])
         output_token_ids.append(output_token_ids_for_batch)
     return output_token_ids, sorted_token_ids_in_output
 
@@ -190,8 +191,9 @@ def _create_weighted_output_token_list(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
-def test_sampler_presence_penalty(device: str, batch_size: int,
-                                  presence_penalty: float):
+def test_sampler_presence_penalty(
+    device: str, batch_size: int, presence_penalty: float
+):
     """
     Test to verify that if presence penalty is enabled then tokens
     are penalized as per their presence in the existing output.
@@ -201,10 +203,12 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     output_token_ids = sampling_metadata.output_token_ids
     sampling_metadata.presence_penalties = _create_penalty_tensor(
-        batch_size, presence_penalty, torch.device(device))
+        batch_size, presence_penalty, torch.device(device)
+    )
     sampling_metadata.no_penalties = False
     sampler = Sampler()
     logits = sampler.apply_penalties(fake_logits, sampling_metadata)
@@ -235,8 +239,9 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
-def test_sampler_frequency_penalty(device: str, batch_size: int,
-                                   frequency_penalty: float):
+def test_sampler_frequency_penalty(
+    device: str, batch_size: int, frequency_penalty: float
+):
     """
     Test to verify that if frequency penalty is enabled then tokens are
     penalized as per their frequency of occurrence.
@@ -246,14 +251,15 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     sampling_metadata.frequency_penalties = _create_penalty_tensor(
-        batch_size, frequency_penalty, torch.device(device))
-    output_token_ids, sorted_token_ids_in_output = \
-        _create_weighted_output_token_list(
-            batch_size,
-            VOCAB_SIZE,
-        )
+        batch_size, frequency_penalty, torch.device(device)
+    )
+    output_token_ids, sorted_token_ids_in_output = _create_weighted_output_token_list(
+        batch_size,
+        VOCAB_SIZE,
+    )
     sampling_metadata.output_token_ids = output_token_ids
     sampling_metadata.no_penalties = False
     sampler = Sampler()
@@ -262,18 +268,17 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     for batch_idx in range(batch_size):
         non_penalized_token_id = logits[batch_idx].argmax().item()
         penalized_token_id = logits[batch_idx].argmin().item()
-        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[
-            batch_idx]
+        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[batch_idx]
         most_frequent_token_id = distinct_sorted_token_ids_in_output[
-            len(distinct_sorted_token_ids_in_output) - 1]
+            len(distinct_sorted_token_ids_in_output) - 1
+        ]
         if frequency_penalty > 0:
             # If `frequency_penalty` is set to > 0, it indicates
             # a preference for new tokens over existing ones. Verify that the
             # non-penalized token ID is not present in the output, while the
             # most penalized token is the one that occurs most frequently in
             # the output.
-            assert (non_penalized_token_id
-                    not in distinct_sorted_token_ids_in_output)
+            assert non_penalized_token_id not in distinct_sorted_token_ids_in_output
             assert penalized_token_id == most_frequent_token_id
         elif frequency_penalty < 0:
             # If `frequency_penalty` is set to < 0, it indicates
@@ -288,8 +293,9 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
-def test_sampler_repetition_penalty(device: str, batch_size: int,
-                                    repetition_penalty: float):
+def test_sampler_repetition_penalty(
+    device: str, batch_size: int, repetition_penalty: float
+):
     """
     Test to verify that when the repetition penalty is enabled, tokens
     are penalized based on their presence in the prompt or the existing
@@ -300,9 +306,11 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     sampling_metadata.repetition_penalties = _create_penalty_tensor(
-        batch_size, repetition_penalty, torch.device(device))
+        batch_size, repetition_penalty, torch.device(device)
+    )
     sampling_metadata.no_penalties = False
     sampler = Sampler()
     logits = sampler.apply_penalties(fake_logits, sampling_metadata)
@@ -310,32 +318,40 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
     for batch_idx in range(batch_size):
         non_penalized_token_id = logits[batch_idx].argmax().item()
         penalized_token_id = logits[batch_idx].argmin().item()
-        prompt_tokens = sampling_metadata.prompt_token_ids[
-            batch_idx][:].tolist()
+        prompt_tokens = sampling_metadata.prompt_token_ids[batch_idx][:].tolist()
         output_tokens = sampling_metadata.output_token_ids[batch_idx]
         if repetition_penalty > 1.0:
             # If `repetition_penalty` > 1.0, verify that the non-penalized
             # token ID has not been seen before, while the penalized token ID
             # exists either in the prompt or the output.
-            assert (non_penalized_token_id not in prompt_tokens
-                    and non_penalized_token_id not in output_tokens)
-            assert (penalized_token_id in prompt_tokens
-                    or penalized_token_id in output_tokens)
+            assert (
+                non_penalized_token_id not in prompt_tokens
+                and non_penalized_token_id not in output_tokens
+            )
+            assert (
+                penalized_token_id in prompt_tokens
+                or penalized_token_id in output_tokens
+            )
         elif repetition_penalty < 1.0:
             # If `repetition_penalty` < 1.0, verify that the penalized
             # token ID has not been seen before, while the non-penalized
             # token ID exists either in the prompt or the output.
-            assert (penalized_token_id not in prompt_tokens
-                    and penalized_token_id not in output_tokens)
-            assert (non_penalized_token_id in prompt_tokens
-                    or non_penalized_token_id in output_tokens)
+            assert (
+                penalized_token_id not in prompt_tokens
+                and penalized_token_id not in output_tokens
+            )
+            assert (
+                non_penalized_token_id in prompt_tokens
+                or non_penalized_token_id in output_tokens
+            )
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
-def test_sampler_allowed_token_ids(device: str, batch_size: int,
-                                   num_allowed_token_ids: int):
+def test_sampler_allowed_token_ids(
+    device: str, batch_size: int, num_allowed_token_ids: int
+):
     """
     Test to verify that when the repetition penalty is enabled, tokens
     are penalized based on their presence in the prompt or the existing
@@ -346,7 +362,8 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     mask = _create_allowed_token_ids(
         batch_size=batch_size,
         vocab_size=VOCAB_SIZE,
@@ -366,17 +383,19 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
             start = min(batch_idx, VOCAB_SIZE - 1)
             end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
             if token_id >= start and token_id < end:
-                assert logits_for_req[token_id] == -float(
-                    "inf"), f"{batch_idx}, {token_id}"
+                assert logits_for_req[token_id] == -float("inf"), (
+                    f"{batch_idx}, {token_id}"
+                )
             else:
                 assert logits_for_req[token_id] != -float("inf")
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
-def test_sampler_bad_words(device: str, batch_size: int,
-                           bad_words_lengths: list[tuple[int]]):
+@pytest.mark.parametrize("bad_words_lengths", [(1,), (1, 3), (2, 2)])
+def test_sampler_bad_words(
+    device: str, batch_size: int, bad_words_lengths: list[tuple[int]]
+):
     """
     Test to verify that when the bad words restriction is present, tokens
     are penalized based on their match with the bad words.
@@ -386,19 +405,24 @@ def test_sampler_bad_words(device: str, batch_size: int,
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
     sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
-        batch_size, VOCAB_SIZE, bad_words_lengths)
+        batch_size, VOCAB_SIZE, bad_words_lengths
+    )
     bad_words_last_tokens = _update_output_token_ids_for_bad_words(
-        sampling_metadata, VOCAB_SIZE)
+        sampling_metadata, VOCAB_SIZE
+    )
     sampler = Sampler()
     logits = sampler.apply_bad_words(fake_logits, sampling_metadata)
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         logits_for_req = logits[batch_idx]
         for token_id in range(VOCAB_SIZE):
-            if (batch_idx in bad_words_last_tokens
-                    and token_id in bad_words_last_tokens[batch_idx]):
+            if (
+                batch_idx in bad_words_last_tokens
+                and token_id in bad_words_last_tokens[batch_idx]
+            ):
                 assert logits_for_req[token_id] == -float("inf")
             else:
                 assert logits_for_req[token_id] != -float("inf")
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index ac0f3eb58836..b1b80ff18aa7 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -66,9 +66,9 @@ def test_stop(model):
     # Output should not contain the stop word.
     assert len(new_split_text) == STOP_IDX
 
-    params = SamplingParams(temperature=0,
-                            stop=split_text[STOP_IDX],
-                            include_stop_str_in_output=True)
+    params = SamplingParams(
+        temperature=0, stop=split_text[STOP_IDX], include_stop_str_in_output=True
+    )
     output = model.generate(PROMPT, params)
     new_split_text = output[0].outputs[0].text.split()
 
@@ -103,8 +103,8 @@ def test_detokenize_false(model):
     assert len(output[0].outputs[0].text) == 0
 
     output = model.generate(
-        PROMPT, SamplingParams(detokenize=False, logprobs=3,
-                               prompt_logprobs=3))
+        PROMPT, SamplingParams(detokenize=False, logprobs=3, prompt_logprobs=3)
+    )
     assert len(output[0].outputs[0].token_ids) > 0
     assert len(output[0].outputs[0].text) == 0
 
@@ -131,8 +131,7 @@ def test_bad_words(model):
     assert bad_words_1 not in new_text
 
     bad_words_2 = new_text.split()[-1]
-    params = SamplingParams(temperature=0,
-                            bad_words=[bad_words_1, bad_words_2])
+    params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2])
     output = model.generate(PROMPT, params)
     new_text = output[0].outputs[0].text
     assert bad_words_1 not in new_text
@@ -150,8 +149,7 @@ def pick_ith(token_ids, logits):
         return logits
 
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT,
-                           SamplingParams(logits_processors=[pick_ith]))
+        _ = model.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
 
 
 def test_allowed_token_ids(model):
@@ -159,8 +157,7 @@ def test_allowed_token_ids(model):
 
     TOKEN_ID = 10
     allowed_token_ids = [TOKEN_ID]
-    output = model.generate(
-        PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
+    output = model.generate(PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
     assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
 
     # Reject empty allowed_token_ids.
@@ -173,8 +170,7 @@ def test_allowed_token_ids(model):
 
     # Reject out of vocabulary.
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT,
-                           SamplingParams(allowed_token_ids=[10000000]))
+        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
 
 
 def test_priority(model):
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index ccf38c31d39e..c70cbebe22ca 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -5,8 +5,10 @@
 from torch import Generator
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
-                                                  is_flashinfer_available)
+from vllm.v1.sample.ops.topk_topp_sampler import (
+    apply_top_k_top_p,
+    is_flashinfer_available,
+)
 
 DEVICE = current_platform.device_type
 
@@ -30,19 +32,18 @@ def reset_default_device():
 
 
 def test_topk_impl_equivalence():
-
     torch.set_default_device(DEVICE)
     generator = Generator(device=DEVICE).manual_seed(33)
 
     logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
 
     # Random top-k values between 1 and 9.
-    k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+    k = torch.randint(1, 10, (BATCH_SIZE,), generator=generator)
 
     # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
     k.masked_fill_(
-        torch.randint(0, 2, (BATCH_SIZE, ), generator=generator, dtype=bool),
-        VOCAB_SIZE)
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=bool), VOCAB_SIZE
+    )
 
     # Top-k only implementation
     result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
@@ -55,7 +56,7 @@ def test_topk_impl_equivalence():
 
 
 def test_flashinfer_sampler():
-    '''
+    """
     This test verifies that the FlashInfer top-k and top-p sampling
     implementation produces the same results as the Python implementation.
 
@@ -63,11 +64,10 @@ def test_flashinfer_sampler():
     top-p prob renorm (it did provide fused sampling but we cannot compare
     sampling results due to randomness), so we will compare the probability
     renormed consequently by top-k and then top-p of FlashInfer implementation.
-    '''
+    """
 
     if not FLASHINFER_ENABLED:
-        pytest.skip(
-            "FlashInfer not installed or not available on this platform.")
+        pytest.skip("FlashInfer not installed or not available on this platform.")
 
     torch.set_default_device(DEVICE)
     generator = Generator(device=DEVICE).manual_seed(42)
@@ -76,23 +76,21 @@ def test_flashinfer_sampler():
     logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
 
     # Generate various top-k and top-p values
-    k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator)
-    p_values = torch.rand(
-        (BATCH_SIZE, ), generator=generator) * 0.5 + 0.5  # range in [0.5, 1.0]
+    k_values = torch.randint(1, 1000, (BATCH_SIZE,), generator=generator)
+    p_values = (
+        torch.rand((BATCH_SIZE,), generator=generator) * 0.5 + 0.5
+    )  # range in [0.5, 1.0]
 
     # Sometimes disable top-k (k=vocab_size)
     k_values.masked_fill_(
-        torch.randint(0,
-                      2, (BATCH_SIZE, ),
-                      generator=generator,
-                      dtype=torch.bool), VOCAB_SIZE)
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool),
+        VOCAB_SIZE,
+    )
 
     # Sometimes disable top-p (p=1.0)
     p_values.masked_fill_(
-        torch.randint(0,
-                      2, (BATCH_SIZE, ),
-                      generator=generator,
-                      dtype=torch.bool), 1.0)
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 1.0
+    )
 
     python_logits = apply_top_k_top_p(
         logits=logits.clone(),
@@ -113,5 +111,6 @@ def test_flashinfer_sampler():
     )
 
     # Compare the results
-    assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \
+    assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), (
         "FlashInfer and Python sampling implementations do not match!"
+    )
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index e33efb413d02..0f1214e9745c 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -16,6 +16,7 @@
 
 class BatchLogprobsComposition(Enum):
     """Types of logprobs configs to include in test batch"""
+
     NONE = 0
     SAMPLE = 1
     PROMPT = 2
@@ -26,10 +27,10 @@ class BatchLogprobsComposition(Enum):
 
 
 def get_test_batch(
-    batch_logprobs_composition: BatchLogprobsComposition
+    batch_logprobs_composition: BatchLogprobsComposition,
 ) -> BatchLogprobsSpecType:
     """Generate logprobs configs for a batch of requests
-    
+
     A given request's logprobs configuration is (1) num_sample_logprobs and (2)
     num_prompt_logprobs. The batch logprobs configuration is the list of request
     logprobs configs.
@@ -101,7 +102,7 @@ def assert_incr_detok_str_matches_non_incr_detok_str(
     msg: str,
 ) -> None:
     """Compare incrementally detok. text to non-incrementally detok. text
-    
+
     Fail if the strings mismatch after non-alphanumeric characters are stripped
     out.
 
@@ -120,15 +121,15 @@ def assert_incr_detok_str_matches_non_incr_detok_str(
                                           tokens
       msg: error message if `assert` fails
     """
-    rgx = r'[^a-zA-Z0-9]+'
-    assert (re.sub(rgx, '', incremental_detokenization_str) == re.sub(
-        rgx, '', non_incremental_detokenization_str)), (msg)
+    rgx = r"[^a-zA-Z0-9]+"
+    assert re.sub(rgx, "", incremental_detokenization_str) == re.sub(
+        rgx, "", non_incremental_detokenization_str
+    ), msg
 
 
-def compute_correct_cumulative_logprob(
-        completion_output: CompletionOutput) -> float:
+def compute_correct_cumulative_logprob(completion_output: CompletionOutput) -> float:
     """Compute known-good value for evaluating cumulative logprob
-    
+
     Args:
       completion_output: completion output from engine
 
@@ -146,12 +147,12 @@ def create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
     return fake_logits
 
 
-def create_penalty_tensor(batch_size: int, penalty_value: float,
-                          device: torch.device) -> torch.Tensor:
-    return torch.full((batch_size, ),
-                      fill_value=penalty_value,
-                      dtype=torch.float,
-                      device=device)
+def create_penalty_tensor(
+    batch_size: int, penalty_value: float, device: torch.device
+) -> torch.Tensor:
+    return torch.full(
+        (batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
+    )
 
 
 def create_prompt_tokens_tensor(
@@ -170,6 +171,7 @@ def create_prompt_tokens_tensor(
 
 class LogitsprocsTestFakes(NamedTuple):
     """Wraps fake data structures to support testing"""
+
     logits: torch.Tensor
     sampling_metadata: SamplingMetadata
 
@@ -178,15 +180,16 @@ def get_logitsprocs_by_cls(
         cls: type[LogitsProcessor],
     ) -> Iterator[LogitsProcessor]:
         """Yield logits processors of a specific class.
-        
+
         Args:
           cls: :class:`LogitsProcessor` subclass
 
         Returns:
           Iterator over logits processors
         """
-        return (lp for lp in self.sampling_metadata.logitsprocs.all
-                if isinstance(lp, cls))
+        return (
+            lp for lp in self.sampling_metadata.logitsprocs.all if isinstance(lp, cls)
+        )
 
     def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
         """Iterator over all logits processors."""
@@ -208,8 +211,7 @@ def fake_apply_logitsprocs(
     slice_indices: list[int],
 ) -> torch.Tensor:
     """Imitate application of logits processors in engine core"""
-    logits = test_fakes.logits[torch.tensor(slice_indices,
-                                            dtype=torch.long)].clone()
+    logits = test_fakes.logits[torch.tensor(slice_indices, dtype=torch.long)].clone()
     for processor in test_fakes.get_logitsprocs():
         logits = processor.apply(logits)
     return logits
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 682d84dc23d1..99e0cc8d2124 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -3,10 +3,12 @@
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
-
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
-                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from tests.v1.shutdown.utils import (
+    SHUTDOWN_TEST_THRESHOLD_BYTES,
+    SHUTDOWN_TEST_TIMEOUT_SEC,
+)
+
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.sampling_params import RequestOutputKind
@@ -21,8 +23,9 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("send_one_request", [False, True])
-async def test_async_llm_delete(model: str, tensor_parallel_size: int,
-                                send_one_request: bool) -> None:
+async def test_async_llm_delete(
+    model: str, tensor_parallel_size: int, send_one_request: bool
+) -> None:
     """Test that AsyncLLM frees GPU memory upon deletion.
     AsyncLLM always uses an MP client.
 
@@ -34,19 +37,21 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int,
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
-    engine_args = AsyncEngineArgs(model=model,
-                                  enforce_eager=True,
-                                  tensor_parallel_size=tensor_parallel_size)
+    engine_args = AsyncEngineArgs(
+        model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+    )
 
     # Instantiate AsyncLLM; make request to complete any deferred
     # initialization; then delete instance
     async_llm = AsyncLLM.from_engine_args(engine_args)
     if send_one_request:
         async for _ in async_llm.generate(
-                "Hello my name is",
-                request_id="abc",
-                sampling_params=SamplingParams(
-                    max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=1, output_kind=RequestOutputKind.DELTA
+            ),
+        ):
             pass
     del async_llm
 
@@ -62,9 +67,13 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int,
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("send_one_request", [False, True])
-def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
-                    enable_multiprocessing: bool,
-                    send_one_request: bool) -> None:
+def test_llm_delete(
+    monkeypatch,
+    model: str,
+    tensor_parallel_size: int,
+    enable_multiprocessing: bool,
+    send_one_request: bool,
+) -> None:
     """Test that LLM frees GPU memory upon deletion.
     TODO(andy) - LLM without multiprocessing.
 
@@ -83,12 +92,13 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
 
         # Instantiate LLM; make request to complete any deferred
         # initialization; then delete instance
-        llm = LLM(model=model,
-                  enforce_eager=True,
-                  tensor_parallel_size=tensor_parallel_size)
+        llm = LLM(
+            model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+        )
         if send_one_request:
-            llm.generate("Hello my name is",
-                         sampling_params=SamplingParams(max_tokens=1))
+            llm.generate(
+                "Hello my name is", sampling_params=SamplingParams(max_tokens=1)
+            )
         del llm
 
         # Confirm all the processes are cleaned up.
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 523b7ee23115..d7cbc82a2493 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -5,10 +5,12 @@
 import asyncio
 
 import pytest
-
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
-                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from tests.v1.shutdown.utils import (
+    SHUTDOWN_TEST_THRESHOLD_BYTES,
+    SHUTDOWN_TEST_TIMEOUT_SEC,
+)
+
 from vllm import LLM, AsyncEngineArgs, SamplingParams
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.models.llama import LlamaForCausalLM
@@ -26,8 +28,10 @@ def evil_forward(self, *args, **kwargs):
     if not hasattr(self, "num_calls"):
         self.num_calls = 0
 
-    if (self.num_calls == NUMBER_OF_GOOD_PASSES
-            and get_tensor_model_parallel_rank() == 0):
+    if (
+        self.num_calls == NUMBER_OF_GOOD_PASSES
+        and get_tensor_model_parallel_rank() == 0
+    ):
         raise Exception("Simulated illegal memory access on Rank 0!")
     self.num_calls += 1
 
@@ -37,10 +41,11 @@ def evil_forward(self, *args, **kwargs):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)
-async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
-                                     model: str) -> None:
+async def test_async_llm_model_error(
+    monkeypatch, tensor_parallel_size: int, model: str
+) -> None:
     """Test that AsyncLLM propagates a forward pass error and frees memory.
-    
+
     AsyncLLM always uses an MP client.
     """
     if cuda_device_count_stateless() < tensor_parallel_size:
@@ -49,15 +54,15 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
     # Monkeypatch an error in the model.
     monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
 
-    engine_args = AsyncEngineArgs(model=model,
-                                  enforce_eager=True,
-                                  tensor_parallel_size=tensor_parallel_size)
+    engine_args = AsyncEngineArgs(
+        model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+    )
     async_llm = AsyncLLM.from_engine_args(engine_args)
 
     async def generate(request_id: str):
-        generator = async_llm.generate("Hello my name is",
-                                       request_id=request_id,
-                                       sampling_params=SamplingParams())
+        generator = async_llm.generate(
+            "Hello my name is", request_id=request_id, sampling_params=SamplingParams()
+        )
         try:
             async for _ in generator:
                 pass
@@ -77,9 +82,9 @@ async def generate(request_id: str):
 
     # We should not be able to make another request.
     with pytest.raises(EngineDeadError):
-        async for _ in async_llm.generate("Hello my name is",
-                                          request_id="abc",
-                                          sampling_params=SamplingParams()):
+        async for _ in async_llm.generate(
+            "Hello my name is", request_id="abc", sampling_params=SamplingParams()
+        ):
             raise Exception("We should not get here.")
 
     # Confirm all the processes are cleaned up.
@@ -98,8 +103,9 @@ async def generate(request_id: str):
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)
-def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
-                         enable_multiprocessing: bool, model: str) -> None:
+def test_llm_model_error(
+    monkeypatch, tensor_parallel_size: int, enable_multiprocessing: bool, model: str
+) -> None:
     """Test that LLM propagates a forward pass error and frees memory.
     TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
     and >1 rank
@@ -108,19 +114,17 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-
         MP_VALUE = "1" if enable_multiprocessing else "0"
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
         # Monkeypatch an error in the model.
         m.setattr(LlamaForCausalLM, "forward", evil_forward)
 
-        llm = LLM(model=model,
-                  enforce_eager=True,
-                  tensor_parallel_size=tensor_parallel_size)
+        llm = LLM(
+            model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+        )
 
-        with pytest.raises(
-                EngineDeadError if enable_multiprocessing else Exception):
+        with pytest.raises(EngineDeadError if enable_multiprocessing else Exception):
             llm.generate("Hello my name is Robert and I")
 
         # Confirm all the processes are cleaned up.
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index a077d48fecbb..ef30c9d2615f 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -5,8 +5,8 @@
 import asyncio
 
 import pytest
-
 from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
+
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs.data import TokensPrompt
@@ -30,9 +30,9 @@ async def test_async_llm_processor_error(model: str) -> None:
 
     async def generate(request_id: str):
         # [] is not allowed and will raise a ValueError in Processor.
-        generator = async_llm.generate(TokensPrompt([]),
-                                       request_id=request_id,
-                                       sampling_params=SamplingParams())
+        generator = async_llm.generate(
+            TokensPrompt([]), request_id=request_id, sampling_params=SamplingParams()
+        )
         try:
             async for _ in generator:
                 pass
@@ -55,11 +55,12 @@ async def generate(request_id: str):
     EXPECTED_TOKENS = 5
     outputs = []
     async for out in async_llm.generate(
-            "Hello my name is",
-            request_id="abc",
-            sampling_params=SamplingParams(
-                max_tokens=EXPECTED_TOKENS,
-                output_kind=RequestOutputKind.DELTA)):
+        "Hello my name is",
+        request_id="abc",
+        sampling_params=SamplingParams(
+            max_tokens=EXPECTED_TOKENS, output_kind=RequestOutputKind.DELTA
+        ),
+    ):
         outputs.append(out)
 
     generated_tokens = []
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 88fc5297aaf5..ccc171855565 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -3,10 +3,12 @@
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
-
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
-                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from tests.v1.shutdown.utils import (
+    SHUTDOWN_TEST_THRESHOLD_BYTES,
+    SHUTDOWN_TEST_TIMEOUT_SEC,
+)
+
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -30,9 +32,9 @@ def evil_method(self, *args, **kwargs):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
-def test_async_llm_startup_error(monkeypatch, model: str,
-                                 tensor_parallel_size: int,
-                                 failing_method: str) -> None:
+def test_async_llm_startup_error(
+    monkeypatch, model: str, tensor_parallel_size: int, failing_method: str
+) -> None:
     """Test that AsyncLLM propagates an __init__ error & frees memory.
     Test profiling (forward()) and load weights failures.
     AsyncLLM always uses an MP client.
@@ -43,9 +45,9 @@ def test_async_llm_startup_error(monkeypatch, model: str,
     # Monkeypatch an error in the model.
     monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
 
-    engine_args = AsyncEngineArgs(model=model,
-                                  enforce_eager=True,
-                                  tensor_parallel_size=tensor_parallel_size)
+    engine_args = AsyncEngineArgs(
+        model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size
+    )
 
     # Confirm we get an exception.
     with pytest.raises(Exception, match="initialization failed"):
@@ -63,9 +65,13 @@ def test_async_llm_startup_error(monkeypatch, model: str,
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
-def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
-                           enable_multiprocessing: bool,
-                           failing_method: str) -> None:
+def test_llm_startup_error(
+    monkeypatch,
+    model: str,
+    tensor_parallel_size: int,
+    enable_multiprocessing: bool,
+    failing_method: str,
+) -> None:
     """Test that LLM propagates an __init__ error and frees memory.
     Test profiling (forward()) and load weights failures.
     TODO(andy) - LLM without multiprocessing.
@@ -76,7 +82,6 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-
         MP_VALUE = "1" if enable_multiprocessing else "0"
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
@@ -84,12 +89,16 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
         monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
 
         with pytest.raises(
-                Exception,
-                match="initialization failed"
-                if enable_multiprocessing else "Simulated Error in startup!"):
-            _ = LLM(model=model,
-                    enforce_eager=True,
-                    tensor_parallel_size=tensor_parallel_size)
+            Exception,
+            match="initialization failed"
+            if enable_multiprocessing
+            else "Simulated Error in startup!",
+        ):
+            _ = LLM(
+                model=model,
+                enforce_eager=True,
+                tensor_parallel_size=tensor_parallel_size,
+            )
 
         # Confirm all the processes are cleaned up.
         wait_for_gpu_memory_to_clear(
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 5c74a286c4a9..508a93346ec4 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -5,14 +5,24 @@
 
 import pytest
 import torch
-
-from tests.v1.attention.utils import (BatchSpec, _Backend,
-                                      create_common_attn_metadata,
-                                      create_standard_kv_cache_spec,
-                                      get_attention_backend)
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
-                         VllmConfig)
+from tests.v1.attention.utils import (
+    BatchSpec,
+    _Backend,
+    create_common_attn_metadata,
+    create_standard_kv_cache_spec,
+    get_attention_backend,
+)
+
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    SpeculativeConfig,
+    VllmConfig,
+)
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
@@ -23,14 +33,16 @@
 
 
 def _create_proposer(method: str, k: int) -> EagleProposer:
-    model_config = ModelConfig(model=model_dir,
-                               task="generate",
-                               max_model_len=100,
-                               tokenizer=model_dir,
-                               tokenizer_mode="auto",
-                               dtype="auto",
-                               seed=None,
-                               trust_remote_code=False)
+    model_config = ModelConfig(
+        model=model_dir,
+        task="generate",
+        max_model_len=100,
+        tokenizer=model_dir,
+        tokenizer_mode="auto",
+        dtype="auto",
+        seed=None,
+        trust_remote_code=False,
+    )
 
     # Choose model directory based on method
     draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
@@ -50,10 +62,10 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
         device_config=DeviceConfig(device=current_platform.device_type),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig())
+        scheduler_config=SchedulerConfig(),
+    )
 
-    return EagleProposer(vllm_config=vllm_config,
-                         device=current_platform.device_type)
+    return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
 
 
 def test_prepare_inputs():
@@ -83,17 +95,15 @@ def test_prepare_inputs():
     )
 
     # Rejected tokens per request: [1, 3, 2]
-    num_rejected_tokens = torch.tensor([1, 3, 2],
-                                       dtype=torch.int32,
-                                       device=device)
+    num_rejected_tokens = torch.tensor([1, 3, 2], dtype=torch.int32, device=device)
 
     # Expected calculations:
     # query_len_per_req = [4, 7, 5]
     # num_tokens_per_req = [3, 4, 3]  (after subtracting rejected tokens)
     # Expected cumulative counts: [0, 3, 7, 10]
-    expected_cu_num_tokens = torch.tensor([0, 3, 7, 10],
-                                          dtype=torch.int32,
-                                          device=device)
+    expected_cu_num_tokens = torch.tensor(
+        [0, 3, 7, 10], dtype=torch.int32, device=device
+    )
 
     # Expected token indices (mapped from original positions):
     # First request: indices 0, 1, 2      (keeping first 3 from positions 0-3)
@@ -110,32 +120,43 @@ def test_prepare_inputs():
             7,  # Second request: 4 tokens (7-3)
             11,
             12,
-            13  # Third request: 3 tokens (5-2)
+            13,  # Third request: 3 tokens (5-2)
         ],
         dtype=torch.int32,
-        device=device)
+        device=device,
+    )
     proposer = _create_proposer("eagle", 1)
 
     updated_metadata, token_indices = proposer.prepare_inputs(
-        common_attn_metadata, num_rejected_tokens.cpu())
+        common_attn_metadata, num_rejected_tokens.cpu()
+    )
 
-    assert torch.equal(updated_metadata.query_start_loc,
-                       expected_cu_num_tokens)
+    assert torch.equal(updated_metadata.query_start_loc, expected_cu_num_tokens)
     assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
     assert torch.equal(token_indices, expected_token_indices)
 
 
-@pytest.mark.parametrize("method,proposer_helper", [
-    ("eagle", lambda k: _create_proposer("eagle", k)),
-    ("eagle3", lambda k: _create_proposer("eagle3", k)),
-])
+@pytest.mark.parametrize(
+    "method,proposer_helper",
+    [
+        ("eagle", lambda k: _create_proposer("eagle", k)),
+        ("eagle3", lambda k: _create_proposer("eagle3", k)),
+    ],
+)
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
-@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
-@mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
-@mock.patch('vllm.v1.spec_decode.eagle.get_model')
-def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
-                    proposer_helper, pp_size, use_distinct_embed_tokens):
+@mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
+@mock.patch("vllm.v1.spec_decode.eagle.get_layers_from_vllm_config")
+@mock.patch("vllm.v1.spec_decode.eagle.get_model")
+def test_load_model(
+    mock_get_model,
+    mock_get_layers,
+    mock_get_pp_group,
+    method,
+    proposer_helper,
+    pp_size,
+    use_distinct_embed_tokens,
+):
     # Setup draft model mock
     mock_model = mock.MagicMock()
     if use_distinct_embed_tokens:
@@ -150,12 +171,10 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
     # Setup mocks for attention layers
     target_attn_layers = {
         "target_attn_1": mock.MagicMock(),
-        "target_attn_2": mock.MagicMock()
+        "target_attn_2": mock.MagicMock(),
     }
     # Draft model has one extra attention layer compared to target model
-    all_attn_layers = {
-        **target_attn_layers, "draft_extra_attn": mock.MagicMock()
-    }
+    all_attn_layers = {**target_attn_layers, "draft_extra_attn": mock.MagicMock()}
 
     # Make mock_get_layers return different values for each call
     mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
@@ -176,6 +195,7 @@ class _TargetModelStub(LlamaForCausalLM):
     target_model.model.embed_tokens.weight.shape = (131072, 4096)
 
     from vllm.model_executor.models import SupportsMultiModal
+
     assert not isinstance(target_model, SupportsMultiModal)
 
     if method == "eagle":
@@ -197,13 +217,11 @@ class _TargetModelStub(LlamaForCausalLM):
     # Verify that the embed tokens are set correctly
     # If pp_size is > 1, the embed tokens should be distinct
     if pp_size > 1 or use_distinct_embed_tokens:
-        assert proposer.model.model.embed_tokens != \
-            target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens != target_model.model.embed_tokens
     else:
         # When pp_size is 1 and the draft and target models have
         # embed_tokens of the same shape, they should be shared.
-        assert proposer.model.model.embed_tokens == \
-            target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens == target_model.model.embed_tokens
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
@@ -290,24 +308,17 @@ def create_deterministic_logits(token_ids):
         device=device,
     )
 
-    target_token_ids = torch.randint(0,
-                                     vocab_size, (total_tokens, ),
-                                     device=device)
-    target_positions = torch.cat([
-        torch.arange(seq_len_1, device=device),
-        torch.arange(seq_len_2, device=device)
-    ])
-    target_hidden_states = torch.randn(total_tokens,
-                                       hidden_size,
-                                       device=device)
-    next_token_ids = torch.randint(0,
-                                   vocab_size, (batch_size, ),
-                                   dtype=torch.int32,
-                                   device=device)
+    target_token_ids = torch.randint(0, vocab_size, (total_tokens,), device=device)
+    target_positions = torch.cat(
+        [torch.arange(seq_len_1, device=device), torch.arange(seq_len_2, device=device)]
+    )
+    target_hidden_states = torch.randn(total_tokens, hidden_size, device=device)
+    next_token_ids = torch.randint(
+        0, vocab_size, (batch_size,), dtype=torch.int32, device=device
+    )
     sampling_metadata = mock.MagicMock()
 
-    attn_metadata_builder_cls, _ = get_attention_backend(
-        _Backend.FLASH_ATTN_VLLM_V1)
+    attn_metadata_builder_cls, _ = get_attention_backend(_Backend.FLASH_ATTN_VLLM_V1)
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
         vllm_config=proposer.vllm_config,
@@ -318,12 +329,14 @@ def create_deterministic_logits(token_ids):
     proposer.runner = mock.MagicMock()
     proposer.runner.attn_metadata_builders = [attn_metadata_builder]
 
-    result = proposer.propose(target_token_ids=target_token_ids,
-                              target_positions=target_positions,
-                              target_hidden_states=target_hidden_states,
-                              next_token_ids=next_token_ids,
-                              common_attn_metadata=common_attn_metadata,
-                              sampling_metadata=sampling_metadata)
+    result = proposer.propose(
+        target_token_ids=target_token_ids,
+        target_positions=target_positions,
+        target_hidden_states=target_hidden_states,
+        next_token_ids=next_token_ids,
+        common_attn_metadata=common_attn_metadata,
+        sampling_metadata=sampling_metadata,
+    )
 
     assert result.shape == (batch_size, num_speculative_tokens)
 
@@ -332,13 +345,14 @@ def create_deterministic_logits(token_ids):
         # Example for num_speculative_tokens=1:
         # [[42], [60]]
         expected_tokens = torch.tensor(
-            [[base_token_ids[0]], [base_token_ids[1]]], device=device)
+            [[base_token_ids[0]], [base_token_ids[1]]], device=device
+        )
     else:
         # Example for num_speculative_tokens=3:
         # [[42, 43, 44], [60, 61, 62]]
-        expected_tokens = torch.zeros((batch_size, num_speculative_tokens),
-                                      dtype=torch.int64,
-                                      device=device)
+        expected_tokens = torch.zeros(
+            (batch_size, num_speculative_tokens), dtype=torch.int64, device=device
+        )
         for i in range(batch_size):
             for j in range(num_speculative_tokens):
                 expected_tokens[i, j] = base_token_ids[i] + j
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index ffea86d0d19c..f3f96b58ec39 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -4,87 +4,95 @@
 import numpy as np
 
 from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
-from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
-                                                _find_subarray_kmp,
-                                                _kmp_lps_array)
+from vllm.v1.spec_decode.ngram_proposer import (
+    NgramProposer,
+    _find_subarray_kmp,
+    _kmp_lps_array,
+)
 
 
 def test_kmp_lps_array():
     np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
     np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
-                                  np.array([0, 1, 2]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
-                                  np.array([0, 0, 0, 0]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
-                                  np.array([0, 0, 1, 2, 0]))
+    np.testing.assert_array_equal(
+        _kmp_lps_array(np.array([1, 1, 1])), np.array([0, 1, 2])
+    )
+    np.testing.assert_array_equal(
+        _kmp_lps_array(np.array([1, 2, 3, 4])), np.array([0, 0, 0, 0])
+    )
+    np.testing.assert_array_equal(
+        _kmp_lps_array(np.array([1, 2, 1, 2, 3])), np.array([0, 0, 1, 2, 0])
+    )
 
 
 def test_find_subarray_kmp():
     X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
     assert _find_subarray_kmp(X, 2, 2) is None
     X = np.array([1, 2, 3, 4, 1, 2, 3])
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-                                  np.array([4, 1, 2]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
-                                                                         1]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
-                                  np.array([4, 1, 2]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
-                                                                         1]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), np.array([4, 1, 2]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4, 1]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), np.array([4, 1, 2]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4, 1]))
     X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-                                  np.array([4, 1, 2]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), np.array([4, 1, 2]))
     # Return on the first match
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
-                                  np.array([6, 2, 3]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), np.array([6, 2, 3]))
 
 
 def test_ngram_proposer():
-
     def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
         # Dummy model config. Just to set max_model_len.
-        model_config = ModelConfig(model="facebook/opt-125m",
-                                   task="generate",
-                                   max_model_len=100,
-                                   tokenizer="facebook/opt-125m",
-                                   tokenizer_mode="auto",
-                                   dtype="auto",
-                                   seed=None,
-                                   trust_remote_code=False)
+        model_config = ModelConfig(
+            model="facebook/opt-125m",
+            task="generate",
+            max_model_len=100,
+            tokenizer="facebook/opt-125m",
+            tokenizer_mode="auto",
+            dtype="auto",
+            seed=None,
+            trust_remote_code=False,
+        )
         return NgramProposer(
-            vllm_config=VllmConfig(model_config=model_config,
-                                   speculative_config=SpeculativeConfig.
-                                   from_dict({
-                                       "prompt_lookup_min": min_n,
-                                       "prompt_lookup_max": max_n,
-                                       "num_speculative_tokens": k,
-                                       "method": "ngram",
-                                   })))
+            vllm_config=VllmConfig(
+                model_config=model_config,
+                speculative_config=SpeculativeConfig.from_dict(
+                    {
+                        "prompt_lookup_min": min_n,
+                        "prompt_lookup_max": max_n,
+                        "num_speculative_tokens": k,
+                        "method": "ngram",
+                    }
+                ),
+            )
+        )
 
     # No match.
-    result = ngram_proposer(
-        2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
+    result = ngram_proposer(2, 2, 2).propose(
+        context_token_ids=np.array([1, 2, 3, 4, 5])
+    )
     assert result is None
 
     # No match for 4-gram.
-    result = ngram_proposer(
-        4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+    result = ngram_proposer(4, 4, 2).propose(
+        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3])
+    )
     assert result is None
 
     # No match for 4-gram but match for 3-gram.
-    result = ngram_proposer(
-        3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+    result = ngram_proposer(3, 4, 2).propose(
+        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3])
+    )
     assert np.array_equal(result, np.array([4, 1]))
 
     # Match for both 4-gram and 3-gram.
     # In this case, the proposer should return the 4-gram match.
     result = ngram_proposer(3, 4, 2).propose(
-        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
+        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4])
+    )
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
 
     # Match for 2-gram and 3-gram, but not 4-gram.
-    result = ngram_proposer(
-        2, 4,
-        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
+    result = ngram_proposer(2, 4, 2).propose(
+        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4])
+    )
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 4e7c4b33e8c4..655ea10363f3 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -4,88 +4,48 @@
 import pytest
 
 from vllm.v1.structured_output.backend_xgrammar import (
-    has_xgrammar_unsupported_json_features)
+    has_xgrammar_unsupported_json_features,
+)
 
 
 @pytest.fixture
 def unsupported_string_schemas():
     return [
-        {
-            "type": "string",
-            "format": "email"
-        },
+        {"type": "string", "format": "email"},
     ]
 
 
 @pytest.fixture
 def unsupported_integer_schemas():
     return [
-        {
-            "type": "integer",
-            "multipleOf": 120
-        },
+        {"type": "integer", "multipleOf": 120},
     ]
 
 
 @pytest.fixture
 def unsupported_number_schemas():
     return [
-        {
-            "type": "number",
-            "multipleOf": 120
-        },
+        {"type": "number", "multipleOf": 120},
     ]
 
 
 @pytest.fixture
 def unsupported_array_schemas():
     return [
-        {
-            "type": "array",
-            "uniqueItems": True
-        },
-        {
-            "type": "array",
-            "contains": {
-                "type": "string"
-            }
-        },
-        {
-            "type": "array",
-            "minContains": 1
-        },
-        {
-            "type": "array",
-            "maxContains": 5
-        },
+        {"type": "array", "uniqueItems": True},
+        {"type": "array", "contains": {"type": "string"}},
+        {"type": "array", "minContains": 1},
+        {"type": "array", "maxContains": 5},
     ]
 
 
 @pytest.fixture
 def unsupported_object_schemas():
     return [
-        {
-            "type": "object",
-            "minProperties": 1
-        },
-        {
-            "type": "object",
-            "maxProperties": 5
-        },
-        {
-            "type": "object",
-            "propertyNames": {
-                "pattern": "^[a-z]+$"
-            }
-        },
-        {
-            "type": "object",
-            "patternProperties": {
-                "^S": {
-                    "type": "string"
-                }
-            }
-        },
+        {"type": "object", "minProperties": 1},
+        {"type": "object", "maxProperties": 5},
+        {"type": "object", "propertyNames": {"pattern": "^[a-z]+$"}},
+        {"type": "object", "patternProperties": {"^S": {"type": "string"}}},
     ]
 
 
@@ -94,75 +54,50 @@ def supported_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
-            "status": {
-                "type": "string"
-            },
-            "scores": {
-                "type": "array",
-                "items": {
-                    "type": "number"
-                }
-            },
-            "car_type": {
-                "type": "string",
-                "enum": ["sedan", "suv", "truck"]
-            },
-            "car_brand": {
-                "type": "string",
-                "pattern": "^[a-zA-Z]+$"
-            },
-            "short_description": {
-                "type": "string",
-                "maxLength": 50
-            },
-            "mileage": {
-                "type": "number",
-                "minimum": 0,
-                "maximum": 1000000
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+            "status": {"type": "string"},
+            "scores": {"type": "array", "items": {"type": "number"}},
+            "car_type": {"type": "string", "enum": ["sedan", "suv", "truck"]},
+            "car_brand": {"type": "string", "pattern": "^[a-zA-Z]+$"},
+            "short_description": {"type": "string", "maxLength": 50},
+            "mileage": {"type": "number", "minimum": 0, "maximum": 1000000},
             "model_year": {
                 "type": "integer",
                 "exclusiveMinimum": 1900,
-                "exclusiveMaximum": 2100
-            },
-            "long_description": {
-                "type": "string",
-                "minLength": 50,
-                "maxLength": 2000
+                "exclusiveMaximum": 2100,
             },
+            "long_description": {"type": "string", "minLength": 50, "maxLength": 2000},
             "address": {
                 "type": "object",
                 "properties": {
-                    "street": {
-                        "type": "string"
-                    },
-                    "city": {
-                        "type": "string"
-                    }
-                }
-            }
-        }
+                    "street": {"type": "string"},
+                    "city": {"type": "string"},
+                },
+            },
+        },
     }
 
 
-@pytest.mark.parametrize("schema_type", [
-    "unsupported_string_schemas", "unsupported_integer_schemas",
-    "unsupported_number_schemas", "unsupported_array_schemas",
-    "unsupported_object_schemas"
-])
+@pytest.mark.parametrize(
+    "schema_type",
+    [
+        "unsupported_string_schemas",
+        "unsupported_integer_schemas",
+        "unsupported_number_schemas",
+        "unsupported_array_schemas",
+        "unsupported_object_schemas",
+    ],
+)
 def test_unsupported_json_features_by_type(schema_type, request):
     schemas = request.getfixturevalue(schema_type)
     for schema in schemas:
-        assert has_xgrammar_unsupported_json_features(
-            schema), f"Schema should be unsupported: {schema}"
+        assert has_xgrammar_unsupported_json_features(schema), (
+            f"Schema should be unsupported: {schema}"
+        )
 
 
 def test_supported_json_features(supported_schema):
-    assert not has_xgrammar_unsupported_json_features(
-        supported_schema), "Schema should be supported"
+    assert not has_xgrammar_unsupported_json_features(supported_schema), (
+        "Schema should be supported"
+    )
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index 64a41bec3791..780f466c74ef 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -31,39 +31,42 @@
 )
 
 if not current_platform.supports_v1(engine_args.create_model_config()):
-    pytest.skip(reason="Requires V1-supporting platform.",
-                allow_module_level=True)
+    pytest.skip(reason="Requires V1-supporting platform.", allow_module_level=True)
 
 
 async def generate(
-        engine: AsyncLLM,
-        request_id: str,
-        prompt: PromptType,
-        output_kind: RequestOutputKind,
-        max_tokens: int,
-        prompt_logprobs: Optional[int] = None,
-        data_parallel_rank: Optional[int] = None) -> tuple[int, str]:
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    output_kind: RequestOutputKind,
+    max_tokens: int,
+    prompt_logprobs: Optional[int] = None,
+    data_parallel_rank: Optional[int] = None,
+) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
     count = 0
-    sampling_params = SamplingParams(max_tokens=max_tokens,
-                                     ignore_eos=True,
-                                     output_kind=output_kind,
-                                     temperature=0,
-                                     prompt_logprobs=prompt_logprobs)
-    async for out in engine.generate(request_id=request_id,
-                                     prompt=prompt,
-                                     sampling_params=sampling_params,
-                                     data_parallel_rank=data_parallel_rank):
-
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        ignore_eos=True,
+        output_kind=output_kind,
+        temperature=0,
+        prompt_logprobs=prompt_logprobs,
+    )
+    async for out in engine.generate(
+        request_id=request_id,
+        prompt=prompt,
+        sampling_params=sampling_params,
+        data_parallel_rank=data_parallel_rank,
+    ):
         num_tokens = len(out.outputs[0].token_ids)
         if output_kind == RequestOutputKind.DELTA:
             count += num_tokens
         else:
             count = num_tokens
 
-        await asyncio.sleep(0.)
+        await asyncio.sleep(0.0)
 
     return count, request_id
 
@@ -77,9 +80,7 @@ async def generate(
 )
 @pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
 @pytest.mark.asyncio
-async def test_load(output_kind: RequestOutputKind,
-                    data_parallel_backend: str):
-
+async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str):
     stats_loggers = {}
 
     @dataclass
@@ -90,22 +91,24 @@ class SimpleStatsLogger(StatLoggerBase):
         def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
             stats_loggers[engine_index] = self
 
-        def record(self, scheduler_stats: Optional[SchedulerStats],
-                   iteration_stats: Optional[IterationStats]):
+        def record(
+            self,
+            scheduler_stats: Optional[SchedulerStats],
+            iteration_stats: Optional[IterationStats],
+        ):
             if iteration_stats:
-                self.finished_req_count += len(
-                    iteration_stats.finished_requests)
+                self.finished_req_count += len(iteration_stats.finished_requests)
 
         def log_engine_initialized(self):
             self.init_count += 1
 
     with ExitStack() as after:
-
         prompt = "This is a test of data parallel"
 
         engine_args.data_parallel_backend = data_parallel_backend
-        engine = AsyncLLM.from_engine_args(engine_args,
-                                           stat_loggers=[SimpleStatsLogger])
+        engine = AsyncLLM.from_engine_args(
+            engine_args, stat_loggers=[SimpleStatsLogger]
+        )
         after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
@@ -118,20 +121,23 @@ def log_engine_initialized(self):
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                    generate(
+                        engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS
+                    )
+                )
+            )
             # Short sleep to ensure that requests are distributed.
             await asyncio.sleep(0.01)
         # Confirm that we got all the EXPECTED tokens from the requests.
-        done, pending = await asyncio.wait(tasks,
-                                           return_when=asyncio.FIRST_EXCEPTION)
+        done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
         for task in pending:
             task.cancel()
         for task in done:
             num_generated_tokens, request_id = await task
             assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
                 f"{request_id} generated {num_generated_tokens} but "
-                f"expected {NUM_EXPECTED_TOKENS}")
+                f"expected {NUM_EXPECTED_TOKENS}"
+            )
 
         assert not engine.output_processor.has_unfinished_requests()
 
@@ -155,5 +161,6 @@ def log_engine_initialized(self):
         for sl in stats_loggers.values():
             slogger: SimpleStatsLogger = sl
 
-            assert slogger.finished_req_count > NUM_REQUESTS // (
-                DP_SIZE + 1), f"requests are imbalanced: {stats_loggers}"
+            assert slogger.finished_req_count > NUM_REQUESTS // (DP_SIZE + 1), (
+                f"requests are imbalanced: {stats_loggers}"
+            )
diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py
index 98fefad1ff4a..428ecb2e10ae 100644
--- a/tests/v1/test_external_lb_dp.py
+++ b/tests/v1/test_external_lb_dp.py
@@ -25,12 +25,14 @@ class ExternalLBServerManager:
     """Manages data parallel vLLM server instances for external
     load balancer testing."""
 
-    def __init__(self,
-                 model_name: str,
-                 dp_size: int,
-                 api_server_count: int,
-                 base_server_args: list,
-                 tp_size: int = TP_SIZE):
+    def __init__(
+        self,
+        model_name: str,
+        dp_size: int,
+        api_server_count: int,
+        base_server_args: list,
+        tp_size: int = TP_SIZE,
+    ):
         self.model_name = model_name
         self.dp_size = dp_size
         self.tp_size = tp_size
@@ -46,20 +48,22 @@ def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
             server_args = self.base_server_args.copy()
 
             # Add external LB specific arguments
-            server_args.extend([
-                "--data-parallel-size",
-                str(self.dp_size),
-                "--data-parallel-rank",
-                str(rank),
-                "--data-parallel-size-local",
-                "1",
-                "--tensor-parallel-size",
-                str(self.tp_size),
-                "--port",
-                str(8000 + rank),  # Different port for each rank
-                "--api-server-count",
-                str(self.api_server_count),
-            ])
+            server_args.extend(
+                [
+                    "--data-parallel-size",
+                    str(self.dp_size),
+                    "--data-parallel-rank",
+                    str(rank),
+                    "--data-parallel-size-local",
+                    "1",
+                    "--tensor-parallel-size",
+                    str(self.tp_size),
+                    "--port",
+                    str(8000 + rank),  # Different port for each rank
+                    "--api-server-count",
+                    str(self.api_server_count),
+                ]
+            )
 
             # Use a thread to start each server to allow parallel initialization
             def start_server(r: int, sargs: list[str]):
@@ -70,22 +74,23 @@ def start_server(r: int, sargs: list[str]):
                         sargs,
                         auto_port=False,
                         env_dict={
-                            "CUDA_VISIBLE_DEVICES":
-                            ",".join(
-                                str(Platform.device_id_to_physical_device_id(
-                                    i))
-                                for i in range(r * TP_SIZE, (r + 1) * TP_SIZE))
-                        })
+                            "CUDA_VISIBLE_DEVICES": ",".join(
+                                str(Platform.device_id_to_physical_device_id(i))
+                                for i in range(r * TP_SIZE, (r + 1) * TP_SIZE)
+                            )
+                        },
+                    )
                     server.__enter__()
-                    print(f"Server rank {r} started successfully with "
-                          f"{self.api_server_count} API servers")
+                    print(
+                        f"Server rank {r} started successfully with "
+                        f"{self.api_server_count} API servers"
+                    )
                     self.servers.append((server, sargs))
                 except Exception as e:
                     print(f"Failed to start server rank {r}: {e}")
                     raise
 
-            thread = threading.Thread(target=start_server,
-                                      args=(rank, server_args))
+            thread = threading.Thread(target=start_server, args=(rank, server_args))
             thread.start()
 
             self.server_threads.append(thread)
@@ -128,8 +133,9 @@ def default_server_args():
 @pytest.fixture(scope="module", params=[1, 4])
 def servers(request, default_server_args):
     api_server_count = request.param
-    with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
-                                 default_server_args) as server_list:
+    with ExternalLBServerManager(
+        MODEL_NAME, DP_SIZE, api_server_count, default_server_args
+    ) as server_list:
         yield server_list
 
 
@@ -148,16 +154,15 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
     "model_name",
     [MODEL_NAME],
 )
-async def test_external_lb_single_completion(clients: list[
-    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
-                                             model_name: str) -> None:
-
+async def test_external_lb_single_completion(
+    clients: list[openai.AsyncOpenAI],
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     async def make_request(client: openai.AsyncOpenAI):
         completion = await client.completions.create(
-            model=model_name,
-            prompt="Hello, my name is",
-            max_tokens=10,
-            temperature=1.0)
+            model=model_name, prompt="Hello, my name is", max_tokens=10, temperature=1.0
+        )
 
         assert completion.id is not None
         assert completion.choices is not None and len(completion.choices) == 1
@@ -211,11 +216,14 @@ async def make_request(client: openai.AsyncOpenAI):
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
     print(
         f"Successfully completed external LB test with {len(clients)} servers "
-        f"(API server count: {api_server_count})")
+        f"(API server count: {api_server_count})"
+    )
 
 
 @pytest.mark.asyncio
@@ -223,9 +231,11 @@ async def make_request(client: openai.AsyncOpenAI):
     "model_name",
     [MODEL_NAME],
 )
-async def test_external_lb_completion_streaming(clients: list[
-    openai.AsyncOpenAI], servers: list[tuple[RemoteOpenAIServer, list[str]]],
-                                                model_name: str) -> None:
+async def test_external_lb_completion_streaming(
+    clients: list[openai.AsyncOpenAI],
+    servers: list[tuple[RemoteOpenAIServer, list[str]]],
+    model_name: str,
+) -> None:
     prompt = "What is an LLM?"
 
     async def make_streaming_request(client: openai.AsyncOpenAI):
@@ -239,11 +249,9 @@ async def make_streaming_request(client: openai.AsyncOpenAI):
         single_output = single_completion.choices[0].text
 
         # Perform the streaming request
-        stream = await client.completions.create(model=model_name,
-                                                 prompt=prompt,
-                                                 max_tokens=5,
-                                                 temperature=0.0,
-                                                 stream=True)
+        stream = await client.completions.create(
+            model=model_name, prompt=prompt, max_tokens=5, temperature=0.0, stream=True
+        )
         chunks: list[str] = []
         finish_reason_count = 0
         last_chunk = None
@@ -254,16 +262,15 @@ async def make_streaming_request(client: openai.AsyncOpenAI):
             last_chunk = chunk  # Keep track of the last chunk
 
         # finish reason should only return in the last block for OpenAI API
-        assert finish_reason_count == 1, (
-            "Finish reason should appear exactly once.")
-        assert last_chunk is not None, (
-            "Stream should have yielded at least one chunk.")
-        assert last_chunk.choices[
-            0].finish_reason == "length", "Finish reason should be 'length'."
+        assert finish_reason_count == 1, "Finish reason should appear exactly once."
+        assert last_chunk is not None, "Stream should have yielded at least one chunk."
+        assert last_chunk.choices[0].finish_reason == "length", (
+            "Finish reason should be 'length'."
+        )
         # Check that the combined text matches the non-streamed version.
-        assert "".join(
-            chunks
-        ) == single_output, "Streamed output should match non-streamed output."
+        assert "".join(chunks) == single_output, (
+            "Streamed output should match non-streamed output."
+        )
         return True  # Indicate success for this request
 
     # Test single request to each server
@@ -279,10 +286,7 @@ async def make_streaming_request(client: openai.AsyncOpenAI):
     all_tasks = []
 
     for i, client in enumerate(clients):
-        tasks = [
-            make_streaming_request(client)
-            for _ in range(num_requests_per_server)
-        ]
+        tasks = [make_streaming_request(client) for _ in range(num_requests_per_server)]
         all_tasks.extend(tasks)
 
     results = await asyncio.gather(*all_tasks)
@@ -294,10 +298,7 @@ async def make_streaming_request(client: openai.AsyncOpenAI):
     # Second burst of streaming requests
     all_tasks = []
     for i, client in enumerate(clients):
-        tasks = [
-            make_streaming_request(client)
-            for _ in range(num_requests_per_server)
-        ]
+        tasks = [make_streaming_request(client) for _ in range(num_requests_per_server)]
         all_tasks.extend(tasks)
 
     results = await asyncio.gather(*all_tasks)
@@ -306,7 +307,11 @@ async def make_streaming_request(client: openai.AsyncOpenAI):
 
     _, server_args = servers[0]
     api_server_count = (
-        server_args.count('--api-server-count')
-        and server_args[server_args.index('--api-server-count') + 1] or 1)
-    print(f"Successfully completed external LB streaming test with "
-          f"{len(clients)} servers (API server count: {api_server_count})")
+        server_args.count("--api-server-count")
+        and server_args[server_args.index("--api-server-count") + 1]
+        or 1
+    )
+    print(
+        f"Successfully completed external LB streaming test with "
+        f"{len(clients)} servers (API server count: {api_server_count})"
+    )
diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py
index c05de5e4cb64..ceeca28c3727 100644
--- a/tests/v1/test_metrics_reader.py
+++ b/tests/v1/test_metrics_reader.py
@@ -4,8 +4,13 @@
 import prometheus_client
 import pytest
 
-from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector,
-                                    get_metrics_snapshot)
+from vllm.v1.metrics.reader import (
+    Counter,
+    Gauge,
+    Histogram,
+    Vector,
+    get_metrics_snapshot,
+)
 
 
 @pytest.fixture(autouse=True)
@@ -18,10 +23,12 @@ def test_registry(monkeypatch):
 
 @pytest.mark.parametrize("num_engines", [1, 4])
 def test_gauge_metric(test_registry, num_engines):
-    g = prometheus_client.Gauge("vllm:test_gauge",
-                                "Test gauge metric",
-                                labelnames=["model", "engine_index"],
-                                registry=test_registry)
+    g = prometheus_client.Gauge(
+        "vllm:test_gauge",
+        "Test gauge metric",
+        labelnames=["model", "engine_index"],
+        registry=test_registry,
+    )
     for i in range(num_engines):
         g.labels(model="foo", engine_index=str(i)).set(98.5)
 
@@ -39,10 +46,12 @@ def test_gauge_metric(test_registry, num_engines):
 
 @pytest.mark.parametrize("num_engines", [1, 4])
 def test_counter_metric(test_registry, num_engines):
-    c = prometheus_client.Counter("vllm:test_counter",
-                                  "Test counter metric",
-                                  labelnames=["model", "engine_index"],
-                                  registry=test_registry)
+    c = prometheus_client.Counter(
+        "vllm:test_counter",
+        "Test counter metric",
+        labelnames=["model", "engine_index"],
+        registry=test_registry,
+    )
     for i in range(num_engines):
         c.labels(model="bar", engine_index=str(i)).inc(19)
 
@@ -60,11 +69,13 @@ def test_counter_metric(test_registry, num_engines):
 
 @pytest.mark.parametrize("num_engines", [1, 4])
 def test_histogram_metric(test_registry, num_engines):
-    h = prometheus_client.Histogram("vllm:test_histogram",
-                                    "Test histogram metric",
-                                    labelnames=["model", "engine_index"],
-                                    buckets=[10, 20, 30, 40, 50],
-                                    registry=test_registry)
+    h = prometheus_client.Histogram(
+        "vllm:test_histogram",
+        "Test histogram metric",
+        labelnames=["model", "engine_index"],
+        buckets=[10, 20, 30, 40, 50],
+        registry=test_registry,
+    )
     for i in range(num_engines):
         hist = h.labels(model="blaa", engine_index=str(i))
         hist.observe(42)
@@ -95,7 +106,8 @@ def test_vector_metric(test_registry, num_engines):
         "vllm:spec_decode_num_accepted_tokens_per_pos",
         "Vector-like counter metric",
         labelnames=["position", "model", "engine_index"],
-        registry=test_registry)
+        registry=test_registry,
+    )
     for i in range(num_engines):
         c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
         c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 7a7ba346a719..cbca04aa8d4a 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -36,7 +36,6 @@ def test_reject_bad_config(monkeypatch):
 
 
 def test_unsupported_configs(monkeypatch):
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -100,8 +99,7 @@ def test_enable_by_default_fallback(monkeypatch):
         m.delenv("VLLM_USE_V1")
 
         # Should fall back to V0 for supported model.
-        _ = AsyncEngineArgs(
-            model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
+        _ = AsyncEngineArgs(model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
         assert not envs.VLLM_USE_V1
         m.delenv("VLLM_USE_V1")
 
@@ -151,8 +149,10 @@ def test_reject_using_constructor_directly(monkeypatch):
 
         # This uses the V0 constructor directly.
         with pytest.raises(ValueError):
-            AsyncLLMEngine(vllm_config,
-                           AsyncLLMEngine._get_executor_cls(vllm_config),
-                           log_stats=True)
+            AsyncLLMEngine(
+                vllm_config,
+                AsyncLLMEngine._get_executor_cls(vllm_config),
+                log_stats=True,
+            )
 
         m.delenv("VLLM_USE_V1")
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 0ab4e0bf59cf..e78115c5a8b0 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -9,15 +9,19 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalFlatField,
-                                    MultiModalKwargs, MultiModalKwargsItem,
-                                    MultiModalSharedField, NestedTensors)
+from vllm.multimodal.inputs import (
+    MultiModalBatchedField,
+    MultiModalFieldElem,
+    MultiModalFlatField,
+    MultiModalKwargs,
+    MultiModalKwargsItem,
+    MultiModalSharedField,
+    NestedTensors,
+)
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
 
 class UnrecognizedType(UserDict):
-
     def __init__(self, an_int: int):
         super().__init__()
         self.an_int = an_int
@@ -44,10 +48,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
         m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
         obj = MyType(
-            tensor1=torch.randint(low=0,
-                                  high=100,
-                                  size=(1024, ),
-                                  dtype=torch.int32),
+            tensor1=torch.randint(low=0, high=100, size=(1024,), dtype=torch.int32),
             a_string="hello",
             list_of_tensors=[
                 torch.rand((1, 10), dtype=torch.float32),
@@ -55,8 +56,9 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
                 torch.tensor(1984),  # test scalar too
                 # Make sure to test bf16 which numpy doesn't support.
                 torch.rand((3, 5, 1000), dtype=torch.bfloat16),
-                torch.tensor([float("-inf"), float("inf")] * 1024,
-                             dtype=torch.bfloat16),
+                torch.tensor(
+                    [float("-inf"), float("inf")] * 1024, dtype=torch.bfloat16
+                ),
             ],
             numpy_array=np.arange(512),
             unrecognized=UnrecognizedType(33),
@@ -101,15 +103,15 @@ class MyRequest(msgspec.Struct):
 
 def test_multimodal_kwargs():
     d = {
-        "foo":
-        torch.zeros(20000, dtype=torch.float16),
+        "foo": torch.zeros(20000, dtype=torch.float16),
         "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
         "baz": [
             torch.rand((256), dtype=torch.float16),
             [
                 torch.rand((1, 12), dtype=torch.float32),
                 torch.rand((3, 5, 7), dtype=torch.float64),
-            ], [torch.rand((4, 4), dtype=torch.float16)]
+            ],
+            [torch.rand((4, 4), dtype=torch.float16)],
         ],
     }
 
@@ -132,22 +134,24 @@ def test_multimodal_kwargs():
 
 
 def test_multimodal_items_by_modality():
-    e1 = MultiModalFieldElem("audio", "a0",
-                             torch.zeros(1000, dtype=torch.bfloat16),
-                             MultiModalBatchedField())
+    e1 = MultiModalFieldElem(
+        "audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField()
+    )
     e2 = MultiModalFieldElem(
         "video",
         "v0",
         [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
-        MultiModalFlatField(
-            [[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0),
+        MultiModalFlatField([[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0),
+    )
+    e3 = MultiModalFieldElem(
+        "image", "i0", torch.zeros(1000, dtype=torch.int32), MultiModalSharedField(4)
     )
-    e3 = MultiModalFieldElem("image", "i0", torch.zeros(1000,
-                                                        dtype=torch.int32),
-                             MultiModalSharedField(4))
     e4 = MultiModalFieldElem(
-        "image", "i1", torch.zeros(1000, dtype=torch.int32),
-        MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2))
+        "image",
+        "i1",
+        torch.zeros(1000, dtype=torch.int32),
+        MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2),
+    )
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
     image = MultiModalKwargsItem.from_elems([e3, e4])
@@ -190,16 +194,14 @@ def assert_equal(obj1: MyType, obj2: MyType):
     assert torch.equal(obj1.tensor1, obj2.tensor1)
     assert obj1.a_string == obj2.a_string
     assert all(
-        torch.equal(a, b)
-        for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
+        torch.equal(a, b) for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors)
+    )
     assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
     assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
     assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
     assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
-    assert torch.equal(obj1.small_non_contig_tensor,
-                       obj2.small_non_contig_tensor)
-    assert torch.equal(obj1.large_non_contig_tensor,
-                       obj2.large_non_contig_tensor)
+    assert torch.equal(obj1.small_non_contig_tensor, obj2.small_non_contig_tensor)
+    assert torch.equal(obj1.large_non_contig_tensor, obj2.large_non_contig_tensor)
     assert torch.equal(obj1.empty_tensor, obj2.empty_tensor)
 
 
@@ -236,8 +238,9 @@ def test_tensor_serialization():
     decoded = decoder.decode(encoded)
 
     # Verify the decoded tensor matches the original
-    assert torch.allclose(
-        tensor, decoded), "Decoded tensor does not match the original tensor."
+    assert torch.allclose(tensor, decoded), (
+        "Decoded tensor does not match the original tensor."
+    )
 
 
 def test_numpy_array_serialization():
@@ -255,13 +258,12 @@ def test_numpy_array_serialization():
     decoded = decoder.decode(encoded)
 
     # Verify the decoded array matches the original
-    assert np.allclose(
-        array,
-        decoded), "Decoded numpy array does not match the original array."
+    assert np.allclose(array, decoded), (
+        "Decoded numpy array does not match the original array."
+    )
 
 
 class CustomClass:
-
     def __init__(self, value):
         self.value = value
 
@@ -270,7 +272,8 @@ def __eq__(self, other):
 
 
 def test_custom_class_serialization_allowed_with_pickle(
-        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch: pytest.MonkeyPatch,
+):
     """Test that serializing a custom class succeeds when allow_pickle=True."""
 
     with monkeypatch.context() as m:
@@ -287,8 +290,7 @@ def test_custom_class_serialization_allowed_with_pickle(
         decoded = decoder.decode(encoded)
 
         # Verify the decoded object matches the original
-        assert obj == decoded, (
-            "Decoded object does not match the original object.")
+        assert obj == decoded, "Decoded object does not match the original object."
 
 
 def test_custom_class_serialization_disallowed_without_pickle():
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index fd0e630ce178..f987b09e603e 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -10,32 +10,28 @@ def test_bind_kv_cache():
     from vllm.attention import Attention
 
     ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
+        "layers.0.self_attn": Attention(32, 128, 0.1),
+        "layers.1.self_attn": Attention(32, 128, 0.1),
+        "layers.2.self_attn": Attention(32, 128, 0.1),
+        "layers.3.self_attn": Attention(32, 128, 0.1),
     }
     kv_cache = {
-        'layers.0.self_attn': torch.zeros((1, )),
-        'layers.1.self_attn': torch.zeros((1, )),
-        'layers.2.self_attn': torch.zeros((1, )),
-        'layers.3.self_attn': torch.zeros((1, )),
+        "layers.0.self_attn": torch.zeros((1,)),
+        "layers.1.self_attn": torch.zeros((1,)),
+        "layers.2.self_attn": torch.zeros((1,)),
+        "layers.3.self_attn": torch.zeros((1,)),
     }
     runner_kv_caches: list[torch.Tensor] = []
     bind_kv_cache(kv_cache, ctx, runner_kv_caches)
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
-        'layers.0.self_attn']
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
-        'layers.1.self_attn']
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
-        'layers.2.self_attn']
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
-        'layers.3.self_attn']
+    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache["layers.0.self_attn"]
+    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache["layers.1.self_attn"]
+    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache["layers.2.self_attn"]
+    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache["layers.3.self_attn"]
 
-    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
-    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
-    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
-    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
+    assert runner_kv_caches[0] is kv_cache["layers.0.self_attn"]
+    assert runner_kv_caches[1] is kv_cache["layers.1.self_attn"]
+    assert runner_kv_caches[2] is kv_cache["layers.2.self_attn"]
+    assert runner_kv_caches[3] is kv_cache["layers.3.self_attn"]
 
 
 def test_bind_kv_cache_non_attention():
@@ -43,21 +39,19 @@ def test_bind_kv_cache_non_attention():
 
     # example from Jamba PP=2
     ctx = {
-        'model.layers.20.attn': Attention(32, 128, 0.1),
-        'model.layers.28.attn': Attention(32, 128, 0.1),
+        "model.layers.20.attn": Attention(32, 128, 0.1),
+        "model.layers.28.attn": Attention(32, 128, 0.1),
     }
     kv_cache = {
-        'model.layers.20.attn': torch.zeros((1, )),
-        'model.layers.28.attn': torch.zeros((1, )),
+        "model.layers.20.attn": torch.zeros((1,)),
+        "model.layers.28.attn": torch.zeros((1,)),
     }
 
     runner_kv_caches: list[torch.Tensor] = []
     bind_kv_cache(kv_cache, ctx, runner_kv_caches)
 
-    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
-        'model.layers.20.attn']
-    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
-        'model.layers.28.attn']
+    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache["model.layers.20.attn"]
+    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache["model.layers.28.attn"]
 
-    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
-    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
+    assert runner_kv_caches[0] is kv_cache["model.layers.20.attn"]
+    assert runner_kv_caches[1] is kv_cache["model.layers.28.attn"]
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index c8cd099a98cf..ae4cd3e0d5da 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/v1/tpu/test_basic.py`.
 """
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -30,8 +31,9 @@
 # TENSOR_PARALLEL_SIZES = [1, 4]
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic test for TPU only")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(), reason="This is a basic test for TPU only"
+)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
@@ -44,32 +46,36 @@ def test_basic(
     tensor_parallel_size: int,
     max_num_seqs: int,
 ) -> None:
-    prompt = "The next numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
+    prompt = (
+        "The next numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
     example_prompts = [prompt]
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
         with vllm_runner(
-                model,
-                # Note: max_num_batched_tokens == 1024 is needed here to
-                # actually test chunked prompt
-                max_num_batched_tokens=1024,
-                max_model_len=8192,
-                gpu_memory_utilization=0.7,
-                max_num_seqs=max_num_seqs,
-                tensor_parallel_size=tensor_parallel_size) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
+            model,
+            # Note: max_num_batched_tokens == 1024 is needed here to
+            # actually test chunked prompt
+            max_num_batched_tokens=1024,
+            max_model_len=8192,
+            gpu_memory_utilization=0.7,
+            max_num_seqs=max_num_seqs,
+            tensor_parallel_size=tensor_parallel_size,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         output = vllm_outputs[0][1]
 
         assert "1024" in output or "0, 1" in output
 
 
 @pytest.mark.skip(reason="Temporarily disabled due to timeout")
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic test for TPU only")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(), reason="This is a basic test for TPU only"
+)
 @pytest.mark.parametrize("max_tokens", [8])
 @pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
@@ -94,9 +100,9 @@ def test_phi3(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        with vllm_runner(model,
-                         max_num_batched_tokens=256,
-                         max_num_seqs=max_num_seqs) as vllm_model:
+        with vllm_runner(
+            model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+        ) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
         # vllm_outputs is a list of tuples whose first element is the token id
         # and the second element is the output (including the prompt).
@@ -108,10 +114,11 @@ def test_phi3(
 TP_SIZE_8 = 8
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a test for TPU only")
-@pytest.mark.skipif(tpu.num_available_chips() < TP_SIZE_8,
-                    reason=f"This test requires {TP_SIZE_8} TPU chips.")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This is a test for TPU only")
+@pytest.mark.skipif(
+    tpu.num_available_chips() < TP_SIZE_8,
+    reason=f"This test requires {TP_SIZE_8} TPU chips.",
+)
 def test_gemma3_27b_with_text_input_and_tp(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
@@ -135,10 +142,11 @@ def test_gemma3_27b_with_text_input_and_tp(
         m.setenv("VLLM_USE_V1", "1")
 
         with vllm_runner(
-                model,
-                max_num_batched_tokens=256,
-                max_num_seqs=max_num_seqs,
-                tensor_parallel_size=tensor_parallel_size) as vllm_model:
+            model,
+            max_num_batched_tokens=256,
+            max_num_seqs=max_num_seqs,
+            tensor_parallel_size=tensor_parallel_size,
+        ) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
         # vllm_outputs is a list of tuples whose first element is the token id
         # and the second element is the output (including the prompt).
@@ -147,8 +155,9 @@ def test_gemma3_27b_with_text_input_and_tp(
             assert answer in generated_text
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic test for TPU only")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(), reason="This is a basic test for TPU only"
+)
 def test_w8a8_quantization(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
@@ -158,22 +167,25 @@ def test_w8a8_quantization(
     tensor_parallel_size = 1
     max_num_seqs = 4
 
-    prompt = "The next numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
+    prompt = (
+        "The next numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
     example_prompts = [prompt]
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
         with vllm_runner(
-                model,
-                max_num_batched_tokens=64,
-                max_model_len=4096,
-                gpu_memory_utilization=0.7,
-                max_num_seqs=max_num_seqs,
-                tensor_parallel_size=tensor_parallel_size) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
+            model,
+            max_num_batched_tokens=64,
+            max_model_len=4096,
+            gpu_memory_utilization=0.7,
+            max_num_seqs=max_num_seqs,
+            tensor_parallel_size=tensor_parallel_size,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
         output = vllm_outputs[0][1]
 
         assert "1024" in output or "0, 1" in output
diff --git a/tests/v1/tpu/test_kv_cache_update_kernel.py b/tests/v1/tpu/test_kv_cache_update_kernel.py
index f82737325e9b..71646b5dacfd 100644
--- a/tests/v1/tpu/test_kv_cache_update_kernel.py
+++ b/tests/v1/tpu/test_kv_cache_update_kernel.py
@@ -10,66 +10,79 @@
 from vllm.platforms import current_platform
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a test for TPU only")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This is a test for TPU only")
 @pytest.mark.parametrize("page_size", [32, 33])
 @pytest.mark.parametrize("combined_kv_head_num", [2, 16])
 @pytest.mark.parametrize("head_dim", [128, 256])
 @pytest.mark.parametrize("num_slices_per_block", [4, 8])
-def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
-                                head_dim: int, num_slices_per_block: int):
+def test_kv_cache_update_kernel(
+    page_size: int, combined_kv_head_num: int, head_dim: int, num_slices_per_block: int
+):
     page_num = 1000
     padded_num_tokens = 128
     kv_cache_cpu = torch.zeros(
         (page_num * page_size, combined_kv_head_num, head_dim),
         dtype=torch.bfloat16,
-        device="cpu")
+        device="cpu",
+    )
     kv_cache_xla = kv_cache_cpu.to(torch_xla.device())
     new_kv_cpu = torch.randn(
         (padded_num_tokens, combined_kv_head_num, head_dim),
         dtype=torch.bfloat16,
-        device="cpu")
+        device="cpu",
+    )
     new_kv_xla = new_kv_cpu.to(torch_xla.device())
-    slice_lens = np.array([7, page_size, page_size, 1, 1, 1, 9],
-                          dtype=np.int32)
+    slice_lens = np.array([7, page_size, page_size, 1, 1, 1, 9], dtype=np.int32)
     num_kv_update_slices = len(slice_lens)
-    kv_cache_start_indices = np.array([
-        page_size * 2 - 7, page_size * 2, page_size * 3, page_size * 4 + 6,
-        page_size * 5 + 7, page_size * 6 + 8, page_size * 15 + 3
-    ],
-                                      dtype=np.int32)
+    kv_cache_start_indices = np.array(
+        [
+            page_size * 2 - 7,
+            page_size * 2,
+            page_size * 3,
+            page_size * 4 + 6,
+            page_size * 5 + 7,
+            page_size * 6 + 8,
+            page_size * 15 + 3,
+        ],
+        dtype=np.int32,
+    )
     new_kv_cache_indices = np.concatenate(
-        [np.array([0], dtype=np.int32),
-         np.cumsum(slice_lens[:-1])])
+        [np.array([0], dtype=np.int32), np.cumsum(slice_lens[:-1])]
+    )
     slot_mapping = np.stack(
-        [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1)
-    padded_size = (slot_mapping.shape[0] + num_slices_per_block -
-                   1) // num_slices_per_block * num_slices_per_block
-    slot_mapping = np.pad(slot_mapping,
-                          [[0, padded_size - slot_mapping.shape[0]], [0, 0]],
-                          constant_values=0)
+        [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1
+    )
+    padded_size = (
+        (slot_mapping.shape[0] + num_slices_per_block - 1)
+        // num_slices_per_block
+        * num_slices_per_block
+    )
+    slot_mapping = np.pad(
+        slot_mapping,
+        [[0, padded_size - slot_mapping.shape[0]], [0, 0]],
+        constant_values=0,
+    )
     slot_mapping = np.transpose(slot_mapping)
-    slot_mapping_cpu = torch.tensor(slot_mapping,
-                                    device="cpu",
-                                    dtype=torch.int32)
+    slot_mapping_cpu = torch.tensor(slot_mapping, device="cpu", dtype=torch.int32)
     slot_mapping_xla = slot_mapping_cpu.to(torch_xla.device())
-    num_kv_update_slices_xla = torch.tensor([num_kv_update_slices],
-                                            device=torch_xla.device(),
-                                            dtype=torch.int32)
+    num_kv_update_slices_xla = torch.tensor(
+        [num_kv_update_slices], device=torch_xla.device(), dtype=torch.int32
+    )
     torch_xla.sync()
 
     torch.ops.xla.dynamo_set_buffer_donor_(kv_cache_xla, True)
     new_kv_cache_xla = torch.ops.xla.kv_cache_update_op(
-        new_kv_xla, slot_mapping_xla, kv_cache_xla, num_kv_update_slices_xla,
-        page_size, num_slices_per_block)
+        new_kv_xla,
+        slot_mapping_xla,
+        kv_cache_xla,
+        num_kv_update_slices_xla,
+        page_size,
+        num_slices_per_block,
+    )
     kv_cache_xla.copy_(new_kv_cache_xla)
     torch_xla.sync()
 
-    for ni, ci, sl in zip(new_kv_cache_indices, kv_cache_start_indices,
-                          slice_lens):
-        kv_cache_cpu[ci:ci + sl, :, :] = new_kv_cpu[ni:ni + sl, :, :]
+    for ni, ci, sl in zip(new_kv_cache_indices, kv_cache_start_indices, slice_lens):
+        kv_cache_cpu[ci : ci + sl, :, :] = new_kv_cpu[ni : ni + sl, :, :]
 
-    assert torch.allclose(kv_cache_xla.cpu(),
-                          kv_cache_cpu,
-                          atol=1e-4,
-                          rtol=1e-4)
+    assert torch.allclose(kv_cache_xla.cpu(), kv_cache_cpu, atol=1e-4, rtol=1e-4)
diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
index 55fee4ee1ad4..1a2a0b4ccac3 100644
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -26,8 +26,7 @@
 
 @pytest.fixture(autouse=True)
 def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
+    """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
 
 
@@ -56,8 +55,7 @@ def ref_attention(
 HEAD_SIZES = [64, 80]
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -75,19 +73,12 @@ def test_mha_attn_forward(
     current_platform.seed_everything(0)
     # These are expected to be f32
     q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device)
-    k = torch.randn(batch_size,
-                    seq_len,
-                    num_kv_heads * head_size,
-                    device=device)
-    v = torch.randn(batch_size,
-                    seq_len,
-                    num_kv_heads * head_size,
-                    device=device)
+    k = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
+    v = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device)
     scale = 1.0 / head_size**0.5
-    attn = MultiHeadAttention(num_heads,
-                              head_size,
-                              scale=scale,
-                              num_kv_heads=num_kv_heads)
+    attn = MultiHeadAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
     output = attn(q, k, v)
 
     assert num_heads % num_kv_heads == 0
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index a61773a4f611..c293679671d6 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -27,31 +27,24 @@ def base64_encoded_image() -> dict[str, str]:
 
 
 @pytest.mark.asyncio
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 @pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
-async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
-                                                                        str]):
-
+async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]):
     pytest.skip("Skip this test until it's fixed.")
 
     def whats_in_this_image_msg(b64):
-        return [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{b64}"
+        return [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
                     },
-                },
-            ],
-        }]
+                ],
+            }
+        ]
 
     server_args = [
         "--max-model-len",
@@ -68,19 +61,20 @@ def whats_in_this_image_msg(b64):
     ]
 
     # Server will pre-compile on first startup (takes a long time).
-    with RemoteOpenAIServer(model_name, server_args,
-                            max_wait_seconds=600) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, max_wait_seconds=600
+    ) as remote_server:
         client: openai.AsyncOpenAI = remote_server.get_async_client()
 
         # Other requests now should be much faster
         for image_url in TEST_IMAGE_URLS:
             image_base64 = base64_encoded_image[image_url]
-            chat_completion_from_base64 = await client.chat.completions\
-                .create(
+            chat_completion_from_base64 = await client.chat.completions.create(
                 model=model_name,
                 messages=whats_in_this_image_msg(image_base64),
                 max_completion_tokens=24,
-                temperature=0.0)
+                temperature=0.0,
+            )
             result = chat_completion_from_base64
             assert result
             choice = result.choices[0]
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
index df89133170b8..2f72e7094fa7 100644
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@@ -5,8 +5,7 @@
 import torch
 
 from vllm.attention.backends.abstract import AttentionType
-from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
-                                               PallasMetadata)
+from vllm.v1.attention.backends.pallas import PallasAttentionBackendImpl, PallasMetadata
 
 
 def test_ragged_paged_attention():
@@ -51,14 +50,14 @@ class FakeAttentionLayer:
     max_num_reqs = 8
     max_num_blocks_per_req = 8
     num_kv_update_slices = torch.tensor([num_tokens], dtype=torch.int32)
-    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
-                               dtype=torch.int32)
-    context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
+    block_tables = torch.zeros(
+        (max_num_reqs, max_num_blocks_per_req), dtype=torch.int32
+    )
+    context_lens = torch.ones((max_num_reqs,), dtype=torch.int32)
     query_lens = [1] * max_num_reqs
-    query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
-                                                dtype=torch.int32),
-                                   dim=0,
-                                   dtype=torch.int32)
+    query_start_loc = torch.cumsum(
+        torch.tensor([0] + query_lens, dtype=torch.int32), dim=0, dtype=torch.int32
+    )
     num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
     attn_metadata = PallasMetadata(
         slot_mapping=slot_mapping,
@@ -70,8 +69,7 @@ class FakeAttentionLayer:
         num_slices_per_kv_cache_update_block=8,
     )
 
-    with patch("torch.ops.xla.ragged_paged_attention"
-               ) as mock_ragged_paged_attention:
+    with patch("torch.ops.xla.ragged_paged_attention") as mock_ragged_paged_attention:
         attn_impl.forward(
             layer=layer,
             query=query,
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index f4a2d5ac853a..e8cc396f970e 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -4,6 +4,7 @@
 
 Run `pytest tests/v1/tpu/test_perf.py`.
 """
+
 from __future__ import annotations
 
 import time
@@ -37,7 +38,6 @@ class TestParams:
     #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
     #   Couldn't open iommu group /dev/vfio/0
     # => Investigate
-
     # TestParams(
     #     model="Qwen/Qwen2.5-1.5B-Instruct",
     #     num_prompts=1,
@@ -59,16 +59,14 @@ class TestParams:
         num_prompts=64,
         prefix_len=500,
         decode_len=50,
-
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
         # tpu: v5lite (old vllm CI/CD)
         # expected_avg_time=1.4,
         # err_tol=0.30,
-
         # (This is the active CI/CD instance)
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
         # tpu: v6e (current vllm CI/CD)
-        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=  
+        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=
         err_tol=0.20,
     ),
 ]
@@ -81,44 +79,50 @@ class TestParams:
 GPU_UTIL = 0.9
 
 
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This is a basic performance test for TPU only")
+@pytest.mark.skipif(
+    not current_platform.is_tpu(),
+    reason="This is a basic performance test for TPU only",
+)
 @pytest.mark.parametrize("params", TEST_PARAMS)
 def test_perf(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     params: TestParams,
 ) -> None:
-    tokenizer = get_tokenizer(params.model,
-                              tokenizer_mode="auto",
-                              trust_remote_code=True)
+    tokenizer = get_tokenizer(
+        params.model, tokenizer_mode="auto", trust_remote_code=True
+    )
 
     prompts = []
     for i in range(params.num_prompts):
-        prefix_token_ids = np.random.randint(0,
-                                             tokenizer.vocab_size,
-                                             size=params.prefix_len).tolist()
+        prefix_token_ids = np.random.randint(
+            0, tokenizer.vocab_size, size=params.prefix_len
+        ).tolist()
         prompt = tokenizer.decode(prefix_token_ids)
         prompts.append(prompt)
 
     print(
         "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
-            len(prompts), params.prefix_len, params.decode_len))
+            len(prompts), params.prefix_len, params.decode_len
+        )
+    )
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        sampling_params = SamplingParams(max_tokens=params.decode_len,
-                                         temperature=1.0,
-                                         min_p=0.0)
-
-        with vllm_runner(params.model,
-                         max_num_batched_tokens=MAX_MODEL_LEN,
-                         max_model_len=MAX_MODEL_LEN,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         gpu_memory_utilization=GPU_UTIL,
-                         enforce_eager=False,
-                         tensor_parallel_size=1) as vllm_model:
+        sampling_params = SamplingParams(
+            max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+        )
+
+        with vllm_runner(
+            params.model,
+            max_num_batched_tokens=MAX_MODEL_LEN,
+            max_model_len=MAX_MODEL_LEN,
+            max_num_seqs=MAX_NUM_SEQS,
+            gpu_memory_utilization=GPU_UTIL,
+            enforce_eager=False,
+            tensor_parallel_size=1,
+        ) as vllm_model:
             print("  -- Warmup / Compile")
             for i in range(NUM_WARMUPS):
                 _ = vllm_model.generate(prompts, sampling_params)
@@ -133,14 +137,18 @@ def test_perf(
             avg_time = sum(times) / len(times)
 
             print("  -- avg_time = {}".format(avg_time))
-            print("  -- expected_avg_time = {} with err_tol = {}".format(
-                params.expected_avg_time, params.err_tol))
+            print(
+                "  -- expected_avg_time = {} with err_tol = {}".format(
+                    params.expected_avg_time, params.err_tol
+                )
+            )
             diff = avg_time - params.expected_avg_time
             ok = diff < params.err_tol
             if diff < -params.err_tol:
-                print("  !! WARNING !! Performance has improved by {}, "
-                      "it may be necessary to fine-tune the "
-                      "expected_avg_time = {}".format(
-                          -diff, params.expected_avg_time))
+                print(
+                    "  !! WARNING !! Performance has improved by {}, "
+                    "it may be necessary to fine-tune the "
+                    "expected_avg_time = {}".format(-diff, params.expected_avg_time)
+                )
 
             assert ok, " !! ERROR !! Regression detected"
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 198bb1e16ed9..976ffbab7e33 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -16,21 +16,20 @@
 
 
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 def test_sampler_different(model_name: str):
     """
-    Test significantly different sampling params to assert the model produces 
+    Test significantly different sampling params to assert the model produces
     different results.
     """
-    llm = LLM(model_name,
-              enforce_eager=False,
-              max_num_seqs=1,
-              max_model_len=512,
-              max_num_batched_tokens=256)
-    prompts = [
-        "Write a short story about a robot that dreams for the first time."
-    ]
+    llm = LLM(
+        model_name,
+        enforce_eager=False,
+        max_num_seqs=1,
+        max_model_len=512,
+        max_num_batched_tokens=256,
+    )
+    prompts = ["Write a short story about a robot that dreams for the first time."]
     sampling_params = SamplingParams(temperature=0.9, min_p=0.2, max_tokens=64)
     output = llm.generate(prompts, sampling_params)
 
@@ -53,7 +52,9 @@ def test_sampler_different(model_name: str):
                 max_tokens=64,
                 # Vary number of ks
                 top_k=random.randint(4, 12),
-                top_p=random.random()) for _ in range(B)
+                top_p=random.random(),
+            )
+            for _ in range(B)
         ]
         # Make sure first two reqs have the same K/P
         sampling_params[0] = sampling_params[1]
@@ -67,20 +68,18 @@ def test_sampler_different(model_name: str):
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
 # TODO TPU will appear busy if we fan-out test params here
 @pytest.mark.parametrize("n_prompts", [1])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
 def test_logprobs(model_name: str, n_prompts: int):
     """
     Request top logprobs with different sampling settings and check
-    that results contains the requested number, ordered ascendingly.  
+    that results contains the requested number, ordered ascendingly.
     """
 
     def check_num_logprobs(logprobs, expected_num: int):
         for step in logprobs:
             prev_logp = 1.0
             # order by rank
-            sorted_step = dict(
-                sorted(step.items(), key=lambda item: item[1].rank))
+            sorted_step = dict(sorted(step.items(), key=lambda item: item[1].rank))
 
             # Can contain the sampled token
             assert len(step) == expected_num or len(step) == expected_num + 1
@@ -90,23 +89,23 @@ def check_num_logprobs(logprobs, expected_num: int):
                 prev_logp = logp.logprob
                 assert logp.rank == rankno + 1
 
-    llm = LLM(model_name,
-              enforce_eager=False,
-              max_num_seqs=1,
-              max_model_len=128,
-              max_num_batched_tokens=128)
+    llm = LLM(
+        model_name,
+        enforce_eager=False,
+        max_num_seqs=1,
+        max_model_len=128,
+        max_num_batched_tokens=128,
+    )
     prompts = [
         "Write a short story about a robot that dreams for the first time."
     ] * n_prompts
-    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64,\
-         logprobs=4)
-    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
-         logprobs=4)
-    topkp_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
-         logprobs=4, top_k=12, top_p=0.5)
+    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64, logprobs=4)
+    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64, logprobs=4)
+    topkp_sampling_params = SamplingParams(
+        temperature=0.4, max_tokens=64, logprobs=4, top_k=12, top_p=0.5
+    )
 
-    for sp in [greedy_sampling_params, regular_sampling_params, \
-               topkp_sampling_params]:
+    for sp in [greedy_sampling_params, regular_sampling_params, topkp_sampling_params]:
         output = llm.generate(prompts, sp)
         for o in output:
             check_num_logprobs(o.outputs[0].logprobs, 4)
diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py
index ad234df0c8ed..be866bf90a79 100644
--- a/tests/v1/tpu/test_spmd_model_weight_loading.py
+++ b/tests/v1/tpu/test_spmd_model_weight_loading.py
@@ -9,14 +9,18 @@
 import torch_xla.runtime as xr
 
 from vllm.config import set_current_vllm_config
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tpu import TPUModelLoader
 
 
 def _setup_environment(model):
-    engine_args = EngineArgs(model=model, )
+    engine_args = EngineArgs(
+        model=model,
+    )
     vllm_config = engine_args.create_engine_config()
     with set_current_vllm_config(vllm_config):
         temp_file = tempfile.mkstemp()[1]
@@ -25,7 +29,8 @@ def _setup_environment(model):
             0,
             local_rank=0,
             distributed_init_method=f"file://{temp_file}",
-            backend="gloo")
+            backend="gloo",
+        )
         # Under single worker mode, full model is init first and then
         # partitioned using GSPMD.
         ensure_model_parallel_initialized(1, 1)
@@ -42,7 +47,7 @@ def _get_spmd_mesh():
         num_devices = xr.global_runtime_device_count()
         mesh_shape = (num_devices, 1)
         device_ids = np.array(range(num_devices))
-        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+        MESH = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
     return MESH
 
 
@@ -53,15 +58,17 @@ def _get_spmd_mesh():
         # Skip large models due to CI runner disk space limitations
         # "meta-llama/Llama-3.1-8B-Instruct",
         # "meta-llama/Llama-3.1-70B-Instruct",
-    ])
+    ],
+)
 def test_tpu_model_loader(model):
     # Skip the 70B test if there are less than 8 chips
     # TODO: Query using torch xla API, the query API is not working
     # with SPMD now. However, This test is running under SPMD mode.
-    if '70B' in model and xr.global_runtime_device_count() < 8:
+    if "70B" in model and xr.global_runtime_device_count() < 8:
         pytest.skip(
             "Skipping 70B model if the TPU VM has less than 8 chips to \
-                     avoid OOM.")
+                     avoid OOM."
+        )
 
     vllm_config = _setup_environment(model)
     loader = TPUModelLoader(load_config=vllm_config.load_config)
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index ca5c067b364e..d3e07da9d67e 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -6,8 +6,10 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
-                                                  apply_top_k_top_p_tpu)
+from vllm.v1.sample.ops.topk_topp_sampler import (
+    apply_top_k_top_p,
+    apply_top_k_top_p_tpu,
+)
 
 if not current_platform.is_tpu():
     pytest.skip("This test needs a TPU.", allow_module_level=True)
@@ -25,11 +27,10 @@ def test_topk_equivalence_to_native_impl():
         logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
 
         # Random top-k values between 1 and 10.
-        k = torch.randint(1, 10, (BATCH_SIZE, ))
+        k = torch.randint(1, 10, (BATCH_SIZE,))
 
         # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
-        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool),
-                       VOCAB_SIZE)
+        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE,), dtype=bool), VOCAB_SIZE)
 
         result_tpu = apply_top_k_top_p_tpu(logits=logits.clone(), k=k, p=None)
 
@@ -45,15 +46,13 @@ def test_topp_result_sums_past_p():
         probs = logits.softmax(dim=-1)
 
         # Random top-p values between 0 and 1.
-        p = torch.rand((BATCH_SIZE, ))
+        p = torch.rand((BATCH_SIZE,))
 
         # Set p=1 for ~50% of requests in the batch (top-p disabled).
-        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool), 1)
+        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE,), dtype=bool), 1)
 
         no_op_k = torch.tensor([VOCAB_SIZE])
-        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                              k=no_op_k,
-                                              p=p)
+        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(), k=no_op_k, p=p)
 
         # Verify that the masked logit's probability sums to at least p.
         probs.masked_fill_(logits_masked.isinf(), 0)
@@ -67,16 +66,16 @@ def test_topp_result_sums_past_p():
 
 def test_topp_basic():
     with torch.device(xm.xla_device()):
-        logits = torch.tensor([[math.log(0.2),
-                                math.log(0.3),
-                                math.log(0.5)],
-                               [math.log(0.5),
-                                math.log(0.1),
-                                math.log(0.4)]])
+        logits = torch.tensor(
+            [
+                [math.log(0.2), math.log(0.3), math.log(0.5)],
+                [math.log(0.5), math.log(0.1), math.log(0.4)],
+            ]
+        )
 
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([3, 3]),
-                                       p=torch.tensor([0.79, 0.79]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([3, 3]), p=torch.tensor([0.79, 0.79])
+        )
 
         xm.mark_step()
 
@@ -89,16 +88,16 @@ def test_topp_basic():
 
 def test_topp_select_all():
     with torch.device(xm.xla_device()):
-        logits = torch.tensor([[math.log(0.2),
-                                math.log(0.3),
-                                math.log(0.5)],
-                               [math.log(0.5),
-                                math.log(0.1),
-                                math.log(0.4)]])
+        logits = torch.tensor(
+            [
+                [math.log(0.2), math.log(0.3), math.log(0.5)],
+                [math.log(0.5), math.log(0.1), math.log(0.4)],
+            ]
+        )
 
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([3, 3]),
-                                       p=torch.tensor([1.0, 1.0]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([3, 3]), p=torch.tensor([1.0, 1.0])
+        )
 
         xm.mark_step()
 
@@ -109,14 +108,12 @@ def test_topp_with_ties():
     with torch.device(xm.xla_device()):
         # Input has multiple math.log(0.3).
         logits = torch.tensor(
-            [[math.log(0.3),
-              math.log(0.3),
-              math.log(0.3),
-              math.log(0.1)]])
+            [[math.log(0.3), math.log(0.3), math.log(0.3), math.log(0.1)]]
+        )
 
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([4]),
-                                       p=torch.tensor([0.2]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([4]), p=torch.tensor([0.2])
+        )
 
         xm.mark_step()
 
@@ -130,17 +127,17 @@ def test_topp_with_ties():
 
 def test_both_topk_topp():
     with torch.device(xm.xla_device()):
-        logits = torch.tensor([[math.log(0.2),
-                                math.log(0.3),
-                                math.log(0.5)],
-                               [math.log(0.5),
-                                math.log(0.1),
-                                math.log(0.4)]])
+        logits = torch.tensor(
+            [
+                [math.log(0.2), math.log(0.3), math.log(0.5)],
+                [math.log(0.5), math.log(0.1), math.log(0.4)],
+            ]
+        )
 
         # Set k=1 for the first batch.
-        result = apply_top_k_top_p_tpu(logits=logits.clone(),
-                                       k=torch.tensor([1, 3]),
-                                       p=torch.tensor([0.79, 0.79]))
+        result = apply_top_k_top_p_tpu(
+            logits=logits.clone(), k=torch.tensor([1, 3]), p=torch.tensor([0.79, 0.79])
+        )
 
         xm.mark_step()
 
diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py
index 46fa1193881f..098d92550542 100644
--- a/tests/v1/tpu/test_tpu_qkv_linear.py
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
@@ -9,8 +9,10 @@
 import torch_xla.runtime as xr
 
 from vllm.config import set_current_vllm_config
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.distributed.tpu_distributed_utils import XlaQKVParallelLinear
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.linear import QKVParallelLinear
@@ -36,7 +38,8 @@ def setup_environment():
             0,
             local_rank=0,
             distributed_init_method=f"file://{temp_file}",
-            backend="gloo")
+            backend="gloo",
+        )
         ensure_model_parallel_initialized(1, 1)
         yield
 
@@ -51,7 +54,7 @@ def _get_spmd_mesh():
         num_devices = xr.global_runtime_device_count()
         mesh_shape = (num_devices, 1)
         device_ids = np.array(range(num_devices))
-        MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y'))
+        MESH = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
     return MESH
 
 
@@ -59,7 +62,7 @@ def _get_spmd_mesh():
 # `xr.use_spmd()` will set a global state, and this state is not reversible.
 # Therefore, non-SPMD tests should be run before SPMD tests.
 @pytest.mark.parametrize("mesh", [None, _get_spmd_mesh()])
-@pytest.mark.parametrize("device", ['cpu', 'xla'])
+@pytest.mark.parametrize("device", ["cpu", "xla"])
 @torch.no_grad()
 def test_xla_qkv_linear(bias, mesh, device):
     torch.manual_seed(123)
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 40db0b2afe0d..8c57dac0b800 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -4,18 +4,25 @@
 import pytest
 
 from vllm.attention.layer import Attention
-from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (
+    CacheConfig,
+    ModelConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes
-from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
-                                         get_kv_cache_config)
-from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
-                                       SchedulerOutput)
+from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_config
+from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
 from vllm.v1.worker.tpu_model_runner import (
-    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
-    _get_padded_token_len, _get_req_paddings, _get_token_paddings)
+    TPUModelRunner,
+    _get_padded_num_reqs_with_upper_limit,
+    _get_padded_token_len,
+    _get_req_paddings,
+    _get_token_paddings,
+)
 
 
 def get_vllm_config():
@@ -73,10 +80,11 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_positions=[],
                 sampling_params=SamplingParams(),
                 pooling_params=PoolingParams(),
-                block_ids=([0], ),  # block_ids should be tuple[list[int]]
+                block_ids=([0],),  # block_ids should be tuple[list[int]]
                 num_computed_tokens=0,
                 lora_request=None,
-            ))
+            )
+        )
         num_scheduled_tokens[req_id] = 3
         total_num_scheduled_tokens += num_scheduled_tokens[req_id]
 
@@ -105,7 +113,7 @@ def _is_req_added(model_runner, req_id: str) -> bool:
 
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     """Check if the request state block IDs match the block table.
-    
+
     This function handles both legacy BlockTable and new MultiGroupBlockTable
     structures for backward compatibility.
     """
@@ -212,7 +220,7 @@ def test_update_states_request_resumed(model_runner):
         req_ids=[req_id],
         resumed_from_preemption=[False],
         new_token_ids=[[]],
-        new_block_ids=[([], )],
+        new_block_ids=[([],)],
         num_computed_tokens=[0],
     )
 
@@ -309,27 +317,23 @@ def test_get_paddings():
     # Bucketed padding
     min_token_size, max_token_size, padding_gap = 16, 512, 64
     expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
 
     # Bucketed padding with max_token_size not a power of two.
     max_token_size = 317
     expected_paddings = [16, 32, 64, 128, 192, 256, 320]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert actual_paddings == expected_paddings
 
     # Exponential padding.
     max_token_size, padding_gap = 1024, 0
     expected_paddings = [16, 32, 64, 128, 256, 512, 1024]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert actual_paddings == expected_paddings
     # Exponential padding with max_token_size not a power of two.
     max_token_size = 317
     expected_paddings = [16, 32, 64, 128, 256, 512]
-    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                          padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert actual_paddings == expected_paddings
 
 
@@ -356,32 +360,31 @@ def test_get_req_paddings():
     assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
 
 
-def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
-        model_runner):
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(model_runner):
     layer_0 = "model.layers.0.self_attn.attn"
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} must come before the current layer"
     vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with (
+        pytest.raises(ValueError, match=error_msg),
+        set_current_vllm_config(vllm_config),
+    ):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
                 kv_sharing_target_layer_name=layer_1,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -393,25 +396,25 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
     invalid_layer = "model.layers.0.cross_attn.attn"
     error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
     vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with (
+        pytest.raises(ValueError, match=error_msg),
+        set_current_vllm_config(vllm_config),
+    ):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 # invalid layer: cross_attn.atn doesn't exist!
                 kv_sharing_target_layer_name=invalid_layer,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -422,26 +425,26 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
     layer_1 = "model.layers.1.self_attn.attn"
     error_msg = f"{layer_1} cannot be the same as the current layer"
     vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with (
+        pytest.raises(ValueError, match=error_msg),
+        set_current_vllm_config(vllm_config),
+    ):
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -453,20 +456,18 @@ def test_init_kv_cache_without_kv_sharing():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -481,17 +482,15 @@ def test_init_kv_cache_without_kv_sharing():
     available_memory = 20 * GiB_bytes
     # page size for each layer KV can be calculated as
     # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
-    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
+    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
     num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
-    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                          available_memory)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 2
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
     assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     # max_context_len = available_memory / (page_size / block_size) / num_caches
     # max_context_len = 5GB / (512KB / 128) / 2 = 655360
@@ -501,8 +500,9 @@ def test_init_kv_cache_without_kv_sharing():
     # this will only allocate 2 block worth of memory (2 * 512kb)
     kv_cache_config.num_blocks = 1
     for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-        kv_cache_tensor.size = (
-            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+        kv_cache_tensor.size = kv_cache_spec[
+            kv_cache_tensor.shared_by[0]
+        ].page_size_bytes
 
     model_runner.initialize_kv_cache(kv_cache_config)
 
@@ -524,21 +524,19 @@ def test_init_kv_cache_with_kv_sharing_valid():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=128,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -556,24 +554,21 @@ def test_init_kv_cache_with_kv_sharing_valid():
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
     num_expected_blocks = 2 * 20480  # 20GB / 512KB
-    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                          available_memory)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == (2 * 655360)
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (512kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.kv_cache_tensors[0].size =\
-        kv_cache_spec[layer_0].page_size_bytes
+    kv_cache_config.kv_cache_tensors[0].size = kv_cache_spec[layer_0].page_size_bytes
 
     model_runner.initialize_kv_cache(kv_cache_config)
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 943a13debada..bf66d17391b0 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -28,14 +28,11 @@
 MAX_NUM_PROMPT_TOKENS = 64
 
 
-def _compare_objs(obj1,
-                  obj2,
-                  skip: Sequence = ("logitsprocs", "batch_update_builder")):
+def _compare_objs(obj1, obj2, skip: Sequence = ("logitsprocs", "batch_update_builder")):
     attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
-    attr_names = set([
-        a[0] for a in attrs
-        if not (a[0].startswith('__') and a[0].endswith('__'))
-    ])
+    attr_names = set(
+        [a[0] for a in attrs if not (a[0].startswith("__") and a[0].endswith("__"))]
+    )
     for attr_name in attr_names:
         if attr_name in skip:
             continue
@@ -45,8 +42,8 @@ def _compare_objs(obj1,
 
         is_same = False
         if isinstance(a, torch.Tensor):
-            if (a.numel() == 0 or b.numel() == 0):
-                is_same = (a.numel() == 0 and b.numel() == 0)
+            if a.numel() == 0 or b.numel() == 0:
+                is_same = a.numel() == 0 and b.numel() == 0
             elif torch.allclose(a, b):
                 is_same = True
         elif isinstance(a, np.ndarray):
@@ -61,12 +58,14 @@ def _compare_objs(obj1,
             is_same = True  # if we make it here must be same
         elif a == b:
             is_same = True
-        assert is_same, f"Attribute {attr_name} is different"\
-            f" in {obj1} and {obj2}: {a} != {b}"
+        assert is_same, (
+            f"Attribute {attr_name} is different in {obj1} and {obj2}: {a} != {b}"
+        )
 
 
-def _remove_requests(input_batch: InputBatch, batch_size: int,
-                     reqs: list[CachedRequestState]) -> set[str]:
+def _remove_requests(
+    input_batch: InputBatch, batch_size: int, reqs: list[CachedRequestState]
+) -> set[str]:
     """
     Remove some requests randomly from the batch and returns
     set of request removed
@@ -106,10 +105,9 @@ def _construct_expected_sampling_metadata(
     temperature = [0.0 for _ in range(num_reqs)]
     min_tokens = {}
     logit_bias = [None] * num_reqs
-    allowed_token_ids_mask = torch.zeros(num_reqs,
-                                         VOCAB_SIZE,
-                                         dtype=torch.bool,
-                                         device=device)
+    allowed_token_ids_mask = torch.zeros(
+        num_reqs, VOCAB_SIZE, dtype=torch.bool, device=device
+    )
     bad_words_token_ids = {}
     for req in reqs:
         if req.req_id not in req_ids_retained:
@@ -117,35 +115,40 @@ def _construct_expected_sampling_metadata(
         index_in_input_batch = req_id_index_in_input_batch[req.req_id]
         output_token_ids[index_in_input_batch] = req.output_token_ids
         prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
-        presence_penalties[
-            index_in_input_batch] = req.sampling_params.presence_penalty
+        presence_penalties[index_in_input_batch] = req.sampling_params.presence_penalty
         frequency_penalties[index_in_input_batch] = (
-            req.sampling_params.frequency_penalty)
+            req.sampling_params.frequency_penalty
+        )
         repetition_penalties[index_in_input_batch] = (
-            req.sampling_params.repetition_penalty)
+            req.sampling_params.repetition_penalty
+        )
         top_k[index_in_input_batch] = req.sampling_params.top_k
         top_p[index_in_input_batch] = req.sampling_params.top_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
         min_tokens[index_in_input_batch] = (
             req.sampling_params.min_tokens,
-            req.sampling_params.all_stop_token_ids)
+            req.sampling_params.all_stop_token_ids,
+        )
         logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
         if req.sampling_params.allowed_token_ids:
             allowed_token_ids_mask[index_in_input_batch][
-                req.sampling_params.allowed_token_ids] = True
+                req.sampling_params.allowed_token_ids
+            ] = True
         if req.sampling_params.bad_words_token_ids:
-            bad_words_token_ids[
-                index_in_input_batch] = req.sampling_params.bad_words_token_ids
+            bad_words_token_ids[index_in_input_batch] = (
+                req.sampling_params.bad_words_token_ids
+            )
 
     return SamplingMetadata(
-        temperature=torch.tensor(temperature, dtype=torch.float,
-                                 device=device),
+        temperature=torch.tensor(temperature, dtype=torch.float, device=device),
         all_greedy=False,
         all_random=True,
-        top_p=None if all(x == 1.0 for x in top_p) else torch.tensor(
-            top_p, dtype=torch.float, device=device),
-        top_k=None if all(x == 0 for x in top_k) else torch.tensor(
-            top_k, dtype=torch.int, device=device),
+        top_p=None
+        if all(x == 1.0 for x in top_p)
+        else torch.tensor(top_p, dtype=torch.float, device=device),
+        top_k=None
+        if all(x == 0 for x in top_k)
+        else torch.tensor(top_k, dtype=torch.int, device=device),
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=make_tensor_with_pad(
@@ -154,19 +157,21 @@ def _construct_expected_sampling_metadata(
             device=torch.device(device),
             dtype=torch.int64,
         ),
-        frequency_penalties=torch.tensor(frequency_penalties,
-                                         dtype=torch.float,
-                                         device=device),
-        presence_penalties=torch.tensor(presence_penalties,
-                                        dtype=torch.float,
-                                        device=device),
-        repetition_penalties=torch.tensor(repetition_penalties,
-                                          dtype=torch.float,
-                                          device=device),
+        frequency_penalties=torch.tensor(
+            frequency_penalties, dtype=torch.float, device=device
+        ),
+        presence_penalties=torch.tensor(
+            presence_penalties, dtype=torch.float, device=device
+        ),
+        repetition_penalties=torch.tensor(
+            repetition_penalties, dtype=torch.float, device=device
+        ),
         output_token_ids=output_token_ids,
-        no_penalties=(all(x == 0 for x in presence_penalties)
-                      and all(x == 0 for x in frequency_penalties)
-                      and all(x == 1 for x in repetition_penalties)),
+        no_penalties=(
+            all(x == 0 for x in presence_penalties)
+            and all(x == 0 for x in frequency_penalties)
+            and all(x == 1 for x in repetition_penalties)
+        ),
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
         logitsprocs=LogitsProcessorManager(),
@@ -182,8 +187,7 @@ def _create_sampling_params():
         frequency_penalty=np.random.uniform(-2.0, 2.0),
         min_tokens=np.random.randint(1, 10),
         stop_token_ids=[
-            np.random.randint(0, VOCAB_SIZE)
-            for _ in range(np.random.randint(10))
+            np.random.randint(0, VOCAB_SIZE) for _ in range(np.random.randint(10))
         ],
         logit_bias={0: np.random.uniform(-3.0, 3.0)},
     )
@@ -205,7 +209,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         pooling_params=None,
         mm_inputs=[],
         mm_positions=[],
-        block_ids=([], ),
+        block_ids=([],),
         generator=None,
         num_computed_tokens=len(output_token_ids),
         output_token_ids=output_token_ids,
@@ -260,19 +264,18 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
 
     # Create expected output.
     expected_sampling_metadata = _construct_expected_sampling_metadata(
-        reqs,
-        req_ids_retained,
-        input_batch.req_id_to_index,
-        device=torch.device(device))
+        reqs, req_ids_retained, input_batch.req_id_to_index, device=torch.device(device)
+    )
 
     def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
-        return (t1 is None
-                and t2 is None) or (t1 is not None and t2 is not None
-                                    and torch.allclose(t1, t2))
+        return (t1 is None and t2 is None) or (
+            t1 is not None and t2 is not None and torch.allclose(t1, t2)
+        )
 
     # Assert the actual and expected output.
-    assert torch.allclose(expected_sampling_metadata.temperature,
-                          sampling_metadata.temperature)
+    assert torch.allclose(
+        expected_sampling_metadata.temperature, sampling_metadata.temperature
+    )
     assert same(expected_sampling_metadata.top_p, sampling_metadata.top_p)
     assert same(expected_sampling_metadata.top_k, sampling_metadata.top_k)
     assert torch.allclose(
@@ -287,25 +290,29 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
         expected_sampling_metadata.repetition_penalties,
         sampling_metadata.repetition_penalties,
     )
-    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
-                          sampling_metadata.prompt_token_ids)
-    assert (expected_sampling_metadata.output_token_ids ==
-            sampling_metadata.output_token_ids)
-    assert expected_sampling_metadata.no_penalties == \
-           sampling_metadata.no_penalties
+    assert torch.allclose(
+        expected_sampling_metadata.prompt_token_ids, sampling_metadata.prompt_token_ids
+    )
+    assert (
+        expected_sampling_metadata.output_token_ids
+        == sampling_metadata.output_token_ids
+    )
+    assert expected_sampling_metadata.no_penalties == sampling_metadata.no_penalties
     if sampling_metadata.allowed_token_ids_mask:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
-            sampling_metadata.allowed_token_ids_mask)
-    assert expected_sampling_metadata.bad_words_token_ids == \
-        sampling_metadata.bad_words_token_ids
+            sampling_metadata.allowed_token_ids_mask,
+        )
+    assert (
+        expected_sampling_metadata.bad_words_token_ids
+        == sampling_metadata.bad_words_token_ids
+    )
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [32])
-@pytest.mark.parametrize("swap_list", [((0, 1), )])
-def test_swap_states_in_input_batch(device: str, batch_size: int,
-                                    swap_list: list):
+@pytest.mark.parametrize("swap_list", [((0, 1),)])
+def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: list):
     """
     Tests the logic for managing sampling metadata in the InputBatch.
 
@@ -350,8 +357,10 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
 
     reordered_reqs = reqs.copy()
     for swap_pair in swap_list:
-        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = \
-            reordered_reqs[swap_pair[1]], reordered_reqs[swap_pair[0]]
+        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = (
+            reordered_reqs[swap_pair[1]],
+            reordered_reqs[swap_pair[0]],
+        )
         input_batch.swap_states(swap_pair[0], swap_pair[1])
 
     for req_index in range(batch_size):
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 0bdf1f9820d3..a569f37ef803 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -7,17 +7,25 @@
 import torch
 
 from vllm.attention import Attention
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig, set_current_vllm_config)
+from vllm.config import (
+    CacheConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes
-from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
-                                         get_kv_cache_config)
-from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
-                                       SchedulerOutput)
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheTensor)
+from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_config
+from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCacheTensor,
+)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -33,8 +41,7 @@ def initialize_kv_cache(runner: GPUModelRunner):
     """
     attn_spec = FullAttentionSpec(
         block_size=BLOCK_SIZE,
-        num_kv_heads=runner.model_config.get_num_kv_heads(
-            runner.parallel_config),
+        num_kv_heads=runner.model_config.get_num_kv_heads(runner.parallel_config),
         head_size=runner.model_config.get_head_size(),
         dtype=runner.kv_cache_dtype,
         use_mla=False,
@@ -57,9 +64,7 @@ def initialize_kv_cache(runner: GPUModelRunner):
         device=runner.device,
         pin_memory=runner.pin_memory,
         vocab_size=runner.model_config.get_vocab_size(),
-        block_sizes=[
-            kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size
-        ],
+        block_sizes=[kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size],
     )
     runner.initialize_attn_backend(kv_cache_config)
 
@@ -101,8 +106,9 @@ def model_runner():
     model_config = vllm_config.model_config
     num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
     head_size = model_config.get_head_size()
-    vllm_config.compilation_config.static_forward_context[
-        "layer.0"] = Attention(num_heads, head_size, 0.1)
+    vllm_config.compilation_config.static_forward_context["layer.0"] = Attention(
+        num_heads, head_size, 0.1
+    )
     runner = GPUModelRunner(vllm_config, DEVICE)
     initialize_kv_cache(runner)
     return runner
@@ -125,10 +131,11 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                 mm_positions=[],
                 sampling_params=SamplingParams(),
                 pooling_params=None,
-                block_ids=([0], ),
+                block_ids=([0],),
                 num_computed_tokens=0,
                 lora_request=None,
-            ))
+            )
+        )
         num_scheduled_tokens[req_id] = 3
         total_num_scheduled_tokens += num_scheduled_tokens[req_id]
 
@@ -155,22 +162,22 @@ def _is_req_added(model_runner, req_id: str) -> bool:
     return req_id in model_runner.requests
 
 
-def _is_sampling_metadata_changed(model_runner,
-                                  sampling_metadata_before: SamplingMetadata):
-    return model_runner.input_batch.sampling_metadata is not (
-        sampling_metadata_before)
+def _is_sampling_metadata_changed(
+    model_runner, sampling_metadata_before: SamplingMetadata
+):
+    return model_runner.input_batch.sampling_metadata is not (sampling_metadata_before)
 
 
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     req_index = model_runner.input_batch.req_id_to_index[req_id]
     block_table = model_runner.input_batch.block_table[0]
     req_state = model_runner.requests[req_id]
-    if block_table.num_blocks_per_row[req_index] != len(
-            req_state.block_ids[0]):
+    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids[0]):
         return False
     num_blocks = block_table.num_blocks_per_row[req_index]
-    return (block_table.block_table_np[req_index, :num_blocks] ==
-            req_state.block_ids[0]).all()
+    return (
+        block_table.block_table_np[req_index, :num_blocks] == req_state.block_ids[0]
+    ).all()
 
 
 def test_update_states_new_request(model_runner, dist_init):
@@ -253,7 +260,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
         req_ids=[req_id],
         resumed_from_preemption=[False],
         new_token_ids=[[]],
-        new_block_ids=([[0]], ),
+        new_block_ids=([[0]],),
         num_computed_tokens=[0],
     )
 
@@ -285,46 +292,58 @@ def test_get_nans_in_logits(model_runner, dist_init):
     scheduler_output = _schedule_new_request(*req_ids)
     model_runner._update_states(scheduler_output)
 
-    logits = torch.tensor([
-        [1.0, 2.0, 3.0],
-        [3.0, 2.0, 1.0],
-    ], device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, 2.0, 3.0],
+            [3.0, 2.0, 1.0],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 0, "req_1": 0}
 
-    logits = torch.tensor([
-        [1.0, float('nan'), 3.0],
-        [4.0, float('nan'), float('nan')],
-    ],
-                          device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, float("nan"), 3.0],
+            [4.0, float("nan"), float("nan")],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 1, "req_1": 2}
 
-    logits = torch.tensor([
-        [1.0, 2.0, 3.0],
-        [4.0, float('nan'), float('nan')],
-    ],
-                          device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, 2.0, 3.0],
+            [4.0, float("nan"), float("nan")],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 0, "req_1": 2}
 
     result = model_runner._get_nans_in_logits(logits=None)
     assert result == {"req_0": 0, "req_1": 0}
 
-    logits = torch.tensor([
-        [1.0, float('nan'), 3.0],
-    ], device=DEVICE)
+    logits = torch.tensor(
+        [
+            [1.0, float("nan"), 3.0],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
-    assert result == {'req_0': 1, 'req_1': 0}
-
-    logits = torch.tensor([
-        [float('nan'), float('nan'), 2.0],
-        [1.0, 2.0, 3.0],
-        [float('nan'), 2.0, 3.0],
-    ],
-                          device=DEVICE)
+    assert result == {"req_0": 1, "req_1": 0}
+
+    logits = torch.tensor(
+        [
+            [float("nan"), float("nan"), 2.0],
+            [1.0, 2.0, 3.0],
+            [float("nan"), 2.0, 3.0],
+        ],
+        device=DEVICE,
+    )
     result = model_runner._get_nans_in_logits(logits)
-    assert result == {'req_0': 2, 'req_1': 0}
+    assert result == {"req_0": 2, "req_1": 0}
 
 
 def test_update_states_no_changes(model_runner, dist_init):
@@ -402,11 +421,13 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
 def test_kv_cache_stride_order(monkeypatch, model_runner):
     # This test checks if GPUModelRunner initializes correctly when an attention
     # backend enforces a non-default KV cache stride order.
-    n_heads = model_runner.model_config.get_num_kv_heads(
-        model_runner.parallel_config)
+    n_heads = model_runner.model_config.get_num_kv_heads(model_runner.parallel_config)
     expected_kv_cache_shape = [
-        2, NUM_BLOCKS, BLOCK_SIZE, n_heads,
-        model_runner.model_config.get_head_size()
+        2,
+        NUM_BLOCKS,
+        BLOCK_SIZE,
+        n_heads,
+        model_runner.model_config.get_head_size(),
     ]
     # TODO mla test
     default_stride = list(range(5))
@@ -418,8 +439,7 @@ def rnd_stride_order():
 
     # Patch the attention backend class and re-trigger the KV cache creation.
     for attn_backend in model_runner.attn_backends:
-        monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
-                            rnd_stride_order)
+        monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order", rnd_stride_order)
 
     model_runner.attn_backends = []
     model_runner.attn_metadata_builders = []
@@ -451,14 +471,13 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
     model_runner_2.update_config({"load_config": {"load_format": "dummy"}})
     model_runner_2.load_model()  # Initial model loading with dummy weights
     assert str(model_runner.get_model().state_dict()) != str(
-        model_runner_2.get_model().state_dict())
-    model_runner_2.update_config(
-        {"load_config": {
-            "load_format": original_load_format
-        }})
+        model_runner_2.get_model().state_dict()
+    )
+    model_runner_2.update_config({"load_config": {"load_format": original_load_format}})
     model_runner_2.load_model()  # Load real weights inplace
     assert str(model_runner.get_model().state_dict()) == str(
-        model_runner_2.get_model().state_dict())
+        model_runner_2.get_model().state_dict()
+    )
 
 
 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
@@ -470,21 +489,19 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
                 kv_sharing_target_layer_name=layer_1,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -498,22 +515,20 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
     error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
     with pytest.raises(ValueError, match=error_msg):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
                 # invalid layer: cross_attn.atn doesn't exist!
                 kv_sharing_target_layer_name=invalid_layer,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -528,21 +543,19 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
         fwd_context = {
             # initialization below will fail because target layer is invalid;
             # the target layer needs to come before layer 1
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -555,20 +568,18 @@ def test_init_kv_cache_without_kv_sharing():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -583,15 +594,13 @@ def test_init_kv_cache_without_kv_sharing():
     available_memory = 20 * GiB_bytes
     # page size for layer 0's kv_cache_spec is 32KB
     num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
-    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                          available_memory)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 2
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
     assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == 1310720
 
@@ -599,8 +608,9 @@ def test_init_kv_cache_without_kv_sharing():
     # this will only allocate 2 block worth of memory (2 * 32kb)
     kv_cache_config.num_blocks = 1
     for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-        kv_cache_tensor.size = (
-            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+        kv_cache_tensor.size = kv_cache_spec[
+            kv_cache_tensor.shared_by[0]
+        ].page_size_bytes
 
     runner.initialize_kv_cache(kv_cache_config)
 
@@ -623,21 +633,19 @@ def test_init_kv_cache_with_kv_sharing_valid():
     vllm_config = get_vllm_config()
     with set_current_vllm_config(vllm_config):
         fwd_context = {
-            layer_0:
-            Attention(
+            layer_0: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_0,
             ),
-            layer_1:
-            Attention(
+            layer_1: Attention(
                 num_heads=8,
                 head_size=64,
                 scale=1.0,
                 prefix=layer_1,
                 kv_sharing_target_layer_name="model.layers.0.self_attn.attn",
-            )
+            ),
         }
         # suppress var not used error
         assert fwd_context is not None
@@ -655,24 +663,21 @@ def test_init_kv_cache_with_kv_sharing_valid():
     # with KV sharing, we can allocate (available_mem//page_size//1) blocks
     # which is twice as many as without KV sharing
     num_expected_blocks = 655360  # 20GB / 32KB
-    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                          available_memory)
+    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, available_memory)
     assert kv_cache_config.num_blocks == num_expected_blocks
     assert len(kv_cache_config.kv_cache_tensors) == 1
     # Each layer now has twice the available memory for KV cache
     # compared to no KV sharing
     assert kv_cache_config.kv_cache_tensors[0].size == available_memory
 
-    max_context_len =\
-        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
+    max_context_len = estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
     # max context len with KV sharing should be 2x as large as without
     assert max_context_len == 2 * 1310720
 
     # important: override tensor size to prevent large mem alloc during test
     # this will only allocate 1 block worth of memory (32kb)
     kv_cache_config.num_blocks = 1
-    kv_cache_config.kv_cache_tensors[0].size =\
-        kv_cache_spec[layer_0].page_size_bytes
+    kv_cache_config.kv_cache_tensors[0].size = kv_cache_spec[layer_0].page_size_bytes
 
     runner.initialize_kv_cache(kv_cache_config)
 
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
index 83be8bdce85c..4cb66b556e5a 100644
--- a/tests/vllm_test_utils/setup.py
+++ b/tests/vllm_test_utils/setup.py
@@ -4,7 +4,7 @@
 from setuptools import setup
 
 setup(
-    name='vllm_test_utils',
-    version='0.1',
-    packages=['vllm_test_utils'],
+    name="vllm_test_utils",
+    version="0.1",
+    packages=["vllm_test_utils"],
 )
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 49fd083ef19c..e2cab92ea22b 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -26,7 +26,7 @@ def blame(func: Callable) -> Generator[BlameResult, None, None]:
     ```python
     with blame(lambda: some_condition()) as result:
         # do something
-    
+
     if result.found:
         print(result.trace_stack)
     """
@@ -34,7 +34,7 @@ def blame(func: Callable) -> Generator[BlameResult, None, None]:
 
     def _trace_calls(frame, event, arg=None):
         nonlocal result
-        if event in ['call', 'return']:
+        if event in ["call", "return"]:
             # for every function call or return
             try:
                 # Temporarily disable the trace function
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index 9454221b273e..e2f1212ed554 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -19,8 +19,8 @@ class MonitoredValues(Generic[_T]):
 
 @contextlib.contextmanager
 def monitor(
-    measure_func: Callable[[],
-                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    measure_func: Callable[[], _T],
+) -> Generator[MonitoredValues[_T], None, None]:
     """
     Trace the function calls to continuously monitor the change of
     a value.
@@ -28,23 +28,23 @@ def monitor(
     Usage:
 
     ```python
-
     def measure_func():
-        ... # measure the current value
+        ...  # measure the current value
         return current_value
 
+
     with monitor(measure_func) as monitored_values:
         # do something
-    
-        monitored_values.values # all changes of the values
-        monitored_values.trace_stacks # trace stacks of every change
+
+        monitored_values.values  # all changes of the values
+        monitored_values.trace_stacks  # trace stacks of every change
     ```
     """
     monitored_values = MonitoredValues[_T]()
 
     def _trace_calls(frame, event, arg=None):
         nonlocal monitored_values
-        if event in ['line']:
+        if event in ["line"]:
             # triggered by every line of Python code.
             # only Python functions will trigger it,
             # c/cpp functions will not trigger it.
@@ -53,11 +53,14 @@ def _trace_calls(frame, event, arg=None):
                 sys.settrace(None)
                 # do a measurement
                 current_value = measure_func()
-                if len(monitored_values.values
-                       ) == 0 or current_value != monitored_values.values[-1]:
+                if (
+                    len(monitored_values.values) == 0
+                    or current_value != monitored_values.values[-1]
+                ):
                     monitored_values.values.append(current_value)
-                    monitored_values.trace_stacks.append("".join(
-                        traceback.format_stack()))
+                    monitored_values.trace_stacks.append(
+                        "".join(traceback.format_stack())
+                    )
                 # Re-enable the trace function
                 sys.settrace(_trace_calls)
             except NameError:
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 3aabae099073..658773068208 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -9,35 +9,39 @@
 from vllm.platforms import current_platform
 
 MAX_MODEL_LEN = 1024
-MODEL_NAME = os.environ.get("MODEL_NAME",
-                            "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
+MODEL_NAME = os.environ.get(
+    "MODEL_NAME", "robertgshaw2/zephyr-7b-beta-channelwise-gptq"
+)
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
 MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")
 
 
 @pytest.mark.skipif(
-    MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq",
-    reason="OOM in the CI")
+    MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq", reason="OOM in the CI"
+)
 @pytest.mark.skipif(
     not current_platform.has_device_capability(int(MIN_CAPABILITY)),
-    reason="Current system does not have minimum capability.")
+    reason="Current system does not have minimum capability.",
+)
 def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
     """
 
     # MoE models need fp16.
-    NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME
-                  == "nm-testing/test-w4a16-mixtral-actorder-group")
+    NEEDS_FP16 = (
+        QUANTIZATION == "gptq"
+        or MODEL_NAME == "nm-testing/test-w4a16-mixtral-actorder-group"
+    )
     with vllm_runner(
-            model_name=MODEL_NAME,
-            revision=REVISION,
-            dtype=torch.half if NEEDS_FP16 else "auto",
-            quantization=None if QUANTIZATION == "None" else QUANTIZATION,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=2) as model:
-
+        model_name=MODEL_NAME,
+        revision=REVISION,
+        dtype=torch.half if NEEDS_FP16 else "auto",
+        quantization=None if QUANTIZATION == "None" else QUANTIZATION,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=2,
+    ) as model:
         output = model.generate_greedy("Hello world!", max_tokens=20)
         print(output)
         assert output
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
index 3f202d4dbe94..7e45e2a9e04e 100644
--- a/tests/worker/conftest.py
+++ b/tests/worker/conftest.py
@@ -8,4 +8,4 @@ def use_v0_only(monkeypatch):
     """
     This module tests V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
+    monkeypatch.setenv("VLLM_USE_V1", "0")
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 35ac90b38e84..928a60bc782c 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -15,8 +15,7 @@
 BATCH_SIZES = [1, 4, 16, 64, 256]
 
 
-def _create_model_runner(model: str, *args,
-                         **kwargs) -> EncoderDecoderModelRunner:
+def _create_model_runner(model: str, *args, **kwargs) -> EncoderDecoderModelRunner:
     engine_args = EngineArgs(model, *args, **kwargs)
     engine_config = engine_args.create_engine_config()
     model_runner = EncoderDecoderModelRunner(
@@ -26,13 +25,13 @@ def _create_model_runner(model: str, *args,
     return model_runner
 
 
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="CPU backend is currently "
-                    "unsupported for encoder/ "
-                    "decoder models")
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="CPU backend is currently unsupported for encoder/ decoder models",
+)
 def test_empty_seq_group():
     """Verify prepare prompt and decode returns empty output
-       for empty seq group list"""
+    for empty seq group list"""
 
     model_runner = _create_model_runner(
         "facebook/bart-base",
@@ -44,8 +43,7 @@ def test_empty_seq_group():
         enforce_eager=True,
     )
     seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(seq_group_metadata_list)
     (
         input_tokens,
         input_positions,
@@ -69,13 +67,13 @@ def test_empty_seq_group():
     assert return_seq_lens is None
 
 
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="CPU backend is currently "
-                    "unsupported for encoder/ "
-                    "decoder models")
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="CPU backend is currently unsupported for encoder/ decoder models",
+)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 def test_prepare_prompt(batch_size):
-    '''
+    """
     Test the ability of the encoder/decoder model runner subclass to
     produce prefill-phase model inputs & attention metadata.
 
@@ -91,7 +89,7 @@ def test_prepare_prompt(batch_size):
     * batch_size
     * backend_name: The attention backend under test
     * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
-    '''
+    """
 
     model_runner = _create_model_runner(
         "facebook/bart-base",
@@ -152,8 +150,10 @@ def test_prepare_prompt(batch_size):
     device = model_runner.device
     assert attn_metadata.num_prefills > 0
     assert attn_metadata.num_decode_tokens == 0
-    assert torch.equal(attn_metadata.seq_lens_tensor,
-                       torch.tensor(seq_lens, device=device, dtype=torch.int))
+    assert torch.equal(
+        attn_metadata.seq_lens_tensor,
+        torch.tensor(seq_lens, device=device, dtype=torch.int),
+    )
     assert attn_metadata.seq_lens == seq_lens
     assert attn_metadata.max_prefill_seq_len == max(seq_lens)
     assert attn_metadata.max_decode_seq_len == 0
@@ -161,7 +161,8 @@ def test_prepare_prompt(batch_size):
     assert attn_metadata.encoder_seq_lens == encoder_seq_lens
     assert torch.equal(
         attn_metadata.encoder_seq_lens_tensor,
-        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
+        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int),
+    )
     assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
     assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
 
@@ -184,9 +185,9 @@ def test_prepare_prompt(batch_size):
     )
     assert torch.equal(
         attn_metadata.context_lens_tensor,
-        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
-                    dtype=torch.int,
-                    device=device),
+        torch.zeros(
+            attn_metadata.context_lens_tensor.shape[0], dtype=torch.int, device=device
+        ),
     )
 
     # Verify block tables are correct for prompts
@@ -245,8 +246,7 @@ def test_prepare_prompt(batch_size):
     for seq_len in seq_lens:
         # Compute the index offset of the final token in each
         # prompt (recall that the prompts are concatenated)
-        expected_selected_token_indices.append(selected_token_start_idx +
-                                               seq_len - 1)
+        expected_selected_token_indices.append(selected_token_start_idx + seq_len - 1)
         selected_token_start_idx += seq_len
 
     sampling_metadata = model_input.sampling_metadata
@@ -259,14 +259,14 @@ def test_prepare_prompt(batch_size):
     assert torch.equal(actual, expected)
 
 
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="CPU backend is currently "
-                    "unsupported for encoder/ "
-                    "decoder models")
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="CPU backend is currently unsupported for encoder/ decoder models",
+)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
 def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
-    '''
+    """
     Test the ability of the encoder/decoder model runner subclass to
     produce decode-phase model inputs & attention metadata.
 
@@ -283,7 +283,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
     * multiple_seqs_per_seq_group
     * backend_name: The attention backend under test
     * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
-    '''
+    """
 
     model_runner = _create_model_runner(
         "facebook/bart-base",
@@ -298,12 +298,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
     seq_lens: list[int] = []
     encoder_seq_lens: list[int] = []
     seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {
-        0: [1],
-        1: [3]
-    } if multiple_seqs_per_seq_group else {
-        0: [1]
-    }
+    block_tables = {0: [1], 1: [3]} if multiple_seqs_per_seq_group else {0: [1]}
     cross_block_table = [2]
     for i in range(batch_size):
         # make sure all tokens fit into one block
@@ -315,10 +310,9 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=False,
-            seq_data={
-                0: seq_data,
-                1: seq_data
-            } if multiple_seqs_per_seq_group else {0: seq_data},
+            seq_data={0: seq_data, 1: seq_data}
+            if multiple_seqs_per_seq_group
+            else {0: seq_data},
             sampling_params=SamplingParams(temperature=0),
             block_tables=block_tables,
             encoder_seq_data=encoder_seq_data,
@@ -326,10 +320,10 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
         )
         assert seq_group_metadata.token_chunk_size == 1
         seq_group_metadata_list.append(seq_group_metadata)
-        seq_lens.extend(
-            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        seq_lens.extend([seq_len for _ in range(len(seq_group_metadata.seq_data))])
         encoder_seq_lens.extend(
-            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
+            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))]
+        )
 
     # Build
     # * Decoder model inputs
@@ -354,8 +348,10 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
     device = model_runner.device
     assert attn_metadata.num_prefills == 0
     assert attn_metadata.num_decode_tokens > 0
-    assert torch.equal(attn_metadata.seq_lens_tensor,
-                       torch.tensor(seq_lens, device=device, dtype=torch.int))
+    assert torch.equal(
+        attn_metadata.seq_lens_tensor,
+        torch.tensor(seq_lens, device=device, dtype=torch.int),
+    )
     assert attn_metadata.seq_lens == seq_lens
     assert attn_metadata.max_prefill_seq_len == 0
     assert attn_metadata.max_decode_seq_len == max(seq_lens)
@@ -363,7 +359,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
     assert attn_metadata.encoder_seq_lens == encoder_seq_lens
     assert torch.equal(
         attn_metadata.encoder_seq_lens_tensor,
-        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
+        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int),
+    )
     assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
     assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
 
@@ -394,30 +391,33 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
     )
     assert torch.equal(
         attn_metadata.context_lens_tensor,
-        torch.tensor([seq_len - 1 for seq_len in seq_lens],
-                     dtype=torch.int,
-                     device=device))
+        torch.tensor(
+            [seq_len - 1 for seq_len in seq_lens], dtype=torch.int, device=device
+        ),
+    )
 
     # Verify block tables are correct for prompts
     # - Decoder self-attention
-    flattened_block_tables = [
-        block_table for block_table in block_tables.values()
-    ]
-    expected = torch.tensor(flattened_block_tables *
-                            len(seq_group_metadata_list),
-                            dtype=torch.int32,
-                            device=model_runner.device)
+    flattened_block_tables = [block_table for block_table in block_tables.values()]
+    expected = torch.tensor(
+        flattened_block_tables * len(seq_group_metadata_list),
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
     assert torch.equal(
         attn_metadata.block_tables,
         expected,
     )
     # - Encoder/decoder cross-attention
-    expected = torch.tensor([
-        cross_block_table for seq_group_metadata in seq_group_metadata_list
-        for _ in range(len(seq_group_metadata.seq_data))
-    ],
-                            dtype=torch.int32,
-                            device=model_runner.device)
+    expected = torch.tensor(
+        [
+            cross_block_table
+            for seq_group_metadata in seq_group_metadata_list
+            for _ in range(len(seq_group_metadata.seq_data))
+        ],
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
     assert torch.equal(
         attn_metadata.cross_block_tables,
         expected,
@@ -497,12 +497,7 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
         enable_chunked_prefill=False,
         enforce_eager=False,
     )
-    block_tables = {
-        0: [1],
-        1: [3]
-    } if multiple_seqs_per_seq_group else {
-        0: [1]
-    }
+    block_tables = {0: [1], 1: [3]} if multiple_seqs_per_seq_group else {0: [1]}
     seq_lens: list[int] = []
     encoder_seq_lens: list[int] = []
     seq_group_metadata_list: list[SequenceGroupMetadata] = []
@@ -518,22 +513,20 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=False,
-            seq_data={
-                0: seq_data,
-                1: seq_data
-            } if multiple_seqs_per_seq_group else {0: seq_data},
+            seq_data={0: seq_data, 1: seq_data}
+            if multiple_seqs_per_seq_group
+            else {0: seq_data},
             sampling_params=SamplingParams(temperature=0),
             block_tables=block_tables,
             encoder_seq_data=encoder_seq_data,
             cross_block_table=cross_block_table,
         )
         assert seq_group_metadata.token_chunk_size == 1
-        seq_lens.extend(
-            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
+        seq_lens.extend([seq_len for _ in range(len(seq_group_metadata.seq_data))])
         encoder_seq_lens.extend(
-            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
-        expanded_batch_size = expanded_batch_size + len(
-            seq_group_metadata.seq_data)
+            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))]
+        )
+        expanded_batch_size = expanded_batch_size + len(seq_group_metadata.seq_data)
         seq_group_metadata_list.append(seq_group_metadata)
 
     model_input = model_runner.prepare_model_input(seq_group_metadata_list)
@@ -549,12 +542,12 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     # With CUDA Graph capture and replay enabled, the decoder and encoder
     # input sequences will be padded. Create the expected padded tensors
     # accordingly.
-    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
-        expanded_batch_size)
+    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(expanded_batch_size)
     cuda_graph_pad_size = graph_batch_size - expanded_batch_size
     padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
     padded_encoder_seq_lens = encoder_seq_lens + list(
-        itertools.repeat(1, cuda_graph_pad_size))
+        itertools.repeat(1, cuda_graph_pad_size)
+    )
 
     assert return_seq_lens == padded_seq_lens
     assert len(slot_mapping) == len(input_tokens)
@@ -566,7 +559,8 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     assert attn_metadata.num_decode_tokens > 0
     assert torch.equal(
         attn_metadata.seq_lens_tensor,
-        torch.tensor(padded_seq_lens, device=device, dtype=torch.int))
+        torch.tensor(padded_seq_lens, device=device, dtype=torch.int),
+    )
     assert attn_metadata.seq_lens == padded_seq_lens
     assert attn_metadata.max_prefill_seq_len == 0
     assert attn_metadata.max_decode_seq_len == max(seq_lens)
@@ -574,14 +568,16 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens
     assert torch.equal(
         attn_metadata.encoder_seq_lens_tensor,
-        torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int))
+        torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int),
+    )
     assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens)
     assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens)
 
     # Verify block tables are correct for prompts
     # - Decoder self-attention. Pad the block tables as expected.
     flattened_block_tables = [
-        block_table for _ in range(len(seq_group_metadata_list))
+        block_table
+        for _ in range(len(seq_group_metadata_list))
         for block_table in block_tables.values()
     ]
     flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)])
@@ -599,7 +595,8 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     # - Encoder/decoder cross-attention. Pad the cross-attention block tables
     # as expected.
     expected = [
-        cross_block_table for seq_group_metadata in seq_group_metadata_list
+        cross_block_table
+        for seq_group_metadata in seq_group_metadata_list
         for _ in range(len(seq_group_metadata.seq_data))
     ]
     expected.extend([[] for _ in range(cuda_graph_pad_size)])
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index ec33d334ab65..1fcd547bc347 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -12,12 +12,10 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 from vllm.worker.multi_step_model_runner import StatefulModelInput
-from vllm.worker.pooling_model_runner import (
-    ModelInputForGPUWithPoolingMetadata)
+from vllm.worker.pooling_model_runner import ModelInputForGPUWithPoolingMetadata
 
 
 class MockAttentionBackend(AttentionBackend):
-
     @staticmethod
     def get_name() -> str:
         raise NotImplementedError
@@ -82,7 +80,8 @@ def test_model_runner_input():
         input_tokens=torch.ones(10),
         input_positions=torch.ones(10),
         sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
+        attn_metadata=attn_metadata,
+    )
 
     assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
 
@@ -91,29 +90,30 @@ def test_model_runner_input():
     attn_backend = MockAttentionBackend()
     received_model_input = (
         ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
+            tensor_dict, attn_backend=attn_backend
+        )
+    )
     # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithSamplingMetadata)
+    assert isinstance(received_model_input, ModelInputForGPUWithSamplingMetadata)
     assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert (received_model_input.input_tokens == model_input.input_tokens).all()
     assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
+    assert (received_model_input.input_positions == model_input.input_positions).all()
     assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
+    assert received_model_input.multi_modal_kwargs == model_input.multi_modal_kwargs
     assert received_model_input.lora_requests is None
     assert received_model_input.lora_requests == model_input.lora_requests
     assert received_model_input.lora_mapping is None
     assert received_model_input.lora_mapping == model_input.lora_mapping
     for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
+        assert getattr(received_model_input.attn_metadata, field.name, None) == getattr(
+            attn_metadata, field.name, None
+        )
     # For sampling metadata, only selected_token_indices is copied.
-    assert (received_model_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
+    assert (
+        received_model_input.sampling_metadata.selected_token_indices
+        == sampling_metadata.selected_token_indices
+    )
     assert received_model_input.sampling_metadata.seq_groups is None
 
 
@@ -135,7 +135,8 @@ def test_embedding_model_runner_input():
         input_tokens=torch.ones(10),
         input_positions=torch.ones(10),
         pooling_metadata=pooling_metadata,
-        attn_metadata=attn_metadata)
+        attn_metadata=attn_metadata,
+    )
 
     assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
 
@@ -144,26 +145,25 @@ def test_embedding_model_runner_input():
     attn_backend = MockAttentionBackend()
     received_model_input = (
         ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
+            tensor_dict, attn_backend=attn_backend
+        )
+    )
     # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithPoolingMetadata)
+    assert isinstance(received_model_input, ModelInputForGPUWithPoolingMetadata)
     assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert (received_model_input.input_tokens == model_input.input_tokens).all()
     assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
+    assert (received_model_input.input_positions == model_input.input_positions).all()
     assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
+    assert received_model_input.multi_modal_kwargs == model_input.multi_modal_kwargs
     assert received_model_input.lora_requests is None
     assert received_model_input.lora_requests == model_input.lora_requests
     assert received_model_input.lora_mapping is None
     assert received_model_input.lora_mapping == model_input.lora_mapping
     for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
+        assert getattr(received_model_input.attn_metadata, field.name, None) == getattr(
+            attn_metadata, field.name, None
+        )
     # Pooling metadata is not broadcast.
     assert received_model_input.pooling_metadata is None
 
@@ -187,7 +187,8 @@ def test_multi_step_model_runner_input():
         input_tokens=torch.ones(10),
         input_positions=torch.ones(10),
         sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
+        attn_metadata=attn_metadata,
+    )
 
     model_input = StatefulModelInput(
         frozen_model_input=frozen_model_input,
@@ -206,41 +207,45 @@ def test_multi_step_model_runner_input():
     # Test round trip serialization.
     tensor_dict = model_input.as_broadcastable_tensor_dict()
     attn_backend = MockAttentionBackend()
-    received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
-        tensor_dict, attn_backend=attn_backend))
+    received_model_input = StatefulModelInput.from_broadcasted_tensor_dict(
+        tensor_dict, attn_backend=attn_backend
+    )
 
     received_frozen_input = received_model_input.frozen_model_input
 
     # Check that received copy has correct values.
     assert isinstance(received_model_input, StatefulModelInput)
     assert received_frozen_input.input_tokens is not None
-    assert (received_frozen_input.input_tokens ==
-            frozen_model_input.input_tokens).all()
+    assert (received_frozen_input.input_tokens == frozen_model_input.input_tokens).all()
     assert received_frozen_input.input_positions is not None
-    assert (received_frozen_input.input_positions ==
-            frozen_model_input.input_positions).all()
+    assert (
+        received_frozen_input.input_positions == frozen_model_input.input_positions
+    ).all()
     assert received_frozen_input.multi_modal_kwargs is None
-    assert (frozen_model_input.multi_modal_kwargs ==
-            frozen_model_input.multi_modal_kwargs)
+    assert (
+        frozen_model_input.multi_modal_kwargs == frozen_model_input.multi_modal_kwargs
+    )
     assert received_frozen_input.lora_requests is None
-    assert (received_frozen_input.lora_requests ==
-            frozen_model_input.lora_requests)
+    assert received_frozen_input.lora_requests == frozen_model_input.lora_requests
     assert received_frozen_input.lora_mapping is None
-    assert (
-        received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
+    assert received_frozen_input.lora_mapping == frozen_model_input.lora_mapping
     for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_frozen_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
+        assert getattr(
+            received_frozen_input.attn_metadata, field.name, None
+        ) == getattr(attn_metadata, field.name, None)
     # For sampling metadata, only selected_token_indices is copied.
-    assert (received_frozen_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
+    assert (
+        received_frozen_input.sampling_metadata.selected_token_indices
+        == sampling_metadata.selected_token_indices
+    )
     assert received_frozen_input.sampling_metadata.seq_groups is None
 
     # check non frozen fields
     assert received_model_input.is_last_step == model_input.is_last_step
-    assert (received_model_input.is_first_multi_step ==
-            model_input.is_first_multi_step)
+    assert received_model_input.is_first_multi_step == model_input.is_first_multi_step
     assert received_model_input.current_step == model_input.current_step
-    assert (received_model_input.last_sampled_token_ids ==
-            model_input.last_sampled_token_ids).all()
+    assert (
+        received_model_input.last_sampled_token_ids
+        == model_input.last_sampled_token_ids
+    ).all()
     assert received_model_input.is_multi_step == model_input.is_multi_step
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 0be25aa2fc35..b69a3ee30f24 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -4,8 +4,10 @@
 import pytest
 import torch
 
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    init_distributed_environment,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -77,11 +79,9 @@ def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
     expected_selected_token_indices = []
     selected_token_start_idx = 0
     for seq_len in seq_lens:
-        expected_selected_token_indices.append(selected_token_start_idx +
-                                               seq_len - 1)
+        expected_selected_token_indices.append(selected_token_start_idx + seq_len - 1)
         selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(seq_group_metadata_list)
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
     input_embeds = model_input.inputs_embeds
@@ -97,7 +97,8 @@ def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
     assert attn_metadata.num_decode_tokens == 0
     torch.testing.assert_close(
         attn_metadata.seq_lens_tensor,
-        torch.tensor(seq_lens, device=device, dtype=torch.int))
+        torch.tensor(seq_lens, device=device, dtype=torch.int),
+    )
     assert attn_metadata.seq_lens == seq_lens
     assert attn_metadata.max_prefill_seq_len == max(seq_lens)
     assert attn_metadata.max_decode_seq_len == 0
@@ -110,7 +111,8 @@ def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
         start_loc.append(start_idx)
     torch.testing.assert_close(
         attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
+        torch.tensor(start_loc, dtype=torch.int32, device=device),
+    )
 
     # Test seq start locs. Note that for normal prefill it is
     # equivalent to query_start_loc.
@@ -122,16 +124,20 @@ def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
 
     torch.testing.assert_close(
         attn_metadata.seq_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
+        torch.tensor(start_loc, dtype=torch.int32, device=device),
+    )
     torch.testing.assert_close(
         attn_metadata.context_lens_tensor,
-        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
-                    dtype=torch.int,
-                    device=device))
+        torch.zeros(
+            attn_metadata.context_lens_tensor.shape[0], dtype=torch.int, device=device
+        ),
+    )
 
-    expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))],
-                            dtype=torch.int32,
-                            device=model_runner.device)
+    expected = torch.tensor(
+        [[] for _ in range(len(seq_group_metadata_list))],
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
     torch.testing.assert_close(attn_metadata.block_tables, expected)
     # Cuda graph should not be used for prerill.
     assert attn_metadata.use_cuda_graph is False
@@ -149,20 +155,21 @@ def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
         seq_lens,
         query_lens=seq_lens,
         device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
+        pin_memory=model_runner.pin_memory,
+    )
     assert len(input_tokens) == sum(seq_lens)
     assert len(input_positions) == sum(seq_lens)
     actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
+    expected = torch.tensor(
+        expected_selected_token_indices, device=actual.device, dtype=actual.dtype
+    )
     torch.testing.assert_close(actual, expected)
     torch.allclose(input_tokens, input_positions)
 
     actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
+    expected = torch.tensor(
+        expected_selected_token_indices, device=actual.device, dtype=actual.dtype
+    )
     torch.testing.assert_close(actual, expected)
 
 
@@ -198,8 +205,7 @@ def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
             )
             output_embed = torch.rand(10)
         else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len))
+            seq_data = SequenceData.from_seqs(prompt_token_ids=range(context_len))
             output_embed = None
         seq_data.update_num_computed_tokens(context_len)
         # Append one token ID since prefill is finished.
@@ -214,8 +220,7 @@ def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
         assert seq_group_metadata.token_chunk_size == 1
         seq_group_metadata_list.append(seq_group_metadata)
 
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(seq_group_metadata_list)
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
     input_embeds = model_input.inputs_embeds
@@ -225,7 +230,8 @@ def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
     assert len(slot_mapping) == len(input_tokens)
 
     expected_bs = model_runner.vllm_config.pad_for_cudagraph(
-        len(seq_group_metadata_list))
+        len(seq_group_metadata_list)
+    )
     # Verify input metadata is correct for prompts.
     device = model_runner.device
     assert attn_metadata.num_prefills == 0
@@ -244,7 +250,8 @@ def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
         start_loc.append(start_idx)
     torch.testing.assert_close(
         attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
+        torch.tensor(start_loc, dtype=torch.int32, device=device),
+    )
 
     start_idx = 0
     seq_start_loc = [start_idx]
@@ -253,15 +260,18 @@ def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
         seq_start_loc.append(start_idx)
     torch.testing.assert_close(
         attn_metadata.seq_start_loc,
-        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
+        torch.tensor(seq_start_loc, dtype=torch.int32, device=device),
+    )
 
     torch.testing.assert_close(
         attn_metadata.context_lens_tensor,
-        torch.tensor(context_lens, dtype=torch.int, device=device))
+        torch.tensor(context_lens, dtype=torch.int, device=device),
+    )
     assert attn_metadata.max_decode_seq_len == max(seq_lens)
     torch.testing.assert_close(
-        attn_metadata.seq_lens_tensor[:len(seq_lens)],
-        torch.tensor(seq_lens, dtype=torch.int, device=device))
+        attn_metadata.seq_lens_tensor[: len(seq_lens)],
+        torch.tensor(seq_lens, dtype=torch.int, device=device),
+    )
 
     # block table's first index corresponds to each batch, meaning in
     # decoding it is each token.
@@ -269,7 +279,8 @@ def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
     # Block table's second dim corresponds to each token's block number.
     # It is padded up to
     assert attn_metadata.block_tables.shape[1] == (
-        model_runner.get_max_block_per_batch())
+        model_runner.get_max_block_per_batch()
+    )
     assert attn_metadata.use_cuda_graph is True
 
     assert len(input_tokens) == expected_bs
@@ -291,11 +302,12 @@ def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
         # query lens is all 1 for decode.
         query_lens=[1 for _ in range(len(context_lens))],
         device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
+        pin_memory=model_runner.pin_memory,
+    )
     actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
+    expected = torch.tensor(
+        expected_selected_token_indices, device=actual.device, dtype=actual.dtype
+    )
     torch.testing.assert_close(actual, expected)
 
 
@@ -308,8 +320,7 @@ def test_empty_seq_group():
         enforce_eager=False,
     )
     seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(seq_group_metadata_list)
 
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
@@ -319,8 +330,7 @@ def test_empty_seq_group():
     assert input_positions is None
     assert attn_metadata is None
 
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(seq_group_metadata_list)
 
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
@@ -341,15 +351,17 @@ def distributed_init():
         world_size=1,
         rank=0,
         distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
-        local_rank=0)
+        local_rank=0,
+    )
     ensure_model_parallel_initialized(1, 1)
 
 
 @pytest.mark.parametrize("batch_size", list(range(2, 128, 3)))
 @pytest.mark.parametrize("enforce_eager", [True, False])
-@pytest.mark.parametrize('use_prompt_embeds', [True, False])
-def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
-                        distributed_init, monkeypatch):
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_hybrid_batches(
+    batch_size, enforce_eager, use_prompt_embeds, distributed_init, monkeypatch
+):
     if use_prompt_embeds:
         # Prompt Embeddings is only currently supported on V0
         monkeypatch.setenv("VLLM_USE_V1", "0")
@@ -386,7 +398,8 @@ def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
             expected_input_embeds_len += seq_len
         else:
             seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(seq_len), )
+                prompt_token_ids=range(seq_len),
+            )
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=True,
@@ -413,7 +426,8 @@ def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
             expected_input_embeds_len += 1
         else:
             seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len), )
+                prompt_token_ids=range(context_len),
+            )
             output_embed = None
         assert len(seq_data.prompt_token_ids) == context_len
         seq_data.append_token_id(1, 0, output_embed)
@@ -452,11 +466,14 @@ def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
     # Verify attn metadata is consistent. We don't need to test individual
     # values here because they are tested above.
     attn_metadata = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list).attn_metadata
+        seq_group_metadata_list
+    ).attn_metadata
 
-    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
-                                          vars(prefill_meta_actual)):
+    for attr_expected, attr_actual in zip(
+        vars(attn_metadata.prefill_metadata), vars(prefill_meta_actual)
+    ):
         assert attr_expected[1] == attr_actual[1]
-    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
-                                          vars(decode_meta_actual)):
+    for attr_expected, attr_actual in zip(
+        vars(attn_metadata.decode_metadata), vars(decode_meta_actual)
+    ):
         assert attr_expected[1] == attr_actual[1]
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index d8767f700b57..43b8f1d6eeb5 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -16,16 +16,15 @@ def test_gpu_memory_profiling():
     # any gpu setup.
 
     # Set up engine args to build a worker.
-    engine_args = EngineArgs(model="facebook/opt-125m",
-                             dtype="half",
-                             load_format="dummy")
+    engine_args = EngineArgs(
+        model="facebook/opt-125m", dtype="half", load_format="dummy"
+    )
     engine_config = engine_args.create_engine_config()
     engine_config.cache_config.num_gpu_blocks = 1000
     engine_config.cache_config.num_cpu_blocks = 1000
 
     # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
+    distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
     worker = Worker(
         vllm_config=engine_config,
         local_rank=0,
@@ -36,14 +35,14 @@ def test_gpu_memory_profiling():
 
     # Set 10GiB as the total gpu ram to be device-agnostic
     def mock_mem_info():
-        current_usage = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
+        current_usage = torch.cuda.memory_stats()["allocated_bytes.all.current"]
         mock_total_bytes = 10 * 1024**3
         free = mock_total_bytes - current_usage
 
         return (free, mock_total_bytes)
 
     from unittest.mock import patch
+
     with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
         # Load the model so we can profile it
         worker.init_device()
@@ -56,8 +55,10 @@ def mock_mem_info():
     # 9.0 GiB should be the utilization target
     # 8.28 GiB should be available for the KV cache
     block_size = CacheEngine.get_cache_block_size(
-        engine_config.cache_config, engine_config.model_config,
-        engine_config.parallel_config)
+        engine_config.cache_config,
+        engine_config.model_config,
+        engine_config.parallel_config,
+    )
 
     expected_blocks = (8.28 * 1024**3) // block_size
 
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 6d9f404ac207..b543c0c6028f 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -11,16 +11,15 @@
 
 def test_swap() -> None:
     # Configure the engine.
-    engine_args = EngineArgs(model="distilbert/distilgpt2",
-                             dtype="half",
-                             load_format="dummy")
+    engine_args = EngineArgs(
+        model="distilbert/distilgpt2", dtype="half", load_format="dummy"
+    )
     engine_config = engine_args.create_engine_config()
     engine_config.cache_config.num_gpu_blocks = 1000
     engine_config.cache_config.num_cpu_blocks = 1000
 
     # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
+    distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
     worker = Worker(
         vllm_config=engine_config,
         local_rank=0,
@@ -34,7 +33,8 @@ def test_swap() -> None:
     worker.load_model()
     worker.initialize_cache(
         num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
-        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
+        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks,
+    )
 
     # Randomly initialize the cache.
     gpu_cache = worker.cache_engine[0].gpu_cache
@@ -48,8 +48,7 @@ def test_swap() -> None:
         cpu_key_cache.random_()
         cpu_value_cache.random_()
 
-    allclose = lambda a, b: torch.allclose(
-        a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
+    allclose = lambda a, b: torch.allclose(a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
 
     # Test swap out.
     blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 45b58035ebe3..ab3ccea5cffe 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -706,13 +706,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             "random":
             lambda: RandomDataset(random_seed=args.seed,
                                   dataset_path=args.dataset_path).sample(
-                tokenizer=tokenizer,
-                num_requests=args.num_prompts,
-                prefix_len=args.random_prefix_len,
-                input_len=args.random_input_len,
-                output_len=args.random_output_len,
-                range_ratio=args.random_range_ratio,
-            ),
+                                      tokenizer=tokenizer,
+                                      num_requests=args.num_prompts,
+                                      prefix_len=args.random_prefix_len,
+                                      input_len=args.random_input_len,
+                                      output_len=args.random_output_len,
+                                      range_ratio=args.random_range_ratio,
+                                  ),
         }
 
         try:
@@ -1501,18 +1501,23 @@ def sample(
 
             # Build chat-style prompt using tokenizer template, if available.
             messages = [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": question},
+                {
+                    "role": "system",
+                    "content": system_prompt
+                },
+                {
+                    "role": "user",
+                    "content": question
+                },
             ]
             prompt_formatted = tokenizer.apply_chat_template(
-                messages, add_generation_prompt=True, tokenize=False
-            )
+                messages, add_generation_prompt=True, tokenize=False)
             prompt_len = len(tokenizer(prompt_formatted).input_ids)
 
             # Determine output length from reference answer tokens.
             ref_out_len = len(
-                tokenizer(reference_answer, add_special_tokens=False).input_ids
-            )
+                tokenizer(reference_answer,
+                          add_special_tokens=False).input_ids)
             expected_output_len = ref_out_len if dynamic_output else output_len
 
             # Validate sequence lengths.
@@ -1524,8 +1529,7 @@ def sample(
                     prompt=prompt_formatted,
                     prompt_len=prompt_len,
                     expected_output_len=expected_output_len,
-                )
-            )
+                ))
 
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 8b16fea9e3d3..c5c21aac022b 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -133,23 +133,20 @@ async def get_request(
     assert burstiness > 0, (
         f"A positive burstiness factor is expected, but given {burstiness}.")
     # Convert to list to get length for ramp-up calculations
-    if isinstance(input_requests, Iterable) and not isinstance(
-            input_requests, list):
+    if isinstance(input_requests,
+                  Iterable) and not isinstance(input_requests, list):
         input_requests = list(input_requests)
 
     total_requests = len(input_requests)
     request_index = 0
 
     for request in input_requests:
-        current_request_rate = _get_current_request_rate(ramp_up_strategy,
-                                                      ramp_up_start_rps,
-                                                      ramp_up_end_rps,
-                                                      request_index,
-                                                      total_requests,
-                                                      request_rate)
+        current_request_rate = _get_current_request_rate(
+            ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps,
+            request_index, total_requests, request_rate)
 
         yield request, current_request_rate
-        
+
         request_index += 1
 
         if current_request_rate == float("inf"):
@@ -365,8 +362,8 @@ async def benchmark(
         if profile_output.success:
             print("Profiler started")
 
-    distribution = ("Poisson process" if burstiness == 1.0 
-                   else "Gamma distribution")
+    distribution = ("Poisson process"
+                    if burstiness == 1.0 else "Gamma distribution")
 
     if ramp_up_strategy is not None:
         print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
@@ -863,8 +860,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="The ramp-up strategy. This would be used to "
         "ramp up the request rate from initial RPS to final "
         "RPS rate (specified by --ramp-up-start-rps and "
-        "--ramp-up-end-rps.) over the duration of the benchmark."
-    )
+        "--ramp-up-end-rps.) over the duration of the benchmark.")
     parser.add_argument(
         "--ramp-up-start-rps",
         type=int,
@@ -892,13 +888,11 @@ def main(args: argparse.Namespace):
             raise ValueError(
                 "When using ramp-up, do not specify --request-rate. "
                 "The request rate will be controlled by ramp-up parameters. "
-                "Please remove the --request-rate argument."
-            )
+                "Please remove the --request-rate argument.")
         if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
             raise ValueError(
                 "When using --ramp-up-strategy, both --ramp-up-start-rps and "
-                "--ramp-up-end-rps must be specified"
-            )
+                "--ramp-up-end-rps must be specified")
         if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
             raise ValueError("Ramp-up start and end RPS must be non-negative")
         if args.ramp_up_start_rps > args.ramp_up_end_rps:
@@ -1045,7 +1039,7 @@ def main(args: argparse.Namespace):
                                if args.max_concurrency is not None else "")
         label = label or endpoint_type
         if args.ramp_up_strategy is not None:
-            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
+            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
         else:
             file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
         if args.result_filename:

From 9e4a4649a82e1b597d8fe3b8078ecfae83d11d71 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:38:30 +0100
Subject: [PATCH 4/5] `ruff check --fix --unsafe-fixes` (I checked them all)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/compile/test_full_graph.py              |  4 +-
 .../kernels/attention/test_lightning_attn.py  |  5 +-
 .../quantization/test_cutlass_scaled_mm.py    | 10 +--
 tests/v1/engine/test_llm_engine.py            |  4 +-
 .../outlines_logits_processors.py             | 12 +--
 vllm/reasoning/abs_reasoning_parsers.py       |  8 +-
 vllm/utils/__init__.py                        | 84 +++++++++----------
 vllm/v1/attention/backends/flashinfer.py      | 34 ++++----
 vllm/v1/core/sched/output.py                  | 12 +--
 vllm/v1/core/sched/scheduler.py               | 20 ++---
 vllm/v1/structured_output/__init__.py         |  8 +-
 vllm/v1/structured_output/backend_guidance.py |  4 +-
 vllm/v1/structured_output/request.py          |  8 +-
 13 files changed, 101 insertions(+), 112 deletions(-)

diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 3707cb196eeb..04416f4117cf 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 import tempfile
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 import pytest
 import torch
@@ -17,7 +17,7 @@
 from ..utils import create_new_process_for_each_test
 
 
-def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
+def models_list(*, all: bool = True, keywords: list[str] | None = None):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
         (
diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
index 0e3da986299e..ec938caff2c6 100644
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -33,10 +33,7 @@ def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
 
     # More efficient implementation
     # Convert decay factors to matrix form
-    if ed.dim() == 1:
-        decay = torch.exp(-ed).view(1, -1, 1, 1)
-    else:
-        decay = torch.exp(-ed)
+    decay = torch.exp(-ed).view(1, -1, 1, 1) if ed.dim() == 1 else torch.exp(-ed)
 
     for b in range(B):
         for step in range(S):
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index dd1b02083c8d..6444e102f78c 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -88,10 +88,7 @@ def cutlass_fp8_gemm_helper(
     # make scales K-major for blockwise quant, doesn't affect 1D scales
     scale_b = scale_b.t().contiguous().t()
 
-    if use_bias:
-        bias = torch.rand((n,), device=device, dtype=out_dtype) * 10
-    else:
-        bias = None
+    bias = torch.rand((n,), device=device, dtype=out_dtype) * 10 if use_bias else None
 
     out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
@@ -122,10 +119,7 @@ def cutlass_int8_gemm_helper(
     scale_a = torch.randn(a_scales_shape, device=device, dtype=torch.float32)
     scale_b = torch.randn(b_scales_shape, device=device, dtype=torch.float32)
 
-    if use_bias:
-        bias = torch.rand((n,), device=device, dtype=out_dtype) * 10
-    else:
-        bias = None
+    bias = torch.rand((n,), device=device, dtype=out_dtype) * 10 if use_bias else None
 
     out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
     baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 9bd78f55a357..1028d37ed5f3 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import random
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import pytest
 
@@ -78,7 +78,7 @@ def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
 
 def _get_test_sampling_params(
     prompt_list: list[str],
-    seed: Optional[int] = 42,
+    seed: int | None = 42,
     structured_outputs: bool = False,
 ) -> tuple[list[SamplingParams], list[int]]:
     """Generate random sampling params for a batch."""
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 7f047a1df6a5..3641a7fda20d 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -8,7 +8,7 @@
 import importlib.metadata
 import json
 import os
-from typing import Optional, Union
+from typing import Union
 
 import regex as re
 import torch
@@ -36,11 +36,11 @@
 class BaseLogitsProcessor:
 
     def __init__(self, guide: Guide, eos_token_id: int,
-                 reasoner: Optional[ReasoningParser]) -> None:
+                 reasoner: ReasoningParser | None) -> None:
         self._guide: Guide = guide
         self._eos_token_id: int = eos_token_id
-        self._reasoner: Optional[ReasoningParser] = reasoner
-        self._mask: Optional[torch.Tensor] = None
+        self._reasoner: ReasoningParser | None = reasoner
+        self._mask: torch.Tensor | None = None
 
     def __call__(self, input_ids: list[int],
                  scores: torch.Tensor) -> torch.Tensor:
@@ -114,7 +114,7 @@ def _get_guide(cls, regex_string: str,
         return Guide(index)
 
     def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase,
-                 reasoner: Optional[ReasoningParser]) -> None:
+                 reasoner: ReasoningParser | None) -> None:
         super().__init__(
             guide=RegexLogitsProcessor._get_guide(regex_string, tokenizer),
             eos_token_id=tokenizer.eos_token_id,  # type: ignore
@@ -126,7 +126,7 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
     def __init__(self, schema: Union[str, dict, BaseModel],
                  tokenizer: PreTrainedTokenizerBase,
                  whitespace_pattern: Union[str, None],
-                 reasoner: Optional[ReasoningParser]) -> None:
+                 reasoner: ReasoningParser | None) -> None:
 
         if isinstance(schema, type(BaseModel)):
             schema_str = json.dumps(schema.model_json_schema())
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 4f4522d726e8..7e5fe1e43ca2 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -7,7 +7,7 @@
 from abc import abstractmethod
 from collections.abc import Sequence
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Union
 
 from vllm.logger import init_logger
 from vllm.utils import import_from_path, is_list_of
@@ -77,7 +77,7 @@ def extract_reasoning_content(
         self,
         model_output: str,
         request: Union[ChatCompletionRequest, ResponsesRequest],
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from a complete model-generated string.
 
@@ -135,7 +135,7 @@ def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]:
     def _register_module(
         cls,
         module: type,
-        module_name: Optional[Union[str, list[str]]] = None,
+        module_name: Union[str, list[str]] | None = None,
         force: bool = True,
     ) -> None:
         if not issubclass(module, ReasoningParser):
@@ -155,7 +155,7 @@ def _register_module(
     @classmethod
     def register_module(
         cls,
-        name: Optional[Union[str, list[str]]] = None,
+        name: Union[str, list[str]] | None = None,
         force: bool = True,
         module: Union[type, None] = None,
     ) -> Union[type, Callable]:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index bbcc2a523dcb..156bd82f0460 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -276,7 +276,7 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
 
     def __init__(self,
                  capacity: float,
-                 getsizeof: Optional[Callable[[_V], float]] = None):
+                 getsizeof: Callable[[_V], float] | None = None):
         super().__init__(capacity, getsizeof)
 
         self.pinned_items = set[_K]()
@@ -351,7 +351,7 @@ def touch(self, key: _K) -> None:
             self._LRUCache__order[key] = None  # type: ignore
 
     @overload
-    def get(self, key: _K, /) -> Optional[_V]:
+    def get(self, key: _K, /) -> _V | None:
         ...
 
     @overload
@@ -361,9 +361,8 @@ def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]:
     def get(self,
             key: _K,
             /,
-            default: Optional[Union[_V,
-                                    _T]] = None) -> Optional[Union[_V, _T]]:
-        value: Optional[Union[_V, _T]]
+            default: Union[_V, _T] | None = None) -> Union[_V, _T] | None:
+        value: Union[_V, _T] | None
         if key in self:
             value = self.__getitem__(
                 key, update_info=False)  # type: ignore[call-arg]
@@ -385,9 +384,8 @@ def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]:
 
     def pop(self,
             key: _K,
-            default: Optional[Union[_V,
-                                    _T]] = None) -> Optional[Union[_V, _T]]:
-        value: Optional[Union[_V, _T]]
+            default: Union[_V, _T] | None = None) -> Union[_V, _T] | None:
+        value: Union[_V, _T] | None
         if key not in self:
             return default
 
@@ -415,7 +413,7 @@ def _unpin(self, key: _K) -> None:
         """
         self.pinned_items.remove(key)
 
-    def _on_remove(self, key: _K, value: Optional[_V]) -> None:
+    def _on_remove(self, key: _K, value: _V | None) -> None:
         pass
 
     def remove_oldest(self, *, remove_pinned: bool = False) -> None:
@@ -703,7 +701,7 @@ def __del__(self):
 
 def make_async(
     func: Callable[P, T],
-    executor: Optional[concurrent.futures.Executor] = None
+    executor: concurrent.futures.Executor | None = None
 ) -> Callable[P, Awaitable[T]]:
     """Take a blocking function, and run it on in an executor thread.
 
@@ -931,7 +929,7 @@ def _get_open_port() -> int:
             return s.getsockname()[1]
 
 
-def find_process_using_port(port: int) -> Optional[psutil.Process]:
+def find_process_using_port(port: int) -> psutil.Process | None:
     # TODO: We can not check for running processes with network
     # port on macOS. Therefore, we can not have a full graceful shutdown
     # of vLLM. For now, let's not look for processes in this case.
@@ -1011,8 +1009,8 @@ def _generate_random_fp8(
 
 
 def get_kv_cache_torch_dtype(
-        cache_dtype: Optional[Union[str, torch.dtype]],
-        model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype:
+        cache_dtype: Union[str, torch.dtype] | None,
+        model_dtype: Union[str, torch.dtype] | None = None) -> torch.dtype:
     if isinstance(cache_dtype, str):
         if cache_dtype == "auto":
             if isinstance(model_dtype,
@@ -1039,11 +1037,11 @@ def create_kv_caches_with_random_flash(
     num_layers: int,
     num_heads: int,
     head_size: int,
-    cache_dtype: Optional[Union[str, torch.dtype]],
-    model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: Optional[int] = None,
-    device: Optional[str] = "cuda",
-    cache_layout: Optional[str] = "NHD",
+    cache_dtype: Union[str, torch.dtype] | None,
+    model_dtype: Union[str, torch.dtype] | None = None,
+    seed: int | None = None,
+    device: str | None = "cuda",
+    cache_layout: str | None = "NHD",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
@@ -1083,10 +1081,10 @@ def create_kv_caches_with_random(
     num_layers: int,
     num_heads: int,
     head_size: int,
-    cache_dtype: Optional[Union[str, torch.dtype]],
-    model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: Optional[int] = None,
-    device: Optional[str] = "cuda",
+    cache_dtype: Union[str, torch.dtype] | None,
+    model_dtype: Union[str, torch.dtype] | None = None,
+    seed: int | None = None,
+    device: str | None = "cuda",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     if cache_dtype == "fp8" and head_size % 16:
         raise ValueError(
@@ -1147,7 +1145,7 @@ def is_uva_available() -> bool:
 
 class DeviceMemoryProfiler:
 
-    def __init__(self, device: Optional[torch.types.Device] = None):
+    def __init__(self, device: torch.types.Device | None = None):
         self.device = device
 
     def current_memory_usage(self) -> float:
@@ -1174,7 +1172,7 @@ def make_ndarray_with_pad(
     pad: T,
     dtype: npt.DTypeLike,
     *,
-    max_len: Optional[int] = None,
+    max_len: int | None = None,
 ) -> npt.NDArray:
     """
     Make a padded array from 2D inputs.
@@ -1199,8 +1197,8 @@ def make_tensor_with_pad(
     pad: T,
     dtype: torch.dtype,
     *,
-    max_len: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
+    max_len: int | None = None,
+    device: Union[str, torch.device] | None = None,
     pin_memory: bool = False,
 ) -> torch.Tensor:
     """
@@ -1450,7 +1448,7 @@ def identity(value: T, **kwargs) -> T:
 def deprecate_args(
     start_index: int,
     is_deprecated: Union[bool, Callable[[], bool]] = True,
-    additional_message: Optional[str] = None,
+    additional_message: str | None = None,
 ) -> Callable[[F], F]:
     if not callable(is_deprecated):
         is_deprecated = partial(identity, is_deprecated)
@@ -1492,7 +1490,7 @@ def inner(*args, **kwargs):
 def deprecate_kwargs(
     *kws: str,
     is_deprecated: Union[bool, Callable[[], bool]] = True,
-    additional_message: Optional[str] = None,
+    additional_message: str | None = None,
 ) -> Callable[[F], F]:
     deprecated_kws = set(kws)
 
@@ -1526,7 +1524,7 @@ def inner(*args, **kwargs):
 
 @lru_cache(maxsize=8)
 def _cuda_device_count_stateless(
-        cuda_visible_devices: Optional[str] = None) -> int:
+        cuda_visible_devices: str | None = None) -> int:
     # Note: cuda_visible_devices is not used, but we keep it as an argument for
     # LRU Cache purposes.
 
@@ -2016,8 +2014,8 @@ def supports_kw(
 
 
 def resolve_mm_processor_kwargs(
-    init_kwargs: Optional[Mapping[str, object]],
-    inference_kwargs: Optional[Mapping[str, object]],
+    init_kwargs: Mapping[str, object] | None,
+    inference_kwargs: Mapping[str, object] | None,
     callable: Callable[..., object],
     *,
     requires_kw_only: bool = True,
@@ -2060,7 +2058,7 @@ def resolve_mm_processor_kwargs(
 
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
-    overrides: Optional[Mapping[str, object]],
+    overrides: Mapping[str, object] | None,
     *,
     requires_kw_only: bool = True,
     allow_var_kwargs: bool = False,
@@ -2492,8 +2490,8 @@ def direct_register_custom_op(
         op_name: str,
         op_func: Callable,
         mutates_args: list[str],
-        fake_impl: Optional[Callable] = None,
-        target_lib: Optional[Library] = None,
+        fake_impl: Callable | None = None,
+        target_lib: Library | None = None,
         dispatch_key: str = "CUDA",
         tags: tuple[torch.Tag, ...] = (),
 ):
@@ -2771,7 +2769,7 @@ def split_zmq_path(path: str) -> tuple[str, str, str]:
     return scheme, host, port
 
 
-def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str:
+def make_zmq_path(scheme: str, host: str, port: int | None = None) -> str:
     """Make a ZMQ path from its parts.
 
     Args:
@@ -2794,9 +2792,9 @@ def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
     path: str,
     socket_type: Any,
-    bind: Optional[bool] = None,
-    identity: Optional[bytes] = None,
-    linger: Optional[int] = None,
+    bind: bool | None = None,
+    identity: bytes | None = None,
+    linger: int | None = None,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
@@ -2850,9 +2848,9 @@ def make_zmq_socket(
 def zmq_socket_ctx(
     path: str,
     socket_type: Any,
-    bind: Optional[bool] = None,
+    bind: bool | None = None,
     linger: int = 0,
-    identity: Optional[bytes] = None,
+    identity: bytes | None = None,
 ) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
@@ -2926,7 +2924,7 @@ def get_mp_context():
 def bind_kv_cache(
     ctx: dict[str, Any],
     kv_cache: list[list[torch.Tensor]],  # [virtual_engine][layer_index]
-    shared_kv_cache_layers: Optional[dict[str, str]] = None
+    shared_kv_cache_layers: dict[str, str] | None = None
 ) -> None:
     # Bind the kv_cache tensor to Attention modules, similar to
     # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
@@ -3133,7 +3131,7 @@ def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
 
 
 @contextlib.contextmanager
-def cprofile_context(save_file: Optional[str] = None):
+def cprofile_context(save_file: str | None = None):
     """Run a cprofile
 
     Args:
@@ -3155,7 +3153,7 @@ def cprofile_context(save_file: Optional[str] = None):
             prof.print_stats(sort="cumtime")
 
 
-def cprofile(save_file: Optional[str] = None, enabled: bool = True):
+def cprofile(save_file: str | None = None, enabled: bool = True):
     """Decorator to profile a Python method using cProfile.
 
     Args:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1eb27d57acf0..46fa6635a398 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
@@ -38,7 +38,7 @@
 class FlashInferBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
-    cached_sm100a_supported: Optional[bool] = None
+    cached_sm100a_supported: bool | None = None
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
@@ -203,14 +203,14 @@ class FlashInferMetadata:
 
     # For cascade attention.
     use_cascade: bool
-    shared_qo_indptr: Optional[torch.Tensor] = None
-    shared_kv_page_indptr: Optional[torch.Tensor] = None
-    shared_kv_page_indices: Optional[torch.Tensor] = None
-    shared_kv_last_page_len: Optional[torch.Tensor] = None
+    shared_qo_indptr: torch.Tensor | None = None
+    shared_kv_page_indptr: torch.Tensor | None = None
+    shared_kv_page_indices: torch.Tensor | None = None
+    shared_kv_last_page_len: torch.Tensor | None = None
 
-    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
-    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
-    cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
+    prefill_wrapper: BatchPrefillWithPagedKVCacheWrapper | None = None
+    decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
+    cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
 
     @property
     def query_start_loc(self):
@@ -233,7 +233,7 @@ def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
+        self.global_hyperparameters: PerLayerParameters | None = None
 
         self.vllm_config = vllm_config
         self.cache_config = vllm_config.cache_config
@@ -486,13 +486,13 @@ def __init__(
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        blocksparse_params: Optional[dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
+        blocksparse_params: dict[str, Any] | None = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
+        kv_sharing_target_layer_name: int | None = None,
         use_irope: bool = False,
     ) -> None:
         if use_irope:
@@ -530,8 +530,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashInferMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FlashInfer.
 
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index d34f39327805..034d3d99201d 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     import numpy as np
@@ -27,11 +27,11 @@ class NewRequestData:
     mm_inputs: list[MultiModalKwargs]
     mm_hashes: list[str]
     mm_positions: list[PlaceholderRange]
-    sampling_params: Optional[SamplingParams]
-    pooling_params: Optional[PoolingParams]
+    sampling_params: SamplingParams | None
+    pooling_params: PoolingParams | None
     block_ids: tuple[list[int], ...]
     num_computed_tokens: int
-    lora_request: Optional[LoRARequest]
+    lora_request: LoRARequest | None
 
     @classmethod
     def from_request(
@@ -151,7 +151,7 @@ class SchedulerOutput:
     # for filling the next token bitmask
     structured_output_request_ids: dict[str, int]
     # the bitmask for the whole batch
-    grammar_bitmask: Optional[npt.NDArray[np.int32]]
+    grammar_bitmask: npt.NDArray[np.int32] | None
 
     # KV Cache Connector metadata.
-    kv_connector_metadata: Optional[KVConnectorMetadata] = None
+    kv_connector_metadata: KVConnectorMetadata | None = None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 446f98034cb8..5ee7a8e1b95e 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,7 +7,7 @@
 import time
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
@@ -63,7 +63,7 @@ def __init__(
         # request ids should be included in the EngineCoreOutputs returned
         # by update_from_outputs(). This is currently used in the multi-engine
         # case to track request lifetimes efficiently.
-        self.finished_req_ids_dict: Optional[dict[int, set[str]]] = (
+        self.finished_req_ids_dict: dict[int, set[str]] | None = (
             defaultdict(set) if include_finished_set else None)
 
         # Scheduling constraints.
@@ -759,7 +759,7 @@ def update_from_output(
         num_nans_in_logits = model_runner_output.num_nans_in_logits
 
         outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
-        spec_decoding_stats: Optional[SpecDecodingStats] = None
+        spec_decoding_stats: SpecDecodingStats | None = None
 
         # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more,
         # the below loop can be a performance bottleneck. We should do our best
@@ -1006,7 +1006,7 @@ def finish_requests(
             request.status = finished_status
             self._free_request(request)
 
-    def _free_request(self, request: Request) -> Optional[dict[str, Any]]:
+    def _free_request(self, request: Request) -> dict[str, Any] | None:
         assert request.is_finished()
 
         delay_free_blocks, kv_xfer_params = self._connector_finished(request)
@@ -1038,8 +1038,8 @@ def reset_prefix_cache(self) -> bool:
 
     def make_stats(
         self,
-        spec_decoding_stats: Optional[SpecDecodingStats] = None,
-    ) -> Optional[SchedulerStats]:
+        spec_decoding_stats: SpecDecodingStats | None = None,
+    ) -> SchedulerStats | None:
         if not self.log_stats:
             return None
         prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
@@ -1056,10 +1056,10 @@ def make_stats(
 
     def make_spec_decoding_stats(
         self,
-        spec_decoding_stats: Optional[SpecDecodingStats],
+        spec_decoding_stats: SpecDecodingStats | None,
         num_draft_tokens: int,
         num_accepted_tokens: int,
-    ) -> Optional[SpecDecodingStats]:
+    ) -> SpecDecodingStats | None:
         if not self.log_stats:
             return None
         if spec_decoding_stats is None:
@@ -1077,11 +1077,11 @@ def shutdown(self) -> None:
     # KV Connector Related Methods
     ########################################################################
 
-    def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
+    def get_kv_connector(self) -> KVConnectorBase_V1 | None:
         return self.connector
 
     def _connector_finished(
-            self, request: Request) -> tuple[bool, Optional[dict[str, Any]]]:
+            self, request: Request) -> tuple[bool, dict[str, Any] | None]:
         """
         Invoke the KV connector request_finished() method if applicable.
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index bd1dd01f9063..586b425af38d 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -33,11 +33,11 @@ class StructuredOutputManager:
     """Engine-level manager for structured output requests."""
 
     def __init__(self, vllm_config: VllmConfig):
-        self.backend: Optional[StructuredOutputBackend] = None
-        self.reasoner: Optional[ReasoningParser] = None
+        self.backend: StructuredOutputBackend | None = None
+        self.reasoner: ReasoningParser | None = None
         self.vllm_config = vllm_config
 
-        self._grammar_bitmask: Optional[torch.Tensor] = None
+        self._grammar_bitmask: torch.Tensor | None = None
         self._full_mask = torch.tensor(-1, dtype=torch.int32)
 
         if not self.vllm_config.model_config.skip_tokenizer_init:
@@ -125,7 +125,7 @@ def grammar_bitmask(
         requests: dict[str, Request],
         structured_output_request_ids: dict[str, int],
         scheduled_spec_decode_tokens: dict[str, list[int]],
-    ) -> Optional[npt.NDArray[np.int32]]:
+    ) -> npt.NDArray[np.int32] | None:
         # Prepare the structured output bitmask for this batch.
         if not structured_output_request_ids:
             return None
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 02e7fc33f517..4875f1f16c1d 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -7,7 +7,7 @@
 import json
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Union
 
 import torch
 
@@ -237,7 +237,7 @@ def _process_schema(grammar_spec: Union[str, dict[str, Any]], ) -> str:
 
 def validate_guidance_grammar(
         sampling_params: SamplingParams,
-        tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
+        tokenizer: llguidance.LLTokenizer | None = None) -> None:
     tp, grm = get_structured_output_key(sampling_params)
     guidance_grm = serialize_guidance_grammar(tp, grm)
     err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index fc365f12573f..3febb3a0d681 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -19,9 +19,9 @@
 class StructuredOutputRequest:
 
     sampling_params: SamplingParams
-    _grammar: Optional[Union[Future[StructuredOutputGrammar],
-                             StructuredOutputGrammar]] = None
-    reasoning_ended: Optional[bool] = None
+    _grammar: Union[Future[StructuredOutputGrammar],
+                    StructuredOutputGrammar] | None = None
+    reasoning_ended: bool | None = None
 
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports
@@ -41,7 +41,7 @@ def is_grammar_ready(self) -> bool:
         return self._check_grammar_completion()
 
     @property
-    def grammar(self) -> Optional[StructuredOutputGrammar]:
+    def grammar(self) -> StructuredOutputGrammar | None:
         completed = self._check_grammar_completion()
         return cast(Optional[StructuredOutputGrammar],
                     self._grammar) if completed else None

From 1bb9e353bac8a7e958df027f0f2dbda4c8fa3477 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 28 Jul 2025 12:21:34 +0100
Subject: [PATCH 5/5] Manual fixes

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |  2 +-
 .../compile/piecewise/test_full_cudagraph.py  |  3 +-
 tests/compile/piecewise/test_simple.py        | 15 +++++---
 tests/compile/piecewise/test_toy_llama.py     | 35 +++++++++---------
 tests/core/test_scheduler.py                  | 37 ++++++++++---------
 tests/entrypoints/conftest.py                 |  9 +++--
 tests/entrypoints/openai/test_audio.py        |  2 +-
 tests/entrypoints/openai/test_chat.py         | 22 +++++++----
 .../entrypoints/openai/test_chat_template.py  |  5 +--
 .../openai/test_chat_with_tool_reasoning.py   |  9 +++--
 .../test_completion_with_function_calling.py  | 18 ++++++---
 tests/entrypoints/openai/test_video.py        |  4 +-
 tests/entrypoints/openai/test_vision.py       |  4 +-
 tests/entrypoints/test_chat_utils.py          | 15 ++++----
 .../moe/modular_kernel_tools/common.py        |  3 +-
 tests/kernels/utils.py                        | 11 ++----
 .../models/language/generation/test_gemma.py  |  4 +-
 .../language/generation/test_mistral.py       |  6 ++-
 .../multimodal/generation/test_ultravox.py    |  3 +-
 .../multimodal/generation/test_voxtral.py     |  3 +-
 .../generation/vlm_utils/custom_inputs.py     |  6 +--
 tests/quantization/test_gptq_dynamic.py       |  6 +--
 tests/quantization/test_ptpc_fp8.py           |  8 ++--
 tests/quantization/test_quark.py              |  8 ++--
 tests/tool_use/test_tool_choice_required.py   |  3 +-
 tests/v1/attention/utils.py                   |  9 +++--
 .../openai/responses/test_image.py            |  2 +-
 .../nixl_integration/test_accuracy.py         |  5 ++-
 vllm/utils/__init__.py                        |  2 +-
 vllm/v1/structured_output/__init__.py         |  2 +-
 30 files changed, 145 insertions(+), 116 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 43418dd8fd79..cdb5510d0646 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@ repos:
   rev: v0.11.7
   hooks:
   - id: ruff
-    args: [--output-format, github, --fix]
+    # args: [--output-format, github, --fix]
   - id: ruff-format
     files: |
       (?x)^(
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index 99dd61254a75..2ce20ae94b66 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -159,7 +159,8 @@ def test_full_cudagraph_with_invalid_backend():
         temporary_environ(
             {
                 "VLLM_USE_V1": "1",
-                "VLLM_FLASH_ATTN_VERSION": "2",  # FA2 not supported with full_cuda_graph
+                # FA2 not supported with full_cuda_graph
+                "VLLM_FLASH_ATTN_VERSION": "2",
             }
         ),
         pytest.raises(RuntimeError),
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index ee67d6696f70..c557022666b1 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -100,11 +100,16 @@ def test_simple_piecewise_compile(use_inductor):
 
     with (
         compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            # one graph for the model
+            num_graphs_seen=1,
+            # 2 * num_layers + 1
+            num_piecewise_graphs_seen=5,
+            # 1 + num_layers
+            num_piecewise_capturable_graphs_seen=3,
+            # num_piecewise_capturable_graphs_seen
+            num_backend_compilations=3,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            num_cudagraph_captured=6,
         ),
         set_forward_context({}, vllm_config=vllm_config),
     ):
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 738cc45da6e0..44d38774930f 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -361,11 +361,14 @@ def test_toy_llama(use_inductor: bool):
         kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
     with compilation_counter.expect(
-        num_graphs_seen=1,  # one graph for the model
+        # one graph for the model
+        num_graphs_seen=1,
         num_piecewise_graphs_seen=1,
         num_piecewise_capturable_graphs_seen=1,
-        num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-        num_cudagraph_captured=2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        # num_piecewise_capturable_graphs_seen
+        num_backend_compilations=1,
+        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_cudagraph_captured=2,
         **kwargs,
     ):
         outputs.append(
@@ -374,16 +377,16 @@ def test_toy_llama(use_inductor: bool):
     run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
 
     with compilation_counter.expect(
-        num_graphs_seen=1,  # one graph for the model
-        num_piecewise_graphs_seen=2 * llama_config.num_layers + 1,  # 2 * num_layers + 1
-        num_piecewise_capturable_graphs_seen=1
-        + llama_config.num_layers,  # 1 + num_layers
-        num_backend_compilations=1
-        + llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-        num_cudagraph_captured=2
-        * (
-            1 + llama_config.num_layers
-        ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        # one graph for the model
+        num_graphs_seen=1,
+        # 2 * num_layers + 1
+        num_piecewise_graphs_seen=2 * llama_config.num_layers + 1,
+        # 1 + num_layers
+        num_piecewise_capturable_graphs_seen=1 + llama_config.num_layers,
+        # num_piecewise_capturable_graphs_seen
+        num_backend_compilations=1 + llama_config.num_layers,
+        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_cudagraph_captured=2 * (1 + llama_config.num_layers),
     ):
         outputs.append(
             run_model(
@@ -470,11 +473,7 @@ def benchmark():
                 # and use it later, because it will look up the name `b` in the
                 # enclosing scope, and the value of `b` will always be 256.
                 # it is fine here, because we only use the lambda function once.
-                runtime = do_bench(
-                    lambda: graphs[b][0](  # noqa
-                        input_ids[:b], positions[:b]
-                    )
-                )  # noqa
+                runtime = do_bench(lambda: graphs[b][0](input_ids[:b], positions[:b]))  # noqa
                 piecewise_cudagraph_time[b] = runtime
             else:
                 runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 731d1be00dfd..ef1f9d637cca 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -1109,8 +1109,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler._add_seq_group_to_swapped(seq_group)
 
     scheduler._schedule_swapped(budget, curr_loras)
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    computed_blocks_tracker = scheduler.block_manager._computed_blocks_tracker
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1139,15 +1140,15 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_prefills(budget, curr_loras)
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
     # Priority preemption schedule
     scheduler._schedule_priority_preemption(budget)
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1187,8 +1188,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1223,8 +1224,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        2
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(2)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1261,8 +1262,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1289,8 +1290,8 @@ def test_remove_seq_from_computed_blocks_tracker():
     for _, seq_group in seq_and_seq_groups:
         scheduler.add_seq_group(seq_group)
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        0
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(0)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1323,8 +1324,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        0
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(0)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1357,7 +1358,7 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index bea264cc8fb5..3fd3e0ac4793 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -67,13 +67,15 @@ def sample_complex_json_schema():
         "type": "object",
         "properties": {
             "score": {
+                # Numeric range
                 "type": "integer",
                 "minimum": 0,
-                "maximum": 100,  # Numeric range
+                "maximum": 100,
             },
             "grade": {
+                # Regex pattern
                 "type": "string",
-                "pattern": "^[A-D]$",  # Regex pattern
+                "pattern": "^[A-D]$",
             },
             "email": {
                 "type": "string",
@@ -82,8 +84,9 @@ def sample_complex_json_schema():
             "tags": {
                 "type": "array",
                 "items": {
+                    # Combining length and pattern restrictions
                     "type": "string",
-                    "pattern": "^[a-z]{1,10}$",  # Combining length and pattern restrictions
+                    "pattern": "^[a-z]{1,10}$",
                 },
             },
         },
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index e9e73f88a7bb..815930a13196 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -143,7 +143,7 @@ async def test_single_chat_session_audio_base64encoded(
                 {
                     "type": "audio_url",
                     "audio_url": {
-                        "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+                        "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"  # noqa: E501
                     },
                 },
                 {"type": "text", "text": "What's happening in this audio?"},
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index a21be27e61ae..a71cd95ca6ed 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -38,8 +38,8 @@ def server(
     request,
     monkeypatch_module,
     zephyr_lora_files,  # noqa: F811
-    zephyr_lora_added_tokens_files,
-):  # noqa: F811
+    zephyr_lora_added_tokens_files,  # noqa: F811
+):
     use_v1 = request.param
     monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
@@ -713,12 +713,14 @@ async def test_required_tool_use(
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description": "The city to find the weather for, e.g. 'Vienna'",
+                            "description": "The city to find the weather for, e.g. "
+                            "'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
                             "type": "string",
-                            "description": "The country that the city is in, e.g. 'Austria'",
+                            "description": "The country that the city is in, e.g. "
+                            "'Austria'",
                         },
                         "unit": {
                             "type": "string",
@@ -740,16 +742,19 @@ async def test_required_tool_use(
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description": "The city to get the forecast for, e.g. 'Vienna'",
+                            "description": "The city to get the forecast for, e.g. "
+                            "'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
                             "type": "string",
-                            "description": "The country that the city is in, e.g. 'Austria'",
+                            "description": "The country that the city is in, e.g. "
+                            "'Austria'",
                         },
                         "days": {
                             "type": "integer",
-                            "description": "Number of days to get the forecast for (1-7)",
+                            "description": "Number of days to get the forecast for "
+                            "(1-7)",
                         },
                         "unit": {
                             "type": "string",
@@ -957,7 +962,8 @@ async def test_complex_message_content(client: openai.AsyncOpenAI):
                 "content": [
                     {
                         "type": "text",
-                        "text": "what is 1+1? please provide the result without any other text.",
+                        "text": "what is 1+1? please provide the result without any "
+                        "other text.",
                     }
                 ],
             }
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 17e36444fe84..1daf42f00761 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -75,9 +75,8 @@ def test_load_chat_template():
     # Hard coded value for template_chatml.jinja
     assert (
         template_content
-        == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
-    )  # noqa: E501
+        == "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"  # noqa: E501
+    )
 
 
 def test_no_load_chat_template_filelike():
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index 4f23eee46211..e4cfd1c1965d 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -45,11 +45,13 @@ async def client(server):
                 "properties": {
                     "city": {
                         "type": "string",
-                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                        "description": "The city to find the weather for, e.g. "
+                        "'San Francisco'",
                     },
                     "state": {
                         "type": "string",
-                        "description": "the two-letter abbreviation for the state that the city is"
+                        "description": "the two-letter abbreviation for the state that "
+                        "the city is"
                         " in, e.g. 'CA' which would mean 'California'",
                     },
                     "unit": {
@@ -69,7 +71,8 @@ async def client(server):
     {"role": "assistant", "content": "I'm doing well! How can I help you?"},
     {
         "role": "user",
-        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
+        "content": "Can you tell me what the temperate will be in Dallas, in "
+        "fahrenheit?",
     },
 ]
 
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index d6835db4c959..da02248ee689 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -60,12 +60,14 @@ async def test_function_tool_use(
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description": "The city to find the weather for, e.g. 'Vienna'",
+                            "description": "The city to find the weather for, e.g. "
+                            "'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
                             "type": "string",
-                            "description": "The country that the city is in, e.g. 'Austria'",
+                            "description": "The country that the city is in, e.g. "
+                            "'Austria'",
                         },
                         "unit": {
                             "type": "string",
@@ -94,7 +96,8 @@ async def test_function_tool_use(
                                 "include_forecast": {
                                     "type": "boolean",
                                     "default": False,
-                                    "description": "Whether to include a 24-hour forecast",
+                                    "description": "Whether to include a 24-hour "
+                                    "forecast",
                                     "title": "Include Forecast",
                                 },
                                 "language": {
@@ -120,16 +123,19 @@ async def test_function_tool_use(
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description": "The city to get the forecast for, e.g. 'Vienna'",
+                            "description": "The city to get the forecast for, e.g. "
+                            "'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
                             "type": "string",
-                            "description": "The country that the city is in, e.g. 'Austria'",
+                            "description": "The country that the city is in, e.g. "
+                            "'Austria'",
                         },
                         "days": {
                             "type": "integer",
-                            "description": "Number of days to get the forecast for (1-7)",
+                            "description": "Number of days to get the forecast for "
+                            "(1-7)",
                         },
                         "unit": {
                             "type": "string",
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 825dbc7d2e48..88aa6a7e3d9f 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -179,7 +179,7 @@ async def test_single_chat_session_video_base64encoded(
                 {
                     "type": "video_url",
                     "video_url": {
-                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"  # noqa: E501
                     },
                 },
                 {"type": "text", "text": "What's in this video?"},
@@ -238,7 +238,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
                 {
                     "type": "video_url",
                     "video_url": {
-                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"  # noqa: E501
                     },
                 },
                 {"type": "text", "text": "What's in this video?"},
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index e9984a38d068..2210d86e6892 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -231,7 +231,7 @@ async def test_single_chat_session_image_base64encoded(
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                        "url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"  # noqa: E501
                     },
                 },
                 {"type": "text", "text": content_text},
@@ -298,7 +298,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                        "url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"  # noqa: E501
                     },
                 },
                 {"type": "text", "text": "What's in this image?"},
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index e189d9585f6c..85482ca3ecac 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -382,7 +382,8 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
                     {"type": "image_url", "image_url": {"url": image_url}},
                     {
                         "type": "text",
-                        "text": "What's in <|image_1|> and how does it compare to <|image_2|>?",
+                        "text": "What's in <|image_1|> and how does it compare to "
+                        "<|image_2|>?",
                     },
                 ],
             }
@@ -427,8 +428,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
     assert conversation == [
         {
             "role": "user",
-            "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
-            "other one?",
+            "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
+            "the other one?",
         }
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
@@ -730,14 +731,14 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     assert conversation == [
         {
             "role": "user",
-            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
         },
         {"role": "assistant", "content": "Some stuff."},
         {
             "role": "user",
-            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
+            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
         },
     ]
 
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 48d4beb8c294..5a145537f0fe 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -640,7 +640,8 @@ def run_modular_kernel(
     mk = make_modular_kernel(config, vllm_config)
 
     mk_kwargs = {
-        "hidden_states": rank_tensors.hidden_states.clone(),  # impls might update the tensor in place
+        # impls might update the tensor in place
+        "hidden_states": rank_tensors.hidden_states.clone(),
         "w1": rank_weights.w1,
         "w2": rank_weights.w2,
         "topk_weights": rank_tensors.topk_weights,
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 1c9636d5c6a2..f21df2996e0d 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -915,13 +915,10 @@ def make_test_metadata(
             None if encoder_seq_lens is None else (sum(encoder_seq_lens))
         )
 
-    if cross_test_params is None:
-        cross_kv_mmap = None
-    else:
-        # Encoder/decoder or encoder-only models only:
-        # * Extract *cross-attention* slot_mapping and block table
-        #   (kv_mmap)
-        cross_kv_mmap = cross_test_params.kv_mmap
+    # Encoder/decoder or encoder-only models only:
+    # * Extract *cross-attention* slot_mapping and block table
+    #   (kv_mmap)
+    cross_kv_mmap = None if cross_test_params is None else cross_test_params.kv_mmap
 
     attn_backend_obj = make_backend(attn_backend.name)
 
diff --git a/tests/models/language/generation/test_gemma.py b/tests/models/language/generation/test_gemma.py
index 85b6f29b151c..164e7ec626cc 100644
--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@@ -16,7 +16,9 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
         ) as llm:
             if model == "google/gemma-3-4b-it":
                 normalizers = llm.model.collective_rpc(
-                    lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()
+                    lambda self: (
+                        self.model_runner.model.language_model.model.normalizer.cpu().item()
+                    )
                 )
                 config = llm.model.llm_engine.model_config.hf_config.text_config
             else:
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index c477789628b4..787faba30b92 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -48,11 +48,13 @@
                 "properties": {
                     "city": {
                         "type": "string",
-                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                        "description": "The city to find the weather for, e.g. "
+                        "'San Francisco'",
                     },
                     "state": {
                         "type": "string",
-                        "description": "the two-letter abbreviation for the state that the city is"
+                        "description": "the two-letter abbreviation for the state that "
+                        "the city is"
                         " in, e.g. 'CA' which would mean 'California'",
                     },
                     "unit": {
diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
index da1e7c7486fd..7a00efafc850 100644
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -170,7 +170,8 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
                 ],
                 {
                     "type": "text",
-                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                    "text": f"What's happening in these {len(audio_assets)} audio "
+                    "clips?",
                 },
             ],
         }
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index aa9628435e4d..abd796ff8e69 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -108,7 +108,8 @@ def asset_to_chunk(asset):
                 *audio_chunks,
                 {
                     "type": "text",
-                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                    "text": f"What's happening in these {len(audio_assets)} audio "
+                    "clips?",
                 },
             ],
         }
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index 3886547b8a8b..dfadcf3c9f2e 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -104,9 +104,9 @@ def multi_video_multi_aspect_ratio_inputs(
 
 def different_patch_input_cases_internvl():
     images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
-    formatter = (
-        lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
-    )  # noqa: E501
+    formatter = lambda img_prompt: (
+        f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    )
     single_img_prompts = [
         "<image>\nWhat's the content in the center of the image?",
         "<image>\nWhat is the season?",
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index d9ddd55c065f..c007895ab5db 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -42,10 +42,8 @@ def test_gptq_with_dynamic(
         GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
     )
 
-    for (
-        name,
-        submodule,
-    ) in vllm_model.model.llm_engine.model_executor.driver_worker.model_runner.model.named_modules():
+    model_runner = vllm_model.model.llm_engine.model_executor.driver_worker.model_runner
+    for name, submodule in model_runner.model.named_modules():
         if name == "lm_head":
             assert isinstance(submodule.quant_method, linear_method_cls)
         elif name == "model.layers.0.self_attn.qkv_proj":
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 4cd743ef625f..a9e8d1e61b5f 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -47,10 +47,10 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
             output = llm.generate_greedy("Hello my name is", max_tokens=20)
             assert output
     except AssertionError as e:
-        if (
-            str(e)
-            == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified."
-        ):  # noqa: E501
+        if str(e) == (
+            "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output "
+            "dtype of bfloat16. torch.float16 is specified."
+        ):
             # If the error message matches, the test passes
             pass
         else:
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index a9537be9df05..55083a052eca 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -112,12 +112,12 @@ def test_quark_fp8_parity(vllm_runner):
         vllm_runner(quark_model_id, **llm_kwargs) as quark_handle,
         vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle,
     ):
-        quark_model = quark_handle.model.llm_engine.model_executor.driver_worker.model_runner.model
+        quark_worker = quark_handle.model.llm_engine.model_executor.driver_worker
+        quark_model = quark_worker.model_runner.model
         quark_state_dict = quark_model.state_dict()
 
-        fp8_model = (
-            fp8_handle.model.llm_engine.model_executor.driver_worker.model_runner.model
-        )
+        fp8_worker = fp8_handle.model.llm_engine.model_executor.driver_worker
+        fp8_model = fp8_worker.model_runner.model
         fp8_state_dict = fp8_model.state_dict()
 
     assert fp8_state_dict.keys() == quark_state_dict.keys()
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 699f64723de9..71650bfcf709 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -45,7 +45,8 @@
                 "properties": {
                     "city": {
                         "type": "string",
-                        "description": "The city to get the forecast for, e.g. 'New York'",
+                        "description": "The city to get the forecast for, e.g. "
+                        "'New York'",
                     },
                     "days": {
                         "type": "integer",
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 70195f704f09..2726e310f83f 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -115,11 +115,12 @@ def get_attention_backend(backend_name: _Backend):
     Returns:
         Tuple of (backend_builder_class, backend_impl_class)
     """
+    backends = "vllm.v1.attention.backends"
     backend_map = {
-        _Backend.FLASH_ATTN_VLLM_V1: "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",
-        _Backend.FLASHINFER_VLLM_V1: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
-        _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
-        _Backend.TRITON_ATTN_VLLM_V1: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
+        _Backend.FLASH_ATTN_VLLM_V1: f"{backends}.flash_attn.FlashAttentionBackend",
+        _Backend.FLASHINFER_VLLM_V1: f"{backends}.flashinfer.FlashInferBackend",
+        _Backend.FLEX_ATTENTION: f"{backends}.flex_attention.FlexAttentionBackend",
+        _Backend.TRITON_ATTN_VLLM_V1: f"{backends}.triton_attn.TritonAttentionBackend",
     }
 
     if backend_name not in backend_map:
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py
index fcdfc882ed84..39aa5e034b5b 100644
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -100,7 +100,7 @@ async def test_single_chat_session_image_base64encoded(
             "content": [
                 {
                     "type": "input_image",
-                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
+                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",  # noqa: E501
                     "detail": "auto",
                 },
                 {"type": "input_text", "text": content_text},
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index 81bd5e30d40e..b301968e5bf8 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -15,8 +15,9 @@
 EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
 
 SIMPLE_PROMPT = (
-    "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",
-)  # noqa: E501
+    "The best part about working on vLLM is that I got to meet so many people across "
+    "various different organizations like UCB, Google, and Meta which means",
+)
 
 # Get model name from environment variable
 MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 156bd82f0460..a8eb672cee5d 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -47,7 +47,7 @@
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Tuple, TypeVar, Union, cast, overload)
+                    Tuple, TypeVar, Union, cast, overload)
 from urllib.parse import urlparse
 from uuid import uuid4
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 586b425af38d..98c0a4f66d65 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -4,7 +4,7 @@
 
 import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger