From cb0f02068adda1588a5af3914285b357528fbff6 Mon Sep 17 00:00:00 2001 From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:03:02 -0700 Subject: [PATCH] [test] Unwaive Gemma3 tests Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 5 ++++- tests/integration/defs/test_e2e.py | 2 ++ tests/integration/test_lists/waives.txt | 6 +----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2ca55c1dd17..4afa44556b0 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -798,13 +798,16 @@ def test_auto_dtype(self): kv_cache_config = KvCacheConfig( enable_block_reuse=False, enable_partial_reuse=False, + free_gpu_memory_fraction=0.5, ) # We use FlashInfer as the attention backend for Gemma3 VLM to support custom mask for images. # So, testing with it here. with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config, attn_backend="FLASHINFER", - cuda_graph_config=None) as llm: + cuda_graph_config=None, + max_batch_size=128, + max_seq_len=4096) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 0123679d477..a46bfd4c912 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2280,6 +2280,8 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, cmd.append("--image_format=pil") cmd.append("--attention_backend=FLASHINFER") cmd.append("--disable_kv_cache_reuse") + cmd.append("--kv_cache_fraction=0.5") + cmd.append("--max_seq_len=1024") output = llm_venv.run_cmd(cmd, caller=check_output) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index aea1a260946..5b2f667f0de 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -211,7 +211,6 @@ perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512 perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5328495) full:B200/examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] SKIP (https://nvbugs/5292737) -full:B200/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5295470) examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-summarization_long] SKIP (https://nvbugs/5324976) examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849) examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) @@ -232,7 +231,7 @@ accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder] SKIP (https://nvbugs/5355128) examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (https://nvbugs/5355128) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5375646) -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620) +full:L40S/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620) test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] SKIP (https://nvbugs/5380570) test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1] SKIP (https://nvbugs/5380570) examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992) @@ -240,9 +239,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-floa examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424) triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437) triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437) -accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype SKIP (https://nvbugs/5401114) -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] SKIP (https://nvbugs/5401114) -test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False] SKIP (https://nvbugs/5401114) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233) examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233) test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416)