diff --git a/tests/integration/test_lists/qa/llm_function_l20.txt b/tests/integration/test_lists/qa/llm_function_l20.txt index a6dcb651b95..3a84d6edbc1 100644 --- a/tests/integration/test_lists/qa/llm_function_l20.txt +++ b/tests/integration/test_lists/qa/llm_function_l20.txt @@ -19,7 +19,6 @@ accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 5ded8577e83..fa6d20759fe 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -346,3 +346,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5485102) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5485109) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5485116) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5488580) diff --git a/tests/unittest/llmapi/test_executor.py b/tests/unittest/llmapi/test_executor.py index cf34aa66f81..2d9509c78aa 100644 --- a/tests/unittest/llmapi/test_executor.py +++ b/tests/unittest/llmapi/test_executor.py @@ -78,6 +78,7 @@ def llama_7b_tp2_path(engine_path: Path) -> Path: return path +@pytest.mark.skip(reason="https://nvbugs/5488280") @pytest.mark.skipif(WORLD_SIZE != 1, reason="Must run on single MPI rank") def test_generation_bs2(llama_7b_bs2_path: Path): tokenizer = TransformersTokenizer.from_pretrained(llama_7b_bs2_path) @@ -99,6 +100,7 @@ def test_generation_bs2(llama_7b_bs2_path: Path): 'E F G H I K L M') +@pytest.mark.skip(reason="https://nvbugs/5488280") @pytest.mark.skipif(WORLD_SIZE != 1, reason="Must run on single MPI rank") def test_sync_generation(llama_7b_path: Path): tokenizer = TransformersTokenizer.from_pretrained(llama_7b_path) diff --git a/tests/unittest/trt/model_api/test_model_level_api.py b/tests/unittest/trt/model_api/test_model_level_api.py index 4ea753ea1fb..6eb736407d9 100644 --- a/tests/unittest/trt/model_api/test_model_level_api.py +++ b/tests/unittest/trt/model_api/test_model_level_api.py @@ -3,6 +3,7 @@ import tempfile from contextlib import contextmanager +import pytest from profile_utils import profile from transformers import AutoTokenizer from utils.llm_data import llm_models_root @@ -42,6 +43,7 @@ def workspace(suffix, prefix="./trtllm_workspace"): # 233s on ipp1-1197: loading weights 37s, network/engine 27s, save engine: 35s, load engine (14GB) about 100s @profile("save-and-load") @force_ampere +@pytest.mark.skip(reason="https://nvbugs/5488280") def test_save_load(): '''When the engine_dir parameter of to_trt and generate is not None to_trt() saves the engine to disk. @@ -102,6 +104,7 @@ def test_high_level_fake_weights(): @force_ampere +@pytest.mark.skip(reason="https://nvbugs/5488280") def test_async_io(): max_batch_size, max_isl, max_osl = 8, 256, 256 hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf") diff --git a/tests/unittest/trt/model_api/test_model_quantization.py b/tests/unittest/trt/model_api/test_model_quantization.py index a023fff92ec..7a1404bed36 100644 --- a/tests/unittest/trt/model_api/test_model_quantization.py +++ b/tests/unittest/trt/model_api/test_model_quantization.py @@ -1,5 +1,6 @@ import tempfile +import pytest from transformers import AutoTokenizer from utils.llm_data import llm_models_root from utils.util import force_ampere, skip_no_modelopt, skip_pre_ada @@ -20,6 +21,7 @@ ] +@pytest.mark.skip(reason="https://nvbugs/5488280") @force_ampere @skip_no_modelopt def test_int4_awq_quantization(): @@ -63,6 +65,7 @@ def test_int4_awq_quantization(): # TODO: TRTLLM-185, check the score when the test infra is ready, hard coded value is not stable, cause flaky tests in L0 +@pytest.mark.skip(reason="https://nvbugs/5488280") @skip_pre_ada @skip_no_modelopt def test_fp8_quantization():