diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index f94f3457c377..510858c2d7ef 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -41,8 +41,8 @@ reason= "Prevent unstable test based on golden strings from breaking the build " " and test input model being too large and hanging the system.") -@pytest.mark.skipif(not is_quant_method_supported("nvfp4"), - reason="nvfp4 is not supported on this GPU type.") +@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"), + reason="modelopt_fp4 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: model = LLM( @@ -50,7 +50,7 @@ def test_models(example_prompts, model_name) -> None: max_model_len=MAX_MODEL_LEN, trust_remote_code=True, enforce_eager=True, - quantization="nvfp4", + quantization="modelopt_fp4", ) tokenizer = AutoTokenizer.from_pretrained(model_name) diff --git a/vllm/config.py b/vllm/config.py index a185a75c6bf3..5382e9a16829 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -824,7 +824,7 @@ def _verify_quantization(self) -> None: optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8", - "quark", "nvfp4", "bitblas", "gptq_bitblas" + "quark", "modelopt_fp4", "bitblas", "gptq_bitblas" ] if self.quantization is not None: self.quantization = cast(QuantizationMethods, diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index a713b1e93c2d..15d6b616a80e 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -14,7 +14,7 @@ "ptpc_fp8", "fbgemm_fp8", "modelopt", - "nvfp4", + "modelopt_fp4", "marlin", "bitblas", "gguf", @@ -118,7 +118,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "fp8": Fp8Config, "fbgemm_fp8": FBGEMMFp8Config, "modelopt": ModelOptFp8Config, - "nvfp4": ModelOptNvFp4Config, + "modelopt_fp4": ModelOptNvFp4Config, "marlin": MarlinConfig, "bitblas": BitBLASConfig, "gguf": GGUFConfig, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 13957a96deca..b108b02a43e2 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -192,7 +192,7 @@ def __init__( @classmethod def get_name(cls) -> QuantizationMethods: - return "nvfp4" + return "modelopt_fp4" @classmethod def get_supported_act_dtypes(cls) -> list[torch.dtype]: