diff --git a/vllm/envs.py b/vllm/envs.py index 0445447dd9df..c40f7e47097c 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -92,6 +92,10 @@ V_SCALE_CONSTANT: int = 10 VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 + VLLM_MLA_DISABLE: bool = False + VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True + VLLM_MLA_DISABLE_REQUANTIZATION: bool = False + VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False def get_default_cache_root(): @@ -580,6 +584,49 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), + + # If set, vllm will run in development mode, which will enable + # some additional endpoints for developing and debugging, + # e.g. `/reset_prefix_cache` + "VLLM_SERVER_DEV_MODE": + lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))), + + # Controls the maximum number of requests to handle in a + # single asyncio task when processing per-token outputs in the + # V1 AsyncLLM interface. It is applicable when handling a high + # concurrency of streaming requests. + # Setting this too high can result in a higher variance of + # inter-message latencies. Setting it too low can negatively impact + # TTFT and overall throughput. + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": + lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")), + + # If set, vLLM will disable the MLA attention optimizations. + "VLLM_MLA_DISABLE": + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), + + # Flag that can control whether or not we perform matrix-absorption for MLA + # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the + # matrices reduces the runtime FLOPs needed to compute MLA but requires + # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage, + # the is enabled by default + "VLLM_MLA_PERFORM_MATRIX_ABSORPTION": + lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))), + + # When running MLA with matrix-absorption enabled and fp8 quantized weights + # we perform the matrix-absorption in float32 precision, after the matrices + # are absorbed we requantize the weights back to fp8, this flag can be used + # to disable the requantization step, and instead convert the absorbed + # matrices to match the activation type. This can lead to higher memory and + # compute usage but better preserves the accuracy of the original model. + "VLLM_MLA_DISABLE_REQUANTIZATION": + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))), + + # If set, vLLM will use the Triton implementation of moe_align_block_size, + # i.e. moe_align_block_size_triton in fused_moe.py. + "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": + lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) + ), } # end-env-vars-definition