vllm-project · simon-mo · Jul 11, 2025 · Jul 1, 2025
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+
+from .common import Config
+from .mk_objects import (MK_ALL_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES,
+                         MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+
+
+def make_config_arg_parser(description: str):
+
+    def to_pf_class_type(s: str) -> mk.FusedMoEPrepareAndFinalize:
+        for pf in MK_ALL_PREPARE_FINALIZE_TYPES:
+            if pf.__name__ == s:
+                return pf
+        raise ValueError(
+            f"Cannot find a PrepareFinalize type that matches {s}")
+
+    def to_experts_class_type(s: str) -> mk.FusedMoEPermuteExpertsUnpermute:
+        for fe in MK_FUSED_EXPERT_TYPES:
+            if fe.__name__ == s:
+                return fe
+        raise ValueError(f"Cannot find a FusedExperts type that matches {s}")
+
+    def to_quant_torch_dtype(s: str) -> torch.dtype:
+        if s == "torch.float8_e4m3fn":
+            return torch.float8_e4m3fn
+        raise ValueError(f"Unsupported quant type {s}")
+
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=2,
+        help="Number of ranks that participate in all2all",
+    )
+    parser.add_argument(
+        "--pf-type",
+        type=to_pf_class_type,
+        required=True,
+        help=("Choose a PrepareFinalize Type : "
+              f"{[x.__name__ for x in MK_ALL_PREPARE_FINALIZE_TYPES]}"),
+    )
+    parser.add_argument(
+        "--experts-type",
+        type=to_experts_class_type,
+        required=True,
+        help=(f"Choose a FusedExpert type : "
+              f"{[x.__name__ for x in MK_FUSED_EXPERT_TYPES]}"),
+    )
+    parser.add_argument(
+        "-m",
+        nargs="+",
+        type=int,
+        default=[64],
+        help="num tokens per rank",
+    )
+    parser.add_argument(
+        "-k",
+        type=int,
+        default=7168,
+        help="hidden-size",
+    )
+    parser.add_argument(
+        "-n",
+        type=int,
+        default=1024,
+        help="N dimension of the first fused-moe matmul",
+    )
+    parser.add_argument("--num-experts",
+                        type=int,
+                        default=32,
+                        help="Global num experts")
+    parser.add_argument("--topk",
+                        nargs="+",
+                        type=int,
+                        default=[4, 1],
+                        help="num topk")
+    parser.add_argument(
+        "--fused-moe-chunk-size",
+        nargs="+",
+        type=int,
+        help="Fused moe chunk size used for the non-batched fused experts impl."
+    )
+
+    # Quant args
+    parser.add_argument("--quant-dtype",
+                        type=to_quant_torch_dtype,
+                        help="Quant datatype")
+    parser.add_argument("--per-token-quantized-activations",
+                        action='store_true',
+                        help=("The input activations must be per-token "
+                              "quantized"))
+    parser.add_argument("--per-channel-quantized-weights",
+                        action="store_true",
+                        help="The weights must be per-channel quantized.")
+    parser.add_argument("--block-shape",
+                        nargs="+",
+                        type=int,
+                        help="Quantization block shape")
+
+    # Torch trace profile generation args
+    parser.add_argument("--torch-trace-dir-path",
+                        type=str,
+                        default=None,
+                        help="Get torch trace for single execution")
+
+    return parser
+
+
+def _validate_args(args: argparse.Namespace):
+
+    if args.quant_dtype is not None:
+        assert args.quant_dtype == torch.float8_e4m3fn
+        if args.block_shape is not None:
+            assert len(args.block_shape) == 2, (
+                f"block shape must have 2 elements. got {args.block_shape}")
+
+    if args.experts_type in MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES:
+        assert args.world_size == 1, (
+            "Single GPU objects need world size set to 1")
+
+    if args.torch_trace_dir_path is not None:
+        from pathlib import Path
+        assert Path(args.torch_trace_dir_path).is_dir(), (
+            f"Please create {args.torch_trace_dir_path}")
+
+
+def make_config(args: argparse.Namespace) -> Config:
+
+    _validate_args(args)
+
+    quant_config = None
+    if args.quant_dtype is not None:
+        quant_config = FusedMoEQuantConfig(
+            quant_dtype=args.quant_dtype,
+            per_act_token_quant=args.per_token_quantized_activations,
+            per_out_ch_quant=args.per_channel_quantized_weights,
+            block_shape=args.block_shape)
+
+    return Config(
+        Ms=args.m,
+        K=args.k,
+        N=args.n,
+        E=args.num_experts,
+        topks=args.topk,
+        dtype=torch.bfloat16,  # hard-code
+        quant_config=quant_config,
+        prepare_finalize_type=args.pf_type,
+        fused_experts_type=args.experts_type,
+        fused_moe_chunk_size=args.fused_moe_chunk_size,
+        world_size=args.world_size,
+        torch_trace_dir_path=args.torch_trace_dir_path)