From c49d51277283180151b9b4c959f3726c2c0ac4c5 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 25 Mar 2025 15:28:03 -0700
Subject: [PATCH 1/8] Add sparsity support in benchmarking

ghstack-source-id: c9e900d84457e7dc527ad3ea78648d61b8004b09
ghstack-comment-id: 2752688588
Pull Request resolved: https://github.com/pytorch/ao/pull/1955
---
 .../microbenchmarks/benchmark_inference.py    | 15 ++++++---
 .../microbenchmarks/benchmark_runner.py       | 32 +++++++++++++++++--
 .../microbenchmarks/test/benchmark_config.yml |  1 +
 .../test/test_benchmark_inference.py          |  1 +
 benchmarks/microbenchmarks/utils.py           | 30 ++++++++++++++---
 5 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index e96ec9c59e..3b04aebb3b 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -23,7 +23,10 @@
     model_inference_time_in_ms,
     string_to_config,
 )
+from torchao import quantization
+from torchao.core.config import AOBaseConfig
 from torchao.quantization import quantize_
+from torchao.sparsity.sparse_api import sparsify_
 
 
 def run(config: BenchmarkConfig) -> BenchmarkResult:
@@ -44,11 +47,15 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
 
     # Use quantize_ to apply each quantization function to the model
     m_copy = deepcopy(base_model).eval().to(config.device)
-    quantization_config = string_to_config(
-        config.quantization, high_precision_dtype=config.high_precision_dtype
+    aoBaseConfig = string_to_config(
+        config.quantization, config.sparsity, high_precision_dtype=config.high_precision_dtype
     )
-    if quantization_config is not None:
-        quantize_(m_copy, quantization_config)
+    if aoBaseConfig is not None and config.quantization is not None:
+        quantize_(m_copy, aoBaseConfig)
+    elif config.sparsity is not None and aoBaseConfig is not None:
+        sparsify_(m_copy, aoBaseConfig)
+    else:
+        pass # No quantization or sparsity specified, do nothing
     if config.use_torch_compile:
         print("Compiling model....")
         m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
index 69a6ced025..32dd9ca02b 100644
--- a/benchmarks/microbenchmarks/benchmark_runner.py
+++ b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -30,6 +30,7 @@
     generate_results_csv,
     print_results,
 )
+from torchao import sparsity
 
 
 def get_shapes_for_config(
@@ -67,6 +68,28 @@ def get_param_combinations(model_param):
 
     return shapes, base_params
 
+def get_quantization_sparsity_recipes(
+    quantization_recipes: str, sparsity_recipes: str
+) -> List[Tuple[str, str]]:
+    """Generate valid quantization and sparsity recipes."""
+
+    config_recipes = []
+    for quant_config, sparse_config in product(quantization_recipes, sparsity_recipes):
+        if sparse_config != "None" and quant_config != "baseline":
+            if "semi" in sparse_config or "2:4" in sparse_config:
+                if "marlin" in quant_config or "int8dq" in quant_config or "float8dq" in quant_config:
+                    pass
+                else:
+                    continue
+            elif sparse_config == "block":
+                config_recipes.append(("baseline", sparse_config))
+            else:
+                raise ValueError(f"Invalid sparsity recipe: {sparse_config}")
+        config_recipes.append((quant_config, sparse_config))
+    print('Generated config recipes: ', config_recipes)
+    return config_recipes
+
+    
 
 def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig]:
     """Load benchmark configurations from CLI arguments and YAML file."""
@@ -82,12 +105,17 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig
         shapes, params = get_param_combinations(model_param)
 
         # Create configs for all combinations
-        for quant_config, (shape_name, shape) in product(
-            config.get("quantization_config_recipe_names", ["baseline"]), shapes
+        for (quant_config, sparse_config), (shape_name, shape) in product(
+            get_quantization_sparsity_recipes(
+                config.get("quantization_config_recipe_names", ["baseline"]),
+                config.get("sparsity_config_recipe_names", ["None"]),
+                ),
+            shapes
         ):
             configs.append(
                 BenchmarkConfig(
                     quantization=quant_config,
+                    sparsity=sparse_config,
                     params=params,
                     shape_name=shape_name,
                     shape=shape,
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index ee2de56ed9..cfc3d41e7e 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -4,6 +4,7 @@ quantization_config_recipe_names:
   - "baseline"
   - "int4wo-32"
   - "int4wo-128"
+sparsity_config_recipe_name: "semi-sparse"
 output_dir: "benchmarks/microbenchmarks/results"
 model_params:
   - name: "small_bf16_linear"
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
index b67ed6a0cc..bc7bd7dbbf 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -17,6 +17,7 @@ def setUp(self):
 
         self.config = BenchmarkConfig(
             quantization="baseline",
+            sparsity="semi-sparse",
             params={
                 "high_precision_dtype": "torch.float32",
                 "use_torch_compile": False,
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 34d4443255..a007e7a364 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -5,7 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 import csv
 import os
-from typing import Any, Dict, List
+from pickle import NONE
+from typing import Any, Dict, List, Optional
 
 import torch
 from tabulate import tabulate
@@ -24,7 +25,9 @@
     PerRow,
     PerTensor,
     UIntXWeightOnlyConfig,
+    Float8DynamicActivationFloat8SemiSparseWeightConfig,
 )
+from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig
 
 try:
     import triton  # noqa: F401
@@ -51,7 +54,8 @@ def get_default_device(device: str = "cuda") -> str:
 class BenchmarkConfig:
     def __init__(
         self,
-        quantization: str,  # Quantization string format is similar to the format being used for llama/generate.py
+        quantization: Optional[str],  # Quantization string format is similar to the format being used for llama/generate.py
+        sparsity: Optional[str],  # Specify the type of sparsity to be used
         params: Dict[str, Any],
         shape_name: str,
         shape: List[int],
@@ -60,6 +64,7 @@ def __init__(
     ):
         self.benchmark_mode = benchmark_mode
         self.quantization = quantization
+        self.sparsity = sparsity
         self.m, self.k, self.n = shape
         self.shape_name = shape_name
         self.high_precision_dtype = self._parse_precision(
@@ -142,16 +147,24 @@ def forward(self, x):
         return x
 
 
-def string_to_config(quantization: str, **kwargs) -> AOBaseConfig:
+def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwargs) -> AOBaseConfig:
     """Get quantization config based on quantization string.
 
     Args:
         quantization (str): Quantization method to be used. The quantiation string format is similar to the format being used for llama/generate.py.
+        sparsity (str): Sparsity method to be used. The sparsity string format is similar to the format being used for llama/generate.py.
         **kwargs: Additional arguments to be passed to the quantization method
 
     Returns:
         AOBaseConfig: Quantization configuration object
     """
+    if quantization is None and sparsity is not None:
+        if "semi" in sparsity or "2:4" in sparsity:
+            return SemiSparseWeightConfig()
+        if sparsity == "block":
+            return BlockSparseWeightConfig()
+    if quantization is None and sparsity is None:
+        return None
     high_precision_dtype = kwargs.get("high_precision_dtype", torch.bfloat16)
     if "int4wo" in quantization and not HAS_TRITON:
         print("Warning: Triton not available, falling back to baseline")
@@ -163,7 +176,9 @@ def string_to_config(quantization: str, **kwargs) -> AOBaseConfig:
     if "int8wo" in quantization:
         return Int8WeightOnlyConfig()
     if "int8dq" in quantization:
-        if "int8dq_prefill_wo_decode" in quantization:
+        if sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
+            return Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout())
+        elif "int8dq_prefill_wo_decode" in quantization:
             return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True)
         else:
             return Int8DynamicActivationInt8WeightConfig()
@@ -198,6 +213,9 @@ def string_to_config(quantization: str, **kwargs) -> AOBaseConfig:
                 act_mapping_type=MappingType.SYMMETRIC,
                 layout=MarlinQQQLayout(),
             )
+        elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
+            from torchao.dtypes import MarlinSparseLayout
+            return Int4WeightOnlyConfig(layout=MarlinSparseLayout())
     if "fp6" in quantization:
         return FPXWeightOnlyConfig(3, 2)
     elif "uintx" in quantization:
@@ -246,6 +264,8 @@ def string_to_config(quantization: str, **kwargs) -> AOBaseConfig:
     elif "float8wo" in quantization:
         return Float8WeightOnlyConfig()
     elif "float8dq" in quantization:
+        if sparsity and "semi" in sparsity:
+                return Float8DynamicActivationFloat8SemiSparseWeightConfig()
         granularity = str(quantization.split("-")[-1])
         if granularity == "tensor":
             granularity = PerTensor()
@@ -369,6 +389,7 @@ def print_results(results: List[BenchmarkResult]):
     # Extract relevant columns for display
     display_columns = [
         "quantization",
+        "sparsity",
         "model_type",
         "m",
         "k",
@@ -380,6 +401,7 @@ def print_results(results: List[BenchmarkResult]):
     # Format data for tabulate
     headers = {
         "quantization": "Quantization",
+        "sparsity": "Sparsity",
         "model_type": "Model Type",
         "m": "M",
         "k": "K",

From 407ea1d0720fc42916b11d12b73201fcef2d4ae2 Mon Sep 17 00:00:00 2001
From: Apurva Jain <appy@meta.com>
Date: Tue, 25 Mar 2025 15:28:06 -0700
Subject: [PATCH 2/8] Sparsity tests

ghstack-source-id: ac08cd26ac7d672bbca67c628e305c9771536944
ghstack-comment-id: 2752688653
Pull Request resolved: https://github.com/pytorch/ao/pull/1956
---
 .../microbenchmarks/benchmark_inference.py    |  8 +--
 .../microbenchmarks/benchmark_runner.py       | 17 ++---
 .../microbenchmarks/test/benchmark_config.yml |  7 ++-
 .../test/test_benchmark_inference.py          | 40 ++++++++++++
 .../test/test_benchmark_runner.py             | 63 +++++++++++++++++++
 benchmarks/microbenchmarks/test/test_utils.py | 43 ++++++++++++-
 benchmarks/microbenchmarks/utils.py           | 17 +++--
 7 files changed, 174 insertions(+), 21 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index 3b04aebb3b..56fd314f36 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -23,8 +23,6 @@
     model_inference_time_in_ms,
     string_to_config,
 )
-from torchao import quantization
-from torchao.core.config import AOBaseConfig
 from torchao.quantization import quantize_
 from torchao.sparsity.sparse_api import sparsify_
 
@@ -48,14 +46,16 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
     # Use quantize_ to apply each quantization function to the model
     m_copy = deepcopy(base_model).eval().to(config.device)
     aoBaseConfig = string_to_config(
-        config.quantization, config.sparsity, high_precision_dtype=config.high_precision_dtype
+        config.quantization,
+        config.sparsity,
+        high_precision_dtype=config.high_precision_dtype,
     )
     if aoBaseConfig is not None and config.quantization is not None:
         quantize_(m_copy, aoBaseConfig)
     elif config.sparsity is not None and aoBaseConfig is not None:
         sparsify_(m_copy, aoBaseConfig)
     else:
-        pass # No quantization or sparsity specified, do nothing
+        pass  # No quantization or sparsity specified, do nothing
     if config.use_torch_compile:
         print("Compiling model....")
         m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
index 32dd9ca02b..7ba6d6b6dc 100644
--- a/benchmarks/microbenchmarks/benchmark_runner.py
+++ b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -30,7 +30,6 @@
     generate_results_csv,
     print_results,
 )
-from torchao import sparsity
 
 
 def get_shapes_for_config(
@@ -68,6 +67,7 @@ def get_param_combinations(model_param):
 
     return shapes, base_params
 
+
 def get_quantization_sparsity_recipes(
     quantization_recipes: str, sparsity_recipes: str
 ) -> List[Tuple[str, str]]:
@@ -75,9 +75,14 @@ def get_quantization_sparsity_recipes(
 
     config_recipes = []
     for quant_config, sparse_config in product(quantization_recipes, sparsity_recipes):
-        if sparse_config != "None" and quant_config != "baseline":
+        if sparse_config != "None":
             if "semi" in sparse_config or "2:4" in sparse_config:
-                if "marlin" in quant_config or "int8dq" in quant_config or "float8dq" in quant_config:
+                if (
+                    "marlin" in quant_config
+                    or "int8dq" in quant_config
+                    or "float8dq" in quant_config
+                    or quant_config == "baseline"
+                ):
                     pass
                 else:
                     continue
@@ -86,10 +91,8 @@ def get_quantization_sparsity_recipes(
             else:
                 raise ValueError(f"Invalid sparsity recipe: {sparse_config}")
         config_recipes.append((quant_config, sparse_config))
-    print('Generated config recipes: ', config_recipes)
     return config_recipes
 
-    
 
 def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig]:
     """Load benchmark configurations from CLI arguments and YAML file."""
@@ -109,8 +112,8 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig
             get_quantization_sparsity_recipes(
                 config.get("quantization_config_recipe_names", ["baseline"]),
                 config.get("sparsity_config_recipe_names", ["None"]),
-                ),
-            shapes
+            ),
+            shapes,
         ):
             configs.append(
                 BenchmarkConfig(
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index cfc3d41e7e..074dfeabf4 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -3,8 +3,11 @@ benchmark_mode: "inference"
 quantization_config_recipe_names:
   - "baseline"
   - "int4wo-32"
-  - "int4wo-128"
-sparsity_config_recipe_name: "semi-sparse"
+  - "marlin"
+sparsity_config_recipe_names:
+  - "None"
+  - "semi-sparse"
+  - "block"
 output_dir: "benchmarks/microbenchmarks/results"
 model_params:
   - name: "small_bf16_linear"
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
index bc7bd7dbbf..55052acdaa 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -41,6 +41,46 @@ def test_run_inference(self):
         self.assertIsInstance(result, BenchmarkResult)
         self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
 
+    def test_run_inference_with_sparsity(self):
+        """Test running inference with sparsity configurations"""
+        # Test with semi-sparse config
+        config = BenchmarkConfig(
+            quantization="marlin",
+            sparsity="semi-sparse",
+            params={
+                "high_precision_dtype": "torch.float32",
+                "use_torch_compile": False,
+                "device": "cpu",
+                "model_type": "linear",
+            },
+            shape_name="custom",
+            shape=[16, 32, 8],
+            output_dir=self.temp_dir,
+            benchmark_mode="inference",
+        )
+        result = run(config)
+        self.assertIsInstance(result, BenchmarkResult)
+        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+
+        # Test with block sparsity
+        config = BenchmarkConfig(
+            quantization="baseline",
+            sparsity="block",
+            params={
+                "high_precision_dtype": "torch.float32",
+                "use_torch_compile": False,
+                "device": "cpu",
+                "model_type": "linear",
+            },
+            shape_name="custom",
+            shape=[16, 32, 8],
+            output_dir=self.temp_dir,
+            benchmark_mode="inference",
+        )
+        result = run(config)
+        self.assertIsInstance(result, BenchmarkResult)
+        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py
index 27e6304513..7e9c6d024e 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_runner.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_runner.py
@@ -13,6 +13,7 @@
 
 from benchmarks.microbenchmarks.benchmark_runner import (
     get_param_combinations,
+    get_quantization_sparsity_recipes,
     get_shapes_for_config,
     load_benchmark_configs,
     run_inference_benchmarks_from_config,
@@ -88,6 +89,68 @@ def test_run_inference_benchmarks_from_config(self):
         results_file = Path(self.temp_dir) / "results.csv"
         self.assertTrue(results_file.exists())
 
+    def test_get_quantization_sparsity_recipes(self):
+        """Test generation of valid quantization and sparsity recipe combinations"""
+        # Test basic combinations
+        quant_recipes = ["baseline", "int8wo"]
+        sparse_recipes = ["None", "semi-sparse"]
+        recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
+        self.assertEqual(
+            len(recipes), 3
+        )  # Should only get baseline+None and int8wo+None
+        self.assertIn(("baseline", "None"), recipes)
+        self.assertIn(("int8wo", "None"), recipes)
+        self.assertIn(("baseline", "semi-sparse"), recipes)
+
+        # Test marlin with semi-sparse
+        quant_recipes = ["marlin", "baseline"]
+        sparse_recipes = ["None", "semi-sparse"]
+        recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
+        self.assertIn(("marlin", "semi-sparse"), recipes)
+        self.assertIn(("baseline", "None"), recipes)
+
+        # Test block sparsity
+        quant_recipes = ["baseline"]
+        sparse_recipes = ["None", "block"]
+        recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
+        self.assertIn(("baseline", "block"), recipes)
+
+    def test_load_benchmark_configs_with_sparsity(self):
+        """Test loading benchmark configs with sparsity options"""
+        test_config = {
+            "benchmark_mode": "inference",
+            "quantization_config_recipe_names": ["baseline", "marlin"],
+            "sparsity_config_recipe_names": ["None", "semi-sparse"],
+            "output_dir": self.temp_dir,
+            "model_params": [
+                {
+                    "matrix_shapes": [
+                        {"name": "custom", "shapes": [[1024, 1024, 1024]]}
+                    ],
+                    "high_precision_dtype": "torch.bfloat16",
+                    "device": "cpu",
+                    "model_type": "linear",
+                }
+            ],
+        }
+
+        config_path = Path(self.temp_dir) / "test_sparsity_config.yml"
+        with open(config_path, "w") as f:
+            yaml.dump(test_config, f)
+
+        configs = load_benchmark_configs(argparse.Namespace(config=str(config_path)))
+
+        # Check that we get configs for baseline and marlin with appropriate sparsity
+        self.assertTrue(
+            any(c.quantization == "baseline" and c.sparsity == "None" for c in configs)
+        )
+        self.assertTrue(
+            any(
+                c.quantization == "marlin" and c.sparsity == "semi-sparse"
+                for c in configs
+            )
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py
index 7864467f9f..ae7e88c9ae 100644
--- a/benchmarks/microbenchmarks/test/test_utils.py
+++ b/benchmarks/microbenchmarks/test/test_utils.py
@@ -13,7 +13,11 @@
 from benchmarks.microbenchmarks.utils import (
     BenchmarkConfig,
     BenchmarkResult,
+    BlockSparseWeightConfig,
+    Float8DynamicActivationFloat8SemiSparseWeightConfig,
+    Int4WeightOnlyConfig,
     LNLinearSigmoid,
+    SemiSparseWeightConfig,
     ToyLinearModel,
     clean_caches,
     create_model_and_input,
@@ -39,6 +43,7 @@ def setUp(self):
     def test_benchmark_config(self):
         config = BenchmarkConfig(
             quantization="baseline",
+            sparsity="None",
             params=self.test_params,
             shape_name="custom",
             shape=self.test_shape,
@@ -60,6 +65,7 @@ def test_benchmark_config(self):
     def test_benchmark_result(self):
         config = BenchmarkConfig(
             quantization="baseline",
+            sparsity="None",
             params=self.test_params,
             shape_name="custom",
             shape=self.test_shape,
@@ -82,17 +88,46 @@ def test_get_default_device(self):
 
     def test_string_to_config(self):
         # Test baseline
-        config = string_to_config("baseline")
+        config = string_to_config("baseline", "None")
         self.assertIsNone(config)
 
         # Test int8wo
-        config = string_to_config("int8wo")
+        config = string_to_config("int8wo", "None")
         self.assertIsNotNone(config)
 
         # Test invalid config
-        config = string_to_config("not_a_real_config")
+        config = string_to_config("not_a_real_config", "None")
         self.assertIsNone(config)
 
+    def test_string_to_config_sparsity(self):
+        """Test sparsity config generation"""
+        # Test semi-sparse config
+        config = string_to_config(None, "semi-sparse")
+        self.assertIsInstance(config, SemiSparseWeightConfig)
+
+        # Test block sparse config
+        config = string_to_config(None, "block")
+        self.assertIsInstance(config, BlockSparseWeightConfig)
+
+        # Test combined sparsity and quantization
+        config = string_to_config("marlin", "semi-sparse")
+        self.assertIsInstance(config, Int4WeightOnlyConfig)
+
+        # Test float8 with semi-sparse
+        config = string_to_config("float8dq", "semi-sparse")
+        self.assertIsInstance(
+            config, Float8DynamicActivationFloat8SemiSparseWeightConfig
+        )
+
+    def test_invalid_sparsity(self):
+        """Test invalid sparsity config generation"""
+        from benchmarks.microbenchmarks.benchmark_runner import (
+            get_quantization_sparsity_recipes,
+        )
+
+        with self.assertRaises(ValueError):
+            get_quantization_sparsity_recipes(["baseline"], ["invalid_sparsity"])
+
     def test_toy_linear_model(self):
         model = ToyLinearModel(k=64, n=32, dtype=torch.float32)
         x = torch.randn(16, 64)
@@ -139,6 +174,7 @@ def test_generate_results_csv(self):
             BenchmarkResult(
                 BenchmarkConfig(
                     quantization="int8wo",
+                    sparsity="None",
                     params={},
                     shape_name="custom",
                     shape=[1024, 1024, 1024],
@@ -149,6 +185,7 @@ def test_generate_results_csv(self):
             BenchmarkResult(
                 BenchmarkConfig(
                     quantization="int4wo",
+                    sparsity="None",
                     params={},
                     shape_name="custom",
                     shape=[1024, 1024, 1024],
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index a007e7a364..076d9aa180 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 import csv
 import os
-from pickle import NONE
 from typing import Any, Dict, List, Optional
 
 import torch
@@ -14,6 +13,7 @@
 
 from torchao.core.config import AOBaseConfig
 from torchao.quantization import (
+    Float8DynamicActivationFloat8SemiSparseWeightConfig,
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
     FPXWeightOnlyConfig,
@@ -25,7 +25,6 @@
     PerRow,
     PerTensor,
     UIntXWeightOnlyConfig,
-    Float8DynamicActivationFloat8SemiSparseWeightConfig,
 )
 from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig
 
@@ -54,7 +53,9 @@ def get_default_device(device: str = "cuda") -> str:
 class BenchmarkConfig:
     def __init__(
         self,
-        quantization: Optional[str],  # Quantization string format is similar to the format being used for llama/generate.py
+        quantization: Optional[
+            str
+        ],  # Quantization string format is similar to the format being used for llama/generate.py
         sparsity: Optional[str],  # Specify the type of sparsity to be used
         params: Dict[str, Any],
         shape_name: str,
@@ -147,7 +148,9 @@ def forward(self, x):
         return x
 
 
-def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwargs) -> AOBaseConfig:
+def string_to_config(
+    quantization: Optional[str], sparsity: Optional[str], **kwargs
+) -> AOBaseConfig:
     """Get quantization config based on quantization string.
 
     Args:
@@ -166,6 +169,7 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa
     if quantization is None and sparsity is None:
         return None
     high_precision_dtype = kwargs.get("high_precision_dtype", torch.bfloat16)
+
     if "int4wo" in quantization and not HAS_TRITON:
         print("Warning: Triton not available, falling back to baseline")
         return None
@@ -177,6 +181,8 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa
         return Int8WeightOnlyConfig()
     if "int8dq" in quantization:
         if sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
+            from torchao.dtypes import SemiSparseLayout
+
             return Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout())
         elif "int8dq_prefill_wo_decode" in quantization:
             return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True)
@@ -215,6 +221,7 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa
             )
         elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
             from torchao.dtypes import MarlinSparseLayout
+
             return Int4WeightOnlyConfig(layout=MarlinSparseLayout())
     if "fp6" in quantization:
         return FPXWeightOnlyConfig(3, 2)
@@ -265,7 +272,7 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa
         return Float8WeightOnlyConfig()
     elif "float8dq" in quantization:
         if sparsity and "semi" in sparsity:
-                return Float8DynamicActivationFloat8SemiSparseWeightConfig()
+            return Float8DynamicActivationFloat8SemiSparseWeightConfig()
         granularity = str(quantization.split("-")[-1])
         if granularity == "tensor":
             granularity = PerTensor()

From 65d406503dc6796f39c76690c8d7ec088ab2e4f3 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 25 Mar 2025 15:28:09 -0700
Subject: [PATCH 3/8] Add sparsify_ to benchmarking

ghstack-source-id: 4cf77f85be18d1df2b1827ce11afbd02fccd9188
ghstack-comment-id: 2752688734
Pull Request resolved: https://github.com/pytorch/ao/pull/1957
---
 benchmarks/microbenchmarks/benchmark_inference.py | 11 +++++++----
 benchmarks/microbenchmarks/results/results.csv    | 14 ++++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/microbenchmarks/results/results.csv

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index 56fd314f36..ea91deca12 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -50,12 +50,15 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         config.sparsity,
         high_precision_dtype=config.high_precision_dtype,
     )
-    if aoBaseConfig is not None and config.quantization is not None:
-        quantize_(m_copy, aoBaseConfig)
-    elif config.sparsity is not None and aoBaseConfig is not None:
+    if config.sparsity != "None" and config.quantization == "baseline":
+        print(f"Sparsifying model for sparsity: {config.sparsity}")
         sparsify_(m_copy, aoBaseConfig)
-    else:
+    elif config.sparsity == "None" and config.quantization == "baseline":
         pass  # No quantization or sparsity specified, do nothing
+    else:
+        print(f"Quantizing model with quantization: {config.quantization}, sparsity: {config.sparsity}")
+        quantize_(m_copy, aoBaseConfig)
+
     if config.use_torch_compile:
         print("Compiling model....")
         m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
diff --git a/benchmarks/microbenchmarks/results/results.csv b/benchmarks/microbenchmarks/results/results.csv
new file mode 100644
index 0000000000..d1e2fe188e
--- /dev/null
+++ b/benchmarks/microbenchmarks/results/results.csv
@@ -0,0 +1,14 @@
+name,quantization,m,k,n,high_precision_dtype,use_torch_compile,torch_compile_mode,device,model_type,output_dir,model_inference_time_in_ms
+small_bf16_linear,baseline,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,47.870889538899064
+small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79820020310581
+small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79379970394075
+small_bf16_linear,marlin,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,2827.8696595225483
+large_bf16_ln_linear,baseline,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,69.42827021703124
+large_bf16_ln_linear,baseline,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,110.21242011338472
+large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,640.171259874478
+large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1230.195889947936
+large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,638.9500107616186
+large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1224.7836799360812
+large_bf16_ln_linear,marlin,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,10065.855470020324
+large_bf16_ln_linear,marlin,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,11008.49203998223
+cpu_fp32_linear,baseline,4096,4096,1024,torch.float32,False,,cpu,linear,benchmarks/microbenchmarks/results,369783.37747976184

From cea30b1fbf1ec128d56028c0a118de35ccac36db Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 25 Mar 2025 15:28:12 -0700
Subject: [PATCH 4/8] Support sparsify_ run

ghstack-source-id: 8b459e288351460492c91d476791f3d47c7b627d
ghstack-comment-id: 2752688801
Pull Request resolved: https://github.com/pytorch/ao/pull/1958
---
 .../microbenchmarks/benchmark_inference.py    | 23 +++--
 .../microbenchmarks/benchmark_runner.py       | 84 +++++++++++++------
 .../microbenchmarks/results/results.csv       | 14 ----
 .../microbenchmarks/test/benchmark_config.yml |  4 +-
 .../test/test_benchmark_inference.py          | 24 +++++-
 .../test/test_benchmark_runner.py             | 65 +++++++++++---
 benchmarks/microbenchmarks/test/test_utils.py | 17 ++++
 benchmarks/microbenchmarks/utils.py           | 12 ++-
 8 files changed, 178 insertions(+), 65 deletions(-)
 delete mode 100644 benchmarks/microbenchmarks/results/results.csv

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index ea91deca12..d84e21f382 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -50,13 +50,26 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         config.sparsity,
         high_precision_dtype=config.high_precision_dtype,
     )
-    if config.sparsity != "None" and config.quantization == "baseline":
-        print(f"Sparsifying model for sparsity: {config.sparsity}")
-        sparsify_(m_copy, aoBaseConfig)
-    elif config.sparsity == "None" and config.quantization == "baseline":
+
+    # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
+    is_cuda = config.device == "cuda" and torch.cuda.is_available()
+
+    if config.sparsity is not None and (
+        config.quantization is None or "baseline" in config.quantization
+    ):
+        if is_cuda:
+            print(f"Applying {config.sparsity} sparsity to model")
+            sparsify_(m_copy, aoBaseConfig)
+        else:
+            print(
+                f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
+            )
+    elif config.sparsity is None and (
+        config.quantization is None or "baseline" in config.quantization
+    ):
         pass  # No quantization or sparsity specified, do nothing
     else:
-        print(f"Quantizing model with quantization: {config.quantization}, sparsity: {config.sparsity}")
+        print("Quantizing model....")
         quantize_(m_copy, aoBaseConfig)
 
     if config.use_torch_compile:
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
index 7ba6d6b6dc..85a8ef2f53 100644
--- a/benchmarks/microbenchmarks/benchmark_runner.py
+++ b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -21,7 +21,7 @@
 
 import argparse
 from itertools import product
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import yaml
 
@@ -69,28 +69,57 @@ def get_param_combinations(model_param):
 
 
 def get_quantization_sparsity_recipes(
-    quantization_recipes: str, sparsity_recipes: str
-) -> List[Tuple[str, str]]:
-    """Generate valid quantization and sparsity recipes."""
-
-    config_recipes = []
-    for quant_config, sparse_config in product(quantization_recipes, sparsity_recipes):
-        if sparse_config != "None":
-            if "semi" in sparse_config or "2:4" in sparse_config:
+    quantization_recipes: List[str], sparsity_recipes: List[str]
+) -> Set[Tuple[str, Optional[str]]]:
+    """Generate valid quantization and sparsity recipes.
+
+    Args:
+        quantization_recipes: List of quantization recipes
+        sparsity_recipes: List of sparsity recipes
+
+    Returns:
+        Set of tuples containing (quantization_recipe, sparsity_recipe)
+        For block sparsity, quantization is always "baseline"
+        All quantization techniques are also run without sparsity
+    """
+    config_recipes = set()
+
+    # Handle edge cases
+    if sparsity_recipes is None and quantization_recipes is None:
+        return {("baseline", None)}
+    if sparsity_recipes is None:
+        return {(quant, None) for quant in quantization_recipes}
+    if quantization_recipes is None:
+        return {("baseline", sparse) for sparse in sparsity_recipes}
+
+    # Always include baseline without sparsity
+    config_recipes.add(("baseline", None))
+
+    # Add all quantization techniques without sparsity
+    for quant_config in quantization_recipes:
+        config_recipes.add((quant_config, None))
+
+    # Process combinations of quantization and sparsity
+    for sparse_config in sparsity_recipes:
+        if sparse_config is None:
+            # Skip None sparsity as we've already added all quantization techniques without sparsity
+            continue
+        elif "block" in sparse_config:
+            # For block sparsity, only pair with baseline quantization
+            config_recipes.add(("baseline", sparse_config))
+        elif "semi" in sparse_config or "2:4" in sparse_config:
+            # For semi-sparse, only pair with compatible quantization methods
+            for quant_config in quantization_recipes:
                 if (
                     "marlin" in quant_config
                     or "int8dq" in quant_config
                     or "float8dq" in quant_config
                     or quant_config == "baseline"
                 ):
-                    pass
-                else:
-                    continue
-            elif sparse_config == "block":
-                config_recipes.append(("baseline", sparse_config))
-            else:
-                raise ValueError(f"Invalid sparsity recipe: {sparse_config}")
-        config_recipes.append((quant_config, sparse_config))
+                    config_recipes.add((quant_config, sparse_config))
+        else:
+            raise ValueError(f"Invalid sparsity recipe: {sparse_config}")
+
     return config_recipes
 
 
@@ -104,15 +133,16 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig
 
     # Create all possible combinations
     configs = []
+    quantization_sparsity_recipes = get_quantization_sparsity_recipes(
+        config.get("quantization_config_recipe_names", None),
+        config.get("sparsity_config_recipe_names", None),
+    )
     for model_param in config["model_params"]:
         shapes, params = get_param_combinations(model_param)
 
         # Create configs for all combinations
         for (quant_config, sparse_config), (shape_name, shape) in product(
-            get_quantization_sparsity_recipes(
-                config.get("quantization_config_recipe_names", ["baseline"]),
-                config.get("sparsity_config_recipe_names", ["None"]),
-            ),
+            quantization_sparsity_recipes,
             shapes,
         ):
             configs.append(
@@ -126,7 +156,6 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig
                     benchmark_mode=benchmark_mode,
                 )
             )
-
     return configs
 
 
@@ -135,14 +164,17 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
     from benchmarks.microbenchmarks.benchmark_inference import run as run_inference
 
     results = []
-    print("Benchmarking Inference ......")
+    print("----------------- RUNNING BENCHMARKS FOR INFERENCE -----------------------")
     for config in configs:
+        print("----------------------------------------")
         try:
-            print(f"Running: {config.name}")
+            print(
+                f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
+            )
             result = run_inference(config)  # Pass the config object directly
             results.append(result)
-        except Exception as e:
-            print(f"Error running benchmark {config.name}: {e}")
+        except Exception:
+            print(f"Error running benchmark {config.name}")
             continue
 
     # Add results to csv
diff --git a/benchmarks/microbenchmarks/results/results.csv b/benchmarks/microbenchmarks/results/results.csv
deleted file mode 100644
index d1e2fe188e..0000000000
--- a/benchmarks/microbenchmarks/results/results.csv
+++ /dev/null
@@ -1,14 +0,0 @@
-name,quantization,m,k,n,high_precision_dtype,use_torch_compile,torch_compile_mode,device,model_type,output_dir,model_inference_time_in_ms
-small_bf16_linear,baseline,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,47.870889538899064
-small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79820020310581
-small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79379970394075
-small_bf16_linear,marlin,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,2827.8696595225483
-large_bf16_ln_linear,baseline,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,69.42827021703124
-large_bf16_ln_linear,baseline,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,110.21242011338472
-large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,640.171259874478
-large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1230.195889947936
-large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,638.9500107616186
-large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1224.7836799360812
-large_bf16_ln_linear,marlin,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,10065.855470020324
-large_bf16_ln_linear,marlin,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,11008.49203998223
-cpu_fp32_linear,baseline,4096,4096,1024,torch.float32,False,,cpu,linear,benchmarks/microbenchmarks/results,369783.37747976184
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index 074dfeabf4..17cd666bfa 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -1,11 +1,11 @@
 # Sample configuration for inference benchmarks
 benchmark_mode: "inference"
 quantization_config_recipe_names:
-  - "baseline"
+  # - "baseline" Will always run a baseline instatance
   - "int4wo-32"
   - "marlin"
 sparsity_config_recipe_names:
-  - "None"
+  # - "none" Will always run a without sparsity instance
   - "semi-sparse"
   - "block"
 output_dir: "benchmarks/microbenchmarks/results"
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
index 55052acdaa..0216297bc7 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 import tempfile
 import unittest
+from unittest.mock import patch
 
 from benchmarks.microbenchmarks.benchmark_inference import run
 from benchmarks.microbenchmarks.utils import BenchmarkConfig, BenchmarkResult
@@ -36,14 +37,28 @@ def tearDown(self):
 
         shutil.rmtree(self.temp_dir)
 
-    def test_run_inference(self):
+    @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
+    def test_run_inference(self, mock_string_to_config):
+        # Mock string_to_config to return a valid config
+        from torchao.sparsity.sparse_api import SemiSparseWeightConfig
+
+        mock_string_to_config.return_value = SemiSparseWeightConfig()
+
         result = run(self.config)
         self.assertIsInstance(result, BenchmarkResult)
         self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
 
-    def test_run_inference_with_sparsity(self):
+    @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
+    def test_run_inference_with_sparsity(self, mock_string_to_config):
         """Test running inference with sparsity configurations"""
+        # Mock string_to_config to return valid configs
+        from torchao.quantization import Int4WeightOnlyConfig
+        from torchao.sparsity.sparse_api import (
+            BlockSparseWeightConfig,
+        )
+
         # Test with semi-sparse config
+        mock_string_to_config.return_value = Int4WeightOnlyConfig()
         config = BenchmarkConfig(
             quantization="marlin",
             sparsity="semi-sparse",
@@ -54,7 +69,7 @@ def test_run_inference_with_sparsity(self):
                 "model_type": "linear",
             },
             shape_name="custom",
-            shape=[16, 32, 8],
+            shape=[64, 64, 64],  # Use dimensions divisible by 64
             output_dir=self.temp_dir,
             benchmark_mode="inference",
         )
@@ -63,6 +78,7 @@ def test_run_inference_with_sparsity(self):
         self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
 
         # Test with block sparsity
+        mock_string_to_config.return_value = BlockSparseWeightConfig()
         config = BenchmarkConfig(
             quantization="baseline",
             sparsity="block",
@@ -73,7 +89,7 @@ def test_run_inference_with_sparsity(self):
                 "model_type": "linear",
             },
             shape_name="custom",
-            shape=[16, 32, 8],
+            shape=[64, 64, 64],  # Use dimensions divisible by 64
             output_dir=self.temp_dir,
             benchmark_mode="inference",
         )
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py
index 7e9c6d024e..a8683a1de8 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_runner.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_runner.py
@@ -93,34 +93,75 @@ def test_get_quantization_sparsity_recipes(self):
         """Test generation of valid quantization and sparsity recipe combinations"""
         # Test basic combinations
         quant_recipes = ["baseline", "int8wo"]
-        sparse_recipes = ["None", "semi-sparse"]
+        sparse_recipes = [None, "semi-sparse"]
         recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
-        self.assertEqual(
-            len(recipes), 3
-        )  # Should only get baseline+None and int8wo+None
-        self.assertIn(("baseline", "None"), recipes)
-        self.assertIn(("int8wo", "None"), recipes)
+        self.assertIn(("baseline", None), recipes)
+        self.assertIn(("int8wo", None), recipes)
         self.assertIn(("baseline", "semi-sparse"), recipes)
 
         # Test marlin with semi-sparse
         quant_recipes = ["marlin", "baseline"]
-        sparse_recipes = ["None", "semi-sparse"]
+        sparse_recipes = [None, "semi-sparse"]
         recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
         self.assertIn(("marlin", "semi-sparse"), recipes)
-        self.assertIn(("baseline", "None"), recipes)
+        self.assertIn(("baseline", None), recipes)
 
         # Test block sparsity
         quant_recipes = ["baseline"]
-        sparse_recipes = ["None", "block"]
+        sparse_recipes = [None, "block"]
         recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
         self.assertIn(("baseline", "block"), recipes)
 
-    def test_load_benchmark_configs_with_sparsity(self):
+    def test_none_string_raises_error(self):
+        """Test that passing 'None' as a string raises an error"""
+        quant_recipes = ["baseline"]
+        sparse_recipes = ["None"]  # "None" as a string should raise an error
+        with self.assertRaises(ValueError):
+            get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
+
+    def test_block_sparsity_with_quantization(self):
+        """Test that block sparsity is only paired with baseline quantization"""
+        quant_recipes = ["baseline", "int8wo", "int4wo", "marlin"]
+        sparse_recipes = ["block"]
+        recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
+
+        # Block sparsity should only be paired with baseline
+        self.assertIn(("baseline", "block"), recipes)
+        self.assertNotIn(("int8wo", "block"), recipes)
+        self.assertNotIn(("int4wo", "block"), recipes)
+        self.assertNotIn(("marlin", "block"), recipes)
+
+        # All quantization techniques should be run without sparsity
+        self.assertIn(("baseline", None), recipes)
+        self.assertIn(("int8wo", None), recipes)
+        self.assertIn(("int4wo", None), recipes)
+        self.assertIn(("marlin", None), recipes)
+
+    def test_all_quantization_without_sparsity(self):
+        """Test that all quantization techniques are run without sparsity"""
+        quant_recipes = ["baseline", "int8wo", "int4wo", "marlin"]
+        sparse_recipes = [None, "semi-sparse", "block"]
+        recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
+
+        # All quantization techniques should be run without sparsity
+        for quant in quant_recipes:
+            self.assertIn((quant, None), recipes)
+
+    @patch(
+        "benchmarks.microbenchmarks.benchmark_runner.get_quantization_sparsity_recipes"
+    )
+    def test_load_benchmark_configs_with_sparsity(self, mock_get_recipes):
         """Test loading benchmark configs with sparsity options"""
+        # Mock get_quantization_sparsity_recipes to return a valid set of recipes
+        mock_get_recipes.return_value = {("baseline", None), ("marlin", "semi-sparse")}
+
         test_config = {
             "benchmark_mode": "inference",
             "quantization_config_recipe_names": ["baseline", "marlin"],
-            "sparsity_config_recipe_names": ["None", "semi-sparse"],
+            "sparsity_config_recipe_names": [
+                None,
+                "semi-sparse",
+            ],  # Use None instead of "None"
             "output_dir": self.temp_dir,
             "model_params": [
                 {
@@ -142,7 +183,7 @@ def test_load_benchmark_configs_with_sparsity(self):
 
         # Check that we get configs for baseline and marlin with appropriate sparsity
         self.assertTrue(
-            any(c.quantization == "baseline" and c.sparsity == "None" for c in configs)
+            any(c.quantization == "baseline" and c.sparsity is None for c in configs)
         )
         self.assertTrue(
             any(
diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py
index ae7e88c9ae..83e88c5b11 100644
--- a/benchmarks/microbenchmarks/test/test_utils.py
+++ b/benchmarks/microbenchmarks/test/test_utils.py
@@ -119,6 +119,23 @@ def test_string_to_config_sparsity(self):
             config, Float8DynamicActivationFloat8SemiSparseWeightConfig
         )
 
+    def test_block_sparsity_with_baseline_quantization(self):
+        """Test that block sparsity with baseline quantization returns BlockSparseWeightConfig"""
+        config = string_to_config("baseline", "block")
+        self.assertIsInstance(config, BlockSparseWeightConfig)
+
+    def test_block_sparsity_with_non_baseline_quantization(self):
+        """Test that block sparsity with non-baseline quantization still returns BlockSparseWeightConfig"""
+        # Block sparsity should take precedence over any quantization method
+        config = string_to_config("int8wo", "block")
+        self.assertIsInstance(config, BlockSparseWeightConfig)
+
+        config = string_to_config("int4wo", "block")
+        self.assertIsInstance(config, BlockSparseWeightConfig)
+
+        config = string_to_config("marlin", "block")
+        self.assertIsInstance(config, BlockSparseWeightConfig)
+
     def test_invalid_sparsity(self):
         """Test invalid sparsity config generation"""
         from benchmarks.microbenchmarks.benchmark_runner import (
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 076d9aa180..fd3db11591 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -95,6 +95,7 @@ def to_dict(self) -> Dict[str, Any]:
         return {
             "name": self.name,
             "quantization": self.quantization,
+            "sparsity": self.sparsity,
             "m": self.m,
             "k": self.k,
             "n": self.n,
@@ -161,11 +162,16 @@ def string_to_config(
     Returns:
         AOBaseConfig: Quantization configuration object
     """
+    # Handle block sparsity case - with block sparsity, quantization should always be "none" or "baseline"
+    if sparsity is not None and sparsity == "block":
+        return BlockSparseWeightConfig()
+
+    # Handle other sparsity cases
     if quantization is None and sparsity is not None:
         if "semi" in sparsity or "2:4" in sparsity:
             return SemiSparseWeightConfig()
-        if sparsity == "block":
-            return BlockSparseWeightConfig()
+        else:
+            raise ValueError(f"Unknown sparsity type: {sparsity}")
     if quantization is None and sparsity is None:
         return None
     high_precision_dtype = kwargs.get("high_precision_dtype", torch.bfloat16)
@@ -424,6 +430,8 @@ def print_results(results: List[BenchmarkResult]):
         row = []
         for col in display_columns:
             value = result_dict.get(col, "N/A")
+            if value is None:
+                value = "N/A"
             if col == "model_inference_time_in_ms":
                 value = f"{value:.2f}" if isinstance(value, (int, float)) else value
             elif col == "use_torch_compile":

From ae70fece7bad9ebe3027867d422ffca55216589d Mon Sep 17 00:00:00 2001
From: Apurva Jain <appy@meta.com>
Date: Tue, 25 Mar 2025 15:28:15 -0700
Subject: [PATCH 5/8] Review updates

ghstack-source-id: ded017942498ba330ed1e99cb784ead3a049bb6e
ghstack-comment-id: 2752688871
Pull Request resolved: https://github.com/pytorch/ao/pull/1959
---
 .../microbenchmarks/benchmark_inference.py     |  6 +++---
 benchmarks/microbenchmarks/benchmark_runner.py | 12 ++----------
 .../microbenchmarks/test/benchmark_config.yml  |  4 ++--
 .../test/test_benchmark_inference.py           | 18 +++++++++++++-----
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index d84e21f382..c084d18d3a 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -45,7 +45,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
 
     # Use quantize_ to apply each quantization function to the model
     m_copy = deepcopy(base_model).eval().to(config.device)
-    aoBaseConfig = string_to_config(
+    ao_base_config = string_to_config(
         config.quantization,
         config.sparsity,
         high_precision_dtype=config.high_precision_dtype,
@@ -59,7 +59,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
     ):
         if is_cuda:
             print(f"Applying {config.sparsity} sparsity to model")
-            sparsify_(m_copy, aoBaseConfig)
+            sparsify_(m_copy, ao_base_config)
         else:
             print(
                 f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
@@ -70,7 +70,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         pass  # No quantization or sparsity specified, do nothing
     else:
         print("Quantizing model....")
-        quantize_(m_copy, aoBaseConfig)
+        quantize_(m_copy, ao_base_config)
 
     if config.use_torch_compile:
         print("Compiling model....")
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
index 85a8ef2f53..7152542eec 100644
--- a/benchmarks/microbenchmarks/benchmark_runner.py
+++ b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -84,14 +84,6 @@ def get_quantization_sparsity_recipes(
     """
     config_recipes = set()
 
-    # Handle edge cases
-    if sparsity_recipes is None and quantization_recipes is None:
-        return {("baseline", None)}
-    if sparsity_recipes is None:
-        return {(quant, None) for quant in quantization_recipes}
-    if quantization_recipes is None:
-        return {("baseline", sparse) for sparse in sparsity_recipes}
-
     # Always include baseline without sparsity
     config_recipes.add(("baseline", None))
 
@@ -134,8 +126,8 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig
     # Create all possible combinations
     configs = []
     quantization_sparsity_recipes = get_quantization_sparsity_recipes(
-        config.get("quantization_config_recipe_names", None),
-        config.get("sparsity_config_recipe_names", None),
+        config.get("quantization_config_recipe_names", []),
+        config.get("sparsity_config_recipe_names", []),
     )
     for model_param in config["model_params"]:
         shapes, params = get_param_combinations(model_param)
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index 17cd666bfa..97a38469de 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -1,11 +1,11 @@
 # Sample configuration for inference benchmarks
 benchmark_mode: "inference"
 quantization_config_recipe_names:
-  # - "baseline" Will always run a baseline instatance
+  # Will run a baseline inference for model by default, without quantization for comparison
   - "int4wo-32"
   - "marlin"
 sparsity_config_recipe_names:
-  # - "none" Will always run a without sparsity instance
+  # Will run a baseline inference for model by default, without sparsity for comparison
   - "semi-sparse"
   - "block"
 output_dir: "benchmarks/microbenchmarks/results"
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
index 0216297bc7..22863dcbcf 100644
--- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py
+++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -49,16 +49,16 @@ def test_run_inference(self, mock_string_to_config):
         self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
 
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
-    def test_run_inference_with_sparsity(self, mock_string_to_config):
+    def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
         """Test running inference with sparsity configurations"""
         # Mock string_to_config to return valid configs
+        from torchao.dtypes import MarlinSparseLayout
         from torchao.quantization import Int4WeightOnlyConfig
-        from torchao.sparsity.sparse_api import (
-            BlockSparseWeightConfig,
-        )
 
         # Test with semi-sparse config
-        mock_string_to_config.return_value = Int4WeightOnlyConfig()
+        mock_string_to_config.return_value = Int4WeightOnlyConfig(
+            layout=MarlinSparseLayout()
+        )
         config = BenchmarkConfig(
             quantization="marlin",
             sparsity="semi-sparse",
@@ -77,6 +77,14 @@ def test_run_inference_with_sparsity(self, mock_string_to_config):
         self.assertIsInstance(result, BenchmarkResult)
         self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
 
+    @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
+    def test_run_inference_with_block_sparsity(self, mock_string_to_config):
+        """Test running inference with sparsity configurations"""
+        # Mock string_to_config to return valid configs
+        from torchao.sparsity.sparse_api import (
+            BlockSparseWeightConfig,
+        )
+
         # Test with block sparsity
         mock_string_to_config.return_value = BlockSparseWeightConfig()
         config = BenchmarkConfig(

From d71baa35e23e929e04f2e48ae5cc987199fb18d7 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 25 Mar 2025 15:28:18 -0700
Subject: [PATCH 6/8] Add profile support

ghstack-source-id: df824669905a8f0fbc1de6744d7a4bc7aa1747d5
ghstack-comment-id: 2752688926
Pull Request resolved: https://github.com/pytorch/ao/pull/1960
---
 .../microbenchmarks/benchmark_inference.py    |  6 +-
 .../microbenchmarks/test/benchmark_config.yml |  3 +-
 benchmarks/microbenchmarks/utils.py           | 64 +++++++++++++++++++
 3 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index c084d18d3a..9dec543c11 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -20,6 +20,7 @@
     BenchmarkResult,
     clean_caches,
     create_model_and_input,
+    generate_model_profile,
     model_inference_time_in_ms,
     string_to_config,
 )
@@ -84,10 +85,9 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         model=m_copy, input_data=input_data
     )
 
-    # TODO: Benchmark time using profiler
     # Profile dtype model evaluation
-    # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype)
-    # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json")  # Save profiling details
+    prof = generate_model_profile(m_copy, input_data)
+    prof.export_chrome_trace(f"{config.profile_path}.json")  # Save profiling details
 
     # TODO: Benchmark gemm time using cuda graph
     # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs)
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index 97a38469de..13f67bbaeb 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -8,7 +8,7 @@ sparsity_config_recipe_names:
   # Will run a baseline inference for model by default, without sparsity for comparison
   - "semi-sparse"
   - "block"
-output_dir: "benchmarks/microbenchmarks/results"
+output_dir: "benchmarks/microbenchmarks/results/"
 model_params:
   - name: "small_bf16_linear"
     matrix_shapes:
@@ -21,6 +21,7 @@ model_params:
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
+    profile: True
 
   - name: "large_bf16_ln_linear"
     matrix_shapes:
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index fd3db11591..111f9072f3 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -27,6 +27,12 @@
     UIntXWeightOnlyConfig,
 )
 from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig
+from torch.profiler import profile, record_function, ProfilerActivity
+import os
+import subprocess
+import sys
+import uuid
+
 
 try:
     import triton  # noqa: F401
@@ -84,6 +90,8 @@ def __init__(
             "name",
             f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}",
         )
+        self.profile = params.get("profile", False)
+        self.profile_file_name = os.path.join(self.output_dir, f"/profile/{self.name}_{self.m}_{self.k}_{self.n}_profile.json")
 
     @staticmethod
     def _parse_precision(precision_str: str) -> torch.dtype:
@@ -105,6 +113,7 @@ def to_dict(self) -> Dict[str, Any]:
             "device": self.device,
             "model_type": self.model_type,
             "output_dir": self.output_dir,
+            "profile": self.profile,
         }
 
 
@@ -319,6 +328,61 @@ def model_inference_time_in_ms(model, input_data):
     return res * 1e6
 
 
+def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]:
+    file_name = os.path.basename(local_path)
+    manifold_path = os.path.join(
+        "perfetto_internal_traces/tree/shared_trace", f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}"
+    )
+    cmd = [
+        "manifold",
+        "put",
+        local_path,
+        manifold_path,
+        "--ttl",
+        str(28 * 24 * 60 * 60),
+        "--userData",
+        "false",
+    ]
+    ret = subprocess.run(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
+    )
+    if ret.returncode == 0:
+        print("Upload trace successfully.")
+        return manifold_path
+    else:
+        print("[ERROR] Upload failed, maybe the trace file exists.")
+        return None
+
+
+def print_perfetto_ui_url(manifold_path: str) -> None:
+    url = (
+        "https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html"
+        + "#!/?url=https://interncache-all.fbcdn.net/manifold/"
+        + manifold_path
+    )
+    print(f"The trace is accessible at:\n{url}")
+
+
+def generate_model_profile(model, input_data, profile_file):
+    # Function to benchmark model evaluation with profiling
+    torch.profiler._utils._init_for_cuda_graphs()
+    prof = torch.profiler.profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True)
+    with prof:
+        # with record_function("model_inference"):
+        for _ in range(1):  # Run the model multiple times to warm up the cache
+            with torch.no_grad():
+                _ = model(*input_data)
+                torch.cuda.synchronize()
+    prof.export_chrome_trace(profile_file)  # Save profiling details
+    
+    manifold_path = upload_trace_file(profile_file)
+    if manifold_path:
+        print_perfetto_ui_url(manifold_path)
+
+    # Return the profiler output
+    return prof
+
+
 def create_model_and_input(
     model_type: str,
     m: int,

From f2d24cd69a13b951978109f287ca660176f4c140 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 25 Mar 2025 15:31:03 -0700
Subject: [PATCH 7/8] Revert "Add profile support"

This reverts commit d71baa35e23e929e04f2e48ae5cc987199fb18d7.
---
 .../microbenchmarks/benchmark_inference.py    |  6 +-
 .../microbenchmarks/test/benchmark_config.yml |  3 +-
 benchmarks/microbenchmarks/utils.py           | 64 -------------------
 3 files changed, 4 insertions(+), 69 deletions(-)

diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
index 9dec543c11..c084d18d3a 100644
--- a/benchmarks/microbenchmarks/benchmark_inference.py
+++ b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -20,7 +20,6 @@
     BenchmarkResult,
     clean_caches,
     create_model_and_input,
-    generate_model_profile,
     model_inference_time_in_ms,
     string_to_config,
 )
@@ -85,9 +84,10 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         model=m_copy, input_data=input_data
     )
 
+    # TODO: Benchmark time using profiler
     # Profile dtype model evaluation
-    prof = generate_model_profile(m_copy, input_data)
-    prof.export_chrome_trace(f"{config.profile_path}.json")  # Save profiling details
+    # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype)
+    # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json")  # Save profiling details
 
     # TODO: Benchmark gemm time using cuda graph
     # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs)
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
index 13f67bbaeb..97a38469de 100644
--- a/benchmarks/microbenchmarks/test/benchmark_config.yml
+++ b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -8,7 +8,7 @@ sparsity_config_recipe_names:
   # Will run a baseline inference for model by default, without sparsity for comparison
   - "semi-sparse"
   - "block"
-output_dir: "benchmarks/microbenchmarks/results/"
+output_dir: "benchmarks/microbenchmarks/results"
 model_params:
   - name: "small_bf16_linear"
     matrix_shapes:
@@ -21,7 +21,6 @@ model_params:
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
-    profile: True
 
   - name: "large_bf16_ln_linear"
     matrix_shapes:
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index 111f9072f3..fd3db11591 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -27,12 +27,6 @@
     UIntXWeightOnlyConfig,
 )
 from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig
-from torch.profiler import profile, record_function, ProfilerActivity
-import os
-import subprocess
-import sys
-import uuid
-
 
 try:
     import triton  # noqa: F401
@@ -90,8 +84,6 @@ def __init__(
             "name",
             f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}",
         )
-        self.profile = params.get("profile", False)
-        self.profile_file_name = os.path.join(self.output_dir, f"/profile/{self.name}_{self.m}_{self.k}_{self.n}_profile.json")
 
     @staticmethod
     def _parse_precision(precision_str: str) -> torch.dtype:
@@ -113,7 +105,6 @@ def to_dict(self) -> Dict[str, Any]:
             "device": self.device,
             "model_type": self.model_type,
             "output_dir": self.output_dir,
-            "profile": self.profile,
         }
 
 
@@ -328,61 +319,6 @@ def model_inference_time_in_ms(model, input_data):
     return res * 1e6
 
 
-def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]:
-    file_name = os.path.basename(local_path)
-    manifold_path = os.path.join(
-        "perfetto_internal_traces/tree/shared_trace", f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}"
-    )
-    cmd = [
-        "manifold",
-        "put",
-        local_path,
-        manifold_path,
-        "--ttl",
-        str(28 * 24 * 60 * 60),
-        "--userData",
-        "false",
-    ]
-    ret = subprocess.run(
-        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
-    )
-    if ret.returncode == 0:
-        print("Upload trace successfully.")
-        return manifold_path
-    else:
-        print("[ERROR] Upload failed, maybe the trace file exists.")
-        return None
-
-
-def print_perfetto_ui_url(manifold_path: str) -> None:
-    url = (
-        "https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html"
-        + "#!/?url=https://interncache-all.fbcdn.net/manifold/"
-        + manifold_path
-    )
-    print(f"The trace is accessible at:\n{url}")
-
-
-def generate_model_profile(model, input_data, profile_file):
-    # Function to benchmark model evaluation with profiling
-    torch.profiler._utils._init_for_cuda_graphs()
-    prof = torch.profiler.profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True)
-    with prof:
-        # with record_function("model_inference"):
-        for _ in range(1):  # Run the model multiple times to warm up the cache
-            with torch.no_grad():
-                _ = model(*input_data)
-                torch.cuda.synchronize()
-    prof.export_chrome_trace(profile_file)  # Save profiling details
-    
-    manifold_path = upload_trace_file(profile_file)
-    if manifold_path:
-        print_perfetto_ui_url(manifold_path)
-
-    # Return the profiler output
-    return prof
-
-
 def create_model_and_input(
     model_type: str,
     m: int,

From 285038951e379f67109ff577e42f0cf9f63585ae Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 27 Mar 2025 11:02:06 -0700
Subject: [PATCH 8/8] nit updates

---
 benchmarks/microbenchmarks/test/test_utils.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py
index 83e88c5b11..14f226bd7e 100644
--- a/benchmarks/microbenchmarks/test/test_utils.py
+++ b/benchmarks/microbenchmarks/test/test_utils.py
@@ -43,7 +43,7 @@ def setUp(self):
     def test_benchmark_config(self):
         config = BenchmarkConfig(
             quantization="baseline",
-            sparsity="None",
+            sparsity=None,
             params=self.test_params,
             shape_name="custom",
             shape=self.test_shape,
@@ -65,7 +65,7 @@ def test_benchmark_config(self):
     def test_benchmark_result(self):
         config = BenchmarkConfig(
             quantization="baseline",
-            sparsity="None",
+            sparsity=None,
             params=self.test_params,
             shape_name="custom",
             shape=self.test_shape,
@@ -88,15 +88,15 @@ def test_get_default_device(self):
 
     def test_string_to_config(self):
         # Test baseline
-        config = string_to_config("baseline", "None")
+        config = string_to_config("baseline", None)
         self.assertIsNone(config)
 
         # Test int8wo
-        config = string_to_config("int8wo", "None")
+        config = string_to_config("int8wo", None)
         self.assertIsNotNone(config)
 
         # Test invalid config
-        config = string_to_config("not_a_real_config", "None")
+        config = string_to_config("not_a_real_config", None)
         self.assertIsNone(config)
 
     def test_string_to_config_sparsity(self):
@@ -191,7 +191,7 @@ def test_generate_results_csv(self):
             BenchmarkResult(
                 BenchmarkConfig(
                     quantization="int8wo",
-                    sparsity="None",
+                    sparsity=None,
                     params={},
                     shape_name="custom",
                     shape=[1024, 1024, 1024],
@@ -202,7 +202,7 @@ def test_generate_results_csv(self):
             BenchmarkResult(
                 BenchmarkConfig(
                     quantization="int4wo",
-                    sparsity="None",
+                    sparsity=None,
                     params={},
                     shape_name="custom",
                     shape=[1024, 1024, 1024],