From c49d51277283180151b9b4c959f3726c2c0ac4c5 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 25 Mar 2025 15:28:03 -0700 Subject: [PATCH 1/8] Add sparsity support in benchmarking ghstack-source-id: c9e900d84457e7dc527ad3ea78648d61b8004b09 ghstack-comment-id: 2752688588 Pull Request resolved: https://github.com/pytorch/ao/pull/1955 --- .../microbenchmarks/benchmark_inference.py | 15 ++++++--- .../microbenchmarks/benchmark_runner.py | 32 +++++++++++++++++-- .../microbenchmarks/test/benchmark_config.yml | 1 + .../test/test_benchmark_inference.py | 1 + benchmarks/microbenchmarks/utils.py | 30 ++++++++++++++--- 5 files changed, 69 insertions(+), 10 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index e96ec9c59e..3b04aebb3b 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -23,7 +23,10 @@ model_inference_time_in_ms, string_to_config, ) +from torchao import quantization +from torchao.core.config import AOBaseConfig from torchao.quantization import quantize_ +from torchao.sparsity.sparse_api import sparsify_ def run(config: BenchmarkConfig) -> BenchmarkResult: @@ -44,11 +47,15 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: # Use quantize_ to apply each quantization function to the model m_copy = deepcopy(base_model).eval().to(config.device) - quantization_config = string_to_config( - config.quantization, high_precision_dtype=config.high_precision_dtype + aoBaseConfig = string_to_config( + config.quantization, config.sparsity, high_precision_dtype=config.high_precision_dtype ) - if quantization_config is not None: - quantize_(m_copy, quantization_config) + if aoBaseConfig is not None and config.quantization is not None: + quantize_(m_copy, aoBaseConfig) + elif config.sparsity is not None and aoBaseConfig is not None: + sparsify_(m_copy, aoBaseConfig) + else: + pass # No quantization or sparsity specified, do nothing if config.use_torch_compile: print("Compiling model....") m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 69a6ced025..32dd9ca02b 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -30,6 +30,7 @@ generate_results_csv, print_results, ) +from torchao import sparsity def get_shapes_for_config( @@ -67,6 +68,28 @@ def get_param_combinations(model_param): return shapes, base_params +def get_quantization_sparsity_recipes( + quantization_recipes: str, sparsity_recipes: str +) -> List[Tuple[str, str]]: + """Generate valid quantization and sparsity recipes.""" + + config_recipes = [] + for quant_config, sparse_config in product(quantization_recipes, sparsity_recipes): + if sparse_config != "None" and quant_config != "baseline": + if "semi" in sparse_config or "2:4" in sparse_config: + if "marlin" in quant_config or "int8dq" in quant_config or "float8dq" in quant_config: + pass + else: + continue + elif sparse_config == "block": + config_recipes.append(("baseline", sparse_config)) + else: + raise ValueError(f"Invalid sparsity recipe: {sparse_config}") + config_recipes.append((quant_config, sparse_config)) + print('Generated config recipes: ', config_recipes) + return config_recipes + + def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig]: """Load benchmark configurations from CLI arguments and YAML file.""" @@ -82,12 +105,17 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig shapes, params = get_param_combinations(model_param) # Create configs for all combinations - for quant_config, (shape_name, shape) in product( - config.get("quantization_config_recipe_names", ["baseline"]), shapes + for (quant_config, sparse_config), (shape_name, shape) in product( + get_quantization_sparsity_recipes( + config.get("quantization_config_recipe_names", ["baseline"]), + config.get("sparsity_config_recipe_names", ["None"]), + ), + shapes ): configs.append( BenchmarkConfig( quantization=quant_config, + sparsity=sparse_config, params=params, shape_name=shape_name, shape=shape, diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index ee2de56ed9..cfc3d41e7e 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -4,6 +4,7 @@ quantization_config_recipe_names: - "baseline" - "int4wo-32" - "int4wo-128" +sparsity_config_recipe_name: "semi-sparse" output_dir: "benchmarks/microbenchmarks/results" model_params: - name: "small_bf16_linear" diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py index b67ed6a0cc..bc7bd7dbbf 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py @@ -17,6 +17,7 @@ def setUp(self): self.config = BenchmarkConfig( quantization="baseline", + sparsity="semi-sparse", params={ "high_precision_dtype": "torch.float32", "use_torch_compile": False, diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 34d4443255..a007e7a364 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -5,7 +5,8 @@ # LICENSE file in the root directory of this source tree. import csv import os -from typing import Any, Dict, List +from pickle import NONE +from typing import Any, Dict, List, Optional import torch from tabulate import tabulate @@ -24,7 +25,9 @@ PerRow, PerTensor, UIntXWeightOnlyConfig, + Float8DynamicActivationFloat8SemiSparseWeightConfig, ) +from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig try: import triton # noqa: F401 @@ -51,7 +54,8 @@ def get_default_device(device: str = "cuda") -> str: class BenchmarkConfig: def __init__( self, - quantization: str, # Quantization string format is similar to the format being used for llama/generate.py + quantization: Optional[str], # Quantization string format is similar to the format being used for llama/generate.py + sparsity: Optional[str], # Specify the type of sparsity to be used params: Dict[str, Any], shape_name: str, shape: List[int], @@ -60,6 +64,7 @@ def __init__( ): self.benchmark_mode = benchmark_mode self.quantization = quantization + self.sparsity = sparsity self.m, self.k, self.n = shape self.shape_name = shape_name self.high_precision_dtype = self._parse_precision( @@ -142,16 +147,24 @@ def forward(self, x): return x -def string_to_config(quantization: str, **kwargs) -> AOBaseConfig: +def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwargs) -> AOBaseConfig: """Get quantization config based on quantization string. Args: quantization (str): Quantization method to be used. The quantiation string format is similar to the format being used for llama/generate.py. + sparsity (str): Sparsity method to be used. The sparsity string format is similar to the format being used for llama/generate.py. **kwargs: Additional arguments to be passed to the quantization method Returns: AOBaseConfig: Quantization configuration object """ + if quantization is None and sparsity is not None: + if "semi" in sparsity or "2:4" in sparsity: + return SemiSparseWeightConfig() + if sparsity == "block": + return BlockSparseWeightConfig() + if quantization is None and sparsity is None: + return None high_precision_dtype = kwargs.get("high_precision_dtype", torch.bfloat16) if "int4wo" in quantization and not HAS_TRITON: print("Warning: Triton not available, falling back to baseline") @@ -163,7 +176,9 @@ def string_to_config(quantization: str, **kwargs) -> AOBaseConfig: if "int8wo" in quantization: return Int8WeightOnlyConfig() if "int8dq" in quantization: - if "int8dq_prefill_wo_decode" in quantization: + if sparsity is not None and ("semi" in sparsity or "2:4" in sparsity): + return Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout()) + elif "int8dq_prefill_wo_decode" in quantization: return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True) else: return Int8DynamicActivationInt8WeightConfig() @@ -198,6 +213,9 @@ def string_to_config(quantization: str, **kwargs) -> AOBaseConfig: act_mapping_type=MappingType.SYMMETRIC, layout=MarlinQQQLayout(), ) + elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity): + from torchao.dtypes import MarlinSparseLayout + return Int4WeightOnlyConfig(layout=MarlinSparseLayout()) if "fp6" in quantization: return FPXWeightOnlyConfig(3, 2) elif "uintx" in quantization: @@ -246,6 +264,8 @@ def string_to_config(quantization: str, **kwargs) -> AOBaseConfig: elif "float8wo" in quantization: return Float8WeightOnlyConfig() elif "float8dq" in quantization: + if sparsity and "semi" in sparsity: + return Float8DynamicActivationFloat8SemiSparseWeightConfig() granularity = str(quantization.split("-")[-1]) if granularity == "tensor": granularity = PerTensor() @@ -369,6 +389,7 @@ def print_results(results: List[BenchmarkResult]): # Extract relevant columns for display display_columns = [ "quantization", + "sparsity", "model_type", "m", "k", @@ -380,6 +401,7 @@ def print_results(results: List[BenchmarkResult]): # Format data for tabulate headers = { "quantization": "Quantization", + "sparsity": "Sparsity", "model_type": "Model Type", "m": "M", "k": "K", From 407ea1d0720fc42916b11d12b73201fcef2d4ae2 Mon Sep 17 00:00:00 2001 From: Apurva Jain Date: Tue, 25 Mar 2025 15:28:06 -0700 Subject: [PATCH 2/8] Sparsity tests ghstack-source-id: ac08cd26ac7d672bbca67c628e305c9771536944 ghstack-comment-id: 2752688653 Pull Request resolved: https://github.com/pytorch/ao/pull/1956 --- .../microbenchmarks/benchmark_inference.py | 8 +-- .../microbenchmarks/benchmark_runner.py | 17 ++--- .../microbenchmarks/test/benchmark_config.yml | 7 ++- .../test/test_benchmark_inference.py | 40 ++++++++++++ .../test/test_benchmark_runner.py | 63 +++++++++++++++++++ benchmarks/microbenchmarks/test/test_utils.py | 43 ++++++++++++- benchmarks/microbenchmarks/utils.py | 17 +++-- 7 files changed, 174 insertions(+), 21 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index 3b04aebb3b..56fd314f36 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -23,8 +23,6 @@ model_inference_time_in_ms, string_to_config, ) -from torchao import quantization -from torchao.core.config import AOBaseConfig from torchao.quantization import quantize_ from torchao.sparsity.sparse_api import sparsify_ @@ -48,14 +46,16 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: # Use quantize_ to apply each quantization function to the model m_copy = deepcopy(base_model).eval().to(config.device) aoBaseConfig = string_to_config( - config.quantization, config.sparsity, high_precision_dtype=config.high_precision_dtype + config.quantization, + config.sparsity, + high_precision_dtype=config.high_precision_dtype, ) if aoBaseConfig is not None and config.quantization is not None: quantize_(m_copy, aoBaseConfig) elif config.sparsity is not None and aoBaseConfig is not None: sparsify_(m_copy, aoBaseConfig) else: - pass # No quantization or sparsity specified, do nothing + pass # No quantization or sparsity specified, do nothing if config.use_torch_compile: print("Compiling model....") m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 32dd9ca02b..7ba6d6b6dc 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -30,7 +30,6 @@ generate_results_csv, print_results, ) -from torchao import sparsity def get_shapes_for_config( @@ -68,6 +67,7 @@ def get_param_combinations(model_param): return shapes, base_params + def get_quantization_sparsity_recipes( quantization_recipes: str, sparsity_recipes: str ) -> List[Tuple[str, str]]: @@ -75,9 +75,14 @@ def get_quantization_sparsity_recipes( config_recipes = [] for quant_config, sparse_config in product(quantization_recipes, sparsity_recipes): - if sparse_config != "None" and quant_config != "baseline": + if sparse_config != "None": if "semi" in sparse_config or "2:4" in sparse_config: - if "marlin" in quant_config or "int8dq" in quant_config or "float8dq" in quant_config: + if ( + "marlin" in quant_config + or "int8dq" in quant_config + or "float8dq" in quant_config + or quant_config == "baseline" + ): pass else: continue @@ -86,10 +91,8 @@ def get_quantization_sparsity_recipes( else: raise ValueError(f"Invalid sparsity recipe: {sparse_config}") config_recipes.append((quant_config, sparse_config)) - print('Generated config recipes: ', config_recipes) return config_recipes - def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig]: """Load benchmark configurations from CLI arguments and YAML file.""" @@ -109,8 +112,8 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig get_quantization_sparsity_recipes( config.get("quantization_config_recipe_names", ["baseline"]), config.get("sparsity_config_recipe_names", ["None"]), - ), - shapes + ), + shapes, ): configs.append( BenchmarkConfig( diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index cfc3d41e7e..074dfeabf4 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -3,8 +3,11 @@ benchmark_mode: "inference" quantization_config_recipe_names: - "baseline" - "int4wo-32" - - "int4wo-128" -sparsity_config_recipe_name: "semi-sparse" + - "marlin" +sparsity_config_recipe_names: + - "None" + - "semi-sparse" + - "block" output_dir: "benchmarks/microbenchmarks/results" model_params: - name: "small_bf16_linear" diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py index bc7bd7dbbf..55052acdaa 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py @@ -41,6 +41,46 @@ def test_run_inference(self): self.assertIsInstance(result, BenchmarkResult) self.assertTrue(hasattr(result, "model_inference_time_in_ms")) + def test_run_inference_with_sparsity(self): + """Test running inference with sparsity configurations""" + # Test with semi-sparse config + config = BenchmarkConfig( + quantization="marlin", + sparsity="semi-sparse", + params={ + "high_precision_dtype": "torch.float32", + "use_torch_compile": False, + "device": "cpu", + "model_type": "linear", + }, + shape_name="custom", + shape=[16, 32, 8], + output_dir=self.temp_dir, + benchmark_mode="inference", + ) + result = run(config) + self.assertIsInstance(result, BenchmarkResult) + self.assertTrue(hasattr(result, "model_inference_time_in_ms")) + + # Test with block sparsity + config = BenchmarkConfig( + quantization="baseline", + sparsity="block", + params={ + "high_precision_dtype": "torch.float32", + "use_torch_compile": False, + "device": "cpu", + "model_type": "linear", + }, + shape_name="custom", + shape=[16, 32, 8], + output_dir=self.temp_dir, + benchmark_mode="inference", + ) + result = run(config) + self.assertIsInstance(result, BenchmarkResult) + self.assertTrue(hasattr(result, "model_inference_time_in_ms")) + if __name__ == "__main__": unittest.main() diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py index 27e6304513..7e9c6d024e 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_runner.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_runner.py @@ -13,6 +13,7 @@ from benchmarks.microbenchmarks.benchmark_runner import ( get_param_combinations, + get_quantization_sparsity_recipes, get_shapes_for_config, load_benchmark_configs, run_inference_benchmarks_from_config, @@ -88,6 +89,68 @@ def test_run_inference_benchmarks_from_config(self): results_file = Path(self.temp_dir) / "results.csv" self.assertTrue(results_file.exists()) + def test_get_quantization_sparsity_recipes(self): + """Test generation of valid quantization and sparsity recipe combinations""" + # Test basic combinations + quant_recipes = ["baseline", "int8wo"] + sparse_recipes = ["None", "semi-sparse"] + recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) + self.assertEqual( + len(recipes), 3 + ) # Should only get baseline+None and int8wo+None + self.assertIn(("baseline", "None"), recipes) + self.assertIn(("int8wo", "None"), recipes) + self.assertIn(("baseline", "semi-sparse"), recipes) + + # Test marlin with semi-sparse + quant_recipes = ["marlin", "baseline"] + sparse_recipes = ["None", "semi-sparse"] + recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) + self.assertIn(("marlin", "semi-sparse"), recipes) + self.assertIn(("baseline", "None"), recipes) + + # Test block sparsity + quant_recipes = ["baseline"] + sparse_recipes = ["None", "block"] + recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) + self.assertIn(("baseline", "block"), recipes) + + def test_load_benchmark_configs_with_sparsity(self): + """Test loading benchmark configs with sparsity options""" + test_config = { + "benchmark_mode": "inference", + "quantization_config_recipe_names": ["baseline", "marlin"], + "sparsity_config_recipe_names": ["None", "semi-sparse"], + "output_dir": self.temp_dir, + "model_params": [ + { + "matrix_shapes": [ + {"name": "custom", "shapes": [[1024, 1024, 1024]]} + ], + "high_precision_dtype": "torch.bfloat16", + "device": "cpu", + "model_type": "linear", + } + ], + } + + config_path = Path(self.temp_dir) / "test_sparsity_config.yml" + with open(config_path, "w") as f: + yaml.dump(test_config, f) + + configs = load_benchmark_configs(argparse.Namespace(config=str(config_path))) + + # Check that we get configs for baseline and marlin with appropriate sparsity + self.assertTrue( + any(c.quantization == "baseline" and c.sparsity == "None" for c in configs) + ) + self.assertTrue( + any( + c.quantization == "marlin" and c.sparsity == "semi-sparse" + for c in configs + ) + ) + if __name__ == "__main__": unittest.main() diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py index 7864467f9f..ae7e88c9ae 100644 --- a/benchmarks/microbenchmarks/test/test_utils.py +++ b/benchmarks/microbenchmarks/test/test_utils.py @@ -13,7 +13,11 @@ from benchmarks.microbenchmarks.utils import ( BenchmarkConfig, BenchmarkResult, + BlockSparseWeightConfig, + Float8DynamicActivationFloat8SemiSparseWeightConfig, + Int4WeightOnlyConfig, LNLinearSigmoid, + SemiSparseWeightConfig, ToyLinearModel, clean_caches, create_model_and_input, @@ -39,6 +43,7 @@ def setUp(self): def test_benchmark_config(self): config = BenchmarkConfig( quantization="baseline", + sparsity="None", params=self.test_params, shape_name="custom", shape=self.test_shape, @@ -60,6 +65,7 @@ def test_benchmark_config(self): def test_benchmark_result(self): config = BenchmarkConfig( quantization="baseline", + sparsity="None", params=self.test_params, shape_name="custom", shape=self.test_shape, @@ -82,17 +88,46 @@ def test_get_default_device(self): def test_string_to_config(self): # Test baseline - config = string_to_config("baseline") + config = string_to_config("baseline", "None") self.assertIsNone(config) # Test int8wo - config = string_to_config("int8wo") + config = string_to_config("int8wo", "None") self.assertIsNotNone(config) # Test invalid config - config = string_to_config("not_a_real_config") + config = string_to_config("not_a_real_config", "None") self.assertIsNone(config) + def test_string_to_config_sparsity(self): + """Test sparsity config generation""" + # Test semi-sparse config + config = string_to_config(None, "semi-sparse") + self.assertIsInstance(config, SemiSparseWeightConfig) + + # Test block sparse config + config = string_to_config(None, "block") + self.assertIsInstance(config, BlockSparseWeightConfig) + + # Test combined sparsity and quantization + config = string_to_config("marlin", "semi-sparse") + self.assertIsInstance(config, Int4WeightOnlyConfig) + + # Test float8 with semi-sparse + config = string_to_config("float8dq", "semi-sparse") + self.assertIsInstance( + config, Float8DynamicActivationFloat8SemiSparseWeightConfig + ) + + def test_invalid_sparsity(self): + """Test invalid sparsity config generation""" + from benchmarks.microbenchmarks.benchmark_runner import ( + get_quantization_sparsity_recipes, + ) + + with self.assertRaises(ValueError): + get_quantization_sparsity_recipes(["baseline"], ["invalid_sparsity"]) + def test_toy_linear_model(self): model = ToyLinearModel(k=64, n=32, dtype=torch.float32) x = torch.randn(16, 64) @@ -139,6 +174,7 @@ def test_generate_results_csv(self): BenchmarkResult( BenchmarkConfig( quantization="int8wo", + sparsity="None", params={}, shape_name="custom", shape=[1024, 1024, 1024], @@ -149,6 +185,7 @@ def test_generate_results_csv(self): BenchmarkResult( BenchmarkConfig( quantization="int4wo", + sparsity="None", params={}, shape_name="custom", shape=[1024, 1024, 1024], diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index a007e7a364..076d9aa180 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -5,7 +5,6 @@ # LICENSE file in the root directory of this source tree. import csv import os -from pickle import NONE from typing import Any, Dict, List, Optional import torch @@ -14,6 +13,7 @@ from torchao.core.config import AOBaseConfig from torchao.quantization import ( + Float8DynamicActivationFloat8SemiSparseWeightConfig, Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig, FPXWeightOnlyConfig, @@ -25,7 +25,6 @@ PerRow, PerTensor, UIntXWeightOnlyConfig, - Float8DynamicActivationFloat8SemiSparseWeightConfig, ) from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig @@ -54,7 +53,9 @@ def get_default_device(device: str = "cuda") -> str: class BenchmarkConfig: def __init__( self, - quantization: Optional[str], # Quantization string format is similar to the format being used for llama/generate.py + quantization: Optional[ + str + ], # Quantization string format is similar to the format being used for llama/generate.py sparsity: Optional[str], # Specify the type of sparsity to be used params: Dict[str, Any], shape_name: str, @@ -147,7 +148,9 @@ def forward(self, x): return x -def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwargs) -> AOBaseConfig: +def string_to_config( + quantization: Optional[str], sparsity: Optional[str], **kwargs +) -> AOBaseConfig: """Get quantization config based on quantization string. Args: @@ -166,6 +169,7 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa if quantization is None and sparsity is None: return None high_precision_dtype = kwargs.get("high_precision_dtype", torch.bfloat16) + if "int4wo" in quantization and not HAS_TRITON: print("Warning: Triton not available, falling back to baseline") return None @@ -177,6 +181,8 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa return Int8WeightOnlyConfig() if "int8dq" in quantization: if sparsity is not None and ("semi" in sparsity or "2:4" in sparsity): + from torchao.dtypes import SemiSparseLayout + return Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout()) elif "int8dq_prefill_wo_decode" in quantization: return Int8DynamicActivationInt8WeightConfig(weight_only_decode=True) @@ -215,6 +221,7 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa ) elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity): from torchao.dtypes import MarlinSparseLayout + return Int4WeightOnlyConfig(layout=MarlinSparseLayout()) if "fp6" in quantization: return FPXWeightOnlyConfig(3, 2) @@ -265,7 +272,7 @@ def string_to_config(quantization: Optional[str], sparsity: Optional[str], **kwa return Float8WeightOnlyConfig() elif "float8dq" in quantization: if sparsity and "semi" in sparsity: - return Float8DynamicActivationFloat8SemiSparseWeightConfig() + return Float8DynamicActivationFloat8SemiSparseWeightConfig() granularity = str(quantization.split("-")[-1]) if granularity == "tensor": granularity = PerTensor() From 65d406503dc6796f39c76690c8d7ec088ab2e4f3 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 25 Mar 2025 15:28:09 -0700 Subject: [PATCH 3/8] Add sparsify_ to benchmarking ghstack-source-id: 4cf77f85be18d1df2b1827ce11afbd02fccd9188 ghstack-comment-id: 2752688734 Pull Request resolved: https://github.com/pytorch/ao/pull/1957 --- benchmarks/microbenchmarks/benchmark_inference.py | 11 +++++++---- benchmarks/microbenchmarks/results/results.csv | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 benchmarks/microbenchmarks/results/results.csv diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index 56fd314f36..ea91deca12 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -50,12 +50,15 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: config.sparsity, high_precision_dtype=config.high_precision_dtype, ) - if aoBaseConfig is not None and config.quantization is not None: - quantize_(m_copy, aoBaseConfig) - elif config.sparsity is not None and aoBaseConfig is not None: + if config.sparsity != "None" and config.quantization == "baseline": + print(f"Sparsifying model for sparsity: {config.sparsity}") sparsify_(m_copy, aoBaseConfig) - else: + elif config.sparsity == "None" and config.quantization == "baseline": pass # No quantization or sparsity specified, do nothing + else: + print(f"Quantizing model with quantization: {config.quantization}, sparsity: {config.sparsity}") + quantize_(m_copy, aoBaseConfig) + if config.use_torch_compile: print("Compiling model....") m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) diff --git a/benchmarks/microbenchmarks/results/results.csv b/benchmarks/microbenchmarks/results/results.csv new file mode 100644 index 0000000000..d1e2fe188e --- /dev/null +++ b/benchmarks/microbenchmarks/results/results.csv @@ -0,0 +1,14 @@ +name,quantization,m,k,n,high_precision_dtype,use_torch_compile,torch_compile_mode,device,model_type,output_dir,model_inference_time_in_ms +small_bf16_linear,baseline,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,47.870889538899064 +small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79820020310581 +small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79379970394075 +small_bf16_linear,marlin,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,2827.8696595225483 +large_bf16_ln_linear,baseline,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,69.42827021703124 +large_bf16_ln_linear,baseline,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,110.21242011338472 +large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,640.171259874478 +large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1230.195889947936 +large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,638.9500107616186 +large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1224.7836799360812 +large_bf16_ln_linear,marlin,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,10065.855470020324 +large_bf16_ln_linear,marlin,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,11008.49203998223 +cpu_fp32_linear,baseline,4096,4096,1024,torch.float32,False,,cpu,linear,benchmarks/microbenchmarks/results,369783.37747976184 From cea30b1fbf1ec128d56028c0a118de35ccac36db Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 25 Mar 2025 15:28:12 -0700 Subject: [PATCH 4/8] Support sparsify_ run ghstack-source-id: 8b459e288351460492c91d476791f3d47c7b627d ghstack-comment-id: 2752688801 Pull Request resolved: https://github.com/pytorch/ao/pull/1958 --- .../microbenchmarks/benchmark_inference.py | 23 +++-- .../microbenchmarks/benchmark_runner.py | 84 +++++++++++++------ .../microbenchmarks/results/results.csv | 14 ---- .../microbenchmarks/test/benchmark_config.yml | 4 +- .../test/test_benchmark_inference.py | 24 +++++- .../test/test_benchmark_runner.py | 65 +++++++++++--- benchmarks/microbenchmarks/test/test_utils.py | 17 ++++ benchmarks/microbenchmarks/utils.py | 12 ++- 8 files changed, 178 insertions(+), 65 deletions(-) delete mode 100644 benchmarks/microbenchmarks/results/results.csv diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index ea91deca12..d84e21f382 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -50,13 +50,26 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: config.sparsity, high_precision_dtype=config.high_precision_dtype, ) - if config.sparsity != "None" and config.quantization == "baseline": - print(f"Sparsifying model for sparsity: {config.sparsity}") - sparsify_(m_copy, aoBaseConfig) - elif config.sparsity == "None" and config.quantization == "baseline": + + # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) + is_cuda = config.device == "cuda" and torch.cuda.is_available() + + if config.sparsity is not None and ( + config.quantization is None or "baseline" in config.quantization + ): + if is_cuda: + print(f"Applying {config.sparsity} sparsity to model") + sparsify_(m_copy, aoBaseConfig) + else: + print( + f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" + ) + elif config.sparsity is None and ( + config.quantization is None or "baseline" in config.quantization + ): pass # No quantization or sparsity specified, do nothing else: - print(f"Quantizing model with quantization: {config.quantization}, sparsity: {config.sparsity}") + print("Quantizing model....") quantize_(m_copy, aoBaseConfig) if config.use_torch_compile: diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 7ba6d6b6dc..85a8ef2f53 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -21,7 +21,7 @@ import argparse from itertools import product -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple import yaml @@ -69,28 +69,57 @@ def get_param_combinations(model_param): def get_quantization_sparsity_recipes( - quantization_recipes: str, sparsity_recipes: str -) -> List[Tuple[str, str]]: - """Generate valid quantization and sparsity recipes.""" - - config_recipes = [] - for quant_config, sparse_config in product(quantization_recipes, sparsity_recipes): - if sparse_config != "None": - if "semi" in sparse_config or "2:4" in sparse_config: + quantization_recipes: List[str], sparsity_recipes: List[str] +) -> Set[Tuple[str, Optional[str]]]: + """Generate valid quantization and sparsity recipes. + + Args: + quantization_recipes: List of quantization recipes + sparsity_recipes: List of sparsity recipes + + Returns: + Set of tuples containing (quantization_recipe, sparsity_recipe) + For block sparsity, quantization is always "baseline" + All quantization techniques are also run without sparsity + """ + config_recipes = set() + + # Handle edge cases + if sparsity_recipes is None and quantization_recipes is None: + return {("baseline", None)} + if sparsity_recipes is None: + return {(quant, None) for quant in quantization_recipes} + if quantization_recipes is None: + return {("baseline", sparse) for sparse in sparsity_recipes} + + # Always include baseline without sparsity + config_recipes.add(("baseline", None)) + + # Add all quantization techniques without sparsity + for quant_config in quantization_recipes: + config_recipes.add((quant_config, None)) + + # Process combinations of quantization and sparsity + for sparse_config in sparsity_recipes: + if sparse_config is None: + # Skip None sparsity as we've already added all quantization techniques without sparsity + continue + elif "block" in sparse_config: + # For block sparsity, only pair with baseline quantization + config_recipes.add(("baseline", sparse_config)) + elif "semi" in sparse_config or "2:4" in sparse_config: + # For semi-sparse, only pair with compatible quantization methods + for quant_config in quantization_recipes: if ( "marlin" in quant_config or "int8dq" in quant_config or "float8dq" in quant_config or quant_config == "baseline" ): - pass - else: - continue - elif sparse_config == "block": - config_recipes.append(("baseline", sparse_config)) - else: - raise ValueError(f"Invalid sparsity recipe: {sparse_config}") - config_recipes.append((quant_config, sparse_config)) + config_recipes.add((quant_config, sparse_config)) + else: + raise ValueError(f"Invalid sparsity recipe: {sparse_config}") + return config_recipes @@ -104,15 +133,16 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig # Create all possible combinations configs = [] + quantization_sparsity_recipes = get_quantization_sparsity_recipes( + config.get("quantization_config_recipe_names", None), + config.get("sparsity_config_recipe_names", None), + ) for model_param in config["model_params"]: shapes, params = get_param_combinations(model_param) # Create configs for all combinations for (quant_config, sparse_config), (shape_name, shape) in product( - get_quantization_sparsity_recipes( - config.get("quantization_config_recipe_names", ["baseline"]), - config.get("sparsity_config_recipe_names", ["None"]), - ), + quantization_sparsity_recipes, shapes, ): configs.append( @@ -126,7 +156,6 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig benchmark_mode=benchmark_mode, ) ) - return configs @@ -135,14 +164,17 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None from benchmarks.microbenchmarks.benchmark_inference import run as run_inference results = [] - print("Benchmarking Inference ......") + print("----------------- RUNNING BENCHMARKS FOR INFERENCE -----------------------") for config in configs: + print("----------------------------------------") try: - print(f"Running: {config.name}") + print( + f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}" + ) result = run_inference(config) # Pass the config object directly results.append(result) - except Exception as e: - print(f"Error running benchmark {config.name}: {e}") + except Exception: + print(f"Error running benchmark {config.name}") continue # Add results to csv diff --git a/benchmarks/microbenchmarks/results/results.csv b/benchmarks/microbenchmarks/results/results.csv deleted file mode 100644 index d1e2fe188e..0000000000 --- a/benchmarks/microbenchmarks/results/results.csv +++ /dev/null @@ -1,14 +0,0 @@ -name,quantization,m,k,n,high_precision_dtype,use_torch_compile,torch_compile_mode,device,model_type,output_dir,model_inference_time_in_ms -small_bf16_linear,baseline,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,47.870889538899064 -small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79820020310581 -small_bf16_linear,int4wo-32,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,92.79379970394075 -small_bf16_linear,marlin,1024,1024,1024,torch.bfloat16,True,max-autotune,cuda,linear,benchmarks/microbenchmarks/results,2827.8696595225483 -large_bf16_ln_linear,baseline,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,69.42827021703124 -large_bf16_ln_linear,baseline,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,110.21242011338472 -large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,640.171259874478 -large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1230.195889947936 -large_bf16_ln_linear,int4wo-32,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,638.9500107616186 -large_bf16_ln_linear,int4wo-32,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,1224.7836799360812 -large_bf16_ln_linear,marlin,2048,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,10065.855470020324 -large_bf16_ln_linear,marlin,4096,4096,1024,torch.bfloat16,True,max-autotune,cuda,ln_linear_sigmoid,benchmarks/microbenchmarks/results,11008.49203998223 -cpu_fp32_linear,baseline,4096,4096,1024,torch.float32,False,,cpu,linear,benchmarks/microbenchmarks/results,369783.37747976184 diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 074dfeabf4..17cd666bfa 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -1,11 +1,11 @@ # Sample configuration for inference benchmarks benchmark_mode: "inference" quantization_config_recipe_names: - - "baseline" + # - "baseline" Will always run a baseline instatance - "int4wo-32" - "marlin" sparsity_config_recipe_names: - - "None" + # - "none" Will always run a without sparsity instance - "semi-sparse" - "block" output_dir: "benchmarks/microbenchmarks/results" diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py index 55052acdaa..0216297bc7 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import tempfile import unittest +from unittest.mock import patch from benchmarks.microbenchmarks.benchmark_inference import run from benchmarks.microbenchmarks.utils import BenchmarkConfig, BenchmarkResult @@ -36,14 +37,28 @@ def tearDown(self): shutil.rmtree(self.temp_dir) - def test_run_inference(self): + @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") + def test_run_inference(self, mock_string_to_config): + # Mock string_to_config to return a valid config + from torchao.sparsity.sparse_api import SemiSparseWeightConfig + + mock_string_to_config.return_value = SemiSparseWeightConfig() + result = run(self.config) self.assertIsInstance(result, BenchmarkResult) self.assertTrue(hasattr(result, "model_inference_time_in_ms")) - def test_run_inference_with_sparsity(self): + @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") + def test_run_inference_with_sparsity(self, mock_string_to_config): """Test running inference with sparsity configurations""" + # Mock string_to_config to return valid configs + from torchao.quantization import Int4WeightOnlyConfig + from torchao.sparsity.sparse_api import ( + BlockSparseWeightConfig, + ) + # Test with semi-sparse config + mock_string_to_config.return_value = Int4WeightOnlyConfig() config = BenchmarkConfig( quantization="marlin", sparsity="semi-sparse", @@ -54,7 +69,7 @@ def test_run_inference_with_sparsity(self): "model_type": "linear", }, shape_name="custom", - shape=[16, 32, 8], + shape=[64, 64, 64], # Use dimensions divisible by 64 output_dir=self.temp_dir, benchmark_mode="inference", ) @@ -63,6 +78,7 @@ def test_run_inference_with_sparsity(self): self.assertTrue(hasattr(result, "model_inference_time_in_ms")) # Test with block sparsity + mock_string_to_config.return_value = BlockSparseWeightConfig() config = BenchmarkConfig( quantization="baseline", sparsity="block", @@ -73,7 +89,7 @@ def test_run_inference_with_sparsity(self): "model_type": "linear", }, shape_name="custom", - shape=[16, 32, 8], + shape=[64, 64, 64], # Use dimensions divisible by 64 output_dir=self.temp_dir, benchmark_mode="inference", ) diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py index 7e9c6d024e..a8683a1de8 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_runner.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_runner.py @@ -93,34 +93,75 @@ def test_get_quantization_sparsity_recipes(self): """Test generation of valid quantization and sparsity recipe combinations""" # Test basic combinations quant_recipes = ["baseline", "int8wo"] - sparse_recipes = ["None", "semi-sparse"] + sparse_recipes = [None, "semi-sparse"] recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) - self.assertEqual( - len(recipes), 3 - ) # Should only get baseline+None and int8wo+None - self.assertIn(("baseline", "None"), recipes) - self.assertIn(("int8wo", "None"), recipes) + self.assertIn(("baseline", None), recipes) + self.assertIn(("int8wo", None), recipes) self.assertIn(("baseline", "semi-sparse"), recipes) # Test marlin with semi-sparse quant_recipes = ["marlin", "baseline"] - sparse_recipes = ["None", "semi-sparse"] + sparse_recipes = [None, "semi-sparse"] recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) self.assertIn(("marlin", "semi-sparse"), recipes) - self.assertIn(("baseline", "None"), recipes) + self.assertIn(("baseline", None), recipes) # Test block sparsity quant_recipes = ["baseline"] - sparse_recipes = ["None", "block"] + sparse_recipes = [None, "block"] recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) self.assertIn(("baseline", "block"), recipes) - def test_load_benchmark_configs_with_sparsity(self): + def test_none_string_raises_error(self): + """Test that passing 'None' as a string raises an error""" + quant_recipes = ["baseline"] + sparse_recipes = ["None"] # "None" as a string should raise an error + with self.assertRaises(ValueError): + get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) + + def test_block_sparsity_with_quantization(self): + """Test that block sparsity is only paired with baseline quantization""" + quant_recipes = ["baseline", "int8wo", "int4wo", "marlin"] + sparse_recipes = ["block"] + recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) + + # Block sparsity should only be paired with baseline + self.assertIn(("baseline", "block"), recipes) + self.assertNotIn(("int8wo", "block"), recipes) + self.assertNotIn(("int4wo", "block"), recipes) + self.assertNotIn(("marlin", "block"), recipes) + + # All quantization techniques should be run without sparsity + self.assertIn(("baseline", None), recipes) + self.assertIn(("int8wo", None), recipes) + self.assertIn(("int4wo", None), recipes) + self.assertIn(("marlin", None), recipes) + + def test_all_quantization_without_sparsity(self): + """Test that all quantization techniques are run without sparsity""" + quant_recipes = ["baseline", "int8wo", "int4wo", "marlin"] + sparse_recipes = [None, "semi-sparse", "block"] + recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes) + + # All quantization techniques should be run without sparsity + for quant in quant_recipes: + self.assertIn((quant, None), recipes) + + @patch( + "benchmarks.microbenchmarks.benchmark_runner.get_quantization_sparsity_recipes" + ) + def test_load_benchmark_configs_with_sparsity(self, mock_get_recipes): """Test loading benchmark configs with sparsity options""" + # Mock get_quantization_sparsity_recipes to return a valid set of recipes + mock_get_recipes.return_value = {("baseline", None), ("marlin", "semi-sparse")} + test_config = { "benchmark_mode": "inference", "quantization_config_recipe_names": ["baseline", "marlin"], - "sparsity_config_recipe_names": ["None", "semi-sparse"], + "sparsity_config_recipe_names": [ + None, + "semi-sparse", + ], # Use None instead of "None" "output_dir": self.temp_dir, "model_params": [ { @@ -142,7 +183,7 @@ def test_load_benchmark_configs_with_sparsity(self): # Check that we get configs for baseline and marlin with appropriate sparsity self.assertTrue( - any(c.quantization == "baseline" and c.sparsity == "None" for c in configs) + any(c.quantization == "baseline" and c.sparsity is None for c in configs) ) self.assertTrue( any( diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py index ae7e88c9ae..83e88c5b11 100644 --- a/benchmarks/microbenchmarks/test/test_utils.py +++ b/benchmarks/microbenchmarks/test/test_utils.py @@ -119,6 +119,23 @@ def test_string_to_config_sparsity(self): config, Float8DynamicActivationFloat8SemiSparseWeightConfig ) + def test_block_sparsity_with_baseline_quantization(self): + """Test that block sparsity with baseline quantization returns BlockSparseWeightConfig""" + config = string_to_config("baseline", "block") + self.assertIsInstance(config, BlockSparseWeightConfig) + + def test_block_sparsity_with_non_baseline_quantization(self): + """Test that block sparsity with non-baseline quantization still returns BlockSparseWeightConfig""" + # Block sparsity should take precedence over any quantization method + config = string_to_config("int8wo", "block") + self.assertIsInstance(config, BlockSparseWeightConfig) + + config = string_to_config("int4wo", "block") + self.assertIsInstance(config, BlockSparseWeightConfig) + + config = string_to_config("marlin", "block") + self.assertIsInstance(config, BlockSparseWeightConfig) + def test_invalid_sparsity(self): """Test invalid sparsity config generation""" from benchmarks.microbenchmarks.benchmark_runner import ( diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 076d9aa180..fd3db11591 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -95,6 +95,7 @@ def to_dict(self) -> Dict[str, Any]: return { "name": self.name, "quantization": self.quantization, + "sparsity": self.sparsity, "m": self.m, "k": self.k, "n": self.n, @@ -161,11 +162,16 @@ def string_to_config( Returns: AOBaseConfig: Quantization configuration object """ + # Handle block sparsity case - with block sparsity, quantization should always be "none" or "baseline" + if sparsity is not None and sparsity == "block": + return BlockSparseWeightConfig() + + # Handle other sparsity cases if quantization is None and sparsity is not None: if "semi" in sparsity or "2:4" in sparsity: return SemiSparseWeightConfig() - if sparsity == "block": - return BlockSparseWeightConfig() + else: + raise ValueError(f"Unknown sparsity type: {sparsity}") if quantization is None and sparsity is None: return None high_precision_dtype = kwargs.get("high_precision_dtype", torch.bfloat16) @@ -424,6 +430,8 @@ def print_results(results: List[BenchmarkResult]): row = [] for col in display_columns: value = result_dict.get(col, "N/A") + if value is None: + value = "N/A" if col == "model_inference_time_in_ms": value = f"{value:.2f}" if isinstance(value, (int, float)) else value elif col == "use_torch_compile": From ae70fece7bad9ebe3027867d422ffca55216589d Mon Sep 17 00:00:00 2001 From: Apurva Jain Date: Tue, 25 Mar 2025 15:28:15 -0700 Subject: [PATCH 5/8] Review updates ghstack-source-id: ded017942498ba330ed1e99cb784ead3a049bb6e ghstack-comment-id: 2752688871 Pull Request resolved: https://github.com/pytorch/ao/pull/1959 --- .../microbenchmarks/benchmark_inference.py | 6 +++--- benchmarks/microbenchmarks/benchmark_runner.py | 12 ++---------- .../microbenchmarks/test/benchmark_config.yml | 4 ++-- .../test/test_benchmark_inference.py | 18 +++++++++++++----- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index d84e21f382..c084d18d3a 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -45,7 +45,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: # Use quantize_ to apply each quantization function to the model m_copy = deepcopy(base_model).eval().to(config.device) - aoBaseConfig = string_to_config( + ao_base_config = string_to_config( config.quantization, config.sparsity, high_precision_dtype=config.high_precision_dtype, @@ -59,7 +59,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: ): if is_cuda: print(f"Applying {config.sparsity} sparsity to model") - sparsify_(m_copy, aoBaseConfig) + sparsify_(m_copy, ao_base_config) else: print( f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" @@ -70,7 +70,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: pass # No quantization or sparsity specified, do nothing else: print("Quantizing model....") - quantize_(m_copy, aoBaseConfig) + quantize_(m_copy, ao_base_config) if config.use_torch_compile: print("Compiling model....") diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 85a8ef2f53..7152542eec 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -84,14 +84,6 @@ def get_quantization_sparsity_recipes( """ config_recipes = set() - # Handle edge cases - if sparsity_recipes is None and quantization_recipes is None: - return {("baseline", None)} - if sparsity_recipes is None: - return {(quant, None) for quant in quantization_recipes} - if quantization_recipes is None: - return {("baseline", sparse) for sparse in sparsity_recipes} - # Always include baseline without sparsity config_recipes.add(("baseline", None)) @@ -134,8 +126,8 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig # Create all possible combinations configs = [] quantization_sparsity_recipes = get_quantization_sparsity_recipes( - config.get("quantization_config_recipe_names", None), - config.get("sparsity_config_recipe_names", None), + config.get("quantization_config_recipe_names", []), + config.get("sparsity_config_recipe_names", []), ) for model_param in config["model_params"]: shapes, params = get_param_combinations(model_param) diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 17cd666bfa..97a38469de 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -1,11 +1,11 @@ # Sample configuration for inference benchmarks benchmark_mode: "inference" quantization_config_recipe_names: - # - "baseline" Will always run a baseline instatance + # Will run a baseline inference for model by default, without quantization for comparison - "int4wo-32" - "marlin" sparsity_config_recipe_names: - # - "none" Will always run a without sparsity instance + # Will run a baseline inference for model by default, without sparsity for comparison - "semi-sparse" - "block" output_dir: "benchmarks/microbenchmarks/results" diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py index 0216297bc7..22863dcbcf 100644 --- a/benchmarks/microbenchmarks/test/test_benchmark_inference.py +++ b/benchmarks/microbenchmarks/test/test_benchmark_inference.py @@ -49,16 +49,16 @@ def test_run_inference(self, mock_string_to_config): self.assertTrue(hasattr(result, "model_inference_time_in_ms")) @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") - def test_run_inference_with_sparsity(self, mock_string_to_config): + def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config): """Test running inference with sparsity configurations""" # Mock string_to_config to return valid configs + from torchao.dtypes import MarlinSparseLayout from torchao.quantization import Int4WeightOnlyConfig - from torchao.sparsity.sparse_api import ( - BlockSparseWeightConfig, - ) # Test with semi-sparse config - mock_string_to_config.return_value = Int4WeightOnlyConfig() + mock_string_to_config.return_value = Int4WeightOnlyConfig( + layout=MarlinSparseLayout() + ) config = BenchmarkConfig( quantization="marlin", sparsity="semi-sparse", @@ -77,6 +77,14 @@ def test_run_inference_with_sparsity(self, mock_string_to_config): self.assertIsInstance(result, BenchmarkResult) self.assertTrue(hasattr(result, "model_inference_time_in_ms")) + @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config") + def test_run_inference_with_block_sparsity(self, mock_string_to_config): + """Test running inference with sparsity configurations""" + # Mock string_to_config to return valid configs + from torchao.sparsity.sparse_api import ( + BlockSparseWeightConfig, + ) + # Test with block sparsity mock_string_to_config.return_value = BlockSparseWeightConfig() config = BenchmarkConfig( From d71baa35e23e929e04f2e48ae5cc987199fb18d7 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 25 Mar 2025 15:28:18 -0700 Subject: [PATCH 6/8] Add profile support ghstack-source-id: df824669905a8f0fbc1de6744d7a4bc7aa1747d5 ghstack-comment-id: 2752688926 Pull Request resolved: https://github.com/pytorch/ao/pull/1960 --- .../microbenchmarks/benchmark_inference.py | 6 +- .../microbenchmarks/test/benchmark_config.yml | 3 +- benchmarks/microbenchmarks/utils.py | 64 +++++++++++++++++++ 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index c084d18d3a..9dec543c11 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -20,6 +20,7 @@ BenchmarkResult, clean_caches, create_model_and_input, + generate_model_profile, model_inference_time_in_ms, string_to_config, ) @@ -84,10 +85,9 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: model=m_copy, input_data=input_data ) - # TODO: Benchmark time using profiler # Profile dtype model evaluation - # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype) - # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details + prof = generate_model_profile(m_copy, input_data) + prof.export_chrome_trace(f"{config.profile_path}.json") # Save profiling details # TODO: Benchmark gemm time using cuda graph # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs) diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 97a38469de..13f67bbaeb 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -8,7 +8,7 @@ sparsity_config_recipe_names: # Will run a baseline inference for model by default, without sparsity for comparison - "semi-sparse" - "block" -output_dir: "benchmarks/microbenchmarks/results" +output_dir: "benchmarks/microbenchmarks/results/" model_params: - name: "small_bf16_linear" matrix_shapes: @@ -21,6 +21,7 @@ model_params: torch_compile_mode: "max-autotune" device: "cuda" model_type: "linear" + profile: True - name: "large_bf16_ln_linear" matrix_shapes: diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index fd3db11591..111f9072f3 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -27,6 +27,12 @@ UIntXWeightOnlyConfig, ) from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig +from torch.profiler import profile, record_function, ProfilerActivity +import os +import subprocess +import sys +import uuid + try: import triton # noqa: F401 @@ -84,6 +90,8 @@ def __init__( "name", f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}", ) + self.profile = params.get("profile", False) + self.profile_file_name = os.path.join(self.output_dir, f"/profile/{self.name}_{self.m}_{self.k}_{self.n}_profile.json") @staticmethod def _parse_precision(precision_str: str) -> torch.dtype: @@ -105,6 +113,7 @@ def to_dict(self) -> Dict[str, Any]: "device": self.device, "model_type": self.model_type, "output_dir": self.output_dir, + "profile": self.profile, } @@ -319,6 +328,61 @@ def model_inference_time_in_ms(model, input_data): return res * 1e6 +def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]: + file_name = os.path.basename(local_path) + manifold_path = os.path.join( + "perfetto_internal_traces/tree/shared_trace", f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}" + ) + cmd = [ + "manifold", + "put", + local_path, + manifold_path, + "--ttl", + str(28 * 24 * 60 * 60), + "--userData", + "false", + ] + ret = subprocess.run( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True + ) + if ret.returncode == 0: + print("Upload trace successfully.") + return manifold_path + else: + print("[ERROR] Upload failed, maybe the trace file exists.") + return None + + +def print_perfetto_ui_url(manifold_path: str) -> None: + url = ( + "https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html" + + "#!/?url=https://interncache-all.fbcdn.net/manifold/" + + manifold_path + ) + print(f"The trace is accessible at:\n{url}") + + +def generate_model_profile(model, input_data, profile_file): + # Function to benchmark model evaluation with profiling + torch.profiler._utils._init_for_cuda_graphs() + prof = torch.profiler.profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) + with prof: + # with record_function("model_inference"): + for _ in range(1): # Run the model multiple times to warm up the cache + with torch.no_grad(): + _ = model(*input_data) + torch.cuda.synchronize() + prof.export_chrome_trace(profile_file) # Save profiling details + + manifold_path = upload_trace_file(profile_file) + if manifold_path: + print_perfetto_ui_url(manifold_path) + + # Return the profiler output + return prof + + def create_model_and_input( model_type: str, m: int, From f2d24cd69a13b951978109f287ca660176f4c140 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 25 Mar 2025 15:31:03 -0700 Subject: [PATCH 7/8] Revert "Add profile support" This reverts commit d71baa35e23e929e04f2e48ae5cc987199fb18d7. --- .../microbenchmarks/benchmark_inference.py | 6 +- .../microbenchmarks/test/benchmark_config.yml | 3 +- benchmarks/microbenchmarks/utils.py | 64 ------------------- 3 files changed, 4 insertions(+), 69 deletions(-) diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py index 9dec543c11..c084d18d3a 100644 --- a/benchmarks/microbenchmarks/benchmark_inference.py +++ b/benchmarks/microbenchmarks/benchmark_inference.py @@ -20,7 +20,6 @@ BenchmarkResult, clean_caches, create_model_and_input, - generate_model_profile, model_inference_time_in_ms, string_to_config, ) @@ -85,9 +84,10 @@ def run(config: BenchmarkConfig) -> BenchmarkResult: model=m_copy, input_data=input_data ) + # TODO: Benchmark time using profiler # Profile dtype model evaluation - prof = generate_model_profile(m_copy, input_data) - prof.export_chrome_trace(f"{config.profile_path}.json") # Save profiling details + # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype) + # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details # TODO: Benchmark gemm time using cuda graph # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs) diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml index 13f67bbaeb..97a38469de 100644 --- a/benchmarks/microbenchmarks/test/benchmark_config.yml +++ b/benchmarks/microbenchmarks/test/benchmark_config.yml @@ -8,7 +8,7 @@ sparsity_config_recipe_names: # Will run a baseline inference for model by default, without sparsity for comparison - "semi-sparse" - "block" -output_dir: "benchmarks/microbenchmarks/results/" +output_dir: "benchmarks/microbenchmarks/results" model_params: - name: "small_bf16_linear" matrix_shapes: @@ -21,7 +21,6 @@ model_params: torch_compile_mode: "max-autotune" device: "cuda" model_type: "linear" - profile: True - name: "large_bf16_ln_linear" matrix_shapes: diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py index 111f9072f3..fd3db11591 100644 --- a/benchmarks/microbenchmarks/utils.py +++ b/benchmarks/microbenchmarks/utils.py @@ -27,12 +27,6 @@ UIntXWeightOnlyConfig, ) from torchao.sparsity.sparse_api import BlockSparseWeightConfig, SemiSparseWeightConfig -from torch.profiler import profile, record_function, ProfilerActivity -import os -import subprocess -import sys -import uuid - try: import triton # noqa: F401 @@ -90,8 +84,6 @@ def __init__( "name", f"benchmark_{self.quantization}_{self.model_type}_m{self.m}_k{self.k}_n{self.n}{'_compile' if self.use_torch_compile else ''}", ) - self.profile = params.get("profile", False) - self.profile_file_name = os.path.join(self.output_dir, f"/profile/{self.name}_{self.m}_{self.k}_{self.n}_profile.json") @staticmethod def _parse_precision(precision_str: str) -> torch.dtype: @@ -113,7 +105,6 @@ def to_dict(self) -> Dict[str, Any]: "device": self.device, "model_type": self.model_type, "output_dir": self.output_dir, - "profile": self.profile, } @@ -328,61 +319,6 @@ def model_inference_time_in_ms(model, input_data): return res * 1e6 -def upload_trace_file(local_path: str, overwrite: bool = False) -> Optional[str]: - file_name = os.path.basename(local_path) - manifold_path = os.path.join( - "perfetto_internal_traces/tree/shared_trace", f"{os.getlogin()}_{str(uuid.uuid4())}_{file_name}" - ) - cmd = [ - "manifold", - "put", - local_path, - manifold_path, - "--ttl", - str(28 * 24 * 60 * 60), - "--userData", - "false", - ] - ret = subprocess.run( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True - ) - if ret.returncode == 0: - print("Upload trace successfully.") - return manifold_path - else: - print("[ERROR] Upload failed, maybe the trace file exists.") - return None - - -def print_perfetto_ui_url(manifold_path: str) -> None: - url = ( - "https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html" - + "#!/?url=https://interncache-all.fbcdn.net/manifold/" - + manifold_path - ) - print(f"The trace is accessible at:\n{url}") - - -def generate_model_profile(model, input_data, profile_file): - # Function to benchmark model evaluation with profiling - torch.profiler._utils._init_for_cuda_graphs() - prof = torch.profiler.profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) - with prof: - # with record_function("model_inference"): - for _ in range(1): # Run the model multiple times to warm up the cache - with torch.no_grad(): - _ = model(*input_data) - torch.cuda.synchronize() - prof.export_chrome_trace(profile_file) # Save profiling details - - manifold_path = upload_trace_file(profile_file) - if manifold_path: - print_perfetto_ui_url(manifold_path) - - # Return the profiler output - return prof - - def create_model_and_input( model_type: str, m: int, From 285038951e379f67109ff577e42f0cf9f63585ae Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 27 Mar 2025 11:02:06 -0700 Subject: [PATCH 8/8] nit updates --- benchmarks/microbenchmarks/test/test_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py index 83e88c5b11..14f226bd7e 100644 --- a/benchmarks/microbenchmarks/test/test_utils.py +++ b/benchmarks/microbenchmarks/test/test_utils.py @@ -43,7 +43,7 @@ def setUp(self): def test_benchmark_config(self): config = BenchmarkConfig( quantization="baseline", - sparsity="None", + sparsity=None, params=self.test_params, shape_name="custom", shape=self.test_shape, @@ -65,7 +65,7 @@ def test_benchmark_config(self): def test_benchmark_result(self): config = BenchmarkConfig( quantization="baseline", - sparsity="None", + sparsity=None, params=self.test_params, shape_name="custom", shape=self.test_shape, @@ -88,15 +88,15 @@ def test_get_default_device(self): def test_string_to_config(self): # Test baseline - config = string_to_config("baseline", "None") + config = string_to_config("baseline", None) self.assertIsNone(config) # Test int8wo - config = string_to_config("int8wo", "None") + config = string_to_config("int8wo", None) self.assertIsNotNone(config) # Test invalid config - config = string_to_config("not_a_real_config", "None") + config = string_to_config("not_a_real_config", None) self.assertIsNone(config) def test_string_to_config_sparsity(self): @@ -191,7 +191,7 @@ def test_generate_results_csv(self): BenchmarkResult( BenchmarkConfig( quantization="int8wo", - sparsity="None", + sparsity=None, params={}, shape_name="custom", shape=[1024, 1024, 1024], @@ -202,7 +202,7 @@ def test_generate_results_csv(self): BenchmarkResult( BenchmarkConfig( quantization="int4wo", - sparsity="None", + sparsity=None, params={}, shape_name="custom", shape=[1024, 1024, 1024],