Skip to content

Add sparsity to benchmarking #1917

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions benchmarks/microbenchmarks/benchmark_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
string_to_config,
)
from torchao.quantization import quantize_
from torchao.sparsity.sparse_api import sparsify_


def run(config: BenchmarkConfig) -> BenchmarkResult:
Expand All @@ -44,11 +45,33 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:

# Use quantize_ to apply each quantization function to the model
m_copy = deepcopy(base_model).eval().to(config.device)
quantization_config = string_to_config(
config.quantization, high_precision_dtype=config.high_precision_dtype
ao_base_config = string_to_config(
config.quantization,
config.sparsity,
high_precision_dtype=config.high_precision_dtype,
)
if quantization_config is not None:
quantize_(m_copy, quantization_config)

# Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
is_cuda = config.device == "cuda" and torch.cuda.is_available()

if config.sparsity is not None and (
config.quantization is None or "baseline" in config.quantization
):
if is_cuda:
print(f"Applying {config.sparsity} sparsity to model")
sparsify_(m_copy, ao_base_config)
else:
print(
f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
)
elif config.sparsity is None and (
config.quantization is None or "baseline" in config.quantization
):
pass # No quantization or sparsity specified, do nothing
else:
print("Quantizing model....")
quantize_(m_copy, ao_base_config)

if config.use_torch_compile:
print("Compiling model....")
m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
Expand Down
71 changes: 63 additions & 8 deletions benchmarks/microbenchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import argparse
from itertools import product
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List, Optional, Set, Tuple

import yaml

Expand Down Expand Up @@ -68,6 +68,53 @@ def get_param_combinations(model_param):
return shapes, base_params


def get_quantization_sparsity_recipes(
quantization_recipes: List[str], sparsity_recipes: List[str]
) -> Set[Tuple[str, Optional[str]]]:
"""Generate valid quantization and sparsity recipes.

Args:
quantization_recipes: List of quantization recipes
sparsity_recipes: List of sparsity recipes

Returns:
Set of tuples containing (quantization_recipe, sparsity_recipe)
For block sparsity, quantization is always "baseline"
All quantization techniques are also run without sparsity
"""
config_recipes = set()

# Always include baseline without sparsity
config_recipes.add(("baseline", None))

# Add all quantization techniques without sparsity
for quant_config in quantization_recipes:
config_recipes.add((quant_config, None))

# Process combinations of quantization and sparsity
for sparse_config in sparsity_recipes:
if sparse_config is None:
# Skip None sparsity as we've already added all quantization techniques without sparsity
continue
elif "block" in sparse_config:
# For block sparsity, only pair with baseline quantization
config_recipes.add(("baseline", sparse_config))
elif "semi" in sparse_config or "2:4" in sparse_config:
# For semi-sparse, only pair with compatible quantization methods
for quant_config in quantization_recipes:
if (
"marlin" in quant_config
or "int8dq" in quant_config
or "float8dq" in quant_config
or quant_config == "baseline"
):
config_recipes.add((quant_config, sparse_config))
else:
raise ValueError(f"Invalid sparsity recipe: {sparse_config}")

return config_recipes


def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig]:
"""Load benchmark configurations from CLI arguments and YAML file."""
with open(cli_args.config, "r") as f:
Expand All @@ -78,24 +125,29 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig

# Create all possible combinations
configs = []
quantization_sparsity_recipes = get_quantization_sparsity_recipes(
config.get("quantization_config_recipe_names", []),
config.get("sparsity_config_recipe_names", []),
)
for model_param in config["model_params"]:
shapes, params = get_param_combinations(model_param)

# Create configs for all combinations
for quant_config, (shape_name, shape) in product(
config.get("quantization_config_recipe_names", ["baseline"]), shapes
for (quant_config, sparse_config), (shape_name, shape) in product(
quantization_sparsity_recipes,
shapes,
):
configs.append(
BenchmarkConfig(
quantization=quant_config,
sparsity=sparse_config,
params=params,
shape_name=shape_name,
shape=shape,
output_dir=output_dir,
benchmark_mode=benchmark_mode,
)
)

return configs


Expand All @@ -104,14 +156,17 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
from benchmarks.microbenchmarks.benchmark_inference import run as run_inference

results = []
print("Benchmarking Inference ......")
print("----------------- RUNNING BENCHMARKS FOR INFERENCE -----------------------")
for config in configs:
print("----------------------------------------")
try:
print(f"Running: {config.name}")
print(
f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
)
result = run_inference(config) # Pass the config object directly
results.append(result)
except Exception as e:
print(f"Error running benchmark {config.name}: {e}")
except Exception:
print(f"Error running benchmark {config.name}")
continue

# Add results to csv
Expand Down
8 changes: 6 additions & 2 deletions benchmarks/microbenchmarks/test/benchmark_config.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Sample configuration for inference benchmarks
benchmark_mode: "inference"
quantization_config_recipe_names:
- "baseline"
# Will run a baseline inference for model by default, without quantization for comparison
- "int4wo-32"
- "int4wo-128"
- "marlin"
sparsity_config_recipe_names:
# Will run a baseline inference for model by default, without sparsity for comparison
- "semi-sparse"
- "block"
output_dir: "benchmarks/microbenchmarks/results"
model_params:
- name: "small_bf16_linear"
Expand Down
67 changes: 66 additions & 1 deletion benchmarks/microbenchmarks/test/test_benchmark_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# LICENSE file in the root directory of this source tree.
import tempfile
import unittest
from unittest.mock import patch

from benchmarks.microbenchmarks.benchmark_inference import run
from benchmarks.microbenchmarks.utils import BenchmarkConfig, BenchmarkResult
Expand All @@ -17,6 +18,7 @@ def setUp(self):

self.config = BenchmarkConfig(
quantization="baseline",
sparsity="semi-sparse",
params={
"high_precision_dtype": "torch.float32",
"use_torch_compile": False,
Expand All @@ -35,11 +37,74 @@ def tearDown(self):

shutil.rmtree(self.temp_dir)

def test_run_inference(self):
@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
def test_run_inference(self, mock_string_to_config):
# Mock string_to_config to return a valid config
from torchao.sparsity.sparse_api import SemiSparseWeightConfig

mock_string_to_config.return_value = SemiSparseWeightConfig()

result = run(self.config)
self.assertIsInstance(result, BenchmarkResult)
self.assertTrue(hasattr(result, "model_inference_time_in_ms"))

@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
"""Test running inference with sparsity configurations"""
# Mock string_to_config to return valid configs
from torchao.dtypes import MarlinSparseLayout
from torchao.quantization import Int4WeightOnlyConfig

# Test with semi-sparse config
mock_string_to_config.return_value = Int4WeightOnlyConfig(
layout=MarlinSparseLayout()
)
config = BenchmarkConfig(
quantization="marlin",
sparsity="semi-sparse",
params={
"high_precision_dtype": "torch.float32",
"use_torch_compile": False,
"device": "cpu",
"model_type": "linear",
},
shape_name="custom",
shape=[64, 64, 64], # Use dimensions divisible by 64
output_dir=self.temp_dir,
benchmark_mode="inference",
)
result = run(config)
self.assertIsInstance(result, BenchmarkResult)
self.assertTrue(hasattr(result, "model_inference_time_in_ms"))

@patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
def test_run_inference_with_block_sparsity(self, mock_string_to_config):
"""Test running inference with sparsity configurations"""
# Mock string_to_config to return valid configs
from torchao.sparsity.sparse_api import (
BlockSparseWeightConfig,
)

# Test with block sparsity
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see - can we split this into two tests then, one for int4+2:4 marlin, and one for block sparsity?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me try that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

mock_string_to_config.return_value = BlockSparseWeightConfig()
config = BenchmarkConfig(
quantization="baseline",
sparsity="block",
params={
"high_precision_dtype": "torch.float32",
"use_torch_compile": False,
"device": "cpu",
"model_type": "linear",
},
shape_name="custom",
shape=[64, 64, 64], # Use dimensions divisible by 64
output_dir=self.temp_dir,
benchmark_mode="inference",
)
result = run(config)
self.assertIsInstance(result, BenchmarkResult)
self.assertTrue(hasattr(result, "model_inference_time_in_ms"))


if __name__ == "__main__":
unittest.main()
104 changes: 104 additions & 0 deletions benchmarks/microbenchmarks/test/test_benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from benchmarks.microbenchmarks.benchmark_runner import (
get_param_combinations,
get_quantization_sparsity_recipes,
get_shapes_for_config,
load_benchmark_configs,
run_inference_benchmarks_from_config,
Expand Down Expand Up @@ -88,6 +89,109 @@ def test_run_inference_benchmarks_from_config(self):
results_file = Path(self.temp_dir) / "results.csv"
self.assertTrue(results_file.exists())

def test_get_quantization_sparsity_recipes(self):
"""Test generation of valid quantization and sparsity recipe combinations"""
# Test basic combinations
quant_recipes = ["baseline", "int8wo"]
sparse_recipes = [None, "semi-sparse"]
recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
self.assertIn(("baseline", None), recipes)
self.assertIn(("int8wo", None), recipes)
self.assertIn(("baseline", "semi-sparse"), recipes)

# Test marlin with semi-sparse
quant_recipes = ["marlin", "baseline"]
sparse_recipes = [None, "semi-sparse"]
recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
self.assertIn(("marlin", "semi-sparse"), recipes)
self.assertIn(("baseline", None), recipes)

# Test block sparsity
quant_recipes = ["baseline"]
sparse_recipes = [None, "block"]
recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)
self.assertIn(("baseline", "block"), recipes)

def test_none_string_raises_error(self):
"""Test that passing 'None' as a string raises an error"""
quant_recipes = ["baseline"]
sparse_recipes = ["None"] # "None" as a string should raise an error
with self.assertRaises(ValueError):
get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)

def test_block_sparsity_with_quantization(self):
"""Test that block sparsity is only paired with baseline quantization"""
quant_recipes = ["baseline", "int8wo", "int4wo", "marlin"]
sparse_recipes = ["block"]
recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)

# Block sparsity should only be paired with baseline
self.assertIn(("baseline", "block"), recipes)
self.assertNotIn(("int8wo", "block"), recipes)
self.assertNotIn(("int4wo", "block"), recipes)
self.assertNotIn(("marlin", "block"), recipes)

# All quantization techniques should be run without sparsity
self.assertIn(("baseline", None), recipes)
self.assertIn(("int8wo", None), recipes)
self.assertIn(("int4wo", None), recipes)
self.assertIn(("marlin", None), recipes)

def test_all_quantization_without_sparsity(self):
"""Test that all quantization techniques are run without sparsity"""
quant_recipes = ["baseline", "int8wo", "int4wo", "marlin"]
sparse_recipes = [None, "semi-sparse", "block"]
recipes = get_quantization_sparsity_recipes(quant_recipes, sparse_recipes)

# All quantization techniques should be run without sparsity
for quant in quant_recipes:
self.assertIn((quant, None), recipes)

@patch(
"benchmarks.microbenchmarks.benchmark_runner.get_quantization_sparsity_recipes"
)
def test_load_benchmark_configs_with_sparsity(self, mock_get_recipes):
"""Test loading benchmark configs with sparsity options"""
# Mock get_quantization_sparsity_recipes to return a valid set of recipes
mock_get_recipes.return_value = {("baseline", None), ("marlin", "semi-sparse")}

test_config = {
"benchmark_mode": "inference",
"quantization_config_recipe_names": ["baseline", "marlin"],
"sparsity_config_recipe_names": [
None,
"semi-sparse",
], # Use None instead of "None"
"output_dir": self.temp_dir,
"model_params": [
{
"matrix_shapes": [
{"name": "custom", "shapes": [[1024, 1024, 1024]]}
],
"high_precision_dtype": "torch.bfloat16",
"device": "cpu",
"model_type": "linear",
}
],
}

config_path = Path(self.temp_dir) / "test_sparsity_config.yml"
with open(config_path, "w") as f:
yaml.dump(test_config, f)

configs = load_benchmark_configs(argparse.Namespace(config=str(config_path)))

# Check that we get configs for baseline and marlin with appropriate sparsity
self.assertTrue(
any(c.quantization == "baseline" and c.sparsity is None for c in configs)
)
self.assertTrue(
any(
c.quantization == "marlin" and c.sparsity == "semi-sparse"
for c in configs
)
)


if __name__ == "__main__":
unittest.main()
Loading
Loading