Update based on comments

yanbing-j · yanbing-j · commit fbb2cae6decc · 2024-11-14T03:32:06.000Z
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -91,6 +91,7 @@
     is_fbcode,
     benchmark_model
 )
+from torchao.dtypes.utils import is_device
 
 logger = logging.getLogger("INFO")
 
@@ -132,8 +133,7 @@ def _int8da_int8w_api(mod):
 
 def _int4wo_api(mod):
     if TORCH_VERSION_AT_LEAST_2_4:
-        device_type = next(mod.parameters()).device
-        if device_type == torch.device("cpu"):
+        if is_device(next(mod.parameters()).device.type, "cpu"):
             quantize_(mod, int4_weight_only(layout=Int4CPULayout()), set_inductor_config=False)
         else:
             quantize_(mod, int4_weight_only(), set_inductor_config=False)
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -33,6 +33,7 @@
     TORCH_VERSION_AT_LEAST_2_6,
     is_fbcode,
 )
+from torchao.dtypes.utils import is_device
 
 _SEED = 1234
 torch.manual_seed(_SEED)
@@ -102,7 +103,7 @@ def _groupwise_affine_quantize_tensor_from_qparams(
         .reshape_as(w)
     )
     if TORCH_VERSION_AT_LEAST_2_5:
-        if w.device.type != "cpu":
+        if not is_device(w.device.type, "cpu"):
             w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to(torch.uint8)
 
     return w_int4x8
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -632,8 +632,7 @@ def extra_repr(self):
 
 @dataclass(frozen=True)
 class Int4CPULayout(Layout):
-    def pre_process(self, input: torch.Tensor) -> torch.Tensor:
-        return input
+    pass
 
 @dataclass(frozen=True)
 class Float8Layout(Layout):
@@ -1714,6 +1713,10 @@ def from_plain(
         return cls(packed_weight, scale_and_zero, False, _layout)
 
     def to(self, *args, **kwargs):
+        if not is_device(torch.device(self.device).type, device):
+            raise ValueError(
+                f"Int4CPUAQTTensorImpl does not support conversion from {self.device} to {device}"
+            )
         kwargs = self._get_to_kwargs(*args, **kwargs)
         device = kwargs["device"]
         return self.__class__(
@@ -1724,9 +1727,6 @@ def to(self, *args, **kwargs):
         )
 
     def _apply_fn_to_data(self, fn):
-        # self.packed_weight = fn(self.packed_weight)
-        # self.scale_and_zero = fn(self.scale_and_zero)
-        # return self
         return self.__class__(
             fn(self.packed_weight),
             fn(self.scale_and_zero),
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
@@ -16,6 +16,7 @@
     unpack_tinygemm_scales_and_zeros,
 )
 from torchao.utils import find_multiple
+from torchao.dtypes.utils import is_device
 
 __all__ = [
     "Int8DynamicallyQuantizedLinearWeight",
@@ -458,7 +459,7 @@ def _quantized_op(act_mat, w_qtensor, bias):
         act_mat = torch.nn.functional.pad(act_mat, (0, pad_size - act_mat.shape[-1]))
 
         # matmul
-        if act_mat.device == torch.device("cpu"):
+        if is_device(act_mat.device.type, "cpu"):
             y = aten._weight_int4pack_mm_for_cpu(
                 act_mat.contiguous(),
                 w_qtensor.int_data,
@@ -617,7 +618,7 @@ def to_qtensor_components(cls, input_float, groupsize=128, inner_k_tiles=8):
         input_int4x8, scales_and_zeros = groupwise_affine_quantize_tensor(
             input_float, 4, groupsize, dtype=input_float.dtype
         )
-        if input_float.device == torch.device("cpu"):
+        if is_device(input_float.device.type, "cpu"):
             int_data = aten._convert_weight_to_int4pack_for_cpu(input_int4x8, inner_k_tiles)
         else:
             int_data = aten._convert_weight_to_int4pack(input_int4x8, inner_k_tiles)
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -18,6 +18,7 @@
     quantize_affine,
 )
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+from torchao.dtypes.utils import is_device
 
 __all__ = [
     "compute_error",
@@ -400,14 +401,8 @@ def groupwise_affine_quantize_tensor_from_qparams(
         zero_point_domain=ZeroPointDomain.FLOAT,
     )
     if TORCH_VERSION_AT_LEAST_2_5 and w.shape[-1] > 1:
-        int_data_device_type = int_data.device.type
-        # Move to cpu, until issue with MPS memory management of temporary tensors is resolved
-        # if int_data_device_type == "mps":
-        #     int_data = int_data.cpu()
-        if int_data_device_type != "cpu":
+        if not is_device(int_data.device.type, "cpu"):
             int_data = (int_data[::, ::2] << 4 | int_data[::, 1::2]).to(torch.uint8)
-        # if int_data_device_type == "mps":
-        #     int_data = int_data.to(device="mps")
     return int_data
 
 

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@`
`33`	`33`	`TORCH_VERSION_AT_LEAST_2_6,`
`34`	`34`	`is_fbcode,`
`35`	`35`	`)`
	`36`	`+from torchao.dtypes.utils import is_device`
`36`	`37`
`37`	`38`	`_SEED = 1234`
`38`	`39`	`torch.manual_seed(_SEED)`
`@@ -102,7 +103,7 @@ def _groupwise_affine_quantize_tensor_from_qparams(`
`102`	`103`	`.reshape_as(w)`
`103`	`104`	`)`
`104`	`105`	`if TORCH_VERSION_AT_LEAST_2_5:`
`105`		`- if w.device.type != "cpu":`
	`106`	`+ if not is_device(w.device.type, "cpu"):`
`106`	`107`	`w_int4x8 = (w_int4x8[::, ::2] << 4 \| w_int4x8[::, 1::2]).to(torch.uint8)`
`107`	`108`
`108`	`109`	`return w_int4x8`