diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 53ba84ea8e7..b3b39bbf621 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op @@ -384,12 +385,11 @@ def _apply_bnb_4bit_fake( try: - direct_register_custom_op( - op_name="apply_bnb_4bit", - op_func=_apply_bnb_4bit, - mutates_args=["out"], - fake_impl=_apply_bnb_4bit_fake, - ) + direct_register_custom_op(op_name="apply_bnb_4bit", + op_func=_apply_bnb_4bit, + mutates_args=["out"], + fake_impl=_apply_bnb_4bit_fake, + dispatch_key=current_platform.dispatch_key) apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit except AttributeError as error: diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 0c46d170e88..5631e0a4689 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -371,10 +371,12 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, ...] # bitsandbytes requires data in GPU - if weight_sub_tensor.is_cuda: + if (weight_sub_tensor.is_cuda + or weight_sub_tensor.device.type == "hpu"): loaded_weight = weight_sub_tensor else: - loaded_weight = weight_sub_tensor.cuda() + loaded_weight = weight_sub_tensor.to( + device=current_platform.device_type) # remove the following after the issue is fixed: # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342