diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
index 69d64b498..521f67dec 100644
--- a/examples/awq/llama_example.py
+++ b/examples/awq/llama_example.py
@@ -66,7 +66,9 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
index 5fdc231c9..7786e1003 100644
--- a/examples/awq/qwen3_moe_example.py
+++ b/examples/awq/qwen3_moe_example.py
@@ -71,7 +71,9 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/big_models_with_sequential_onloading/README.md b/examples/big_models_with_sequential_onloading/README.md
index 802723975..42c4b64aa 100644
--- a/examples/big_models_with_sequential_onloading/README.md
+++ b/examples/big_models_with_sequential_onloading/README.md
@@ -37,7 +37,7 @@ During `oneshot`, only one gpu is required which will be used to onload each lay
 ```python
 dispatch_for_generation(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
+sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 ```
diff --git a/examples/big_models_with_sequential_onloading/llama3.3_70b.py b/examples/big_models_with_sequential_onloading/llama3.3_70b.py
index 5cf90ca7e..9cc54fb84 100644
--- a/examples/big_models_with_sequential_onloading/llama3.3_70b.py
+++ b/examples/big_models_with_sequential_onloading/llama3.3_70b.py
@@ -76,7 +76,7 @@ def tokenize(sample):
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
+sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py
index f0d0381d2..aea0a7eee 100644
--- a/examples/compressed_inference/fp8_compressed_inference.py
+++ b/examples/compressed_inference/fp8_compressed_inference.py
@@ -22,7 +22,7 @@
 compressed_model = AutoModelForCausalLM.from_pretrained(
     MODEL_STUB,
     torch_dtype="auto",
-    device_map="cuda:0",
+    device_map="auto",
 )
 
 # tokenize the sample data
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
index 5437ba36c..c1b88c6d3 100644
--- a/examples/multimodal_vision/gemma3_example.py
+++ b/examples/multimodal_vision/gemma3_example.py
@@ -68,7 +68,7 @@ def data_collator(batch):
 raw_image = Image.open(requests.get(image_url, stream=True).raw)
 
 # Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
-inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device)
 output = model.generate(**inputs, max_new_tokens=100, disable_compile=True)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index 1225349c4..2fdaeb1a4 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -109,7 +109,7 @@ def tokenize(sample):
 image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
 raw_image = Image.open(requests.get(image_url, stream=True).raw)
 
-inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device)
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index 0a17d8c50..0673fed9b 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -64,7 +64,7 @@ def data_collator(batch):
 image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
 raw_image = Image.open(requests.get(image_url, stream=True).raw)
 
-inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device)
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
index e70ee43ec..b7281dadd 100644
--- a/examples/multimodal_vision/mistral3_example.py
+++ b/examples/multimodal_vision/mistral3_example.py
@@ -77,7 +77,7 @@ def data_collator(batch):
 image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
 raw_image = Image.open(requests.get(image_url, stream=True).raw)
 
-inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device)
 inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)  # fix dtype
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index 6672aff2e..c54bb27a4 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -64,7 +64,7 @@ def data_collator(batch):
 image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
 raw_image = Image.open(requests.get(image_url, stream=True).raw)
 
-inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device)
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index fa4b0feab..c714dd7e6 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -93,7 +93,9 @@ def data_collator(batch):
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index a0ed50ef4..b86b90411 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -70,7 +70,7 @@ def data_collator(batch):
 image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
 raw_image = Image.open(requests.get(image_url, stream=True).raw)
 
-inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device)
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index 8cccf768e..fd5fbd013 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -121,7 +121,7 @@ def data_collator(batch):
     max_length=MAX_SEQUENCE_LENGTH,
     truncation=True,
     return_tensors="pt",
-).to("cuda")
+).to(model.device)
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
index dcdbb937a..bbaba936e 100644
--- a/examples/multimodal_vision/qwen_2_5_vl_example.py
+++ b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -115,7 +115,7 @@ def data_collator(batch):
     max_length=MAX_SEQUENCE_LENGTH,
     truncation=True,
     return_tensors="pt",
-).to("cuda")
+).to(model.device)
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
index 62da49c88..e186d180e 100644
--- a/examples/quantization_kv_cache/README.md
+++ b/examples/quantization_kv_cache/README.md
@@ -115,7 +115,7 @@ oneshot(
 Test the quantized model with a sample generation:
 
 ```python
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(model.device)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 ```
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
index b88106abe..dd5d1905b 100644
--- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -91,7 +91,9 @@ def process_and_tokenize(example):
 print("\n\n")
 dispatch_for_generation(model)
 print("========== SAMPLE GENERATION ==============")
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 339c353fa..51227b412 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -88,7 +88,9 @@ def process_and_tokenize(example):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
index 0d16e1b22..02bbb8e2e 100644
--- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -88,7 +88,9 @@ def process_and_tokenize(example):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py b/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py
index c018022d9..5df0454a8 100644
--- a/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py
+++ b/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py
@@ -57,7 +57,9 @@
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_non_uniform/quantization_int4_int8.py b/examples/quantization_non_uniform/quantization_int4_int8.py
index 8f9b7de6a..4fd45042d 100644
--- a/examples/quantization_non_uniform/quantization_int4_int8.py
+++ b/examples/quantization_non_uniform/quantization_int4_int8.py
@@ -92,7 +92,9 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_non_uniform/quantization_nvfp4_fp8.py b/examples/quantization_non_uniform/quantization_nvfp4_fp8.py
index c5ba307ca..619f9ca78 100644
--- a/examples/quantization_non_uniform/quantization_nvfp4_fp8.py
+++ b/examples/quantization_non_uniform/quantization_nvfp4_fp8.py
@@ -109,7 +109,9 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index b729be003..b03aacee3 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -67,7 +67,7 @@ def tokenize(sample):
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
+sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
index d35de8d30..2d6968e5f 100644
--- a/examples/quantization_w4a16_fp4/llama3_example.py
+++ b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -21,7 +21,9 @@
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
index 95d01657b..63068fde1 100644
--- a/examples/quantization_w4a4_fp4/llama3_example.py
+++ b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -69,7 +69,9 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_w4a4_fp4/qwen_30b_a3b.py b/examples/quantization_w4a4_fp4/qwen_30b_a3b.py
index b7d51fab2..bca7ca44d 100644
--- a/examples/quantization_w4a4_fp4/qwen_30b_a3b.py
+++ b/examples/quantization_w4a4_fp4/qwen_30b_a3b.py
@@ -77,7 +77,9 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantization_w8a8_fp8/fp8_block_example.py b/examples/quantization_w8a8_fp8/fp8_block_example.py
index 5b47cbbe5..7e908427f 100644
--- a/examples/quantization_w8a8_fp8/fp8_block_example.py
+++ b/examples/quantization_w8a8_fp8/fp8_block_example.py
@@ -28,7 +28,9 @@
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
index 9973b3ad2..f8ddf373c 100644
--- a/examples/quantization_w8a8_fp8/gemma2_example.py
+++ b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -32,7 +32,9 @@
 # Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
index 6a1454cd0..9cd24387f 100644
--- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
+++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -26,7 +26,9 @@
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
index 39c196752..7192b9852 100644
--- a/examples/quantization_w8a8_fp8/llama3_example.py
+++ b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -24,7 +24,9 @@
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
index a03188a61..9bce0ba42 100644
--- a/examples/quantization_w8a8_fp8/llava1.5_example.py
+++ b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -26,7 +26,9 @@
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
index ebadbe973..5b542bf03 100644
--- a/examples/quantization_w8a8_fp8/qwen2vl_example.py
+++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -26,7 +26,9 @@
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
index b35fcc7ff..0da365af8 100644
--- a/examples/quantization_w8a8_int8/gemma2_example.py
+++ b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -70,7 +70,9 @@ def tokenize(sample):
 # Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
index feab87455..1737f08b1 100644
--- a/examples/quantization_w8a8_int8/llama3_example.py
+++ b/examples/quantization_w8a8_int8/llama3_example.py
@@ -72,7 +72,9 @@ def tokenize(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py
index 5021c7947..b67442018 100644
--- a/examples/quantizing_moe/mixtral_example.py
+++ b/examples/quantizing_moe/mixtral_example.py
@@ -75,7 +75,7 @@ def tokenize(sample):
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
+sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================")
diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py
index 343442ded..0db0f6266 100644
--- a/examples/quantizing_moe/qwen_example.py
+++ b/examples/quantizing_moe/qwen_example.py
@@ -74,7 +74,7 @@ def tokenize(sample):
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
+sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================")
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index 590b74611..931b91f24 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -103,7 +103,9 @@ def get_recipe(fp8_enabled):
 # Validate the compressed model
 print("\n========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n")
diff --git a/examples/transform/quip_example.py b/examples/transform/quip_example.py
index 65c8a48b3..2ce4e4079 100644
--- a/examples/transform/quip_example.py
+++ b/examples/transform/quip_example.py
@@ -32,7 +32,9 @@
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/examples/transform/spinquant_example.py b/examples/transform/spinquant_example.py
index 547d06041..a3b22d181 100644
--- a/examples/transform/spinquant_example.py
+++ b/examples/transform/spinquant_example.py
@@ -29,7 +29,9 @@
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 51e7c3a74..f50795543 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -517,8 +517,8 @@ def is_ancestor(module: Module) -> bool:
 def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
     """
     Dispatch a model for sequential calibration using a sequential pipeline.
-    The model will be offloaded to the CPU and dispatched to CUDA device if available.
-    Removes any existing hooks.
+    The model will be offloaded to the CPU and dispatched to CUDA/XPU device
+    if available. Removes any existing hooks.
 
     :param model: model to dispatch
     :return: dispatched model
@@ -527,8 +527,10 @@ def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel:
 
     if torch.cuda.is_available():
         offloaded_dispatch(model, execution_device=torch.device("cuda:0"))
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        offloaded_dispatch(model, execution_device=torch.device("xpu:0"))
     else:
-        logger.warning("CUDA is not available! Compressing model on CPU instead")
+        logger.warning("CUDA/XPU is not available! Compressing model on CPU instead")
 
     return model
 
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index 64e9b2906..e344705d7 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -170,7 +170,10 @@ def initialize_session(
                 "pass a yaml file or string to the `recipe` argument."
             )
 
-        torch.cuda.empty_cache()
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
+        else:
+            torch.cuda.empty_cache()
 
     def finalize_session(self):
         """
@@ -186,7 +189,10 @@ def finalize_session(self):
         logger.info("Finalized LLM Compressor session")
         model = get_session_model()
         self.model = model
-        torch.cuda.empty_cache()
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
+        else:
+            torch.cuda.empty_cache()
 
     def create_optimizer(self):
         """