diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py index 69d64b498..521f67dec 100644 --- a/examples/awq/llama_example.py +++ b/examples/awq/llama_example.py @@ -66,7 +66,9 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py index 5fdc231c9..7786e1003 100644 --- a/examples/awq/qwen3_moe_example.py +++ b/examples/awq/qwen3_moe_example.py @@ -71,7 +71,9 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/big_models_with_sequential_onloading/README.md b/examples/big_models_with_sequential_onloading/README.md index 802723975..42c4b64aa 100644 --- a/examples/big_models_with_sequential_onloading/README.md +++ b/examples/big_models_with_sequential_onloading/README.md @@ -37,7 +37,7 @@ During `oneshot`, only one gpu is required which will be used to onload each lay ```python dispatch_for_generation(model) sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} +sample = {key: value.to(model.device) for key, value in sample.items()} output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) ``` diff --git a/examples/big_models_with_sequential_onloading/llama3.3_70b.py b/examples/big_models_with_sequential_onloading/llama3.3_70b.py index 5cf90ca7e..9cc54fb84 100644 --- a/examples/big_models_with_sequential_onloading/llama3.3_70b.py +++ b/examples/big_models_with_sequential_onloading/llama3.3_70b.py @@ -76,7 +76,7 @@ def tokenize(sample): print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} +sample = {key: value.to(model.device) for key, value in sample.items()} output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py index f0d0381d2..aea0a7eee 100644 --- a/examples/compressed_inference/fp8_compressed_inference.py +++ b/examples/compressed_inference/fp8_compressed_inference.py @@ -22,7 +22,7 @@ compressed_model = AutoModelForCausalLM.from_pretrained( MODEL_STUB, torch_dtype="auto", - device_map="cuda:0", + device_map="auto", ) # tokenize the sample data diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py index 5437ba36c..c1b88c6d3 100644 --- a/examples/multimodal_vision/gemma3_example.py +++ b/examples/multimodal_vision/gemma3_example.py @@ -68,7 +68,7 @@ def data_collator(batch): raw_image = Image.open(requests.get(image_url, stream=True).raw) # Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333 -inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda") +inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=100, disable_compile=True) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py index 1225349c4..2fdaeb1a4 100644 --- a/examples/multimodal_vision/idefics3_example.py +++ b/examples/multimodal_vision/idefics3_example.py @@ -109,7 +109,7 @@ def tokenize(sample): image_url = "http://images.cocodataset.org/train2017/000000231895.jpg" raw_image = Image.open(requests.get(image_url, stream=True).raw) -inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda") +inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py index 0a17d8c50..0673fed9b 100644 --- a/examples/multimodal_vision/llava_example.py +++ b/examples/multimodal_vision/llava_example.py @@ -64,7 +64,7 @@ def data_collator(batch): image_url = "http://images.cocodataset.org/train2017/000000231895.jpg" raw_image = Image.open(requests.get(image_url, stream=True).raw) -inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda") +inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py index e70ee43ec..b7281dadd 100644 --- a/examples/multimodal_vision/mistral3_example.py +++ b/examples/multimodal_vision/mistral3_example.py @@ -77,7 +77,7 @@ def data_collator(batch): image_url = "http://images.cocodataset.org/train2017/000000231895.jpg" raw_image = Image.open(requests.get(image_url, stream=True).raw) -inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda") +inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device) inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype) # fix dtype output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py index 6672aff2e..c54bb27a4 100644 --- a/examples/multimodal_vision/mllama_example.py +++ b/examples/multimodal_vision/mllama_example.py @@ -64,7 +64,7 @@ def data_collator(batch): image_url = "http://images.cocodataset.org/train2017/000000231895.jpg" raw_image = Image.open(requests.get(image_url, stream=True).raw) -inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda") +inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py index fa4b0feab..c714dd7e6 100644 --- a/examples/multimodal_vision/phi3_vision_example.py +++ b/examples/multimodal_vision/phi3_vision_example.py @@ -93,7 +93,9 @@ def data_collator(batch): # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py index a0ed50ef4..b86b90411 100644 --- a/examples/multimodal_vision/pixtral_example.py +++ b/examples/multimodal_vision/pixtral_example.py @@ -70,7 +70,7 @@ def data_collator(batch): image_url = "http://images.cocodataset.org/train2017/000000231895.jpg" raw_image = Image.open(requests.get(image_url, stream=True).raw) -inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda") +inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py index 8cccf768e..fd5fbd013 100644 --- a/examples/multimodal_vision/qwen2_vl_example.py +++ b/examples/multimodal_vision/qwen2_vl_example.py @@ -121,7 +121,7 @@ def data_collator(batch): max_length=MAX_SEQUENCE_LENGTH, truncation=True, return_tensors="pt", -).to("cuda") +).to(model.device) output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py index dcdbb937a..bbaba936e 100644 --- a/examples/multimodal_vision/qwen_2_5_vl_example.py +++ b/examples/multimodal_vision/qwen_2_5_vl_example.py @@ -115,7 +115,7 @@ def data_collator(batch): max_length=MAX_SEQUENCE_LENGTH, truncation=True, return_tensors="pt", -).to("cuda") +).to(model.device) output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) print("==========================================") diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md index 62da49c88..e186d180e 100644 --- a/examples/quantization_kv_cache/README.md +++ b/examples/quantization_kv_cache/README.md @@ -115,7 +115,7 @@ oneshot( Test the quantized model with a sample generation: ```python -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(model.device) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) ``` diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py index b88106abe..dd5d1905b 100644 --- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py +++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py @@ -91,7 +91,9 @@ def process_and_tokenize(example): print("\n\n") dispatch_for_generation(model) print("========== SAMPLE GENERATION ==============") -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100, disable_compile=True) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 339c353fa..51227b412 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -88,7 +88,9 @@ def process_and_tokenize(example): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py index 0d16e1b22..02bbb8e2e 100644 --- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py +++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py @@ -88,7 +88,9 @@ def process_and_tokenize(example): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py b/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py index c018022d9..5df0454a8 100644 --- a/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py +++ b/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py @@ -57,7 +57,9 @@ print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_non_uniform/quantization_int4_int8.py b/examples/quantization_non_uniform/quantization_int4_int8.py index 8f9b7de6a..4fd45042d 100644 --- a/examples/quantization_non_uniform/quantization_int4_int8.py +++ b/examples/quantization_non_uniform/quantization_int4_int8.py @@ -92,7 +92,9 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_non_uniform/quantization_nvfp4_fp8.py b/examples/quantization_non_uniform/quantization_nvfp4_fp8.py index c5ba307ca..619f9ca78 100644 --- a/examples/quantization_non_uniform/quantization_nvfp4_fp8.py +++ b/examples/quantization_non_uniform/quantization_nvfp4_fp8.py @@ -109,7 +109,9 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index b729be003..b03aacee3 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -67,7 +67,7 @@ def tokenize(sample): print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} +sample = {key: value.to(model.device) for key, value in sample.items()} output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py index d35de8d30..2d6968e5f 100644 --- a/examples/quantization_w4a16_fp4/llama3_example.py +++ b/examples/quantization_w4a16_fp4/llama3_example.py @@ -21,7 +21,9 @@ print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py index 95d01657b..63068fde1 100644 --- a/examples/quantization_w4a4_fp4/llama3_example.py +++ b/examples/quantization_w4a4_fp4/llama3_example.py @@ -69,7 +69,9 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_w4a4_fp4/qwen_30b_a3b.py b/examples/quantization_w4a4_fp4/qwen_30b_a3b.py index b7d51fab2..bca7ca44d 100644 --- a/examples/quantization_w4a4_fp4/qwen_30b_a3b.py +++ b/examples/quantization_w4a4_fp4/qwen_30b_a3b.py @@ -77,7 +77,9 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantization_w8a8_fp8/fp8_block_example.py b/examples/quantization_w8a8_fp8/fp8_block_example.py index 5b47cbbe5..7e908427f 100644 --- a/examples/quantization_w8a8_fp8/fp8_block_example.py +++ b/examples/quantization_w8a8_fp8/fp8_block_example.py @@ -28,7 +28,9 @@ # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py index 9973b3ad2..f8ddf373c 100644 --- a/examples/quantization_w8a8_fp8/gemma2_example.py +++ b/examples/quantization_w8a8_fp8/gemma2_example.py @@ -32,7 +32,9 @@ # Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333 print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20, disable_compile=True) print(tokenizer.decode(output[0])) print("==========================================") diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py index 6a1454cd0..9cd24387f 100644 --- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py +++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py @@ -26,7 +26,9 @@ # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py index 39c196752..7192b9852 100644 --- a/examples/quantization_w8a8_fp8/llama3_example.py +++ b/examples/quantization_w8a8_fp8/llama3_example.py @@ -24,7 +24,9 @@ # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20) print(tokenizer.decode(output[0])) print("==========================================") diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py index a03188a61..9bce0ba42 100644 --- a/examples/quantization_w8a8_fp8/llava1.5_example.py +++ b/examples/quantization_w8a8_fp8/llava1.5_example.py @@ -26,7 +26,9 @@ # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py index ebadbe973..5b542bf03 100644 --- a/examples/quantization_w8a8_fp8/qwen2vl_example.py +++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py @@ -26,7 +26,9 @@ # Confirm generations of the quantized model look sane. print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py index b35fcc7ff..0da365af8 100644 --- a/examples/quantization_w8a8_int8/gemma2_example.py +++ b/examples/quantization_w8a8_int8/gemma2_example.py @@ -70,7 +70,9 @@ def tokenize(sample): # Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333 print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=20, disable_compile=True) print(tokenizer.decode(output[0])) print("==========================================") diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py index feab87455..1737f08b1 100644 --- a/examples/quantization_w8a8_int8/llama3_example.py +++ b/examples/quantization_w8a8_int8/llama3_example.py @@ -72,7 +72,9 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py index 5021c7947..b67442018 100644 --- a/examples/quantizing_moe/mixtral_example.py +++ b/examples/quantizing_moe/mixtral_example.py @@ -75,7 +75,7 @@ def tokenize(sample): print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} +sample = {key: value.to(model.device) for key, value in sample.items()} output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================") diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py index 343442ded..0db0f6266 100644 --- a/examples/quantizing_moe/qwen_example.py +++ b/examples/quantizing_moe/qwen_example.py @@ -74,7 +74,7 @@ def tokenize(sample): print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) sample = tokenizer("Hello my name is", return_tensors="pt") -sample = {key: value.to("cuda") for key, value in sample.items()} +sample = {key: value.to(model.device) for key, value in sample.items()} output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================") diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py index 590b74611..931b91f24 100644 --- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py +++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py @@ -103,7 +103,9 @@ def get_recipe(fp8_enabled): # Validate the compressed model print("\n========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n") diff --git a/examples/transform/quip_example.py b/examples/transform/quip_example.py index 65c8a48b3..2ce4e4079 100644 --- a/examples/transform/quip_example.py +++ b/examples/transform/quip_example.py @@ -32,7 +32,9 @@ print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/examples/transform/spinquant_example.py b/examples/transform/spinquant_example.py index 547d06041..a3b22d181 100644 --- a/examples/transform/spinquant_example.py +++ b/examples/transform/spinquant_example.py @@ -29,7 +29,9 @@ print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) output = model.generate(input_ids, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 51e7c3a74..f50795543 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -517,8 +517,8 @@ def is_ancestor(module: Module) -> bool: def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel: """ Dispatch a model for sequential calibration using a sequential pipeline. - The model will be offloaded to the CPU and dispatched to CUDA device if available. - Removes any existing hooks. + The model will be offloaded to the CPU and dispatched to CUDA/XPU device + if available. Removes any existing hooks. :param model: model to dispatch :return: dispatched model @@ -527,8 +527,10 @@ def dispatch_for_sequential(model: PreTrainedModel) -> PreTrainedModel: if torch.cuda.is_available(): offloaded_dispatch(model, execution_device=torch.device("cuda:0")) + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + offloaded_dispatch(model, execution_device=torch.device("xpu:0")) else: - logger.warning("CUDA is not available! Compressing model on CPU instead") + logger.warning("CUDA/XPU is not available! Compressing model on CPU instead") return model diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index 64e9b2906..e344705d7 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -170,7 +170,10 @@ def initialize_session( "pass a yaml file or string to the `recipe` argument." ) - torch.cuda.empty_cache() + if hasattr(torch, "xpu") and torch.xpu.is_available(): + torch.xpu.empty_cache() + else: + torch.cuda.empty_cache() def finalize_session(self): """ @@ -186,7 +189,10 @@ def finalize_session(self): logger.info("Finalized LLM Compressor session") model = get_session_model() self.model = model - torch.cuda.empty_cache() + if hasattr(torch, "xpu") and torch.xpu.is_available(): + torch.xpu.empty_cache() + else: + torch.cuda.empty_cache() def create_optimizer(self): """