From b89ea130bc25d11bb755651c008ebdac0c061746 Mon Sep 17 00:00:00 2001
From: Elaine Zhao <elaineyz@amazon.com>
Date: Thu, 22 May 2025 00:54:33 +0000
Subject: [PATCH 1/3] [Neuron] Remove bypass on EAGLEConfig and add a test

Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
---
 tests/neuron/2_core/test_eagle.py | 72 +++++++++++++++++++++++++++++++
 vllm/config.py                    |  3 +-
 2 files changed, 73 insertions(+), 2 deletions(-)
 create mode 100644 tests/neuron/2_core/test_eagle.py

diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
new file mode 100644
index 000000000000..c5ab55ad42e1
--- /dev/null
+++ b/tests/neuron/2_core/test_eagle.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import torch
+import tempfile
+import shutil
+from safetensors import safe_open
+from huggingface_hub import snapshot_download
+from vllm import LLM, SamplingParams
+
+
+def patch_eagle_draft_with_lm_head(target_model_id: str, draft_model_id: str) -> str:
+    # In NxDI, draft model checkpoint must include lm_head weights from target model.
+    # For more details see https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/
+    # feature-guide.html#eagle-checkpoint-compatibility
+    final_draft_dir = "/tmp/patched_eagle_draft" 
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        target_dir = snapshot_download(repo_id=target_model_id, local_dir=os.path.join(tmp_dir, "target"))
+        draft_dir = snapshot_download(repo_id=draft_model_id, local_dir=os.path.join(tmp_dir, "draft"))
+
+        lm_head_key = "lm_head.weight"
+        index_path = os.path.join(target_dir, "model.safetensors.index.json")
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        shard_name = index["weight_map"][lm_head_key]
+        target_safetensor_path = os.path.join(target_dir, shard_name)
+
+        with safe_open(target_safetensor_path, framework="pt") as f:
+            target_lm_head = f.get_tensor(lm_head_key)
+
+        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
+        draft_state_dict = torch.load(draft_path, map_location="cpu")
+        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
+        torch.save(draft_state_dict, draft_path)
+
+        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
+
+    return final_draft_dir
+
+
+def test_eagle():
+    patched_draft_path = patch_eagle_draft_with_lm_head(
+        target_model_id="meta-llama/Llama-2-7b-hf",
+        draft_model_id="yuhuili/EAGLE-llama2-chat-7B"
+    )
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-hf",
+        speculative_config={
+            "model": patched_draft_path,
+            "num_speculative_tokens": 5,
+            "max_model_len": 128
+        },
+        max_num_seqs=1,
+        max_model_len=128,
+        tensor_parallel_size=2,
+        override_neuron_config={
+            "enable_eagle_speculation": True,
+            "enable_fused_speculation": True,
+            "fused_qkv": True
+        },
+    )
+    prompts = [
+        "The president of the United States is",
+    ]
+    outputs = llm.generate(prompts, SamplingParams(top_k=1))
+    expected_output = " the head of state and head of government of the United States. The president direct"
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
\ No newline at end of file
diff --git a/vllm/config.py b/vllm/config.py
index 3fa1db0e8390..2c5da2d3b2c8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2529,11 +2529,10 @@ def __post_init__(self):
                             "Chunked prefill and EAGLE are not compatible "
                             "when using V0.")
 
-                    from vllm.platforms import current_platform
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
                     if isinstance(self.draft_model_config.hf_config,
-                                  EAGLEConfig) or current_platform.is_neuron():
+                                  EAGLEConfig): 
                         pass
                     else:
                         eagle_config = EAGLEConfig(

From b7805317c09f6b3c3c7cbe7f8fe46d1a19b96436 Mon Sep 17 00:00:00 2001
From: Elaine Zhao <elaineyz@amazon.com>
Date: Thu, 22 May 2025 01:16:12 +0000
Subject: [PATCH 2/3] Run pre-commit

Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
---
 tests/neuron/2_core/test_eagle.py | 39 ++++++++++++++++++-------------
 vllm/config.py                    |  2 +-
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
index c5ab55ad42e1..c87f3236d954 100644
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
@@ -2,27 +2,34 @@
 
 import json
 import os
-import torch
-import tempfile
 import shutil
-from safetensors import safe_open
+import tempfile
+
+import torch
 from huggingface_hub import snapshot_download
+from safetensors import safe_open
+
 from vllm import LLM, SamplingParams
 
 
-def patch_eagle_draft_with_lm_head(target_model_id: str, draft_model_id: str) -> str:
-    # In NxDI, draft model checkpoint must include lm_head weights from target model.
-    # For more details see https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/
-    # feature-guide.html#eagle-checkpoint-compatibility
-    final_draft_dir = "/tmp/patched_eagle_draft" 
+def patch_eagle_draft_with_lm_head(target_model_id: str,
+                                   draft_model_id: str) -> str:
+    # In NxDI, draft model checkpoint must include lm_head weights from target
+    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
+    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
+    # #eagle-checkpoint-compatibility
+    final_draft_dir = "/tmp/patched_eagle_draft"
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        target_dir = snapshot_download(repo_id=target_model_id, local_dir=os.path.join(tmp_dir, "target"))
-        draft_dir = snapshot_download(repo_id=draft_model_id, local_dir=os.path.join(tmp_dir, "draft"))
+        target_dir = snapshot_download(repo_id=target_model_id,
+                                       local_dir=os.path.join(
+                                           tmp_dir, "target"))
+        draft_dir = snapshot_download(repo_id=draft_model_id,
+                                      local_dir=os.path.join(tmp_dir, "draft"))
 
         lm_head_key = "lm_head.weight"
         index_path = os.path.join(target_dir, "model.safetensors.index.json")
-        with open(index_path, "r") as f:
+        with open(index_path) as f:
             index = json.load(f)
         shard_name = index["weight_map"][lm_head_key]
         target_safetensor_path = os.path.join(target_dir, shard_name)
@@ -43,8 +50,7 @@ def patch_eagle_draft_with_lm_head(target_model_id: str, draft_model_id: str) ->
 def test_eagle():
     patched_draft_path = patch_eagle_draft_with_lm_head(
         target_model_id="meta-llama/Llama-2-7b-hf",
-        draft_model_id="yuhuili/EAGLE-llama2-chat-7B"
-    )
+        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
     llm = LLM(
         model="meta-llama/Llama-2-7b-hf",
         speculative_config={
@@ -65,8 +71,9 @@ def test_eagle():
         "The president of the United States is",
     ]
     outputs = llm.generate(prompts, SamplingParams(top_k=1))
-    expected_output = " the head of state and head of government of the United States. The president direct"
+    expected_output = " the head of state and head of government of " \
+    "the United States. The president direct"
+
     for output in outputs:
-        prompt = output.prompt
         generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
\ No newline at end of file
+        assert (expected_output == generated_text)
diff --git a/vllm/config.py b/vllm/config.py
index 2c5da2d3b2c8..bdff4c70fb18 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2532,7 +2532,7 @@ def __post_init__(self):
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
                     if isinstance(self.draft_model_config.hf_config,
-                                  EAGLEConfig): 
+                                  EAGLEConfig):
                         pass
                     else:
                         eagle_config = EAGLEConfig(

From 630088c62a54373636c6d960caa6d8a07e827317 Mon Sep 17 00:00:00 2001
From: Elaine Zhao <elaineyz@amazon.com>
Date: Fri, 23 May 2025 03:26:31 +0000
Subject: [PATCH 3/3] Update Neuron buildkite script to run 2_core tests in
 separate processes

Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
---
 .buildkite/scripts/hardware_ci/run-neuron-test.sh | 9 ++++++++-
 tests/neuron/2_core/test_eagle.py                 | 3 +++
 tests/neuron/2_core/test_mistral.py               | 6 ++++--
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
index c0b9dd8dadba..3d294ea5f8a7 100644
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@@ -53,4 +53,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
+       /bin/bash -c "
+            python3 /workspace/vllm/examples/offline_inference/neuron.py;
+            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
+            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
+                echo 'Running test file: '$f;
+                python3 -m pytest \$f -v --capture=tee-sys;
+            done
+       "
\ No newline at end of file
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
index c87f3236d954..d71c88689a99 100644
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
@@ -76,4 +76,7 @@ def test_eagle():
 
     for output in outputs:
         generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
         assert (expected_output == generated_text)
+
+    print("Neuron Eagle speculation test passed.")
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
index cc3b53a9d7c9..3e651502d1e2 100644
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
@@ -12,8 +12,7 @@ def test_mistral():
               override_neuron_config={
                   "sequence_parallel_enabled": False,
                   "skip_warmup": True
-              },
-              device="neuron")
+              })
 
     # Send more prompts than the compiled batch size (4) and request
     # varying generation lengths to test accuracy related to Neuron
@@ -59,4 +58,7 @@ def test_mistral():
 
     for expected_output, output in zip(expected_outputs, outputs):
         generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
         assert (expected_output == generated_text)
+
+    print("Neuron Mistral test passed.")