From 5dcbafb747e5785a2bc296879cdc6607f74d392a Mon Sep 17 00:00:00 2001
From: Allen Wang <allencwang@google.com>
Date: Fri, 11 Oct 2024 20:46:32 +0000
Subject: [PATCH 1/2] add multistep TPU test

---
 .buildkite/run-tpu-test.sh                    |  2 +-
 .../offline_inference_tpu_with_multistep.py   | 28 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 examples/offline_inference_tpu_with_multistep.py

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 6989c94d46a8..f56ed981fc96 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -12,4 +12,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py && python3 /workspace/vllm/examples/offline_inference_tpu_with_multistep.py"
diff --git a/examples/offline_inference_tpu_with_multistep.py b/examples/offline_inference_tpu_with_multistep.py
new file mode 100644
index 000000000000..72f77ab255d4
--- /dev/null
+++ b/examples/offline_inference_tpu_with_multistep.py
@@ -0,0 +1,28 @@
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "A robot may not injure a human being",
+    "It is only with the heart that one can see rightly;",
+    "The greatest glory in living lies not in never falling,",
+]
+answers = [
+    " or, through inaction, allow a human being to come to harm.",
+    " what is essential is invisible to the eye.",
+    " but in rising every time we fall.",
+]
+N = 1
+# Currently, top-p sampling is disabled. `top_p` should be 1.0.
+sampling_params = SamplingParams(temperature=0.7,
+                                 top_p=1.0,
+                                 n=N,
+                                 max_tokens=16)
+
+# Set `enforce_eager=True` to avoid ahead-of-time compilation.
+# In real workloads, `enforace_eager` should be `False`.
+llm = LLM(model="google/gemma-2b", enforce_eager=True, num_scheduler_steps=2)
+outputs = llm.generate(prompts, sampling_params)
+for output, answer in zip(outputs, answers):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert generated_text.startswith(answer)

From 2bd8ab53d9597626ffd032c339e6d57a7a89f751 Mon Sep 17 00:00:00 2001
From: Allen Wang <allencwang@google.com>
Date: Mon, 14 Oct 2024 15:52:50 +0000
Subject: [PATCH 2/2] reduce temp for determinism?

---
 examples/offline_inference_tpu.py                | 4 ++--
 examples/offline_inference_tpu_with_multistep.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference_tpu.py
index 251629b8027c..f643b68a4342 100644
--- a/examples/offline_inference_tpu.py
+++ b/examples/offline_inference_tpu.py
@@ -12,13 +12,13 @@
 ]
 N = 1
 # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0.7,
+sampling_params = SamplingParams(temperature=0.0,
                                  top_p=1.0,
                                  n=N,
                                  max_tokens=16)
 
 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-# In real workloads, `enforace_eager` should be `False`.
+# In real workloads, `enforce_eager` should be `False`.
 llm = LLM(model="google/gemma-2b", enforce_eager=True)
 outputs = llm.generate(prompts, sampling_params)
 for output, answer in zip(outputs, answers):
diff --git a/examples/offline_inference_tpu_with_multistep.py b/examples/offline_inference_tpu_with_multistep.py
index 72f77ab255d4..8feb317e1a0c 100644
--- a/examples/offline_inference_tpu_with_multistep.py
+++ b/examples/offline_inference_tpu_with_multistep.py
@@ -18,7 +18,7 @@
                                  max_tokens=16)
 
 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-# In real workloads, `enforace_eager` should be `False`.
+# In real workloads, `enforce_eager` should be `False`.
 llm = LLM(model="google/gemma-2b", enforce_eager=True, num_scheduler_steps=2)
 outputs = llm.generate(prompts, sampling_params)
 for output, answer in zip(outputs, answers):