vllm-project · allenwang28 · Oct 11, 2024 · Oct 14, 2024
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
@@ -12,4 +12,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest  && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py && python3 /workspace/vllm/examples/offline_inference_tpu_with_multistep.py"
diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference_tpu.py
@@ -12,13 +12,13 @@
 ]
 N = 1
 # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0.7,
+sampling_params = SamplingParams(temperature=0.0,
                                  top_p=1.0,
                                  n=N,
                                  max_tokens=16)
 
 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-# In real workloads, `enforace_eager` should be `False`.
+# In real workloads, `enforce_eager` should be `False`.
 llm = LLM(model="google/gemma-2b", enforce_eager=True)
 outputs = llm.generate(prompts, sampling_params)
 for output, answer in zip(outputs, answers):

diff --git a/examples/offline_inference_tpu_with_multistep.py b/examples/offline_inference_tpu_with_multistep.py
@@ -0,0 +1,28 @@
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "A robot may not injure a human being",
+    "It is only with the heart that one can see rightly;",
+    "The greatest glory in living lies not in never falling,",
+]
+answers = [
+    " or, through inaction, allow a human being to come to harm.",
+    " what is essential is invisible to the eye.",
+    " but in rising every time we fall.",
+]
+N = 1
+# Currently, top-p sampling is disabled. `top_p` should be 1.0.
+sampling_params = SamplingParams(temperature=0.7,
+                                 top_p=1.0,
+                                 n=N,
+                                 max_tokens=16)
+
+# Set `enforce_eager=True` to avoid ahead-of-time compilation.
+# In real workloads, `enforce_eager` should be `False`.
+llm = LLM(model="google/gemma-2b", enforce_eager=True, num_scheduler_steps=2)
+outputs = llm.generate(prompts, sampling_params)
+for output, answer in zip(outputs, answers):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert generated_text.startswith(answer)