From 5dcbafb747e5785a2bc296879cdc6607f74d392a Mon Sep 17 00:00:00 2001 From: Allen Wang Date: Fri, 11 Oct 2024 20:46:32 +0000 Subject: [PATCH 1/2] add multistep TPU test --- .buildkite/run-tpu-test.sh | 2 +- .../offline_inference_tpu_with_multistep.py | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 examples/offline_inference_tpu_with_multistep.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 6989c94d46a8..f56ed981fc96 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -12,4 +12,4 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py && python3 /workspace/vllm/examples/offline_inference_tpu_with_multistep.py" diff --git a/examples/offline_inference_tpu_with_multistep.py b/examples/offline_inference_tpu_with_multistep.py new file mode 100644 index 000000000000..72f77ab255d4 --- /dev/null +++ b/examples/offline_inference_tpu_with_multistep.py @@ -0,0 +1,28 @@ +from vllm import LLM, SamplingParams + +prompts = [ + "A robot may not injure a human being", + "It is only with the heart that one can see rightly;", + "The greatest glory in living lies not in never falling,", +] +answers = [ + " or, through inaction, allow a human being to come to harm.", + " what is essential is invisible to the eye.", + " but in rising every time we fall.", +] +N = 1 +# Currently, top-p sampling is disabled. `top_p` should be 1.0. +sampling_params = SamplingParams(temperature=0.7, + top_p=1.0, + n=N, + max_tokens=16) + +# Set `enforce_eager=True` to avoid ahead-of-time compilation. +# In real workloads, `enforace_eager` should be `False`. +llm = LLM(model="google/gemma-2b", enforce_eager=True, num_scheduler_steps=2) +outputs = llm.generate(prompts, sampling_params) +for output, answer in zip(outputs, answers): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text.startswith(answer) From 2bd8ab53d9597626ffd032c339e6d57a7a89f751 Mon Sep 17 00:00:00 2001 From: Allen Wang Date: Mon, 14 Oct 2024 15:52:50 +0000 Subject: [PATCH 2/2] reduce temp for determinism? --- examples/offline_inference_tpu.py | 4 ++-- examples/offline_inference_tpu_with_multistep.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference_tpu.py index 251629b8027c..f643b68a4342 100644 --- a/examples/offline_inference_tpu.py +++ b/examples/offline_inference_tpu.py @@ -12,13 +12,13 @@ ] N = 1 # Currently, top-p sampling is disabled. `top_p` should be 1.0. -sampling_params = SamplingParams(temperature=0.7, +sampling_params = SamplingParams(temperature=0.0, top_p=1.0, n=N, max_tokens=16) # Set `enforce_eager=True` to avoid ahead-of-time compilation. -# In real workloads, `enforace_eager` should be `False`. +# In real workloads, `enforce_eager` should be `False`. llm = LLM(model="google/gemma-2b", enforce_eager=True) outputs = llm.generate(prompts, sampling_params) for output, answer in zip(outputs, answers): diff --git a/examples/offline_inference_tpu_with_multistep.py b/examples/offline_inference_tpu_with_multistep.py index 72f77ab255d4..8feb317e1a0c 100644 --- a/examples/offline_inference_tpu_with_multistep.py +++ b/examples/offline_inference_tpu_with_multistep.py @@ -18,7 +18,7 @@ max_tokens=16) # Set `enforce_eager=True` to avoid ahead-of-time compilation. -# In real workloads, `enforace_eager` should be `False`. +# In real workloads, `enforce_eager` should be `False`. llm = LLM(model="google/gemma-2b", enforce_eager=True, num_scheduler_steps=2) outputs = llm.generate(prompts, sampling_params) for output, answer in zip(outputs, answers):