Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ steps:
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
# FIXIT: the phi-3v test requires using fork method
# see https://buildkite.com/vllm/ci-aws/builds/3286#019064f5-697e-4477-ad66-b97b2763e78c for reference
# need to investigate and control when we import the huggingface config class
- VLLM_WORKER_MULTIPROC_METHOD=fork TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
Expand Down
22 changes: 12 additions & 10 deletions tests/models/test_phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,16 +97,8 @@ def run_test(
hf_images = [asset.for_hf() for asset in image_assets]
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]

# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model_id, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
hf_outputs = hf_model.generate_greedy(
HF_IMAGE_PROMPTS,
max_tokens,
images=hf_images,
eos_token_id=hf_model.processor.tokenizer.eos_token_id)

# NOTE: run vLLM first, and then run HF
# otherwise, HF will initialize the GPU and vLLM will fail with fork
vllm_image_prompts = [
p.replace("<|image_1|>",
"<|image|>" * vlm_config.image_feature_size + "<s>")
Expand All @@ -124,6 +116,16 @@ def run_test(
max_tokens,
images=vllm_images)

# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model_id, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
hf_outputs = hf_model.generate_greedy(
HF_IMAGE_PROMPTS,
max_tokens,
images=hf_images,
eos_token_id=hf_model.processor.tokenizer.eos_token_id)

for i in range(len(HF_IMAGE_PROMPTS)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_to_hf_output(
Expand Down