From 719d6507e20f44a3a0e590f594db0c1cb43faf5e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Jul 2025 16:43:55 +0000 Subject: [PATCH 01/47] [Frontend] Replace `--task` option with `--runner` and `--convert` Signed-off-by: DarkLight1337 --- docs/features/multimodal_inputs.md | 4 +- docs/features/prompt_embeds.md | 2 +- docs/models/generative_models.md | 4 +- docs/models/pooling_models.md | 29 +- docs/models/supported_models.md | 79 ++-- docs/serving/openai_compatible_server.md | 24 +- examples/offline_inference/basic/classify.py | 6 +- examples/offline_inference/basic/embed.py | 4 +- examples/offline_inference/basic/score.py | 6 +- .../embed_jina_embeddings_v3.py | 6 +- .../offline_inference/embed_matryoshka_fy.py | 6 +- examples/offline_inference/qwen3_reranker.py | 4 +- .../vision_language_pooling.py | 6 +- ...i_chat_completion_client_for_multimodal.py | 2 +- ...ai_chat_embedding_client_for_multimodal.py | 2 +- .../openai_cross_encoder_score.py | 2 +- ...enai_cross_encoder_score_for_multimodal.py | 2 +- .../online_serving/openai_pooling_client.py | 2 +- ...ompt_embed_inference_with_openai_client.py | 2 +- tests/compile/test_async_tp.py | 3 - tests/compile/test_basic_correctness.py | 6 +- tests/compile/test_fusion_all_reduce.py | 3 - tests/compile/test_sequence_parallelism.py | 3 - tests/conftest.py | 8 +- tests/distributed/test_expert_parallel.py | 26 +- tests/distributed/test_pipeline_parallel.py | 42 +- tests/distributed/test_sequence_parallel.py | 30 +- .../openai/correctness/test_mteb_embed.py | 3 +- .../openai/correctness/test_mteb_score.py | 3 +- .../openai/test_chat_logit_bias_validation.py | 4 - tests/entrypoints/openai/test_embedding.py | 4 +- .../openai/test_embedding_dimensions.py | 4 +- .../entrypoints/openai/test_openai_schema.py | 2 +- .../openai/test_optional_middleware.py | 4 +- tests/entrypoints/openai/test_pooling.py | 4 +- tests/entrypoints/openai/test_truncation.py | 4 +- tests/entrypoints/openai/test_video.py | 2 +- tests/entrypoints/openai/test_vision.py | 2 +- .../openai/test_vision_embedding.py | 4 +- tests/entrypoints/test_chat_utils.py | 39 +- tests/lora/test_worker.py | 5 - .../model_executor/test_guided_processors.py | 10 +- tests/models/language/pooling/embed_utils.py | 2 +- tests/models/language/pooling/mteb_utils.py | 4 +- .../models/language/pooling/test_embedding.py | 2 +- tests/models/language/pooling/test_gritlm.py | 13 +- tests/models/language/pooling/test_jina.py | 2 +- .../pooling/test_nomic_max_model_len.py | 20 +- tests/models/language/pooling/test_scoring.py | 18 +- .../pooling/test_truncation_control.py | 6 +- .../multimodal/generation/test_common.py | 2 +- .../generation/test_granite_speech.py | 2 +- .../multimodal/generation/test_interleaved.py | 2 +- .../multimodal/generation/test_phi4mm.py | 2 +- .../multimodal/generation/test_qwen2_vl.py | 2 +- .../multimodal/generation/vlm_utils/core.py | 6 +- .../multimodal/generation/vlm_utils/types.py | 4 +- .../multimodal/pooling/test_dse_qwen2_vl.py | 2 +- .../pooling/test_jinavl_reranker.py | 2 +- .../multimodal/pooling/test_llava_next.py | 2 +- tests/models/multimodal/pooling/test_phi3v.py | 2 +- .../multimodal/processing/test_common.py | 2 +- tests/models/multimodal/test_mapping.py | 2 +- .../models/quantization/test_bitsandbytes.py | 2 +- tests/models/utils.py | 6 +- tests/multimodal/test_processing.py | 25 +- tests/quantization/test_configs.py | 10 +- tests/test_config.py | 144 ++++--- tests/test_sampling_params.py | 5 - tests/v1/core/test_kv_cache_utils.py | 12 +- tests/v1/core/test_scheduler.py | 3 - tests/v1/core/utils.py | 3 - tests/v1/kv_connector/unit/utils.py | 3 - tests/v1/spec_decode/test_eagle.py | 9 +- tests/v1/spec_decode/test_ngram.py | 9 +- tests/v1/tpu/worker/test_tpu_model_runner.py | 4 - tests/v1/worker/test_gpu_model_runner.py | 4 - vllm/config.py | 374 ++++++++++-------- vllm/engine/arg_utils.py | 31 +- vllm/entrypoints/llm.py | 67 ++-- vllm/model_executor/model_loader/utils.py | 19 +- vllm/model_executor/models/registry.py | 4 +- vllm/transformers_utils/tokenizer_group.py | 12 +- 83 files changed, 624 insertions(+), 628 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index e820ace4f8fe..a999067817ef 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -279,7 +279,7 @@ Here is a simple example using Phi-3.5-Vision. First, launch the OpenAI-compatible server: ```bash -vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ +vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \ --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' ``` @@ -358,7 +358,7 @@ Instead of `image_url`, you can pass a video file via `video_url`. Here is a sim First, launch the OpenAI-compatible server: ```bash -vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192 +vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --runner generate --max-model-len 8192 ``` Then, you can use the OpenAI client as follows: diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index 6f5616e05d8c..83993bd0140f 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -34,7 +34,7 @@ Prompt embeddings are passed in as base64 encoded torch tensors. First, launch the OpenAI-compatible server: ```bash -vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \ +vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \ --max-model-len 4096 --enable-prompt-embeds ``` diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 21ad115e411a..4eeb002fbb71 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -6,8 +6,8 @@ In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text. -For generative models, the only supported `--task` option is `"generate"`. -Usually, this is automatically inferred so you don't have to specify it. +For model architectures that support both generation and pooling, you should set `--runner generate` +to use the model as a generative model. ## Offline Inference diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 741ae2d79c1e..cd67943ece40 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -3,7 +3,7 @@ vLLM also supports pooling models, including embedding, reranking and reward models. In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. -These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input +These models use a [Pooler][vllm.model_executor.layers.pooling.Pooler] to extract the final hidden states of the input before returning them. !!! note @@ -11,14 +11,17 @@ before returning them. As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. -If the model doesn't implement this interface, you can set `--task` which tells vLLM +If the model doesn't implement this interface, you can set `--convert` which tells vLLM to convert the model into a pooling model. -| `--task` | Model type | Supported pooling tasks | -|------------|----------------------|-------------------------------| -| `embed` | Embedding model | `encode`, `embed` | -| `classify` | Classification model | `encode`, `classify`, `score` | -| `reward` | Reward model | `encode` | +| `--convert` | Model type | Supported pooling tasks | +|-------------|----------------------|-------------------------------| +| `embed` | Embedding model | `encode`, `embed` | +| `classify` | Classification model | `encode`, `classify`, `score` | +| `reward` | Reward model | `encode` | + +For model architectures that support both generation and pooling, you should set `--runner pooling` +to use the model as a pooling model. ## Pooling Tasks @@ -33,7 +36,7 @@ In vLLM, we define the following pooling tasks and corresponding APIs: \*The `score` API falls back to `embed` task if the model does not support `score` task. -Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks]. +Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooling.Pooler.get_supported_tasks]. By default, the pooler assigned to each task has the following attributes: @@ -70,7 +73,7 @@ It returns the extracted hidden states directly, which is useful for reward mode ```python from vllm import LLM -llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") +llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", runner="pooling") (output,) = llm.encode("Hello, my name is") data = output.outputs.data @@ -85,7 +88,7 @@ It is primarily designed for embedding models. ```python from vllm import LLM -llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") +llm = LLM(model="intfloat/e5-mistral-7b-instruct", runner="pooling") (output,) = llm.embed("Hello, my name is") embeds = output.outputs.embedding @@ -102,7 +105,7 @@ It is primarily designed for classification models. ```python from vllm import LLM -llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling") (output,) = llm.classify("Hello, my name is") probs = output.outputs.probs @@ -123,7 +126,7 @@ It is designed for embedding models and cross encoder models. Embedding models u ```python from vllm import LLM -llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") +llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") (output,) = llm.score("What is the capital of France?", "The capital of Brazil is Brasilia.") @@ -175,7 +178,7 @@ You can change the output dimensions of embedding models that support Matryoshka from vllm import LLM, PoolingParams llm = LLM(model="jinaai/jina-embeddings-v3", - task="embed", + runner="pooling", trust_remote_code=True) outputs = llm.embed(["Follow the white rabbit."], pooling_params=PoolingParams(dimensions=32)) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c8b6c6c86120..541e6c2f31c3 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -1,7 +1,6 @@ # Supported Models vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks. -If a model supports more than one task, you can set the task via the `--task` argument. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. @@ -24,7 +23,7 @@ To check if the modeling backend is Transformers, you can simply do this: ```python from vllm import LLM -llm = LLM(model=..., task="generate") # Name or path of your model +llm = LLM(model=...) # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` @@ -158,13 +157,13 @@ The [Transformers backend][transformers-backend] enables you to run models direc ```python from vllm import LLM - # For generative models (task=generate) only - llm = LLM(model=..., task="generate") # Name or path of your model + # For generative models (runner=generate) only + llm = LLM(model=..., runner="generate") # Name or path of your model output = llm.generate("Hello, my name is") print(output) - # For pooling models (task={embed,classify,reward,score}) only - llm = LLM(model=..., task="embed") # Name or path of your model + # For pooling models (runner=pooling) only + llm = LLM(model=..., runner="pooling") # Name or path of your model output = llm.encode("Hello, my name is") print(output) ``` @@ -281,13 +280,13 @@ And use with `trust_remote_code=True`. ```python from vllm import LLM -llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) +llm = LLM(model=..., revision=..., runner=..., trust_remote_code=True) -# For generative models (task=generate) only +# For generative models (runner=generate) only output = llm.generate("Hello, my name is") print(output) -# For pooling models (task={embed,classify,reward,score}) only +# For pooling models (runner=pooling) only output = llm.encode("Hello, my name is") print(output) ``` @@ -312,8 +311,6 @@ See [this page](generative_models.md) for more information on how to use generat #### Text Generation -Specified using `--task generate`. -