From 719d6507e20f44a3a0e590f594db0c1cb43faf5e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 23 Jul 2025 16:43:55 +0000
Subject: [PATCH 01/47] [Frontend] Replace `--task` option with `--runner` and
 `--convert`

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/features/multimodal_inputs.md            |   4 +-
 docs/features/prompt_embeds.md                |   2 +-
 docs/models/generative_models.md              |   4 +-
 docs/models/pooling_models.md                 |  29 +-
 docs/models/supported_models.md               |  79 ++--
 docs/serving/openai_compatible_server.md      |  24 +-
 examples/offline_inference/basic/classify.py  |   6 +-
 examples/offline_inference/basic/embed.py     |   4 +-
 examples/offline_inference/basic/score.py     |   6 +-
 .../embed_jina_embeddings_v3.py               |   6 +-
 .../offline_inference/embed_matryoshka_fy.py  |   6 +-
 examples/offline_inference/qwen3_reranker.py  |   4 +-
 .../vision_language_pooling.py                |   6 +-
 ...i_chat_completion_client_for_multimodal.py |   2 +-
 ...ai_chat_embedding_client_for_multimodal.py |   2 +-
 .../openai_cross_encoder_score.py             |   2 +-
 ...enai_cross_encoder_score_for_multimodal.py |   2 +-
 .../online_serving/openai_pooling_client.py   |   2 +-
 ...ompt_embed_inference_with_openai_client.py |   2 +-
 tests/compile/test_async_tp.py                |   3 -
 tests/compile/test_basic_correctness.py       |   6 +-
 tests/compile/test_fusion_all_reduce.py       |   3 -
 tests/compile/test_sequence_parallelism.py    |   3 -
 tests/conftest.py                             |   8 +-
 tests/distributed/test_expert_parallel.py     |  26 +-
 tests/distributed/test_pipeline_parallel.py   |  42 +-
 tests/distributed/test_sequence_parallel.py   |  30 +-
 .../openai/correctness/test_mteb_embed.py     |   3 +-
 .../openai/correctness/test_mteb_score.py     |   3 +-
 .../openai/test_chat_logit_bias_validation.py |   4 -
 tests/entrypoints/openai/test_embedding.py    |   4 +-
 .../openai/test_embedding_dimensions.py       |   4 +-
 .../entrypoints/openai/test_openai_schema.py  |   2 +-
 .../openai/test_optional_middleware.py        |   4 +-
 tests/entrypoints/openai/test_pooling.py      |   4 +-
 tests/entrypoints/openai/test_truncation.py   |   4 +-
 tests/entrypoints/openai/test_video.py        |   2 +-
 tests/entrypoints/openai/test_vision.py       |   2 +-
 .../openai/test_vision_embedding.py           |   4 +-
 tests/entrypoints/test_chat_utils.py          |  39 +-
 tests/lora/test_worker.py                     |   5 -
 .../model_executor/test_guided_processors.py  |  10 +-
 tests/models/language/pooling/embed_utils.py  |   2 +-
 tests/models/language/pooling/mteb_utils.py   |   4 +-
 .../models/language/pooling/test_embedding.py |   2 +-
 tests/models/language/pooling/test_gritlm.py  |  13 +-
 tests/models/language/pooling/test_jina.py    |   2 +-
 .../pooling/test_nomic_max_model_len.py       |  20 +-
 tests/models/language/pooling/test_scoring.py |  18 +-
 .../pooling/test_truncation_control.py        |   6 +-
 .../multimodal/generation/test_common.py      |   2 +-
 .../generation/test_granite_speech.py         |   2 +-
 .../multimodal/generation/test_interleaved.py |   2 +-
 .../multimodal/generation/test_phi4mm.py      |   2 +-
 .../multimodal/generation/test_qwen2_vl.py    |   2 +-
 .../multimodal/generation/vlm_utils/core.py   |   6 +-
 .../multimodal/generation/vlm_utils/types.py  |   4 +-
 .../multimodal/pooling/test_dse_qwen2_vl.py   |   2 +-
 .../pooling/test_jinavl_reranker.py           |   2 +-
 .../multimodal/pooling/test_llava_next.py     |   2 +-
 tests/models/multimodal/pooling/test_phi3v.py |   2 +-
 .../multimodal/processing/test_common.py      |   2 +-
 tests/models/multimodal/test_mapping.py       |   2 +-
 .../models/quantization/test_bitsandbytes.py  |   2 +-
 tests/models/utils.py                         |   6 +-
 tests/multimodal/test_processing.py           |  25 +-
 tests/quantization/test_configs.py            |  10 +-
 tests/test_config.py                          | 144 ++++---
 tests/test_sampling_params.py                 |   5 -
 tests/v1/core/test_kv_cache_utils.py          |  12 +-
 tests/v1/core/test_scheduler.py               |   3 -
 tests/v1/core/utils.py                        |   3 -
 tests/v1/kv_connector/unit/utils.py           |   3 -
 tests/v1/spec_decode/test_eagle.py            |   9 +-
 tests/v1/spec_decode/test_ngram.py            |   9 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py  |   4 -
 tests/v1/worker/test_gpu_model_runner.py      |   4 -
 vllm/config.py                                | 374 ++++++++++--------
 vllm/engine/arg_utils.py                      |  31 +-
 vllm/entrypoints/llm.py                       |  67 ++--
 vllm/model_executor/model_loader/utils.py     |  19 +-
 vllm/model_executor/models/registry.py        |   4 +-
 vllm/transformers_utils/tokenizer_group.py    |  12 +-
 83 files changed, 624 insertions(+), 628 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index e820ace4f8fe..a999067817ef 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -279,7 +279,7 @@ Here is a simple example using Phi-3.5-Vision.
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
   --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 ```
 
@@ -358,7 +358,7 @@ Instead of `image_url`, you can pass a video file via `video_url`. Here is a sim
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
+vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --runner generate --max-model-len 8192
 ```
 
 Then, you can use the OpenAI client as follows:
diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md
index 6f5616e05d8c..83993bd0140f 100644
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@@ -34,7 +34,7 @@ Prompt embeddings are passed in as base64 encoded torch tensors.
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
+vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
   --max-model-len 4096 --enable-prompt-embeds
 ```
 
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 21ad115e411a..4eeb002fbb71 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -6,8 +6,8 @@ In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
 
-For generative models, the only supported `--task` option is `"generate"`.
-Usually, this is automatically inferred so you don't have to specify it.
+For model architectures that support both generation and pooling, you should set `--runner generate`
+to use the model as a generative model.
 
 ## Offline Inference
 
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 741ae2d79c1e..cd67943ece40 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -3,7 +3,7 @@
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
-These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input
+These models use a [Pooler][vllm.model_executor.layers.pooling.Pooler] to extract the final hidden states of the input
 before returning them.
 
 !!! note
@@ -11,14 +11,17 @@ before returning them.
     As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
     pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 
-If the model doesn't implement this interface, you can set `--task` which tells vLLM
+If the model doesn't implement this interface, you can set `--convert` which tells vLLM
 to convert the model into a pooling model.
 
-| `--task`   | Model type           | Supported pooling tasks       |
-|------------|----------------------|-------------------------------|
-| `embed`    | Embedding model      | `encode`, `embed`             |
-| `classify` | Classification model | `encode`, `classify`, `score` |
-| `reward`   | Reward model         | `encode`                      |
+| `--convert` | Model type           | Supported pooling tasks       |
+|-------------|----------------------|-------------------------------|
+| `embed`     | Embedding model      | `encode`, `embed`             |
+| `classify`  | Classification model | `encode`, `classify`, `score` |
+| `reward`    | Reward model         | `encode`                      |
+
+For model architectures that support both generation and pooling, you should set `--runner pooling`
+to use the model as a pooling model.
 
 ## Pooling Tasks
 
@@ -33,7 +36,7 @@ In vLLM, we define the following pooling tasks and corresponding APIs:
 
 \*The `score` API falls back to `embed` task if the model does not support `score` task.
 
-Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks].
+Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooling.Pooler.get_supported_tasks].
 
 By default, the pooler assigned to each task has the following attributes:
 
@@ -70,7 +73,7 @@ It returns the extracted hidden states directly, which is useful for reward mode
 ```python
 from vllm import LLM
 
-llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", runner="pooling")
 (output,) = llm.encode("Hello, my name is")
 
 data = output.outputs.data
@@ -85,7 +88,7 @@ It is primarily designed for embedding models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
+llm = LLM(model="intfloat/e5-mistral-7b-instruct", runner="pooling")
 (output,) = llm.embed("Hello, my name is")
 
 embeds = output.outputs.embedding
@@ -102,7 +105,7 @@ It is primarily designed for classification models.
 ```python
 from vllm import LLM
 
-llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", runner="pooling")
 (output,) = llm.classify("Hello, my name is")
 
 probs = output.outputs.probs
@@ -123,7 +126,7 @@ It is designed for embedding models and cross encoder models. Embedding models u
 ```python
 from vllm import LLM
 
-llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
 (output,) = llm.score("What is the capital of France?",
                       "The capital of Brazil is Brasilia.")
 
@@ -175,7 +178,7 @@ You can change the output dimensions of embedding models that support Matryoshka
 from vllm import LLM, PoolingParams
 
 llm = LLM(model="jinaai/jina-embeddings-v3",
-          task="embed",
+          runner="pooling",
           trust_remote_code=True)
 outputs = llm.embed(["Follow the white rabbit."],
                     pooling_params=PoolingParams(dimensions=32))
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index c8b6c6c86120..541e6c2f31c3 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -1,7 +1,6 @@
 # Supported Models
 
 vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
-If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
@@ -24,7 +23,7 @@ To check if the modeling backend is Transformers, you can simply do this:
 
 ```python
 from vllm import LLM
-llm = LLM(model=..., task="generate")  # Name or path of your model
+llm = LLM(model=...)  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
 
@@ -158,13 +157,13 @@ The [Transformers backend][transformers-backend] enables you to run models direc
     ```python
     from vllm import LLM
 
-    # For generative models (task=generate) only
-    llm = LLM(model=..., task="generate")  # Name or path of your model
+    # For generative models (runner=generate) only
+    llm = LLM(model=..., runner="generate")  # Name or path of your model
     output = llm.generate("Hello, my name is")
     print(output)
 
-    # For pooling models (task={embed,classify,reward,score}) only
-    llm = LLM(model=..., task="embed")  # Name or path of your model
+    # For pooling models (runner=pooling) only
+    llm = LLM(model=..., runner="pooling")  # Name or path of your model
     output = llm.encode("Hello, my name is")
     print(output)
     ```
@@ -281,13 +280,13 @@ And use with `trust_remote_code=True`.
 ```python
 from vllm import LLM
 
-llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
+llm = LLM(model=..., revision=..., runner=..., trust_remote_code=True)
 
-# For generative models (task=generate) only
+# For generative models (runner=generate) only
 output = llm.generate("Hello, my name is")
 print(output)
 
-# For pooling models (task={embed,classify,reward,score}) only
+# For pooling models (runner=pooling) only
 output = llm.encode("Hello, my name is")
 print(output)
 ```
@@ -312,8 +311,6 @@ See [this page](generative_models.md) for more information on how to use generat
 
 #### Text Generation
 
-Specified using `--task generate`.
-
 <style>
 th {
   white-space: nowrap;
@@ -419,25 +416,27 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 !!! important
     Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+    you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
 
 #### Text Embedding
 
-Specified using `--task embed`.
-
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
-| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
+| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
+| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
-| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
-| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
-| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
-| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
-| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
+| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
+| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
+| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
+| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
+| \*<sup>C</sup> | Generative models | \* | \* | | \* |
+
+<sup>C</sup> You need to set `--convert embed` to load the model as an embedding model in vLLM.
+\*Depends on the original model.
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
@@ -459,14 +458,16 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 #### Reward Modeling
 
-Specified using `--task reward`.
-
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| \*<sup>C</sup> | Generative models | \* | \* | | \* |
+
+<sup>C</sup> You need to set `--convert reward` to load the model as a reward model in vLLM.
+\*Depends on the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
@@ -477,20 +478,20 @@ If your model is not in the above list, we will try to automatically convert the
 
 #### Classification
 
-Specified using `--task classify`.
-
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
+| \*<sup>C</sup> | Generative models | \* | \* | | \* |
+
+<sup>C</sup> You need to set `--convert classify` to load the model as an classification model in vLLM.
+\*Depends on the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring
 
-Specified using `--task score`.
-
 | Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|---------------------|
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
@@ -574,8 +575,6 @@ See [this page](generative_models.md) for more information on how to use generat
 
 #### Text Generation
 
-Specified using `--task generate`.
-
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
@@ -696,8 +695,6 @@ Specified using `--task generate`.
 
 #### Transcription
 
-Specified using `--task transcription`.
-
 Speech2Text models trained specifically for Automatic Speech Recognition.
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
@@ -714,10 +711,6 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 #### Text Embedding
 
-Specified using `--task embed`.
-
-Any text generation model can be converted into an embedding model by passing `--task embed`.
-
 !!! note
     To get the best results, you should use pooling models that are specifically trained as such.
 
@@ -725,15 +718,17 @@ The following table lists those that are tested in vLLM.
 
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
-| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
+| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
+| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
+| \*<sup>C</sup> | Generative models | \* | \* | \* | | \* |
+
+<sup>C</sup> You need to set `--convert embed` to load the model as an embedding model in vLLM.
+\*Depends on the original model.
 
 ---
 
 #### Scoring
 
-Specified using `--task score`.
-
 | Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 2cf45eeaab4d..8b4f4207dd42 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -45,17 +45,17 @@ To call the server, in your preferred text editor, create a script that uses an
 We currently support the following OpenAI APIs:
 
 - [Completions API][completions-api] (`/v1/completions`)
-    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
+    - Only applicable to [text generation models](../models/generative_models.md).
     - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API][chat-api] (`/v1/chat/completions`)
-    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template][chat-template].
+    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template][chat-template].
     - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API][embeddings-api] (`/v1/embeddings`)
-    - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
+    - Only applicable to [embedding models](../models/pooling_models.md).
 - [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
-    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
 - [Translation API][translations-api] (`/v1/audio/translations`)
-    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
 
 In addition, we have the following custom APIs:
 
@@ -64,14 +64,14 @@ In addition, we have the following custom APIs:
 - [Pooling API][pooling-api] (`/pooling`)
     - Applicable to all [pooling models](../models/pooling_models.md).
 - [Classification API][classification-api] (`/classify`)
-    - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`).
+    - Only applicable to [classification models](../models/pooling_models.md).
 - [Score API][score-api] (`/score`)
-    - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
+    - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
 - [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`)
     - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
     - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
     - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
-    - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+    - Only applicable to [cross-encoder models](../models/pooling_models.md).
 
 [](){ #chat-template }
 
@@ -250,14 +250,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
     To serve the model:
 
     ```bash
-    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
+    vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
       --trust-remote-code \
       --max-model-len 4096 \
       --chat-template examples/template_vlm2vec.jinja
     ```
 
     !!! important
-        Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
+        Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling`
         to run this model in embedding mode instead of text generation mode.
 
         The custom chat template is completely different from the original one for this model,
@@ -296,14 +296,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
     To serve the model:
 
     ```bash
-    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
       --trust-remote-code \
       --max-model-len 8192 \
       --chat-template examples/template_dse_qwen2_vl.jinja
     ```
 
     !!! important
-        Like with VLM2Vec, we have to explicitly pass `--task embed`.
+        Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
 
         Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
         by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index aaf0e83c9dee..dc3bc399ca8a 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+        model="jason9693/Qwen2.5-1.5B-apeach",
+        runner="pooling",
+        enforce_eager=True,
     )
     return parser.parse_args()
 
@@ -27,7 +29,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="classify" for classification models
+    # You should pass runner="pooling" for classification models
     llm = LLM(**vars(args))
 
     # Generate logits. The output is a list of ClassificationRequestOutputs.
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 7ff9c7f5e0eb..526753bcef22 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -13,7 +13,7 @@ def parse_args():
     # Set example specific arguments
     parser.set_defaults(
         model="intfloat/e5-mistral-7b-instruct",
-        task="embed",
+        runner="pooling",
         enforce_eager=True,
         max_model_len=1024,
     )
@@ -30,7 +30,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
     llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index d37527b0a131..c9ca7a8bf06b 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
+        model="BAAI/bge-reranker-v2-m3",
+        runner="pooling",
+        enforce_eager=True,
     )
     return parser.parse_args()
 
@@ -26,7 +28,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="score" for cross-encoder models
+    # You should pass runner="pooling" for cross-encoder models
     llm = LLM(**vars(args))
 
     # Generate scores. The output is a list of ScoringRequestOutputs.
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index 7d78b8c63c63..33a63deee91b 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+        model="jinaai/jina-embeddings-v3",
+        runner="pooling",
+        trust_remote_code=True,
     )
     return parser.parse_args()
 
@@ -29,7 +31,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
     llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index 50a645ba8270..6871bcfccf1b 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -12,7 +12,9 @@ def parse_args():
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
     parser.set_defaults(
-        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+        model="jinaai/jina-embeddings-v3",
+        runner="pooling",
+        trust_remote_code=True,
     )
     return parser.parse_args()
 
@@ -29,7 +31,7 @@ def main(args: Namespace):
     ]
 
     # Create an LLM.
-    # You should pass task="embed" for embedding models
+    # You should pass runner="pooling" for embedding models
     llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py
index b0fd57237d47..7bc48277f551 100644
--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -17,7 +17,7 @@
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling")
 
 # If you want to load the official original version, the init parameters are
 # as follows.
@@ -27,7 +27,7 @@ def get_llm() -> LLM:
     """Initializes and returns the LLM model for Qwen3-Reranker."""
     return LLM(
         model=model_name,
-        task="score",
+        runner="pooling",
         hf_overrides={
             "architectures": ["Qwen3ForSequenceClassification"],
             "classifier_from_token": ["no", "yes"],
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py
index 57963ebd2b10..0cc0c1e708b1 100644
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -70,7 +70,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="royokong/e5-v",
-        task="embed",
+        runner="pooling",
         max_model_len=4096,
         limit_mm_per_prompt={"image": 1},
     )
@@ -102,7 +102,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="TIGER-Lab/VLM2Vec-Full",
-        task="embed",
+        runner="pooling",
         max_model_len=4096,
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
@@ -122,7 +122,7 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="jinaai/jina-reranker-m0",
-        task="score",
+        runner="pooling",
         max_model_len=32768,
         trust_remote_code=True,
         mm_processor_kwargs={
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index c99b5148de87..ac5f79b56e49 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -9,7 +9,7 @@
 vllm serve llava-hf/llava-1.5-7b-hf
 
 (multi-image inference with Phi-3.5-vision-instruct)
-vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
     --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 
 (audio inference with Ultravox)
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index 70f3c2f19cf1..771ad8511e97 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -92,7 +92,7 @@ def dse_qwen2_vl(inp: dict):
 def parse_args():
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embed before running this."
+        "the model with `--runner pooling` before running this."
     )
     parser.add_argument(
         "--model",
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 2e0d168d615c..f63c2bb84c99 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Score API.
 
-Run `vllm serve <model> --task score` to start up the server in vLLM.
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
 """
 
 import argparse
diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
index e49905a864c1..80ed2c27dfb1 100644
--- a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
+++ b/examples/online_serving/openai_cross_encoder_score_for_multimodal.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Score API.
 
-Run `vllm serve <model> --task score` to start up the server in vLLM.
+Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
 """
 
 import argparse
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index 8252b36705cc..95555d41cbea 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -3,7 +3,7 @@
 """
 Example online usage of Pooling API.
 
-Run `vllm serve <model> --task <embed|classify|reward|score>`
+Run `vllm serve <model> --runner pooling`
 to start up the server in vLLM.
 """
 
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
index 3a9042138377..0bbe4b8f5ee9 100644
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -10,7 +10,7 @@
 
 Run the vLLM server first:
 vllm serve meta-llama/Llama-3.2-1B-Instruct \
-  --task generate \
+  --runner generate \
   --max-model-len 4096 \
   --enable-prompt-embeds
 
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 62804e721e3d..916ec2b83df4 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -148,9 +148,6 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
     vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                            trust_remote_code=True,
                                            dtype=dtype,
                                            seed=42)
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 1ee9b234d9f4..cf715cd03222 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -62,8 +62,8 @@ class TestSetting:
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
             model_args=[
-                "--task", "embed", "--dtype", "bfloat16", "--max-model-len",
-                "2048"
+                "--runner", "pooling", "--dtype", "bfloat16",
+                "--max-model-len", "2048"
             ],
             pp_size=1,
             tp_size=1,
@@ -75,7 +75,7 @@ class TestSetting:
         # # encoder-based embedding model (BERT)
         # TestSetting(
         #     model="BAAI/bge-base-en-v1.5",
-        #     model_args=["--task", "embed"],
+        #     model_args=["--runner", "pooling"],
         #     pp_size=1,
         #     tp_size=1,
         #     attn_backend="XFORMERS",
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index 492e90f2a75f..b8d64247f6be 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -125,9 +125,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
     vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                            trust_remote_code=True,
                                            dtype=dtype,
                                            seed=42)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index b56edfc90612..a6baa97fe699 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -250,9 +250,6 @@ def sequence_parallelism_pass_on_test_model(
     # in the vllm_config, it's not really used.
     model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
     vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                            trust_remote_code=True,
                                            dtype=dtype,
                                            seed=42)
diff --git a/tests/conftest.py b/tests/conftest.py
index a18dbf58c803..8df93901ff82 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,7 +23,7 @@
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, _get_and_verify_dtype
+from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -769,7 +769,8 @@ class VllmRunner:
     def __init__(
         self,
         model_name: str,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
+        convert: ConvertOption = "auto",
         tokenizer_name: Optional[str] = None,
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
@@ -786,7 +787,8 @@ def __init__(
     ) -> None:
         self.llm = LLM(
             model=model_name,
-            task=task,
+            runner=runner,
+            convert=convert,
             tokenizer=tokenizer_name,
             tokenizer_mode=tokenizer_mode,
             trust_remote_code=trust_remote_code,
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index f641bf160414..f273f302e72e 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.logger import init_logger
 
 from ..utils import compare_two_settings, create_new_process_for_each_test
@@ -31,14 +31,14 @@ class EPTestOptions(NamedTuple):
 class EPTestSettings:
     parallel_setups: list[ParallelSetup]
     distributed_backends: list[str]
-    task: TaskOption
+    runner: RunnerOption
     test_options: EPTestOptions
 
     @staticmethod
     def detailed(
         *,
         tp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
         tokenizer_mode: Optional[str] = None,
         load_format: Optional[str] = None,
@@ -63,7 +63,7 @@ def detailed(
                               chunked_prefill=False),
             ],
             distributed_backends=["mp", "ray"],
-            task=task,
+            runner=runner,
             test_options=EPTestOptions(trust_remote_code=trust_remote_code,
                                        tokenizer_mode=tokenizer_mode,
                                        load_format=load_format,
@@ -74,7 +74,7 @@ def detailed(
     def fast(
         *,
         tp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
         tokenizer_mode: Optional[str] = None,
         load_format: Optional[str] = None,
@@ -87,7 +87,7 @@ def fast(
                               chunked_prefill=False),
             ],
             distributed_backends=["mp"],
-            task=task,
+            runner=runner,
             test_options=EPTestOptions(trust_remote_code=trust_remote_code,
                                        tokenizer_mode=tokenizer_mode,
                                        load_format=load_format,
@@ -100,7 +100,7 @@ def iter_params(self, model_name: str):
         for parallel_setup in self.parallel_setups:
             for distributed_backend in self.distributed_backends:
                 yield (model_name, parallel_setup, distributed_backend,
-                       self.task, opts)
+                       self.runner, opts)
 
 
 # NOTE: You can adjust tp_base locally to fit the model in GPU
@@ -118,7 +118,7 @@ def _compare_tp(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: EPTestOptions,
     num_gpus_available: int,
     *,
@@ -154,8 +154,8 @@ def _compare_tp(
         common_args.append("--enable-chunked-prefill")
     if eager_mode:
         common_args.append("--enforce-eager")
-    if task != "auto":
-        common_args.extend(["--task", task])
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
     if trust_remote_code:
         common_args.append("--trust-remote-code")
     if tokenizer_mode:
@@ -203,7 +203,7 @@ def _compare_tp(
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
+    ("model_name", "parallel_setup", "distributed_backend", "runner",
      "test_options"),
     [
         params for model_name, settings in TEST_MODELS.items()
@@ -215,14 +215,14 @@ def test_ep(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: EPTestOptions,
     num_gpus_available,
 ):
     _compare_tp(model_name,
                 parallel_setup,
                 distributed_backend,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate")
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 2391430a083a..333ab5580af4 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -14,7 +14,7 @@
 
 import pytest
 
-from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
+from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
 
@@ -60,7 +60,7 @@ class PPTestSettings:
     distributed_backends: list[str]
     # vllm major version: "0" for V0, "1" for V1
     vllm_major_versions: list[str]
-    task: TaskOption
+    runner: RunnerOption
     test_options: PPTestOptions
 
     def __post_init__(self):
@@ -76,7 +76,7 @@ def detailed(
         tp_base: int = 1,
         pp_base: int = 2,
         multi_node_only: bool = False,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         load_format: Optional[str] = None,
     ):
         return PPTestSettings(
@@ -104,7 +104,7 @@ def detailed(
             ],
             distributed_backends=["mp", "mp", "ray", "ray"],
             vllm_major_versions=["0", "1", "0", "1"],
-            task=task,
+            runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -114,7 +114,7 @@ def fast(
         *,
         tp_base: int = 1,
         pp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
@@ -127,7 +127,7 @@ def fast(
             ],
             distributed_backends=["mp"],
             vllm_major_versions=["0"],
-            task=task,
+            runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -139,7 +139,7 @@ def iter_params(self, model_id: str):
             for backend, vllm_major_version in zip(self.distributed_backends,
                                                    self.vllm_major_versions):
                 yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+                       self.runner, opts)
 
 
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -211,10 +211,10 @@ def iter_params(self, model_id: str):
 
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
     # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
     "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
-        load_format="dummy", task="embed"
+        load_format="dummy", runner="pooling"
     ),
 }
 
@@ -269,7 +269,7 @@ def _compare_tp(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available: int,
     *,
@@ -335,8 +335,8 @@ def _compare_tp(
         common_args.append("--enable-chunked-prefill")
     if eager_mode:
         common_args.append("--enforce-eager")
-    if task != "auto":
-        common_args.extend(["--task", task])
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
     if trust_remote_code:
         common_args.append("--trust-remote-code")
     if tokenizer_mode:
@@ -415,7 +415,7 @@ def _compare_tp(
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -427,7 +427,7 @@ def test_tp_language_generation(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
 ):
@@ -435,7 +435,7 @@ def test_tp_language_generation(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate",
@@ -456,7 +456,7 @@ def test_tp_language_embedding(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
 ):
@@ -464,7 +464,7 @@ def test_tp_language_embedding(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="encode",
@@ -473,7 +473,7 @@ def test_tp_language_embedding(
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in MULTIMODAL_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -485,7 +485,7 @@ def test_tp_multimodal_generation(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: PPTestOptions,
     num_gpus_available,
 ):
@@ -493,7 +493,7 @@ def test_tp_multimodal_generation(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate",
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index b2f6a8ab9dd3..49b8eddecb4a 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -14,7 +14,7 @@
 
 import pytest
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.logger import init_logger
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -48,7 +48,7 @@ class SPTestSettings:
     distributed_backends: list[str]
     # vllm major version: "0" for V0, "1" for V1
     vllm_major_versions: list[str]
-    task: TaskOption
+    runner: RunnerOption
     test_options: SPTestOptions
 
     def __post_init__(self):
@@ -64,7 +64,7 @@ def detailed(
         tp_base: int = 2,
         pp_base: int = 1,
         multi_node_only: bool = False,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         load_format: Optional[str] = None,
     ):
         parallel_setups = []
@@ -81,7 +81,7 @@ def detailed(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
             test_options=SPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -91,7 +91,7 @@ def fast(
         *,
         tp_base: int = 2,
         pp_base: int = 1,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
@@ -109,7 +109,7 @@ def fast(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
             test_options=SPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -119,7 +119,7 @@ def fp8_quant(
         *,
         tp_base: int = 2,
         pp_base: int = 1,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
@@ -135,7 +135,7 @@ def fp8_quant(
             parallel_setups=parallel_setups,
             distributed_backends=["mp", "ray"],
             vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
             test_options=SPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
         )
@@ -147,7 +147,7 @@ def iter_params(self, model_id: str):
             for backend, vllm_major_version in zip(self.distributed_backends,
                                                    self.vllm_major_versions):
                 yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+                       self.runner, opts)
 
 
 def _compare_sp(
@@ -155,7 +155,7 @@ def _compare_sp(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available: int,
     *,
@@ -217,8 +217,8 @@ def _compare_sp(
         common_args.append("--enable-chunked-prefill")
     if eager_mode:
         common_args.append("--enforce-eager")
-    if task != "auto":
-        common_args.extend(["--task", task])
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
     if trust_remote_code:
         common_args.append("--trust-remote-code")
     if tokenizer_mode:
@@ -298,7 +298,7 @@ def _compare_sp(
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id)
@@ -311,7 +311,7 @@ def test_tp_sp_generation(
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
     test_options: SPTestOptions,
     num_gpus_available,
 ):
@@ -319,7 +319,7 @@ def test_tp_sp_generation(
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
-                task,
+                runner,
                 test_options,
                 num_gpus_available,
                 method="generate",
diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/openai/correctness/test_mteb_embed.py
index 12a86f9bdd59..783f7d3e0d5a 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py
@@ -19,7 +19,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
+        "--runner", "pooling", "--enforce-eager",
+        "--disable-uvicorn-access-log"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py
index 05e953de4a0f..cfb865815c9b 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -21,7 +21,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
+        "--runner", "pooling", "--enforce-eager",
+        "--disable-uvicorn-access-log"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
index e9d1a855294c..9fa7ab83555a 100644
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -15,10 +15,6 @@
 def get_vocab_size(model_name):
     config = ModelConfig(
         model=model_name,
-        task="auto",
-        tokenizer=model_name,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
         seed=0,
         dtype="bfloat16",
     )
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f03c96b12179..a7203befcc40 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -33,8 +33,8 @@ def v1(run_with_both_engines):
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         DTYPE,
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 08b797dc57ad..91e91699b92c 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -42,8 +42,8 @@ def dtype(request):
 @pytest.fixture(scope="module")
 def server(model_info, dtype: str):
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         dtype,
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 580bf34f20c4..771119d04ea3 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -21,7 +21,7 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
+        "--runner",
         "generate",
         "--max-model-len",
         "2048",
diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/openai/test_optional_middleware.py
index 882fa0886ce3..eb387998c2cc 100644
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@@ -27,8 +27,8 @@ def server(request: pytest.FixtureRequest):
         passed_params = [passed_params]
 
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "float16",
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 02165ee6d58e..63f4205e0a42 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -20,8 +20,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "reward",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index b33a26af65b3..79b6ce059ce4 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -29,8 +29,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         "--dtype",
         "bfloat16",
         "--enforce-eager",
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index b68e08556ee9..ad4dff00daaa 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -25,7 +25,7 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
+        "--runner",
         "generate",
         "--max-model-len",
         "32768",
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index b6f1d64803e5..8259a81d7b6a 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -48,7 +48,7 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
+        "--runner",
         "generate",
         "--max-model-len",
         "2048",
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index fe982e286ae4..4e6a21058658 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -31,8 +31,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index e321ca70001d..158b4055aea0 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -42,12 +42,8 @@
 @pytest.fixture(scope="function")
 def phi3v_model_config():
     return ModelConfig(PHI3V_MODEL_ID,
-                       task="generate",
-                       tokenizer=PHI3V_MODEL_ID,
-                       tokenizer_mode="auto",
+                       runner="generate",
                        trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -56,12 +52,8 @@ def phi3v_model_config():
 @pytest.fixture(scope="function")
 def phi3v_model_config_mm_interleaved():
     return ModelConfig(PHI3V_MODEL_ID,
-                       task="generate",
+                       runner="generate",
                        tokenizer=PHI3V_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
                        interleave_mm_strings=True,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -81,11 +73,7 @@ def phi3v_tokenizer():
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
     return ModelConfig(QWEN25OMNI_MODEL_ID,
-                       task="generate",
-                       tokenizer=QWEN25OMNI_MODEL_ID,
-                       tokenizer_mode="auto",
-                       dtype="auto",
-                       seed=0,
+                       runner="generate",
                        interleave_mm_strings=True,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -107,12 +95,7 @@ def qwen25omni_tokenizer():
 @pytest.fixture(scope="module")
 def mllama_model_config():
     return ModelConfig(MLLAMA_MODEL_ID,
-                       task="generate",
-                       tokenizer=MLLAMA_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
+                       runner="generate",
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -131,12 +114,7 @@ def mllama_tokenizer():
 @pytest.fixture(scope="function")
 def mistral_model_config():
     return ModelConfig(MISTRAL_MODEL_ID,
-                       task="generate",
-                       tokenizer=MISTRAL_MODEL_ID,
-                       tokenizer_mode="auto",
-                       trust_remote_code=True,
-                       dtype="auto",
-                       seed=0,
+                       runner="generate",
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -1100,12 +1078,7 @@ def get_conversation(is_hf: bool):
 
     # Build a config for the model
     model_config = ModelConfig(model,
-                               task="generate",
-                               tokenizer=model,
-                               tokenizer_mode="auto",
-                               trust_remote_code=True,
-                               dtype="auto",
-                               seed=0,
+                               runner="generate",
                                limit_mm_per_prompt={
                                    "image": 2,
                                })
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 9999c1be54ea..bd0aea67b970 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -38,13 +38,8 @@ def set_active_loras(worker: Union[Worker, V1Worker],
     vllm_config = VllmConfig(
         model_config=ModelConfig(
             "meta-llama/Llama-2-7b-hf",
-            task="auto",
-            tokenizer="meta-llama/Llama-2-7b-hf",
-            tokenizer_mode="auto",
-            trust_remote_code=False,
             seed=0,
             dtype="float16",
-            revision=None,
             enforce_eager=True,
         ),
         load_config=LoadConfig(
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 721478f42442..2cf0ba2fe686 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -69,10 +69,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
 
     config = ModelConfig(
         MODEL_NAME,
-        task="generate",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
+        runner="generate",
         seed=0,
         dtype="bfloat16",
     )
@@ -113,10 +110,7 @@ async def test_guided_logits_processor_with_reasoning(
 
     config = ModelConfig(
         REASONING_MODEL_NAME,
-        task="generate",
-        tokenizer=REASONING_MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
+        runner="generate",
         seed=0,
         dtype="bfloat16",
     )
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index a663679a9c7c..61c5fcab4f8a 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner,
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 97362f641665..80060d2fb0a5 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner,
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
 
@@ -279,7 +279,7 @@ def mteb_test_rerank_models(hf_runner,
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
     with vllm_runner(model_info.name,
-                     task="score",
+                     runner="pooling",
                      max_model_len=None,
                      max_num_seqs=8,
                      **vllm_extra_kwargs) as vllm_model:
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index cc9e4102d5b7..290ab455dfc6 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -93,7 +93,7 @@ def test_models(
         hf_outputs = hf_model.encode(example_prompts)
 
     with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=max_model_len,
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.embed(example_prompts)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index efa119bb7659..d21987571cba 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -28,10 +28,7 @@ def test_find_array():
 
     model_config = ModelConfig(
         MODEL_NAME,
-        task="embed",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
+        runner="pooling",
         dtype="bfloat16",
         seed=0,
     )
@@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner):
 
     with vllm_runner(
             MODEL_NAME,
-            task="embed",
+            runner="pooling",
             max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.llm
@@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner):
 async def test_gritlm_api_server_embedding():
     queries, q_instruction, documents, d_instruction = get_test_data()
 
-    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as server:
         client_embedding = server.get_async_client()
@@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
 
     with vllm_runner(
             MODEL_NAME,
-            task="generate",
+            runner="generate",
             max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
         llm = vllm_model.llm
@@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
 async def test_gritlm_api_server_generate():
     input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
-    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as server:
         client_generate = server.get_async_client()
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 16c711407aea..59b634428cef 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -84,7 +84,7 @@ def test_matryoshka(
         hf_outputs = matryoshka_fy(hf_outputs, dimensions)
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      max_model_len=None) as vllm_model:
         assert vllm_model.llm.llm_engine.model_config.is_matryoshka
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 7413ef578e38..c34c36fd9815 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -21,7 +21,7 @@
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_default(model_info, vllm_runner):
-    with vllm_runner(model_info.name, task="embed",
+    with vllm_runner(model_info.name, runner="pooling",
                      max_model_len=None) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
@@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner):
 @pytest.mark.parametrize("model_info", MODELS)
 def test_set_max_model_len_legal(model_info, vllm_runner):
     # set max_model_len <= 512
-    with vllm_runner(model_info.name, task="embed",
+    with vllm_runner(model_info.name, runner="pooling",
                      max_model_len=256) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.max_model_len == 256
@@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
         # For nomic-embed-text-v2-moe the length is set to 512
         # by sentence_bert_config.json.
         with pytest.raises(ValueError):
-            with vllm_runner(model_info.name, task="embed",
+            with vllm_runner(model_info.name,
+                             runner="pooling",
                              max_model_len=1024):
                 pass
     else:
-        with vllm_runner(model_info.name, task="embed",
+        with vllm_runner(model_info.name, runner="pooling",
                          max_model_len=1024) as vllm_model:
             model_config = vllm_model.llm.llm_engine.model_config
             assert model_config.max_model_len == 1024
@@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
 def test_set_max_model_len_illegal(model_info, vllm_runner):
     # set max_model_len > 2048
     with pytest.raises(ValueError):
-        with vllm_runner(model_info.name, task="embed", max_model_len=4096):
+        with vllm_runner(model_info.name, runner="pooling",
+                         max_model_len=4096):
             pass
 
     # set max_model_len > 2048 by hf_overrides
     hf_overrides = {"max_model_len": 4096}
     with pytest.raises(ValueError):
         with vllm_runner(model_info.name,
-                         task="embed",
+                         runner="pooling",
                          max_model_len=None,
                          hf_overrides=hf_overrides):
             pass
@@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
     }
 
     with vllm_runner(model_info.name,
-                     task="embed",
+                     runner="pooling",
                      max_model_len=None,
                      hf_overrides=hf_overrides):
         pass
@@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
     # illegal max_model_len
     with pytest.raises(ValueError):
         with vllm_runner(model_info.name,
-                         task="embed",
+                         runner="pooling",
                          max_model_len=max_model_len + 1,
                          hf_overrides=hf_overrides):
             pass
@@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
     # illegal max_model_len by hf_overrides
     with pytest.raises(ValueError):
         with vllm_runner(model_info.name,
-                         task="embed",
+                         runner="pooling",
                          max_model_len=None,
                          hf_overrides=hf_overrides):
             pass
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index c75ff1445616..ef9d5530cde1 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict([text_pair]).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
+    with vllm_runner(model_name,
+                     runner="pooling",
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
@@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
+    with vllm_runner(model_name,
+                     runner="pooling",
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
@@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
     with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=DTYPE,
+    with vllm_runner(model_name,
+                     runner="pooling",
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
@@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
         ]
 
     with vllm_runner(emb_model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
@@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
         ]
 
     with vllm_runner(emb_model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
@@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
         ]
 
     with vllm_runner(emb_model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index c7399e01c735..dc2bf21ef63b 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -26,7 +26,7 @@ def test_smaller_truncation_size(vllm_runner,
 
     truncate_prompt_tokens = 10
 
-    with vllm_runner(model_name, task="embed",
+    with vllm_runner(model_name, runner="pooling",
                      max_model_len=max_model_len) as vllm_model:
         vllm_output = vllm_model.llm.encode(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
@@ -41,7 +41,7 @@ def test_max_truncation_size(vllm_runner,
                              input_str=input_str):
     truncate_prompt_tokens = -1
 
-    with vllm_runner(model_name, task="embed",
+    with vllm_runner(model_name, runner="pooling",
                      max_model_len=max_model_len) as vllm_model:
         vllm_output = vllm_model.llm.encode(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
@@ -58,7 +58,7 @@ def test_bigger_truncation_size(vllm_runner,
     truncate_prompt_tokens = max_model_len + 1
 
     with pytest.raises(ValueError), vllm_runner(
-            model_name, task="embed",
+            model_name, runner="pooling",
             max_model_len=max_model_len) as vllm_model:
 
         llm_output = vllm_model.llm.encode(
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index e2e35e9b2721..77b39f57a276 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -638,7 +638,7 @@
         img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
         max_model_len=4096,
         max_num_seqs=2,
-        task="generate",
+        runner="generate",
         # use sdpa mode for hf runner since phi3v didn't work with flash_attn
         hf_model_kwargs={"_attn_implementation": "sdpa"},
         use_tokenizer_eos=True,
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index c5ffa5f3a70a..f2e6fbfad6e8 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -65,7 +65,7 @@ def run_test(
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
             model,
-            task="generate",
+            runner="generate",
             max_model_len=max_model_len,
             max_num_seqs=1,
             dtype=dtype,
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 949c0a80d31b..1ef56af33a09 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -48,7 +48,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
     ]
 
     with vllm_runner(model,
-                     task="generate",
+                     runner="generate",
                      dtype=dtype,
                      limit_mm_per_prompt={"image": 2},
                      max_model_len=32768,
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index 4e8465778e25..67d35213d642 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -99,7 +99,7 @@ def run_test(
     # max_model_len should be greater than image_feature_size
     with vllm_runner(
             model,
-            task="generate",
+            runner="generate",
             max_model_len=max_model_len,
             max_num_seqs=2,
             dtype=dtype,
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index a2793b8c8ddf..c61c27ae204a 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -267,7 +267,7 @@ def run_embedding_input_test(
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
-                     task="generate",
+                     runner="generate",
                      max_model_len=4000,
                      max_num_seqs=3,
                      dtype=dtype,
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index cf8962ce4975..f65385150d75 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -6,7 +6,7 @@
 import torch
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .....conftest import HfRunner, VllmRunner
@@ -37,7 +37,7 @@ def run_test(
     vllm_runner_kwargs: Optional[dict[str, Any]],
     hf_model_kwargs: Optional[dict[str, Any]],
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
-    task: TaskOption = "auto",
+    runner: RunnerOption = "auto",
     distributed_executor_backend: Optional[str] = None,
     tensor_parallel_size: int = 1,
     vllm_embeddings: Optional[torch.Tensor] = None,
@@ -83,7 +83,7 @@ def run_test(
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=enforce_eager,
-                     task=task,
+                     runner=runner,
                      **vllm_runner_kwargs_) as vllm_model:
         tokenizer = vllm_model.llm.get_tokenizer()
 
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 0ec7909e744d..397f88dd8728 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -11,7 +11,7 @@
 from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -109,7 +109,7 @@ class VLMTestInfo(NamedTuple):
     enforce_eager: bool = True
     max_model_len: int = 1024
     max_num_seqs: int = 256
-    task: TaskOption = "auto"
+    runner: RunnerOption = "auto"
     tensor_parallel_size: int = 1
     vllm_runner_kwargs: Optional[dict[str, Any]] = None
 
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index a6f5aeccf94e..f152ded3fb23 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -92,7 +92,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      enforce_eager=True,
                      max_model_len=8192) as vllm_model:
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index 712b6801de45..7ad7a8d284cb 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -49,7 +49,7 @@ def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
 
     with vllm_runner(
             model_name,
-            task="score",
+            runner="pooling",
             dtype=dtype,
             max_num_seqs=2,
             max_model_len=2048,
diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
index 4a8f5cafbe48..50826677581d 100644
--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -64,7 +64,7 @@ def _run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
     with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      max_model_len=4096,
                      enforce_eager=True) as vllm_model:
diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
index 9a4b6d3ff8a8..f918a0bd781e 100644
--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -44,7 +44,7 @@ def _run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, task="embed", dtype=dtype,
+    with vllm_runner(model, runner="pooling", dtype=dtype,
                      enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fd5842523178..150fb38da2c7 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -52,7 +52,7 @@ def _test_processing_correctness(
 
     model_config = ModelConfig(
         model_id,
-        task="auto",
+        runner="auto",
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 5f20452aff3d..24bf596c4b4d 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -58,7 +58,7 @@ def test_hf_model_weights_mapper(model_arch: str):
 
     model_config = ModelConfig(
         model_id,
-        task="auto",
+        runner="auto",
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index e53902cdb8f4..8cb269d7e949 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -172,7 +172,7 @@ def test_4bit_bnb_embedding_model(
 
     # Inflight 4bit quantization
     with vllm_runner(model_name,
-                     task="embed",
+                     runner="pooling",
                      dtype=dtype,
                      gpu_memory_utilization=0.5,
                      quantization="bitsandbytes") as vllm_model:
diff --git a/tests/models/utils.py b/tests/models/utils.py
index cdf8d02df73c..a8ae1f083724 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -8,7 +8,7 @@
 import torch
 import torch.nn.functional as F
 
-from vllm.config import ModelConfig, TaskOption
+from vllm.config import ModelConfig, RunnerOption
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
@@ -255,7 +255,7 @@ def check_logprobs_close(
 
 def build_model_context(
     model_id: str,
-    task: TaskOption = "auto",
+    runner: RunnerOption = "auto",
     dtype: Union[str, torch.dtype] = "auto",
     model_config_kwargs: Optional[dict[str, Any]] = None,
     mm_processor_kwargs: Optional[dict[str, Any]] = None,
@@ -280,7 +280,7 @@ def build_model_context(
     model_config_kwargs = model_config_kwargs or {}
     model_config = ModelConfig(
         model_id,
-        task=task,
+        runner=runner,
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 2f97475f121a..8a3f09bdbe27 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -954,13 +954,6 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
 
     model_config = ModelConfig(
         model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
@@ -993,13 +986,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
 
     model_config = ModelConfig(
         model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
@@ -1061,16 +1047,7 @@ def __call__(
 )
 # yapf: enable
 def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
-    model_config = ModelConfig(
-        model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)
 
     processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     orig_get_hf_processor = processor.info.get_hf_processor
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 8b0ffc0fe42f..8cf8402436ff 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -57,15 +57,7 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
 
     try:
-        model_config = ModelConfig(model_path,
-                                   task="auto",
-                                   tokenizer=model_path,
-                                   tokenizer_mode="auto",
-                                   trust_remote_code=False,
-                                   seed=0,
-                                   dtype="float16",
-                                   revision=None,
-                                   quantization=quantization_arg)
+        model_config = ModelConfig(model_path, quantization=quantization_arg)
         found_quantization_type = model_config.quantization
     except ValueError:
         found_quantization_type = "ERROR"
diff --git a/tests/test_config.py b/tests/test_config.py
index 015baef91811..39bf8629e4f8 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -74,18 +74,22 @@ def test_update_config():
         new_config3 = update_config(config3, {"a": "new_value"})
 
 
+# Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
     [
-        ("distilbert/distilgpt2", "generate", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
-        ("openai/whisper-small", "generate", "transcription"),
+        ("distilbert/distilgpt2", "generate", "none", "generate"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward", "reward"),
+        ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
-def test_auto_task(model_id, expected_runner_type, expected_task):
+def test_auto_task(model_id, expected_runner_type, expected_convert_type,
+                   expected_task):
     config = ModelConfig(
         model_id,
         task="auto",
@@ -97,25 +101,26 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
     )
 
     assert config.runner_type == expected_runner_type
-
-    if config.runner_type == "pooling":
-        assert config.task == expected_task
-    else:
-        assert expected_task in config.supported_tasks
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
 
 
+# Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
     [
-        ("distilbert/distilgpt2", "pooling", "embed"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
-        ("openai/whisper-small", "pooling", "embed"),
+        ("distilbert/distilgpt2", "pooling", "embed", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
+        ("openai/whisper-small", "pooling", "embed", "embed"),
     ],
 )
-def test_score_task(model_id, expected_runner_type, expected_task):
+def test_score_task(model_id, expected_runner_type, expected_convert_type,
+                    expected_task):
     config = ModelConfig(
         model_id,
         task="score",
@@ -127,14 +132,79 @@ def test_score_task(model_id, expected_runner_type, expected_task):
     )
 
     assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
+    [
+        ("distilbert/distilgpt2", "generate", "none", "generate"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward", "reward"),
+        ("openai/whisper-small", "generate", "none", "transcription"),
+    ],
+)
+def test_auto_runner(model_id, expected_runner_type, expected_convert_type,
+                     expected_task):
+    config = ModelConfig(
+        model_id,
+        runner="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+
+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
+
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
+    [
+        ("distilbert/distilgpt2", "pooling", "embed", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward", "reward"),
+        ("openai/whisper-small", "pooling", "embed", "embed"),
+    ],
+)
+def test_pooling_runner(model_id, expected_runner_type, expected_convert_type,
+                        expected_task):
+    config = ModelConfig(
+        model_id,
+        runner="pooling",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
 
-@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"),
-                         [
-                             ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
-                         ])
-def test_draft_task(model_id, expected_runner_type, expected_task):
+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
+    [
+        ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none", "draft"),
+    ],
+)
+def test_draft_runner(model_id, expected_runner_type, expected_convert_type,
+                      expected_task):
     config = ModelConfig(
         model_id,
         runner="draft",
@@ -144,7 +214,8 @@ def test_draft_task(model_id, expected_runner_type, expected_task):
     )
 
     assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
 
 
 @pytest.mark.parametrize(
@@ -168,23 +239,6 @@ def test_transcription_task(model_id, expected_runner_type, expected_task):
     assert config.task == expected_task
 
 
-@pytest.mark.parametrize(("model_id", "bad_task"), [
-    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
-    ("Qwen/Qwen3-0.6B", "transcription"),
-])
-def test_incorrect_task(model_id, bad_task):
-    with pytest.raises(ValueError, match=r"does not support task=.*"):
-        ModelConfig(
-            model_id,
-            task=bad_task,
-            tokenizer=model_id,
-            tokenizer_mode="auto",
-            trust_remote_code=False,
-            seed=0,
-            dtype="float16",
-        )
-
-
 MODEL_IDS_EXPECTED = [
     ("Qwen/Qwen1.5-7B", 32768),
     ("mistralai/Mistral-7B-v0.1", 4096),
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 39e3808d831c..be6427dd6bde 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -21,13 +21,8 @@ def test_max_tokens_none():
 def model_config():
     return ModelConfig(
         MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
         seed=0,
         dtype="float16",
-        revision=None,
     )
 
 
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index ccdbe79dfea4..ebe3a30e3352 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -695,11 +695,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
     # Create a VllmConfig
     model_config = ModelConfig(
         model_id,
-        task="generate",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
+        runner="generate",
         dtype="float16",
         max_model_len=max_model_len,
     )
@@ -733,11 +729,7 @@ def test_get_max_concurrency_for_kv_cache_config():
     max_model_len = 16384
     model_config = ModelConfig(
         model_id,
-        task="generate",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
+        runner="generate",
         dtype="float16",
         max_model_len=max_model_len,
     )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index a858a4d8c823..c719d1975bba 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1248,9 +1248,6 @@ def create_scheduler_with_priority(
     )
     model_config = ModelConfig(
         model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
         trust_remote_code=True,
         dtype="float16",
         seed=42,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 0b7d8251b640..02ca4498db19 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -59,9 +59,6 @@ def create_scheduler(
     )
     model_config = ModelConfig(
         model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
         trust_remote_code=True,
         dtype="float16",
         seed=42,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index cf20d44fbaae..480a7074cdf4 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -68,9 +68,6 @@ def create_vllm_config(
     )
     model_config = ModelConfig(
         model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
         trust_remote_code=True,
         dtype="float16",
         seed=42,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 5c74a286c4a9..da7e5e2c467d 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -24,13 +24,8 @@
 
 def _create_proposer(method: str, k: int) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
-                               task="generate",
-                               max_model_len=100,
-                               tokenizer=model_dir,
-                               tokenizer_mode="auto",
-                               dtype="auto",
-                               seed=None,
-                               trust_remote_code=False)
+                               runner="generate",
+                               max_model_len=100)
 
     # Choose model directory based on method
     draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index ffea86d0d19c..c844925e6cae 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -44,14 +44,7 @@ def test_ngram_proposer():
 
     def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
         # Dummy model config. Just to set max_model_len.
-        model_config = ModelConfig(model="facebook/opt-125m",
-                                   task="generate",
-                                   max_model_len=100,
-                                   tokenizer="facebook/opt-125m",
-                                   tokenizer_mode="auto",
-                                   dtype="auto",
-                                   seed=None,
-                                   trust_remote_code=False)
+        model_config = ModelConfig(model="facebook/opt-125m")
         return NgramProposer(
             vllm_config=VllmConfig(model_config=model_config,
                                    speculative_config=SpeculativeConfig.
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 40db0b2afe0d..215be09bf5a2 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -26,10 +26,6 @@ def get_vllm_config():
     )
     model_config = ModelConfig(
         model="facebook/opt-125m",
-        task="generate",
-        tokenizer="facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=True,
         dtype="bfloat16",  # TPUs typically use bfloat16
         seed=42,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 6ddcbfea24ad..45c3d933961a 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -76,10 +76,6 @@ def get_vllm_config():
     )
     model_config = ModelConfig(
         model="facebook/opt-125m",
-        task="generate",
-        tokenizer="facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=True,
         dtype="float16",
         seed=42,
     )
diff --git a/vllm/config.py b/vllm/config.py
index 223c1968c275..dcb006dcd497 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -26,7 +26,7 @@
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
 from torch.distributed import ProcessGroup, ReduceOp
-from typing_extensions import Self, runtime_checkable
+from typing_extensions import Self, assert_never, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
@@ -101,10 +101,16 @@
 
 RunnerType = Literal["generate", "pooling", "draft"]
 
-_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
+ConvertOption = Literal["auto", "none", "embed", "classify", "reward"]
+
+ConvertType = Literal["none", "embed", "classify", "reward"]
+
+_POOLING_CONVERT_TYPES = ("embed", "classify", "reward")
+
+_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
     "generate": ["generate", "transcription"],
-    "pooling": ["encode", "embed", "classify", "reward"],
-    "draft": [],
+    "pooling": ["embedding", "embed", "classify", "score", "reward"],
+    "draft": ["draft"],
 }
 
 
@@ -235,11 +241,16 @@ class ModelConfig:
     runner: RunnerOption = "auto"
     """The type of model runner to use. Each vLLM instance only supports one
     model runner, even if the same model can be used for multiple types."""
-    task: TaskOption = "auto"
-    """The task to use the model for. If the model supports more than one
-    model runner, this is used to select which model runner to run.
-
-    Note that the model may support other tasks using the same model runner."""
+    convert: ConvertOption = "auto"
+    """Convert the model using adapters defined in
+    [vllm.model_executor.models.adapters][]. The most common use case is to
+    adapt a text generation model to be used for pooling tasks."""
+    task: Optional[TaskOption] = None
+    """[DEPRECATED] The task to use the model for. If the model supports more
+    than one model runner, this is used to select which model runner to run.
+
+    Note that the model may support other tasks using the same model runner.
+    """
     tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
@@ -557,48 +568,103 @@ def __post_init__(self) -> None:
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision)
 
-        # For pooling models, self.task is used to indicate the
-        # user-selected task
-        if self.task == "score":
-            if self._is_classify_task(self.architectures):
-                self.task = "classify"
+        _, arch = self.registry.inspect_model_cls(self.architectures)
+        self._architecture = arch
+
+        is_generative_model = self.registry.is_text_generation_model(arch)
+        is_pooling_model = self.registry.is_pooling_model(arch)
+        print(f"{arch=}", f"{is_generative_model=}", f"{is_pooling_model=}")
+
+        def _task_to_convert(task: TaskOption) -> ConvertType:
+            if task == "embedding" or task == "embed":
+                return "embed"
+            if task == "classify":
+                return "classify"
+            if task == "reward":
+                return "reward"
+            if task == "score":
+                return ("classify"
+                        if self.registry.is_cross_encoder_model(arch) else
+                        "embed")
+
+            return "none"
+
+        if self.task is not None:
+            msg_prefix = ("The 'task' option is now deprecated and "
+                          "will be removed in v0.13.0.")
+
+            is_generative_task = self.task in _RUNNER_TASKS["generate"]
+            is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
+            print(f"{is_generative_task=}", f"{is_pooling_task=}")
+
+            if is_generative_model and not is_pooling_model:
+                if is_generative_task:
+                    self.runner = "generate"
+                    self.convert = "auto"
+                    msg_hint = ("Please remove this option as it is useless "
+                                "for generative-only models")
+                elif is_pooling_task:
+                    self.runner = "pooling"
+                    self.convert = _task_to_convert(self.task)
+                    msg_hint = ("Please replace this option with `--convert "
+                                f"{self.convert}` to continue "
+                                "adapting this generative model into a "
+                                "pooling model.")
+                else:
+                    self.runner = "auto"
+                    self.convert = "auto"
+                    msg_hint = "Please remove this option."
+            elif not is_generative_model and is_pooling_model:
+                if is_generative_task:
+                    # Pooling -> Generative not supported
+                    self.runner = "generate"
+                    self.convert = "auto"
+                    msg_hint = "Please remove this option."
+                elif is_pooling_task:
+                    self.runner = "pooling"
+                    self.convert = _task_to_convert(self.task)
+                    msg_hint = ("Please replace this option with `--convert "
+                                f"{self.convert}` to continue "
+                                "using this pooling model.")
+                else:
+                    msg_hint = "Please remove this option."
+                    self.runner = "auto"
+                    self.convert = "auto"
+            elif is_generative_model and is_pooling_model:
+                if is_generative_task:
+                    self.runner = "auto"
+                    self.convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "generate` to continue using this model as a "
+                                "generative model.")
+                elif is_pooling_task:
+                    self.runner = "auto"
+                    self.convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "pooling` to continue using this model as a "
+                                "pooling model.")
+                else:
+                    self.runner = "auto"
+                    self.convert = "auto"
+                    msg_hint = "Please remove this option."
             else:
-                self.task = "embed"
-        elif self.task == "embedding":
-            msg = ("The 'embedding' task has been renamed to 'embed', please "
-                   "use the new name. The old name will be removed in v1.0.")
+                raise AssertionError("The model should be a generative or "
+                                     "pooling model")
+
+            msg = f"{msg_prefix} {msg_hint}"
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-            self.task = "embed"
+        self.runner_type: RunnerType = (self._get_default_runner_type(arch) if
+                                        self.runner == "auto" else self.runner)
+        logger.debug("Selected runner type: %s", self.runner_type)
 
-        model_info, arch = self.registry.inspect_model_cls(self.architectures)
-        self._model_info = model_info
-        self._architecture = arch
+        self.convert_type: ConvertType = (self._get_default_convert_type(
+            arch, self.runner_type) if self.convert == "auto" else
+                                          self.convert)
+        logger.debug("Selected convert type: %s", self.convert_type)
 
-        all_supported_tasks = self._get_supported_tasks(self.task)
-        logger.debug("Tasks supported by runner type: %s", all_supported_tasks)
-        supported_runner_types = self._get_supported_runner_types(
-            all_supported_tasks)
-        runner_type = self._resolve_runner(self.runner, self.task,
-                                           supported_runner_types,
-                                           all_supported_tasks)
-
-        logger.debug("Selected runner type: %s", runner_type)
-        # For pooling models, self.task is used to indicate the
-        # user-selected task
-        if runner_type == "pooling" and self.task == "auto":
-            selected_task = all_supported_tasks[runner_type][-1]
-            assert selected_task != "encode"
-            self.task = selected_task
-        self.supported_runner_types = supported_runner_types
-        self.runner_type = runner_type
-        self.supported_tasks = all_supported_tasks[runner_type]
-
-        if self.runner_type in ("draft",
-                                "generate") and self.task != "transcription":
-            self.truncation_side = "left"
-        else:
-            self.truncation_side = "right"
+        self.supported_tasks = self._get_supported_tasks(
+            arch, self.runner_type, self.convert_type)
 
         self.pooler_config = self._init_pooler_config()
 
@@ -719,10 +785,6 @@ def architecture(self) -> str:
         # The architecture vllm actually used.
         return self._architecture
 
-    @property
-    def model_info(self):
-        return self._model_info
-
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """Pull model/tokenizer from S3 to temporary directory when needed.
@@ -837,155 +899,135 @@ def _verify_tokenizer_mode(self) -> None:
                 f"one of {get_args(TokenizerMode)}.")
         self.tokenizer_mode = tokenizer_mode
 
-    def _is_classify_task(self, architectures: list[str]):
-        for arch in architectures:
-            if arch.endswith("ForSequenceClassification"):
-                return True
-        return self.registry.is_cross_encoder_model(architectures)
+    def _get_default_runner_type(self, architecture: str) -> RunnerType:
+        if self.registry.is_cross_encoder_model(architecture):
+            return "pooling"
 
-    def _get_preferred_pooling_task(
-        self,
-        architectures: list[str],
-    ) -> _ResolvedTask:
         model_id = self.model
         if get_pooling_config(model_id, self.revision):
-            return "embed"
-        if self.registry.is_transcription_model(architectures):
-            return "transcription"
+            return "pooling"
 
-        suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
-            # Other models follow this pattern
+        # https://huggingface.co/docs/transformers/en/model_doc/auto
+        suffix_to_runner_type: list[tuple[str, RunnerType]] = [
+            ("ForCausalLM", "generate"),
+            ("ForConditionalGeneration", "generate"),
+            ("ChatModel", "generate"),
+            ("LMHeadModel", "generate"),
+            ("ForTextEncoding", "pooling"),
+            ("ForSequenceClassification", "pooling"),
+            ("EmbeddingModel", "pooling"),
+            ("ForRewardModeling", "pooling"),
+            ("RewardModel", "pooling"),
+        ]
+
+        for suffix, pref_runner in suffix_to_runner_type:
+            if architecture.endswith(suffix):
+                return pref_runner
+
+        return "generate"
+
+    def _get_default_convert_type(
+        self,
+        architecture: str,
+        runner_type: RunnerType,
+    ) -> ConvertType:
+        if self.registry.is_cross_encoder_model(architecture):
+            return "classify"
+
+        # https://huggingface.co/docs/transformers/en/model_doc/auto
+        suffix_to_convert_type: list[tuple[str, ConvertType]] = [
+            ("ForTextEncoding", "embed"),
             ("EmbeddingModel", "embed"),
+            ("ForSequenceClassification", "classify"),
+            ("ForAudioClassification", "classify"),
+            ("ForImageClassification", "classify"),
+            ("ForVideoClassification", "classify"),
+            ("ForRewardModeling", "reward"),
             ("RewardModel", "reward"),
         ]
 
-        for suffix, pref_task in suffix_to_preferred_task:
-            if self.architecture.endswith(suffix):
-                return pref_task
+        for suffix, pref_runner in suffix_to_convert_type:
+            if architecture.endswith(suffix):
+                return pref_runner
 
-        return "embed"
+        if runner_type == "pooling":
+            return "embed"
+
+        return "none"
 
     def _get_supported_generation_tasks(
         self,
-        task_option: TaskOption,
+        architecture: str,
+        convert_type: ConvertType,
     ) -> list[_ResolvedTask]:
         registry = self.registry
-        architectures = self.architectures
 
-        if registry.is_transcription_only_model(architectures):
+        if registry.is_transcription_only_model(architecture):
             return ["transcription"]
 
-        supported_tasks = list[_ResolvedTask]()
-        if registry.is_text_generation_model(architectures):
-            supported_tasks.append("generate")
-
-            if registry.is_transcription_model(architectures):
-                supported_tasks.append("transcription")
+        supported_tasks: list[_ResolvedTask] = ["generate"]
+        if registry.is_transcription_model(architecture):
+            supported_tasks.append("transcription")
 
         return supported_tasks
 
+    def _get_default_pooling_task(self, architecture: str) -> _ResolvedTask:
+        if self.registry.is_cross_encoder_model(architecture):
+            return "classify"
+
+        # https://huggingface.co/docs/transformers/en/model_doc/auto
+        suffix_to_convert_type: list[tuple[str, _ResolvedTask]] = [
+            ("ForTextEncoding", "embed"),
+            ("EmbeddingModel", "embed"),
+            ("ForSequenceClassification", "classify"),
+            ("ForAudioClassification", "classify"),
+            ("ForImageClassification", "classify"),
+            ("ForVideoClassification", "classify"),
+            ("ForRewardModeling", "reward"),
+            ("RewardModel", "reward"),
+        ]
+
+        for suffix, pref_runner in suffix_to_convert_type:
+            if architecture.endswith(suffix):
+                return pref_runner
+
+        return "embed"
+
     def _get_supported_pooling_tasks(
         self,
-        task_option: TaskOption,
+        architecture: str,
+        convert_type: ConvertType,
     ) -> list[_ResolvedTask]:
         registry = self.registry
-        architectures = self.architectures
 
+        # TODO: Use get_supported_pooling_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
-        if registry.is_pooling_model(architectures):
+        if (registry.is_pooling_model(architecture)
+                or convert_type in _POOLING_CONVERT_TYPES):
             supported_tasks.append("encode")
 
-            # For now, users must specify the task (other than "pooling")
-            # to use for pooling models
-            if task_option == "auto":
-                preferred_task = self._get_preferred_pooling_task(
-                    architectures)
-
-                supported_tasks.append(preferred_task)
-            elif task_option in _RUNNER_TASKS["pooling"]:
-                supported_tasks.append(cast(_ResolvedTask, task_option))
+            extra_task = (self._get_default_pooling_task(architecture)
+                          if convert_type == "none" else convert_type)
+            supported_tasks.append(extra_task)
 
         return supported_tasks
 
     def _get_supported_tasks(
         self,
-        task_option: TaskOption,
-    ) -> dict[RunnerType, list[_ResolvedTask]]:
-        if self._is_classify_task(self.architectures):
-            return {"generate": [], "pooling": ["classify"], "draft": []}
-        else:
-            return {
-                "generate": self._get_supported_generation_tasks(task_option),
-                "pooling": self._get_supported_pooling_tasks(task_option),
-                "draft": ["draft"]
-            }
-
-    def _get_supported_runner_types(
-        self,
-        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
-    ) -> set[RunnerType]:
-        return {
-            runner
-            for runner, runner_tasks in supported_tasks.items()
-            if len(runner_tasks) > 0
-        }
-
-    def _resolve_runner(
-        self,
-        runner_option: RunnerOption,
-        task_option: TaskOption,
-        supported_runner_types: set[RunnerType],
-        supported_tasks: dict[RunnerType, list[_ResolvedTask]],
-    ) -> RunnerType:
-        if not supported_runner_types:
-            raise ValueError("This model does not support any model runners!")
-
-        if runner_option != "auto":
-            if runner_option not in supported_runner_types:
-                raise ValueError(
-                    f"This model does not support runner={runner_option!r}. "
-                    f"Available runners: {supported_runner_types}")
-
-            return runner_option
-
-        if task_option != "auto":
-            for runner, runner_tasks in supported_tasks.items():
-                if task_option in runner_tasks:
-                    return runner
-            else:
-                task_runner: RunnerType = next(
-                    runner for runner, tasks in _RUNNER_TASKS.items()
-                    if task_option in tasks)
-                raise ValueError(
-                    f"This model does not support task={task_option!r}. "
-                    f"Available tasks for runner={task_runner!r}: "
-                    f"{supported_tasks[task_runner]}")
-
-        if "classify" in supported_tasks.get("pooling", []):
-            # When multiple pooling tasks are present, default to
-            # pooling (eg cross-encoder) for non-standard architectures.
-            return "pooling"
-
-        suffix_to_preferred_runner: list[tuple[str, RunnerType]] = [
-            ("ForCausalLM", "generate"),
-            ("ForConditionalGeneration", "generate"),
-            ("ChatModel", "generate"),
-            ("LMHeadModel", "generate"),
-            ("EmbeddingModel", "pooling"),
-            ("RewardModel", "pooling"),
-        ]
-
-        for suffix, pref_runner in suffix_to_preferred_runner:
-            if self.architecture.endswith(
-                    suffix) and pref_runner in supported_runner_types:
-                return pref_runner
-
-        if "generate" in supported_runner_types:
-            return "generate"
-        if "pooling" in supported_runner_types:
-            return "pooling"
+        architecture: str,
+        runner_type: RunnerType,
+        convert_type: ConvertType,
+    ) -> list[_ResolvedTask]:
+        if runner_type == "generate":
+            return self._get_supported_generation_tasks(
+                architecture, convert_type)
+        if runner_type == "pooling":
+            return self._get_supported_pooling_tasks(architecture,
+                                                     convert_type)
+        if runner_type == "draft":
+            return ["draft"]
 
-        raise AssertionError("This line should not be reached")
+        assert_never(runner_type)
 
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
@@ -1564,7 +1606,7 @@ def use_mla(self) -> bool:
     @property
     def is_v1_compatible(self) -> bool:
         architectures = getattr(self.hf_config, "architectures", [])
-        return me_models.ModelRegistry.is_v1_compatible(architectures)
+        return self.registry.is_v1_compatible(architectures)
 
     @property
     def is_matryoshka(self) -> bool:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4a5efd40241d..728fe87f9e8e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,15 +22,16 @@
 
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         ConfigFormat, ConfigType, DecodingConfig,
-                         DetailedTraceModules, Device, DeviceConfig,
-                         DistributedExecutorBackend, GuidedDecodingBackend,
-                         GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
-                         KVTransferConfig, LoadConfig, LoadFormat,
-                         LogprobsMode, LoRAConfig, ModelConfig, ModelDType,
-                         ModelImpl, MultiModalConfig, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
-                         PromptAdapterConfig, SchedulerConfig, SchedulerPolicy,
+                         ConfigFormat, ConfigType, ConvertOption,
+                         DecodingConfig, DetailedTraceModules, Device,
+                         DeviceConfig, DistributedExecutorBackend,
+                         GuidedDecodingBackend, GuidedDecodingBackendV1,
+                         HfOverrides, KVEventsConfig, KVTransferConfig,
+                         LoadConfig, LoadFormat, LogprobsMode, LoRAConfig,
+                         ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
+                         ObservabilityConfig, ParallelConfig, PoolerConfig,
+                         PrefixCachingHashAlgo, PromptAdapterConfig,
+                         RunnerOption, SchedulerConfig, SchedulerPolicy,
                          SpeculativeConfig, TaskOption, TokenizerMode,
                          VllmConfig, get_attr_docs, get_field)
 from vllm.logger import init_logger
@@ -269,7 +270,9 @@ class EngineArgs:
         str, List[str]]] = ModelConfig.served_model_name
     tokenizer: Optional[str] = ModelConfig.tokenizer
     hf_config_path: Optional[str] = ModelConfig.hf_config_path
-    task: TaskOption = ModelConfig.task
+    runner: RunnerOption = ModelConfig.runner
+    convert: ConvertOption = ModelConfig.convert
+    task: Optional[TaskOption] = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
     tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
@@ -461,7 +464,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
             model_group.add_argument("--model", **model_kwargs["model"])
-        model_group.add_argument("--task", **model_kwargs["task"])
+        model_group.add_argument("--runner", **model_kwargs["runner"])
+        model_group.add_argument("--convert", **model_kwargs["convert"])
+        model_group.add_argument("--task",
+                                 **model_kwargs["task"],
+                                 deprecated=True)
         model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
         model_group.add_argument("--tokenizer-mode",
                                  **model_kwargs["tokenizer_mode"])
@@ -876,6 +883,8 @@ def create_model_config(self) -> ModelConfig:
         return ModelConfig(
             model=self.model,
             hf_config_path=self.hf_config_path,
+            runner=self.runner,
+            convert=self.convert,
             task=self.task,
             tokenizer=self.tokenizer,
             tokenizer_mode=self.tokenizer_mode,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c4f1b3b86619..1642f325976f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -19,8 +19,8 @@
                               create_sort_beams_key_function)
 from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
                          is_init_field)
-from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
-                                   TaskOption)
+from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
+                                   PoolerConfig, RunnerOption)
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          ChatTemplateContentFormatOption,
@@ -169,7 +169,8 @@ def __init__(
         self,
         model: str,
         *,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
+        convert: ConvertOption = "auto",
         tokenizer: Optional[str] = None,
         tokenizer_mode: TokenizerMode = "auto",
         skip_tokenizer_init: bool = False,
@@ -243,7 +244,8 @@ def __init__(
 
         engine_args = EngineArgs(
             model=model,
-            task=task,
+            runner=runner,
+            convert=convert,
             tokenizer=tokenizer,
             tokenizer_mode=tokenizer_mode,
             skip_tokenizer_init=skip_tokenizer_init,
@@ -457,18 +459,10 @@ def generate(
         model_config = self.llm_engine.model_config
         runner_type = model_config.runner_type
         if runner_type != "generate":
-            messages = [
-                "LLM.generate() is only supported for generative models."
-            ]
-
-            if "generate" in model_config.supported_runner_types:
-                messages.append(
-                    "Your model supports the 'generate' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task generate` or "
-                    "`--task transcription`.")
-
-            raise ValueError(" ".join(messages))
+            raise ValueError(
+                "LLM.generate() is only supported for generative models. "
+                "Try passing `--runner generate` to use the model as a "
+                "generative model.")
 
         if prompt_token_ids is not None:
             parsed_prompts = self._convert_v1_inputs(
@@ -1108,16 +1102,10 @@ def encode(
         model_config = self.llm_engine.model_config
         runner_type = model_config.runner_type
         if runner_type != "pooling":
-            messages = ["LLM.encode() is only supported for pooling models."]
-
-            if "pooling" in model_config.supported_runner_types:
-                messages.append(
-                    "Your model supports the 'pooling' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task embed`, "
-                    "`--task classify`, `--task score` etc.")
-
-            raise ValueError(" ".join(messages))
+            raise ValueError(
+                "LLM.encode() is only supported for pooling models. "
+                "Try passing `--runner pooling` to use the model as a "
+                "pooling model.")
 
         if prompt_token_ids is not None:
             parsed_prompts = self._convert_v1_inputs(
@@ -1196,8 +1184,9 @@ def embed(
         """
         model_config = self.llm_engine.model_config
         if "embed" not in model_config.supported_tasks:
-            raise ValueError("Embedding API is not supported by this model. "
-                             "Please set `--task embed`.")
+            raise ValueError(
+                "Embedding API is not supported by this model. "
+                "Try converting the model using `--convert embed`.")
 
         items = self.encode(
             prompts,
@@ -1247,7 +1236,7 @@ def classify(
         if "classify" not in model_config.supported_tasks:
             raise ValueError(
                 "Classification API is not supported by this model. "
-                "Please set `--task classify`.")
+                "Try converting the model using `--convert classify`.")
 
         items = self.encode(
             prompts,
@@ -1305,10 +1294,9 @@ def _cross_encoding_score(
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
-
         if isinstance(tokenizer, MistralTokenizer):
             raise ValueError(
-                "Score API is only enabled for `--task embed or score`")
+                "Score API is not supported for Mistral tokenizer")
 
         if len(data_1) == 1:
             data_1 = data_1 * len(data_2)
@@ -1422,21 +1410,16 @@ def score(
         model_config = self.llm_engine.model_config
         runner_type = model_config.runner_type
         if runner_type != "pooling":
-            messages = ["LLM.score() is only supported for pooling models."]
-
-            if "pooling" in model_config.supported_runner_types:
-                messages.append(
-                    "Your model supports the 'pooling' runner, but is "
-                    f"currently initialized for the '{runner_type}' runner. "
-                    "Please initialize vLLM using `--task embed`, "
-                    "`--task classify`, `--task score` etc.")
-
-            raise ValueError(" ".join(messages))
+            raise ValueError(
+                "LLM.score() is only supported for pooling models. "
+                "Try passing `--runner pooling` to use the model as a "
+                "pooling model.")
 
         if all(t not in model_config.supported_tasks
                for t in ("embed", "classify")):
             raise ValueError("Score API is not supported by this model. "
-                             "Please set `--task embed` or `--task classify`.")
+                             "Try converting the model using "
+                             "`--convert embed` or `--convert classify`.")
 
         if (model_config.task == "classify"
                 and getattr(model_config.hf_config, "num_labels", 0) != 1):
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 4b30336f0132..4bc2252aad79 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -12,6 +12,7 @@
 import transformers
 from torch import nn
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from typing_extensions import assert_never
 
 from vllm.attention import Attention
 from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
@@ -280,15 +281,21 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
-    if model_config.task == "embed":
-        logger.debug_once("Automatic conversion using `as_embedding_model`.")
+
+    convert_type = model_config.convert_type
+    if convert_type == "none":
+        pass
+    elif convert_type == "embed":
+        logger.debug_once("Converting to embedding model.")
         model_cls = as_embedding_model(model_cls)
-    elif model_config.task == "classify":
-        logger.debug_once("Automatic conversion using `as_seq_cls_model`.")
+    elif convert_type == "classify":
+        logger.debug_once("Converting to sequence classification model.")
         model_cls = as_seq_cls_model(model_cls)
-    elif model_config.task == "reward":
-        logger.debug_once("Automatic conversion using `as_reward_model`.")
+    elif convert_type == "reward":
+        logger.debug_once("Converting to reward model.")
         model_cls = as_reward_model(model_cls)
+    else:
+        assert_never(convert_type)
 
     return model_cls, arch
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index fafb6a704383..780ecdabb5a0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -24,7 +24,7 @@
                          is_hybrid, supports_cross_encoding,
                          supports_multimodal, supports_pp,
                          supports_transcription, supports_v0_only)
-from .interfaces_base import is_text_generation_model
+from .interfaces_base import is_pooling_model, is_text_generation_model
 
 logger = init_logger(__name__)
 
@@ -301,7 +301,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
         return _ModelInfo(
             architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
-            is_pooling_model=True,  # Can convert any model into a pooling model
+            is_pooling_model=is_pooling_model(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_pp=supports_pp(model),
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index eb53cceaa058..a8bb0398dfdb 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -3,6 +3,8 @@
 
 from typing import Optional
 
+from typing_extensions import assert_never
+
 from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
@@ -108,6 +110,14 @@ async def get_lora_tokenizer_async(
 def init_tokenizer_from_configs(model_config: ModelConfig,
                                 scheduler_config: SchedulerConfig,
                                 lora_config: Optional[LoRAConfig]):
+    runner_type = model_config.runner_type
+    if runner_type == "generate" or runner_type == "draft":
+        truncation_side = "left"
+    elif runner_type == "pooling":
+        truncation_side = "right"
+    else:
+        assert_never(runner_type)
+
     return TokenizerGroup(
         tokenizer_id=model_config.tokenizer,
         enable_lora=bool(lora_config),
@@ -117,4 +127,4 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
         tokenizer_mode=model_config.tokenizer_mode,
         trust_remote_code=model_config.trust_remote_code,
         revision=model_config.tokenizer_revision,
-        truncation_side=model_config.truncation_side)
+        truncation_side=truncation_side)

From 9d48e9b95a75927ee1ac954d690ce41502be6a2b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 23 Jul 2025 17:13:57 +0000
Subject: [PATCH 02/47] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/pooling_models.md                 |  4 +--
 docs/models/supported_models.md               | 30 +++++++++++--------
 .../multimodal/generation/vlm_utils/types.py  |  2 +-
 vllm/config.py                                | 18 +++++++----
 4 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index cd67943ece40..b4232fa5f286 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -3,7 +3,7 @@
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
-These models use a [Pooler][vllm.model_executor.layers.pooling.Pooler] to extract the final hidden states of the input
+These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
 before returning them.
 
 !!! note
@@ -36,7 +36,7 @@ In vLLM, we define the following pooling tasks and corresponding APIs:
 
 \*The `score` API falls back to `embed` task if the model does not support `score` task.
 
-Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooling.Pooler.get_supported_tasks].
+Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks].
 
 By default, the pooler assigned to each task has the following attributes:
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 541e6c2f31c3..805292d0c7db 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -433,10 +433,11 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
-| \*<sup>C</sup> | Generative models | \* | \* | | \* |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+| `*ForTextEncoding`<sup>C</sup>, `*EmbeddingModel`<sup>C</sup>, etc. | Transformers embedding models | N/A | \* | \* | \* |
 
-<sup>C</sup> You need to set `--convert embed` to load the model as an embedding model in vLLM.
-\*Depends on the original model.
+<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.
+\*Feature support is the same as that of the original model.
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
@@ -464,10 +465,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 | `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| \*<sup>C</sup> | Generative models | \* | \* | | \* |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+| `*ForRewardModeling`<sup>C</sup>, `*RewardModel`<sup>C</sup>, etc. | Transformers reward models | N/A | \* | \* | \* |
 
-<sup>C</sup> You need to set `--convert reward` to load the model as a reward model in vLLM.
-\*Depends on the original model.
+<sup>C</sup> You should set `--convert reward` to load the model as a reward model in vLLM.
+\*Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
@@ -482,10 +484,11 @@ If your model is not in the above list, we will try to automatically convert the
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
-| \*<sup>C</sup> | Generative models | \* | \* | | \* |
+| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
+| `*ForSequenceClassification`<sup>C</sup>, etc. | Transformers classification models | N/A | \* | \* | \* |
 
-<sup>C</sup> You need to set `--convert classify` to load the model as an classification model in vLLM.
-\*Depends on the original model.
+<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.
+\*Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
@@ -707,7 +710,7 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 !!! important
     Since some model architectures support both generative and pooling tasks,
-    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+    you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode.
 
 #### Text Embedding
 
@@ -720,10 +723,11 @@ The following table lists those that are tested in vLLM.
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
-| \*<sup>C</sup> | Generative models | \* | \* | \* | | \* |
+| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
+| `*EmbeddingModel`<sup>C</sup>, etc. | Transformers embedding models | \* | N/A | \* | \* | \* |
 
-<sup>C</sup> You need to set `--convert embed` to load the model as an embedding model in vLLM.
-\*Depends on the original model.
+<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.
+\*Feature support is the same as that of the original model.
 
 ---
 
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 397f88dd8728..945113196088 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -173,7 +173,7 @@ def get_non_parametrized_runner_kwargs(self):
             "enforce_eager": self.enforce_eager,
             "max_model_len": self.max_model_len,
             "max_num_seqs": self.max_num_seqs,
-            "task": self.task,
+            "runner": self.runner,
             "tensor_parallel_size": self.tensor_parallel_size,
             "vllm_runner_kwargs": self.vllm_runner_kwargs,
             "hf_output_post_proc": self.hf_output_post_proc,
diff --git a/vllm/config.py b/vllm/config.py
index dcb006dcd497..8acc5c869477 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -573,7 +573,6 @@ def __post_init__(self) -> None:
 
         is_generative_model = self.registry.is_text_generation_model(arch)
         is_pooling_model = self.registry.is_pooling_model(arch)
-        print(f"{arch=}", f"{is_generative_model=}", f"{is_pooling_model=}")
 
         def _task_to_convert(task: TaskOption) -> ConvertType:
             if task == "embedding" or task == "embed":
@@ -590,19 +589,18 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
             return "none"
 
         if self.task is not None:
-            msg_prefix = ("The 'task' option is now deprecated and "
-                          "will be removed in v0.13.0.")
+            msg_prefix = ("The 'task' option has been deprecated and will be "
+                          "removed in v0.13.0 or v1.0, whichever comes first.")
 
             is_generative_task = self.task in _RUNNER_TASKS["generate"]
             is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
-            print(f"{is_generative_task=}", f"{is_pooling_task=}")
 
             if is_generative_model and not is_pooling_model:
                 if is_generative_task:
                     self.runner = "generate"
                     self.convert = "auto"
                     msg_hint = ("Please remove this option as it is useless "
-                                "for generative-only models")
+                                "for generation-only models")
                 elif is_pooling_task:
                     self.runner = "pooling"
                     self.convert = _task_to_convert(self.task)
@@ -663,6 +661,16 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                                           self.convert)
         logger.debug("Selected convert type: %s", self.convert_type)
 
+        if self.runner_type == "generate" and not is_generative_model:
+            raise ValueError("This model does not support `--runner generate`")
+        if self.runner_type == "pooling" and not is_pooling_model:  # noqa: SIM102
+            if self.convert_type not in _POOLING_CONVERT_TYPES:
+                convert_option = "<" + "|".join(_POOLING_CONVERT_TYPES) + ">"
+                raise ValueError(
+                    "This model does not support `--runner pooling`. "
+                    f"You can pass `--convert {convert_option} to adapt "
+                    "it into a pooling model.")
+
         self.supported_tasks = self._get_supported_tasks(
             arch, self.runner_type, self.convert_type)
 

From e0e1f24dbc7393a196bb74215fbfb54af7cc9e2f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 23 Jul 2025 17:31:46 +0000
Subject: [PATCH 03/47] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/pooling_models.md   |  2 +-
 docs/models/supported_models.md | 40 ++++++++++++++++++++-------------
 vllm/config.py                  | 28 ++++++++++++++---------
 3 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index b4232fa5f286..4110568343e4 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -34,7 +34,7 @@ In vLLM, we define the following pooling tasks and corresponding APIs:
 | `classify` | `classify`         |
 | `score`    | `score`            |
 
-\*The `score` API falls back to `embed` task if the model does not support `score` task.
+\* The `score` API falls back to `embed` task if the model does not support `score` task.
 
 Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks].
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 805292d0c7db..4977a5c11610 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -436,8 +436,8 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 | `*ForTextEncoding`<sup>C</sup>, `*EmbeddingModel`<sup>C</sup>, etc. | Transformers embedding models | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.
-\*Feature support is the same as that of the original model.
+<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.  
+\* Feature support is the same as that of the original model.
 
 !!! note
     `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
@@ -468,8 +468,8 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 | `*ForRewardModeling`<sup>C</sup>, `*RewardModel`<sup>C</sup>, etc. | Transformers reward models | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert reward` to load the model as a reward model in vLLM.
-\*Feature support is the same as that of the original model.
+<sup>C</sup> You should set `--convert reward` to load the model as a reward model in vLLM.  
+\* Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
@@ -487,22 +487,26 @@ If your model is not in the above list, we will try to automatically convert the
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 | `*ForSequenceClassification`<sup>C</sup>, etc. | Transformers classification models | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.
-\*Feature support is the same as that of the original model.
+<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
+\* Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
 [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
 
 #### Sentence Pair Scoring
 
-| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
-|--------------|--------|-------------------|---------------------|
-| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
-| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | |
-| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
-| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
-| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | |
+| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | |
+| `*ForSequenceClassification`<sup>C</sup>, etc. | Transformers classification models | N/A | \* | \* | \* |
+
+<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
+\* Feature support is the same as that of the original model.
 
 !!! note
     Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
@@ -726,8 +730,8 @@ The following table lists those that are tested in vLLM.
 | `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
 | `*EmbeddingModel`<sup>C</sup>, etc. | Transformers embedding models | \* | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.
-\*Feature support is the same as that of the original model.
+<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.  
+\* Feature support is the same as that of the original model.
 
 ---
 
@@ -736,6 +740,10 @@ The following table lists those that are tested in vLLM.
 | Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
+| `*ForSequenceClassification`<sup>C</sup>, etc. | Transformers classification models | \* | N/A | \* | \* | \* |
+
+<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
+\* Feature support is the same as that of the original model.
 
 ## Model Support Policy
 
diff --git a/vllm/config.py b/vllm/config.py
index 8acc5c869477..37052f28afa7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -105,14 +105,18 @@
 
 ConvertType = Literal["none", "embed", "classify", "reward"]
 
-_POOLING_CONVERT_TYPES = ("embed", "classify", "reward")
-
 _RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
     "generate": ["generate", "transcription"],
     "pooling": ["embedding", "embed", "classify", "score", "reward"],
     "draft": ["draft"],
 }
 
+_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
+    "generate": [],
+    "pooling": ["embed", "classify", "reward"],
+    "draft": [],
+}
+
 
 @runtime_checkable
 class SupportsHash(Protocol):
@@ -608,7 +612,7 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                                 f"{self.convert}` to continue "
                                 "adapting this generative model into a "
                                 "pooling model.")
-                else:
+                else:  # task == "auto"
                     self.runner = "auto"
                     self.convert = "auto"
                     msg_hint = "Please remove this option."
@@ -624,10 +628,10 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                     msg_hint = ("Please replace this option with `--convert "
                                 f"{self.convert}` to continue "
                                 "using this pooling model.")
-                else:
-                    msg_hint = "Please remove this option."
+                else:  # task == "auto"
                     self.runner = "auto"
                     self.convert = "auto"
+                    msg_hint = "Please remove this option."
             elif is_generative_model and is_pooling_model:
                 if is_generative_task:
                     self.runner = "auto"
@@ -641,13 +645,14 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                     msg_hint = ("Please replace this option with `--runner "
                                 "pooling` to continue using this model as a "
                                 "pooling model.")
-                else:
+                else:  # task == "auto"
                     self.runner = "auto"
                     self.convert = "auto"
                     msg_hint = "Please remove this option."
             else:
                 raise AssertionError("The model should be a generative or "
-                                     "pooling model")
+                                     "pooling model when task is set to "
+                                     f"{self.task!r}.")
 
             msg = f"{msg_prefix} {msg_hint}"
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
@@ -663,9 +668,10 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
 
         if self.runner_type == "generate" and not is_generative_model:
             raise ValueError("This model does not support `--runner generate`")
-        if self.runner_type == "pooling" and not is_pooling_model:  # noqa: SIM102
-            if self.convert_type not in _POOLING_CONVERT_TYPES:
-                convert_option = "<" + "|".join(_POOLING_CONVERT_TYPES) + ">"
+        if self.runner_type == "pooling" and not is_pooling_model:
+            pooling_converts = _RUNNER_CONVERTS["pooling"]
+            if self.convert_type not in pooling_converts:
+                convert_option = "<" + "|".join(pooling_converts) + ">"
                 raise ValueError(
                     "This model does not support `--runner pooling`. "
                     f"You can pass `--convert {convert_option} to adapt "
@@ -1011,7 +1017,7 @@ def _get_supported_pooling_tasks(
         # TODO: Use get_supported_pooling_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
         if (registry.is_pooling_model(architecture)
-                or convert_type in _POOLING_CONVERT_TYPES):
+                or convert_type in _RUNNER_CONVERTS["pooling"]):
             supported_tasks.append("encode")
 
             extra_task = (self._get_default_pooling_task(architecture)

From aff08747f38391181f124f2a6d89755d7044e988 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 23 Jul 2025 17:58:47 +0000
Subject: [PATCH 04/47] Remove downstream usages of `model_config.task`

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_config.py                      | 11 ++--
 vllm/config.py                            | 72 ++++++++++++-----------
 vllm/entrypoints/llm.py                   | 28 ++++-----
 vllm/entrypoints/openai/api_server.py     |  1 -
 vllm/model_executor/model_loader/utils.py | 30 ++++------
 5 files changed, 71 insertions(+), 71 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 39bf8629e4f8..5628f2f42309 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -219,12 +219,14 @@ def test_draft_runner(model_id, expected_runner_type, expected_convert_type,
 
 
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
     [
-        ("openai/whisper-small", "generate", "transcription"),
+        ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
-def test_transcription_task(model_id, expected_runner_type, expected_task):
+def test_transcription_task(model_id, expected_runner_type,
+                            expected_convert_type, expected_task):
     config = ModelConfig(
         model_id,
         task="transcription",
@@ -236,7 +238,8 @@ def test_transcription_task(model_id, expected_runner_type, expected_task):
     )
 
     assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
 
 
 MODEL_IDS_EXPECTED = [
diff --git a/vllm/config.py b/vllm/config.py
index 37052f28afa7..7c9301795761 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -117,6 +117,30 @@
     "draft": [],
 }
 
+# https://huggingface.co/docs/transformers/en/model_doc/auto
+SUFFIX_TO_RUNNER_TYPE: list[tuple[str, RunnerType]] = [
+    ("ForCausalLM", "generate"),
+    ("ForConditionalGeneration", "generate"),
+    ("ChatModel", "generate"),
+    ("LMHeadModel", "generate"),
+    ("ForTextEncoding", "pooling"),
+    ("ForSequenceClassification", "pooling"),
+    ("EmbeddingModel", "pooling"),
+    ("ForRewardModeling", "pooling"),
+    ("RewardModel", "pooling"),
+]
+
+SUFFIX_TO_CONVERT_TYPE: list[tuple[str, ConvertType]] = [
+    ("ForTextEncoding", "embed"),
+    ("EmbeddingModel", "embed"),
+    ("ForSequenceClassification", "classify"),
+    ("ForAudioClassification", "classify"),
+    ("ForImageClassification", "classify"),
+    ("ForVideoClassification", "classify"),
+    ("ForRewardModeling", "reward"),
+    ("RewardModel", "reward"),
+]
+
 
 @runtime_checkable
 class SupportsHash(Protocol):
@@ -603,8 +627,7 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                 if is_generative_task:
                     self.runner = "generate"
                     self.convert = "auto"
-                    msg_hint = ("Please remove this option as it is useless "
-                                "for generation-only models")
+                    msg_hint = "Please remove this option"
                 elif is_pooling_task:
                     self.runner = "pooling"
                     self.convert = _task_to_convert(self.task)
@@ -921,22 +944,9 @@ def _get_default_runner_type(self, architecture: str) -> RunnerType:
         if get_pooling_config(model_id, self.revision):
             return "pooling"
 
-        # https://huggingface.co/docs/transformers/en/model_doc/auto
-        suffix_to_runner_type: list[tuple[str, RunnerType]] = [
-            ("ForCausalLM", "generate"),
-            ("ForConditionalGeneration", "generate"),
-            ("ChatModel", "generate"),
-            ("LMHeadModel", "generate"),
-            ("ForTextEncoding", "pooling"),
-            ("ForSequenceClassification", "pooling"),
-            ("EmbeddingModel", "pooling"),
-            ("ForRewardModeling", "pooling"),
-            ("RewardModel", "pooling"),
-        ]
-
-        for suffix, pref_runner in suffix_to_runner_type:
+        for suffix, runner_type in SUFFIX_TO_RUNNER_TYPE:
             if architecture.endswith(suffix):
-                return pref_runner
+                return runner_type
 
         return "generate"
 
@@ -948,21 +958,9 @@ def _get_default_convert_type(
         if self.registry.is_cross_encoder_model(architecture):
             return "classify"
 
-        # https://huggingface.co/docs/transformers/en/model_doc/auto
-        suffix_to_convert_type: list[tuple[str, ConvertType]] = [
-            ("ForTextEncoding", "embed"),
-            ("EmbeddingModel", "embed"),
-            ("ForSequenceClassification", "classify"),
-            ("ForAudioClassification", "classify"),
-            ("ForImageClassification", "classify"),
-            ("ForVideoClassification", "classify"),
-            ("ForRewardModeling", "reward"),
-            ("RewardModel", "reward"),
-        ]
-
-        for suffix, pref_runner in suffix_to_convert_type:
+        for suffix, convert_type in SUFFIX_TO_CONVERT_TYPE:
             if architecture.endswith(suffix):
-                return pref_runner
+                return convert_type
 
         if runner_type == "pooling":
             return "embed"
@@ -979,7 +977,12 @@ def _get_supported_generation_tasks(
         if registry.is_transcription_only_model(architecture):
             return ["transcription"]
 
-        supported_tasks: list[_ResolvedTask] = ["generate"]
+        # TODO: Use get_supported_generation_tasks once V0 is removed
+        supported_tasks = list[_ResolvedTask]()
+        if (registry.is_text_generation_model(architecture)
+                or convert_type in _RUNNER_CONVERTS["generate"]):
+            supported_tasks.append("generate")
+
         if registry.is_transcription_model(architecture):
             supported_tasks.append("transcription")
 
@@ -1611,7 +1614,8 @@ def is_multimodal_model(self) -> bool:
 
     @property
     def is_cross_encoder(self) -> bool:
-        return self.task == "classify"
+        return (self.registry.is_cross_encoder_model(self.architecture)
+                or self.convert_type == "classify")
 
     @property
     def use_mla(self) -> bool:
@@ -4908,7 +4912,7 @@ def try_verify_and_update_config(self):
         if self.model_config.is_hybrid:
             HybridAttentionMambaModelConfig.verify_and_update_config(self)
 
-        if self.model_config.task == "classify":
+        if self.model_config.convert_type == "classify":
             # Maybe convert ForCausalLM into ForSequenceClassification model.
             from vllm.model_executor.models.adapters import (
                 SequenceClassificationConfig)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1642f325976f..1470e1a8085a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -489,7 +489,8 @@ def generate(
         truncate_prompt_tokens = None
         if isinstance(sampling_params, SamplingParams):
             truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
-        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+
+        _validate_truncation_size(model_config.max_model_len,
                                   truncate_prompt_tokens, tokenization_kwargs)
 
         # Add any modality specific loras to the corresponding prompts
@@ -1294,6 +1295,8 @@ def _cross_encoding_score(
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
+        model_config = self.llm_engine.model_config
+
         if isinstance(tokenizer, MistralTokenizer):
             raise ValueError(
                 "Score API is not supported for Mistral tokenizer")
@@ -1303,17 +1306,15 @@ def _cross_encoding_score(
 
         pooling_params = PoolingParams(task="score")
         tokenization_kwargs: dict[str, Any] = {}
-        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+
+        _validate_truncation_size(model_config.max_model_len,
                                   truncate_prompt_tokens, tokenization_kwargs)
 
         parsed_prompts = []
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        if self.llm_engine.model_config.is_multimodal_model:
-
-            model_config = self.llm_engine.model_config
-
+        if model_config.is_multimodal_model:
             for q, d in input_pairs:
                 _, engine_prompt = get_score_prompt(
                     model_config=model_config,
@@ -1324,11 +1325,9 @@ def _cross_encoding_score(
                 )
 
                 parsed_prompts.append(engine_prompt)
-
         else:
-
             for q, t in input_pairs:
-                if self.llm_engine.model_config.use_pad_token:
+                if model_config.use_pad_token:
                     # cross_encoder models defaults to using pad_token.
                     prompt_inputs = tokenizer(
                         text=q,  # type: ignore[arg-type]
@@ -1421,7 +1420,7 @@ def score(
                              "Try converting the model using "
                              "`--convert embed` or `--convert classify`.")
 
-        if (model_config.task == "classify"
+        if (model_config.is_cross_encoder
                 and getattr(model_config.hf_config, "num_labels", 0) != 1):
             raise ValueError("Score API is only enabled for num_labels == 1.")
 
@@ -1430,15 +1429,14 @@ def score(
         # lists of tokens to the `text` and `text_pair` kwargs
         tokenizer = self.get_tokenizer()
 
-        if not self.llm_engine.model_config.is_multimodal_model:
+        if not model_config.is_multimodal_model:
 
             def check_data_type(data: Union[SingletonPrompt,
                                             Sequence[SingletonPrompt],
                                             ScoreMultiModalParam]):
                 if isinstance(data, dict) and "content" in data:
-                    raise ValueError(
-                        f"ScoreMultiModalParam is not supported for {self.llm_engine.model_config.architecture}",  # noqa: E501
-                    )
+                    raise ValueError("ScoreMultiModalParam is not supported "
+                                     f"for {model_config.architecture}")
 
             check_data_type(data_1)
             check_data_type(data_2)
@@ -1480,7 +1478,7 @@ def ensure_str(prompt: SingletonPrompt):
 
         _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]
 
-        if self.llm_engine.model_config.is_cross_encoder:
+        if model_config.is_cross_encoder:
             return self._cross_encoding_score(
                 tokenizer,
                 data_1,  # type: ignore[arg-type]
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 57240bb4f333..835cd83b6dd4 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1715,7 +1715,6 @@ async def init_app_state(
         state.openai_serving_models,
         request_logger=request_logger,
     ) if "transcription" in model_config.supported_tasks else None
-    state.task = model_config.task
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
     state.server_load_metrics = 0
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 4bc2252aad79..7157cf27d4d6 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -15,8 +15,8 @@
 from typing_extensions import assert_never
 
 from vllm.attention import Attention
-from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (SUFFIX_TO_CONVERT_TYPE, ModelConfig, ModelImpl,
+                         VllmConfig, set_current_vllm_config)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
@@ -243,25 +243,21 @@ def get_model_architecture(
     vllm_supported_archs = ModelRegistry.get_supported_archs()
     is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
                                  _TRANSFORMERS_MODELS)
-    vllm_not_supported = not any(is_supported(arch) for arch in architectures)
 
-    if vllm_not_supported:
+    if not any(is_supported(arch) for arch in architectures):
         # try automatic conversion in adapters.py
-        for arch in architectures:
-            if not arch.endswith("ForSequenceClassification"):
-                continue
+        for i in range(len(architectures)):
+            arch = architectures[i]
 
-            assert model_config.task == "classify"
-            causal_lm_arch = arch.replace("ForSequenceClassification",
-                                          "ForCausalLM")
-            causal_lm_arch_vllm_supported = (causal_lm_arch
-                                             in vllm_supported_archs)
-            if not causal_lm_arch_vllm_supported:
-                continue
+            for suffix, convert_type in SUFFIX_TO_CONVERT_TYPE:
+                if (convert_type == model_config.convert_type
+                        and arch.endswith(suffix)):
+                    causal_lm_arch = arch.replace(suffix, "ForCausalLM")
+                    if causal_lm_arch in vllm_supported_archs:
+                        architectures[i] = causal_lm_arch
+                        break
 
-            architectures = [causal_lm_arch]
-            vllm_not_supported = False
-            break
+    vllm_not_supported = not any(is_supported(arch) for arch in architectures)
 
     if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures):
         previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]]

From d7ec7ef078b3012ac9e91208b0fc0dc18b3cf702 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 03:38:08 +0000
Subject: [PATCH 05/47] Fixes

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/distributed/test_pipeline_parallel.py         | 2 +-
 tests/entrypoints/test_chat_utils.py                | 2 +-
 tests/model_executor/test_model_load_with_params.py | 2 --
 vllm/config.py                                      | 7 ++++++-
 vllm/model_executor/models/config.py                | 4 ++--
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 333ab5580af4..68a741b7ec0e 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -444,7 +444,7 @@ def test_tp_language_generation(
 
 @pytest.mark.parametrize(
     ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
     [
         params for model_id, settings in EMBEDDING_MODELS.items()
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 158b4055aea0..768c002833e7 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -53,7 +53,7 @@ def phi3v_model_config():
 def phi3v_model_config_mm_interleaved():
     return ModelConfig(PHI3V_MODEL_ID,
                        runner="generate",
-                       tokenizer=PHI3V_MODEL_ID,
+                       trust_remote_code=True,
                        interleave_mm_strings=True,
                        limit_mm_per_prompt={
                            "image": 2,
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index aae9a4d1ef11..3be90060f6d6 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -55,7 +55,6 @@ def check_model(model):
 
         vllm_model.apply_model(check_model)
 
-        # assert output
         assert output
 
 
@@ -95,7 +94,6 @@ def check_model(model):
 
         vllm_model.apply_model(check_model)
 
-        # assert output
         assert output
 
 
diff --git a/vllm/config.py b/vllm/config.py
index 05dd02495137..7465dc2e4073 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -124,8 +124,12 @@
     ("ChatModel", "generate"),
     ("LMHeadModel", "generate"),
     ("ForTextEncoding", "pooling"),
-    ("ForSequenceClassification", "pooling"),
     ("EmbeddingModel", "pooling"),
+    ("ForSequenceClassification", "pooling"),
+    ("ForAudioClassification", "pooling"),
+    ("ForImageClassification", "pooling"),
+    ("ForVideoClassification", "pooling"),
+    ("ClassificationModel", "pooling"),
     ("ForRewardModeling", "pooling"),
     ("RewardModel", "pooling"),
 ]
@@ -137,6 +141,7 @@
     ("ForAudioClassification", "classify"),
     ("ForImageClassification", "classify"),
     ("ForVideoClassification", "classify"),
+    ("ClassificationModel", "classify"),
     ("ForRewardModeling", "reward"),
     ("RewardModel", "reward"),
 ]
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index cb07fe7d9e1d..e297fa5497ba 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -253,8 +253,8 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             dtype=kv_cache_dtype,
             use_mla=model_config.use_mla).page_size_bytes
 
-        model_cls = ModelRegistry.resolve_model_cls(
-            model_config._model_info.architecture)[0]
+        model_cls, _ = ModelRegistry.resolve_model_cls(
+            model_config.architecture)
 
         # get mamba page size
         mamba_page_size = MambaSpec(

From 28529379229955ed0ff4d4fe31b0091a145a6c0e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 03:47:48 +0000
Subject: [PATCH 06/47] Update docs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/pooling_models.md   | 11 ++++++-----
 docs/models/supported_models.md |  6 ------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 4110568343e4..f6d57b47010e 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -13,12 +13,13 @@ before returning them.
 
 If the model doesn't implement this interface, you can set `--convert` which tells vLLM
 to convert the model into a pooling model.
+vLLM can also automatically detect models to convert based on their architecture name.
 
-| `--convert` | Model type           | Supported pooling tasks       |
-|-------------|----------------------|-------------------------------|
-| `embed`     | Embedding model      | `encode`, `embed`             |
-| `classify`  | Classification model | `encode`, `classify`, `score` |
-| `reward`    | Reward model         | `encode`                      |
+| `--convert` | Automatically converted architectures         | Model type           | Supported pooling tasks       |
+|-------------|-----------------------------------------------|----------------------|-------------------------------|
+| `embed`     | `*ForTextEncoding`, `*EmbeddingModel`         | Embedding model      | `encode`, `embed`             |
+| `classify`  | `*For*Classification`, `*ClassificationModel` | Classification model | `encode`, `classify`, `score` |
+| `reward`    | `*ForRewardModeling`, `*RewardModel`          | Reward model         | `encode`                      |
 
 For model architectures that support both generation and pooling, you should set `--runner pooling`
 to use the model as a pooling model.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 4977a5c11610..056a384f9e50 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -434,7 +434,6 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
-| `*ForTextEncoding`<sup>C</sup>, `*EmbeddingModel`<sup>C</sup>, etc. | Transformers embedding models | N/A | \* | \* | \* |
 
 <sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.  
 \* Feature support is the same as that of the original model.
@@ -466,7 +465,6 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
-| `*ForRewardModeling`<sup>C</sup>, `*RewardModel`<sup>C</sup>, etc. | Transformers reward models | N/A | \* | \* | \* |
 
 <sup>C</sup> You should set `--convert reward` to load the model as a reward model in vLLM.  
 \* Feature support is the same as that of the original model.
@@ -485,7 +483,6 @@ If your model is not in the above list, we will try to automatically convert the
 | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
-| `*ForSequenceClassification`<sup>C</sup>, etc. | Transformers classification models | N/A | \* | \* | \* |
 
 <sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
 \* Feature support is the same as that of the original model.
@@ -503,7 +500,6 @@ If your model is not in the above list, we will try to automatically convert the
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | |
 | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | |
-| `*ForSequenceClassification`<sup>C</sup>, etc. | Transformers classification models | N/A | \* | \* | \* |
 
 <sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
 \* Feature support is the same as that of the original model.
@@ -728,7 +724,6 @@ The following table lists those that are tested in vLLM.
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
 | `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
-| `*EmbeddingModel`<sup>C</sup>, etc. | Transformers embedding models | \* | N/A | \* | \* | \* |
 
 <sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.  
 \* Feature support is the same as that of the original model.
@@ -740,7 +735,6 @@ The following table lists those that are tested in vLLM.
 | Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
-| `*ForSequenceClassification`<sup>C</sup>, etc. | Transformers classification models | \* | N/A | \* | \* | \* |
 
 <sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
 \* Feature support is the same as that of the original model.

From d6fe44b4ad3313dfd7e1cbfacab4e38f8b5d1caf Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 03:53:31 +0000
Subject: [PATCH 07/47] Simplify deprecation logic

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 69 ++++++++++++++++++++------------------------------
 1 file changed, 27 insertions(+), 42 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c6bc122a5013..f266574a714a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -622,66 +622,51 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
             return "none"
 
         if self.task is not None:
+            runner: RunnerOption = "auto"
+            convert: ConvertOption = "auto"
             msg_prefix = ("The 'task' option has been deprecated and will be "
                           "removed in v0.13.0 or v1.0, whichever comes first.")
+            msg_hint = "Please remove this option."
 
             is_generative_task = self.task in _RUNNER_TASKS["generate"]
             is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
 
-            if is_generative_model and not is_pooling_model:
+            if is_generative_model and is_pooling_model:
                 if is_generative_task:
-                    self.runner = "generate"
-                    self.convert = "auto"
-                    msg_hint = "Please remove this option"
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "generate` to continue using this model "
+                                "as a generative model.")
                 elif is_pooling_task:
-                    self.runner = "pooling"
-                    self.convert = _task_to_convert(self.task)
-                    msg_hint = ("Please replace this option with `--convert "
-                                f"{self.convert}` to continue "
-                                "adapting this generative model into a "
-                                "pooling model.")
+                    runner = "pooling"
+                    convert = "auto"
+                    msg_hint = ("Please replace this option with `--runner "
+                                "pooling` to continue using this model "
+                                "as a pooling model.")
                 else:  # task == "auto"
-                    self.runner = "auto"
-                    self.convert = "auto"
-                    msg_hint = "Please remove this option."
-            elif not is_generative_model and is_pooling_model:
+                    pass
+            elif is_generative_model or is_pooling_model:
                 if is_generative_task:
-                    # Pooling -> Generative not supported
-                    self.runner = "generate"
-                    self.convert = "auto"
-                    msg_hint = "Please remove this option."
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = "Please remove this option"
                 elif is_pooling_task:
-                    self.runner = "pooling"
-                    self.convert = _task_to_convert(self.task)
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
                     msg_hint = ("Please replace this option with `--convert "
-                                f"{self.convert}` to continue "
-                                "using this pooling model.")
+                                f"{convert}` to continue using this model "
+                                "as a pooling model.")
                 else:  # task == "auto"
-                    self.runner = "auto"
-                    self.convert = "auto"
-                    msg_hint = "Please remove this option."
-            elif is_generative_model and is_pooling_model:
-                if is_generative_task:
-                    self.runner = "auto"
-                    self.convert = "auto"
-                    msg_hint = ("Please replace this option with `--runner "
-                                "generate` to continue using this model as a "
-                                "generative model.")
-                elif is_pooling_task:
-                    self.runner = "auto"
-                    self.convert = "auto"
-                    msg_hint = ("Please replace this option with `--runner "
-                                "pooling` to continue using this model as a "
-                                "pooling model.")
-                else:  # task == "auto"
-                    self.runner = "auto"
-                    self.convert = "auto"
-                    msg_hint = "Please remove this option."
+                    pass
             else:
                 raise AssertionError("The model should be a generative or "
                                      "pooling model when task is set to "
                                      f"{self.task!r}.")
 
+            self.runner = runner
+            self.convert = convert
+
             msg = f"{msg_prefix} {msg_hint}"
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
 

From 24ca7baa28f6a799c08c98e57dffa8b4f693815f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 04:18:14 +0000
Subject: [PATCH 08/47] Fix default runner and conversion

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 70 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 58 insertions(+), 12 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index f266574a714a..090c1288a520 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -670,14 +670,9 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
             msg = f"{msg_prefix} {msg_hint}"
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-        self.runner_type: RunnerType = (self._get_default_runner_type(arch) if
-                                        self.runner == "auto" else self.runner)
-        logger.debug("Selected runner type: %s", self.runner_type)
-
-        self.convert_type: ConvertType = (self._get_default_convert_type(
-            arch, self.runner_type) if self.convert == "auto" else
-                                          self.convert)
-        logger.debug("Selected convert type: %s", self.convert_type)
+        self.runner_type = self._get_runner_type(arch, self.runner)
+        self.convert_type = self._get_convert_type(arch, self.runner_type,
+                                                   self.convert)
 
         if self.runner_type == "generate" and not is_generative_model:
             raise ValueError("This model does not support `--runner generate`")
@@ -929,11 +924,18 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _get_default_runner_type(self, architecture: str) -> RunnerType:
-        if self.registry.is_cross_encoder_model(architecture):
-            return "pooling"
+        registry = self.registry
+
+        if architecture in registry.get_supported_archs():
+            if registry.is_text_generation_model(architecture):
+                return "generate"
+            if registry.is_pooling_model(architecture):
+                return "pooling"
+
+            raise RuntimeError(f"Registered architecture ({architecture}) "
+                               "should be a generative or pooling model")
 
-        model_id = self.model
-        if get_pooling_config(model_id, self.revision):
+        if get_pooling_config(self.model, self.revision):
             return "pooling"
 
         for suffix, runner_type in SUFFIX_TO_RUNNER_TYPE:
@@ -942,11 +944,37 @@ def _get_default_runner_type(self, architecture: str) -> RunnerType:
 
         return "generate"
 
+    def _get_runner_type(
+        self,
+        architecture: str,
+        runner: RunnerOption,
+    ) -> RunnerType:
+        if runner != "auto":
+            return runner
+
+        runner_type = self._get_default_runner_type(architecture)
+
+        logger.info(
+            "Resolved `--runner auto` to `--runner %s`. "
+            "Pass the value explicitly to silence this message.", runner_type)
+
+        return runner_type
+
     def _get_default_convert_type(
         self,
         architecture: str,
         runner_type: RunnerType,
     ) -> ConvertType:
+        registry = self.registry
+
+        if architecture in registry.get_supported_archs():
+            if runner_type == "generate" and registry.is_text_generation_model(
+                    architecture):
+                return "none"
+            if runner_type == "pooling" and registry.is_pooling_model(
+                    architecture):
+                return "none"
+
         if self.registry.is_cross_encoder_model(architecture):
             return "classify"
 
@@ -959,6 +987,24 @@ def _get_default_convert_type(
 
         return "none"
 
+    def _get_convert_type(
+        self,
+        architecture: str,
+        runner_type: RunnerType,
+        convert: ConvertOption,
+    ) -> ConvertType:
+        if convert != "auto":
+            return convert
+
+        convert_type = self._get_default_convert_type(architecture,
+                                                      runner_type)
+
+        logger.info(
+            "Resolved `--convert auto` to `--convert %s`. "
+            "Pass the value explicitly to silence this message.", convert_type)
+
+        return convert_type
+
     def _get_supported_generation_tasks(
         self,
         architecture: str,

From 4d08cd1d3599f62bf7fa1910c87978095351ec23 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 04:19:27 +0000
Subject: [PATCH 09/47] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 090c1288a520..5b286a4a7a70 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -675,7 +675,11 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                                                    self.convert)
 
         if self.runner_type == "generate" and not is_generative_model:
-            raise ValueError("This model does not support `--runner generate`")
+            generate_converts = _RUNNER_CONVERTS["generate"]
+            if self.convert_type not in generate_converts:
+                # Currently we don't have any converters for generative models
+                raise ValueError(
+                    "This model does not support `--runner generate`.")
         if self.runner_type == "pooling" and not is_pooling_model:
             pooling_converts = _RUNNER_CONVERTS["pooling"]
             if self.convert_type not in pooling_converts:

From 7e17f2e6f5ec1cfcbbfe3dd0283434634bf51faf Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 08:19:26 +0000
Subject: [PATCH 10/47] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/language/pooling/mteb_utils.py |  3 --
 tests/test_config.py                        | 18 +++----
 vllm/config.py                              | 58 ++++++++-------------
 vllm/model_executor/model_loader/utils.py   |  7 +--
 4 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 80060d2fb0a5..8c93bbdc98c0 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -285,9 +285,6 @@ def mteb_test_rerank_models(hf_runner,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
-
-        if model_info.architecture:
-            assert (model_info.architecture in model_config.architectures)
         assert model_config.hf_config.num_labels == 1
 
         vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
diff --git a/tests/test_config.py b/tests/test_config.py
index 5628f2f42309..bf24079f2652 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -80,11 +80,11 @@ def test_update_config():
      "expected_task"),
     [
         ("distilbert/distilgpt2", "generate", "none", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
          "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward", "reward"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
         ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
@@ -141,11 +141,11 @@ def test_score_task(model_id, expected_runner_type, expected_convert_type,
      "expected_task"),
     [
         ("distilbert/distilgpt2", "generate", "none", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
          "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward", "reward"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
         ("openai/whisper-small", "generate", "none", "transcription"),
     ],
 )
@@ -171,11 +171,11 @@ def test_auto_runner(model_id, expected_runner_type, expected_convert_type,
      "expected_task"),
     [
         ("distilbert/distilgpt2", "pooling", "embed", "embed"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
          "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward", "reward"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
         ("openai/whisper-small", "pooling", "embed", "embed"),
     ],
 )
diff --git a/vllm/config.py b/vllm/config.py
index 5b286a4a7a70..ba6fdcf7a1d1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -118,32 +118,20 @@
 }
 
 # https://huggingface.co/docs/transformers/en/model_doc/auto
-SUFFIX_TO_RUNNER_TYPE: list[tuple[str, RunnerType]] = [
-    ("ForCausalLM", "generate"),
-    ("ForConditionalGeneration", "generate"),
-    ("ChatModel", "generate"),
-    ("LMHeadModel", "generate"),
-    ("ForTextEncoding", "pooling"),
-    ("EmbeddingModel", "pooling"),
-    ("ForSequenceClassification", "pooling"),
-    ("ForAudioClassification", "pooling"),
-    ("ForImageClassification", "pooling"),
-    ("ForVideoClassification", "pooling"),
-    ("ClassificationModel", "pooling"),
-    ("ForRewardModeling", "pooling"),
-    ("RewardModel", "pooling"),
-]
-
-SUFFIX_TO_CONVERT_TYPE: list[tuple[str, ConvertType]] = [
-    ("ForTextEncoding", "embed"),
-    ("EmbeddingModel", "embed"),
-    ("ForSequenceClassification", "classify"),
-    ("ForAudioClassification", "classify"),
-    ("ForImageClassification", "classify"),
-    ("ForVideoClassification", "classify"),
-    ("ClassificationModel", "classify"),
-    ("ForRewardModeling", "reward"),
-    ("RewardModel", "reward"),
+SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
+    ("ForCausalLM", ("generate", "none")),
+    ("ForConditionalGeneration", ("generate", "none")),
+    ("ChatModel", ("generate", "none")),
+    ("LMHeadModel", ("generate", "none")),
+    ("ForTextEncoding", ("pooling", "embed")),
+    ("EmbeddingModel", ("pooling", "embed")),
+    ("ForSequenceClassification", ("pooling", "classify")),
+    ("ForAudioClassification", ("pooling", "classify")),
+    ("ForImageClassification", ("pooling", "classify")),
+    ("ForVideoClassification", ("pooling", "classify")),
+    ("ClassificationModel", ("pooling", "classify")),
+    ("ForRewardModeling", ("pooling", "reward")),
+    ("RewardModel", ("pooling", "reward")),
 ]
 
 
@@ -942,7 +930,7 @@ def _get_default_runner_type(self, architecture: str) -> RunnerType:
         if get_pooling_config(self.model, self.revision):
             return "pooling"
 
-        for suffix, runner_type in SUFFIX_TO_RUNNER_TYPE:
+        for suffix, (runner_type, _) in SUFFIX_TO_DEFAULTS:
             if architecture.endswith(suffix):
                 return runner_type
 
@@ -972,18 +960,16 @@ def _get_default_convert_type(
         registry = self.registry
 
         if architecture in registry.get_supported_archs():
-            if runner_type == "generate" and registry.is_text_generation_model(
-                    architecture):
+            if (runner_type == "generate"
+                    and registry.is_text_generation_model(architecture)):
                 return "none"
-            if runner_type == "pooling" and registry.is_pooling_model(
-                    architecture):
+            if (runner_type == "pooling"
+                    and registry.is_pooling_model(architecture)):
                 return "none"
 
-        if self.registry.is_cross_encoder_model(architecture):
-            return "classify"
-
-        for suffix, convert_type in SUFFIX_TO_CONVERT_TYPE:
-            if architecture.endswith(suffix):
+        for suffix, (default_runner_type, convert_type) in SUFFIX_TO_DEFAULTS:
+            if (default_runner_type == runner_type
+                    and architecture.endswith(suffix)):
                 return convert_type
 
         if runner_type == "pooling":
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 7157cf27d4d6..201b559caa00 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -15,7 +15,7 @@
 from typing_extensions import assert_never
 
 from vllm.attention import Attention
-from vllm.config import (SUFFIX_TO_CONVERT_TYPE, ModelConfig, ModelImpl,
+from vllm.config import (SUFFIX_TO_DEFAULTS, ModelConfig, ModelImpl,
                          VllmConfig, set_current_vllm_config)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
@@ -249,8 +249,9 @@ def get_model_architecture(
         for i in range(len(architectures)):
             arch = architectures[i]
 
-            for suffix, convert_type in SUFFIX_TO_CONVERT_TYPE:
-                if (convert_type == model_config.convert_type
+            for suffix, (runner_type, convert_type) in SUFFIX_TO_DEFAULTS:
+                if (runner_type == model_config.runner_type
+                        and convert_type == model_config.convert_type
                         and arch.endswith(suffix)):
                     causal_lm_arch = arch.replace(suffix, "ForCausalLM")
                     if causal_lm_arch in vllm_supported_archs:

From fad33e9b0d64175a94ce295f03a85ef1fbbde910 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 08:23:16 +0000
Subject: [PATCH 11/47] Update docs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/generative_models.md | 14 +++++--
 docs/models/pooling_models.md    | 72 +++++++++++++++++++++-----------
 2 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 4eeb002fbb71..ddbbfea5599a 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -2,12 +2,20 @@
 
 vLLM provides first-class support for generative models, which covers most of LLMs.
 
-In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
+In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
 which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
 
-For model architectures that support both generation and pooling, you should set `--runner generate`
-to use the model as a generative model.
+## Configuration
+
+### Model Runner (`--runner`)
+
+Run a model in generation mode via the option `--runner generate`.
+
+!! tip
+    There is no need to set this option because `--runner auto` (the default)
+    automatically resolves to `--runner generate` for models that support the
+    [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 
 ## Offline Inference
 
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index f6d57b47010e..b5959a8dd448 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,6 +1,6 @@
 # Pooling Models
 
-vLLM also supports pooling models, including embedding, reranking and reward models.
+vLLM also supports pooling models, such as embedding, classification and reward models.
 
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
 These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
@@ -11,22 +11,46 @@ before returning them.
     As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
     pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 
-If the model doesn't implement this interface, you can set `--convert` which tells vLLM
-to convert the model into a pooling model.
-vLLM can also automatically detect models to convert based on their architecture name.
+## Configuration
 
-| `--convert` | Automatically converted architectures         | Model type           | Supported pooling tasks       |
-|-------------|-----------------------------------------------|----------------------|-------------------------------|
-| `embed`     | `*ForTextEncoding`, `*EmbeddingModel`         | Embedding model      | `encode`, `embed`             |
-| `classify`  | `*For*Classification`, `*ClassificationModel` | Classification model | `encode`, `classify`, `score` |
-| `reward`    | `*ForRewardModeling`, `*RewardModel`          | Reward model         | `encode`                      |
+### Model Runner (`--runner`)
 
-For model architectures that support both generation and pooling, you should set `--runner pooling`
-to use the model as a pooling model.
+Run a model in pooling mode via the option `--runner pooling`.
 
-## Pooling Tasks
+!! note
+    It is recommended to always set `--runner pooling` because `--runner auto` (the default)
+    automatically resolves to `--runner generate` for models that implement the
+    [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface,
+    even if they also implement the
+    [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
 
-In vLLM, we define the following pooling tasks and corresponding APIs:
+    Moreover, setting `--runner pooling` will automatically trigger model conversion
+    via `--convert` (see below) if the model in vLLM does not implement
+    the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
+
+### Model Conversion (`--convert`)
+
+vLLM can adapt models for various pooling tasks via the option `--convert <...>`.
+
+If `--runner pooling` has been set but the model does not implement the
+[VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface,
+vLLM will attempt to automatically convert the model according to the architecture names
+shown in the table below.
+
+| Architecture                                  | `--convert` | Supported pooling tasks       |
+|-----------------------------------------------|-------------|-------------------------------|
+| `*ForTextEncoding`, `*EmbeddingModel`         | `embed`     | `encode`, `embed`             |
+| `*For*Classification`, `*ClassificationModel` | `classify`  | `encode`, `classify`, `score` |
+| `*ForRewardModeling`, `*RewardModel`          | `reward`    | `encode`                      |
+
+!! tip
+    You can explicitly set `--convert <...>` to specify how to convert the model.
+
+### Pooling Tasks
+
+Each pooling model in vLLM supports one or more of these tasks according to
+[Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks],
+enabling the corresponding APIs:
 
 | Task       | APIs               |
 |------------|--------------------|
@@ -37,9 +61,17 @@ In vLLM, we define the following pooling tasks and corresponding APIs:
 
 \* The `score` API falls back to `embed` task if the model does not support `score` task.
 
-Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks].
+### Pooler Configuration
 
-By default, the pooler assigned to each task has the following attributes:
+#### Predefined models
+
+If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
+you can override some of its attributes via the `--override-pooler-config` option.
+
+#### Converted models
+
+If the model has been converted via `--convert` (see above),
+the pooler assigned to each task has the following attributes by default:
 
 | Task       | Pooling Type   | Normalization | Softmax |
 |------------|----------------|---------------|---------|
@@ -47,20 +79,12 @@ By default, the pooler assigned to each task has the following attributes:
 | `embed`    | `LAST`         | ✅︎            | ❌      |
 | `classify` | `LAST`         | ❌            | ✅︎      |
 
-These defaults may be overridden by the model's implementation in vLLM.
-
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
-we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`),
-which takes priority over the model's defaults.
+its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
 
 You can further customize this via the `--override-pooler-config` option,
 which takes priority over both the model's and Sentence Transformers's defaults.
 
-!!! note
-
-    The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler
-    that is not based on [PoolerConfig][vllm.config.PoolerConfig].
-
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.

From bbaa245def3f3126c2658596cfffef51b64d96fe Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 08:34:43 +0000
Subject: [PATCH 12/47] Fix admonitions

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/generative_models.md | 2 +-
 docs/models/pooling_models.md    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index ddbbfea5599a..e26ca5aa668e 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -12,7 +12,7 @@ which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to o
 
 Run a model in generation mode via the option `--runner generate`.
 
-!! tip
+!!! tip
     There is no need to set this option because `--runner auto` (the default)
     automatically resolves to `--runner generate` for models that support the
     [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index b5959a8dd448..bba1e9dc2e5c 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -17,7 +17,7 @@ before returning them.
 
 Run a model in pooling mode via the option `--runner pooling`.
 
-!! note
+!!! note
     It is recommended to always set `--runner pooling` because `--runner auto` (the default)
     automatically resolves to `--runner generate` for models that implement the
     [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface,
@@ -43,7 +43,7 @@ shown in the table below.
 | `*For*Classification`, `*ClassificationModel` | `classify`  | `encode`, `classify`, `score` |
 | `*ForRewardModeling`, `*RewardModel`          | `reward`    | `encode`                      |
 
-!! tip
+!!! tip
     You can explicitly set `--convert <...>` to specify how to convert the model.
 
 ### Pooling Tasks

From 52c300b535fd71cad0a498c75b30e1a49654234b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 10:19:33 +0000
Subject: [PATCH 13/47] Improve docs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/pooling_models.md   | 10 +++++-----
 docs/models/supported_models.md | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index bba1e9dc2e5c..0b4ab2230819 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -13,7 +13,7 @@ before returning them.
 
 ## Configuration
 
-### Model Runner (`--runner`)
+### Model Runner
 
 Run a model in pooling mode via the option `--runner pooling`.
 
@@ -28,11 +28,11 @@ Run a model in pooling mode via the option `--runner pooling`.
     via `--convert` (see below) if the model in vLLM does not implement
     the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
 
-### Model Conversion (`--convert`)
+### Model Conversion
 
-vLLM can adapt models for various pooling tasks via the option `--convert <...>`.
+vLLM can adapt models for various pooling tasks via the option `--convert <type>`.
 
-If `--runner pooling` has been set but the model does not implement the
+If `--runner pooling` has been set (manually or automatically) but the model does not implement the
 [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface,
 vLLM will attempt to automatically convert the model according to the architecture names
 shown in the table below.
@@ -44,7 +44,7 @@ shown in the table below.
 | `*ForRewardModeling`, `*RewardModel`          | `reward`    | `encode`                      |
 
 !!! tip
-    You can explicitly set `--convert <...>` to specify how to convert the model.
+    You can explicitly set `--convert <type>` to specify how to convert the model.
 
 ### Pooling Tasks
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 056a384f9e50..eb87d236378f 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -435,7 +435,7 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.  
+<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
 !!! note
@@ -466,7 +466,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert reward` to load the model as a reward model in vLLM.  
+<sup>C</sup> Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -484,7 +484,7 @@ If your model is not in the above list, we will try to automatically convert the
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
+<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
 If your model is not in the above list, we will try to automatically convert the model using
@@ -501,7 +501,7 @@ If your model is not in the above list, we will try to automatically convert the
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | |
 | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | |
 
-<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
+<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
 !!! note
@@ -725,7 +725,7 @@ The following table lists those that are tested in vLLM.
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
 | `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
 
-<sup>C</sup> You should set `--convert embed` to load the model as an embedding model in vLLM.  
+<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
 ---
@@ -736,7 +736,7 @@ The following table lists those that are tested in vLLM.
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
 
-<sup>C</sup> You should set `--convert classify` to load the model as an classification model in vLLM.  
+<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.
 
 ## Model Support Policy

From 3dfcdb50c771477c511395f75fd7db10bdf1bdc5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 11:22:15 +0000
Subject: [PATCH 14/47] Update tests

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/pooling/test_prithvi_mae.py    |   2 +-
 tests/test_config.py                          | 220 +++---------------
 2 files changed, 34 insertions(+), 188 deletions(-)

diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
index f08d83c08212..e9be79fba911 100644
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -34,7 +34,7 @@ def _run_test(
             set_default_torch_num_threads(1),
             vllm_runner(
                 model,
-                task="embed",
+                runner="pooling",
                 dtype=torch.float16,
                 enforce_eager=True,
                 skip_tokenizer_init=True,
diff --git a/tests/test_config.py b/tests/test_config.py
index bf24079f2652..3e1de8fb111b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -90,15 +90,7 @@ def test_update_config():
 )
 def test_auto_task(model_id, expected_runner_type, expected_convert_type,
                    expected_task):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+    config = ModelConfig(model_id, task="auto")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
@@ -121,15 +113,24 @@ def test_auto_task(model_id, expected_runner_type, expected_convert_type,
 )
 def test_score_task(model_id, expected_runner_type, expected_convert_type,
                     expected_task):
-    config = ModelConfig(
-        model_id,
-        task="score",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+    config = ModelConfig(model_id, task="score")
+
+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
+
+
+# Can remove once --task option is fully deprecated
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
+    [
+        ("openai/whisper-small", "generate", "none", "transcription"),
+    ],
+)
+def test_transcription_task(model_id, expected_runner_type,
+                            expected_convert_type, expected_task):
+    config = ModelConfig(model_id, task="transcription")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
@@ -218,30 +219,6 @@ def test_draft_runner(model_id, expected_runner_type, expected_convert_type,
     assert expected_task in config.supported_tasks
 
 
-@pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type",
-     "expected_task"),
-    [
-        ("openai/whisper-small", "generate", "none", "transcription"),
-    ],
-)
-def test_transcription_task(model_id, expected_runner_type,
-                            expected_convert_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        task="transcription",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
-
-    assert config.runner_type == expected_runner_type
-    assert config.convert_type == expected_convert_type
-    assert expected_task in config.supported_tasks
-
-
 MODEL_IDS_EXPECTED = [
     ("Qwen/Qwen1.5-7B", 32768),
     ("mistralai/Mistral-7B-v0.1", 4096),
@@ -252,17 +229,7 @@ def test_transcription_task(model_id, expected_runner_type,
 @pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
 def test_disable_sliding_window(model_id_expected):
     model_id, expected = model_id_expected
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-        disable_sliding_window=True,
-    )
+    model_config = ModelConfig(model_id, disable_sliding_window=True)
     assert model_config.max_model_len == expected
 
 
@@ -271,16 +238,7 @@ def test_get_sliding_window():
     # Test that the sliding window is correctly computed.
     # For Qwen1.5/Qwen2, get_sliding_window() should be None
     # when use_sliding_window is False.
-    qwen2_model_config = ModelConfig(
-        "Qwen/Qwen1.5-7B",
-        task="auto",
-        tokenizer="Qwen/Qwen1.5-7B",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B")
 
     qwen2_model_config.hf_config.use_sliding_window = False
     qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
@@ -289,16 +247,7 @@ def test_get_sliding_window():
     qwen2_model_config.hf_config.use_sliding_window = True
     assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
 
-    mistral_model_config = ModelConfig(
-        "mistralai/Mistral-7B-v0.1",
-        task="auto",
-        tokenizer="mistralai/Mistral-7B-v0.1",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1")
     mistral_model_config.hf_config.sliding_window = None
     assert mistral_model_config.get_sliding_window() is None
 
@@ -310,16 +259,7 @@ def test_get_sliding_window():
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)
 
     pooling_config = model_config._init_pooler_config()
     assert pooling_config is not None
@@ -332,14 +272,7 @@ def test_get_pooling_config():
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config_from_args():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               revision=None)
+    model_config = ModelConfig(model_id)
 
     override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
     model_config.override_pooler_config = override_pooler_config
@@ -352,16 +285,8 @@ def test_get_pooling_config_from_args():
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_bert_tokenization_sentence_transformer_config():
-    bge_model_config = ModelConfig(
-        model="BAAI/bge-base-en-v1.5",
-        task="auto",
-        tokenizer="BAAI/bge-base-en-v1.5",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_id = "BAAI/bge-base-en-v1.5"
+    bge_model_config = ModelConfig(model_id)
 
     bert_bge_model_config = bge_model_config._get_encoder_config()
 
@@ -374,27 +299,13 @@ def test_rope_customization():
     TEST_ROPE_THETA = 16_000_000.0
     LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
 
-    llama_model_config = ModelConfig(
-        "meta-llama/Meta-Llama-3-8B-Instruct",
-        task="auto",
-        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
     assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
     assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
     assert llama_model_config.max_model_len == 8192
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
-        task="auto",
-        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
         hf_overrides={
             "rope_scaling": TEST_ROPE_SCALING,
             "rope_theta": TEST_ROPE_THETA,
@@ -406,15 +317,7 @@ def test_rope_customization():
                    None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
-    longchat_model_config = ModelConfig(
-        "lmsys/longchat-13b-16k",
-        task="auto",
-        tokenizer="lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
     # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
     assert all(
         longchat_model_config.hf_config.rope_scaling.get(key) == value
@@ -423,12 +326,6 @@ def test_rope_customization():
 
     longchat_model_config = ModelConfig(
         "lmsys/longchat-13b-16k",
-        task="auto",
-        tokenizer="lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
         hf_overrides={
             "rope_scaling": TEST_ROPE_SCALING,
         },
@@ -447,15 +344,7 @@ def test_rope_customization():
     ("meta-llama/Llama-3.2-11B-Vision", True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    config = ModelConfig(model_id)
 
     assert config.is_encoder_decoder == is_encoder_decoder
 
@@ -465,15 +354,7 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
     ("Qwen/Qwen2-VL-2B-Instruct", True),
 ])
 def test_uses_mrope(model_id, uses_mrope):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    config = ModelConfig(model_id)
 
     assert config.uses_mrope == uses_mrope
 
@@ -483,26 +364,12 @@ def test_generation_config_loading():
 
     # When set generation_config to "vllm", the default generation config
     # will not be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="vllm")
+    model_config = ModelConfig(model_id, generation_config="vllm")
     assert model_config.get_diff_sampling_param() == {}
 
     # When set generation_config to "auto", the default generation config
     # should be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="auto")
+    model_config = ModelConfig(model_id, generation_config="auto")
 
     correct_generation_config = {
         "repetition_penalty": 1.1,
@@ -518,12 +385,6 @@ def test_generation_config_loading():
 
     model_config = ModelConfig(
         model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
         generation_config="auto",
         override_generation_config=override_generation_config)
 
@@ -536,12 +397,6 @@ def test_generation_config_loading():
     # is set, the override_generation_config should be used directly.
     model_config = ModelConfig(
         model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
         generation_config="vllm",
         override_generation_config=override_generation_config)
 
@@ -572,16 +427,7 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
 def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
                                 should_raise):
     """Test get_and_verify_max_len with different configurations."""
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)
 
     if should_raise:
         with pytest.raises(ValueError):

From 4426f78b5101798eddf40e340bcd7965dcac8963 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 14:44:25 +0000
Subject: [PATCH 15/47] Update model resolution

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/generative_models.md          |   5 +-
 docs/models/pooling_models.md             |  13 +-
 vllm/config.py                            | 140 +++++++-------
 vllm/model_executor/model_loader/utils.py | 114 ++----------
 vllm/model_executor/models/registry.py    | 212 +++++++++++++++++-----
 5 files changed, 252 insertions(+), 232 deletions(-)

diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index e26ca5aa668e..a3ad413593f3 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -13,9 +13,8 @@ which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to o
 Run a model in generation mode via the option `--runner generate`.
 
 !!! tip
-    There is no need to set this option because `--runner auto` (the default)
-    automatically resolves to `--runner generate` for models that support the
-    [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
+    There is no need to set this option in the vast majority of cases as vLLM can automatically
+    detect the model runner to use via `--runner auto`.
 
 ## Offline Inference
 
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 0b4ab2230819..03b3ddb28d2f 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -17,16 +17,9 @@ before returning them.
 
 Run a model in pooling mode via the option `--runner pooling`.
 
-!!! note
-    It is recommended to always set `--runner pooling` because `--runner auto` (the default)
-    automatically resolves to `--runner generate` for models that implement the
-    [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface,
-    even if they also implement the
-    [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
-
-    Moreover, setting `--runner pooling` will automatically trigger model conversion
-    via `--convert` (see below) if the model in vLLM does not implement
-    the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
+!!! tip
+    There is no need to set this option in the vast majority of cases as vLLM can automatically
+    detect the model runner to use via `--runner auto`.
 
 ### Model Conversion
 
diff --git a/vllm/config.py b/vllm/config.py
index c45713ffbec6..d13237a6c962 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -590,11 +590,10 @@ def __post_init__(self) -> None:
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision)
 
-        _, arch = self.registry.inspect_model_cls(self.architectures)
-        self._architecture = arch
-
-        is_generative_model = self.registry.is_text_generation_model(arch)
-        is_pooling_model = self.registry.is_pooling_model(arch)
+        architectures = self.architectures
+        is_generative_model = self.registry.is_text_generation_model(
+            architectures)
+        is_pooling_model = self.registry.is_pooling_model(architectures)
 
         def _task_to_convert(task: TaskOption) -> ConvertType:
             if task == "embedding" or task == "embed":
@@ -604,9 +603,8 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
             if task == "reward":
                 return "reward"
             if task == "score":
-                return ("classify"
-                        if self.registry.is_cross_encoder_model(arch) else
-                        "embed")
+                new_task = self._get_default_pooling_task(architectures)
+                return "classify" if new_task == "classify" else "embed"
 
             return "none"
 
@@ -659,8 +657,9 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
             msg = f"{msg_prefix} {msg_hint}"
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
 
-        self.runner_type = self._get_runner_type(arch, self.runner)
-        self.convert_type = self._get_convert_type(arch, self.runner_type,
+        self.runner_type = self._get_runner_type(architectures, self.runner)
+        self.convert_type = self._get_convert_type(architectures,
+                                                   self.runner_type,
                                                    self.convert)
 
         if self.runner_type == "generate" and not is_generative_model:
@@ -679,7 +678,7 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                     "it into a pooling model.")
 
         self.supported_tasks = self._get_supported_tasks(
-            arch, self.runner_type, self.convert_type)
+            architectures, self.runner_type, self.convert_type)
 
         self.pooler_config = self._init_pooler_config()
 
@@ -797,10 +796,12 @@ def architectures(self) -> list[str]:
             architectures.append(transformers_backend_cls)
         return architectures
 
-    @property
+    @cached_property
     def architecture(self) -> str:
-        # The architecture vllm actually used.
-        return self._architecture
+        """The architecture vllm actually used."""
+        _, arch = self.registry.inspect_model_cls(self.architectures,
+                                                  model_config=self)
+        return arch
 
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
@@ -916,36 +917,38 @@ def _verify_tokenizer_mode(self) -> None:
                 f"one of {get_args(TokenizerMode)}.")
         self.tokenizer_mode = tokenizer_mode
 
-    def _get_default_runner_type(self, architecture: str) -> RunnerType:
+    def _get_default_runner_type(
+        self,
+        architectures: list[str],
+    ) -> RunnerType:
         registry = self.registry
 
-        if architecture in registry.get_supported_archs():
-            if registry.is_text_generation_model(architecture):
-                return "generate"
-            if registry.is_pooling_model(architecture):
-                return "pooling"
-
-            raise RuntimeError(f"Registered architecture ({architecture}) "
-                               "should be a generative or pooling model")
-
+        # Some Sentence Transformers models use *ForCausalLM archs
         if get_pooling_config(self.model, self.revision):
             return "pooling"
 
-        for suffix, (runner_type, _) in SUFFIX_TO_DEFAULTS:
-            if architecture.endswith(suffix):
-                return runner_type
+        for arch in architectures:
+            if arch in registry.get_supported_archs():
+                if registry.is_pooling_model(architectures):
+                    return "pooling"
+                if registry.is_text_generation_model(architectures):
+                    return "generate"
+
+            for suffix, (runner_type, _) in SUFFIX_TO_DEFAULTS:
+                if arch.endswith(suffix):
+                    return runner_type
 
         return "generate"
 
     def _get_runner_type(
         self,
-        architecture: str,
+        architectures: list[str],
         runner: RunnerOption,
     ) -> RunnerType:
         if runner != "auto":
             return runner
 
-        runner_type = self._get_default_runner_type(architecture)
+        runner_type = self._get_default_runner_type(architectures)
 
         logger.info(
             "Resolved `--runner auto` to `--runner %s`. "
@@ -955,23 +958,25 @@ def _get_runner_type(
 
     def _get_default_convert_type(
         self,
-        architecture: str,
+        architectures: list[str],
         runner_type: RunnerType,
     ) -> ConvertType:
         registry = self.registry
 
-        if architecture in registry.get_supported_archs():
-            if (runner_type == "generate"
-                    and registry.is_text_generation_model(architecture)):
-                return "none"
-            if (runner_type == "pooling"
-                    and registry.is_pooling_model(architecture)):
-                return "none"
-
-        for suffix, (default_runner_type, convert_type) in SUFFIX_TO_DEFAULTS:
-            if (default_runner_type == runner_type
-                    and architecture.endswith(suffix)):
-                return convert_type
+        for arch in architectures:
+            if arch in registry.get_supported_archs():
+                if (runner_type == "generate"
+                        and registry.is_text_generation_model(architectures)):
+                    return "none"
+                if (runner_type == "pooling"
+                        and registry.is_pooling_model(architectures)):
+                    return "none"
+
+            for suffix, (default_runner_type,
+                         convert_type) in SUFFIX_TO_DEFAULTS:
+                if (default_runner_type == runner_type
+                        and arch.endswith(suffix)):
+                    return convert_type
 
         if runner_type == "pooling":
             return "embed"
@@ -980,14 +985,14 @@ def _get_default_convert_type(
 
     def _get_convert_type(
         self,
-        architecture: str,
+        architectures: list[str],
         runner_type: RunnerType,
         convert: ConvertOption,
     ) -> ConvertType:
         if convert != "auto":
             return convert
 
-        convert_type = self._get_default_convert_type(architecture,
+        convert_type = self._get_default_convert_type(architectures,
                                                       runner_type)
 
         logger.info(
@@ -998,61 +1003,54 @@ def _get_convert_type(
 
     def _get_supported_generation_tasks(
         self,
-        architecture: str,
+        architectures: list[str],
         convert_type: ConvertType,
     ) -> list[_ResolvedTask]:
         registry = self.registry
 
-        if registry.is_transcription_only_model(architecture):
+        if registry.is_transcription_only_model(architectures):
             return ["transcription"]
 
         # TODO: Use get_supported_generation_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
-        if (registry.is_text_generation_model(architecture)
+        if (registry.is_text_generation_model(architectures)
                 or convert_type in _RUNNER_CONVERTS["generate"]):
             supported_tasks.append("generate")
 
-        if registry.is_transcription_model(architecture):
+        if registry.is_transcription_model(architectures):
             supported_tasks.append("transcription")
 
         return supported_tasks
 
-    def _get_default_pooling_task(self, architecture: str) -> _ResolvedTask:
-        if self.registry.is_cross_encoder_model(architecture):
+    def _get_default_pooling_task(
+        self,
+        architectures: list[str],
+    ) -> Literal["embed", "classify", "reward"]:
+        if self.registry.is_cross_encoder_model(architectures):
             return "classify"
 
-        # https://huggingface.co/docs/transformers/en/model_doc/auto
-        suffix_to_convert_type: list[tuple[str, _ResolvedTask]] = [
-            ("ForTextEncoding", "embed"),
-            ("EmbeddingModel", "embed"),
-            ("ForSequenceClassification", "classify"),
-            ("ForAudioClassification", "classify"),
-            ("ForImageClassification", "classify"),
-            ("ForVideoClassification", "classify"),
-            ("ForRewardModeling", "reward"),
-            ("RewardModel", "reward"),
-        ]
-
-        for suffix, pref_runner in suffix_to_convert_type:
-            if architecture.endswith(suffix):
-                return pref_runner
+        for arch in architectures:
+            for suffix, (runner_type, convert_type) in SUFFIX_TO_DEFAULTS:
+                if runner_type == "pooling" and arch.endswith(suffix):
+                    assert convert_type != "none"
+                    return convert_type
 
         return "embed"
 
     def _get_supported_pooling_tasks(
         self,
-        architecture: str,
+        architectures: list[str],
         convert_type: ConvertType,
     ) -> list[_ResolvedTask]:
         registry = self.registry
 
         # TODO: Use get_supported_pooling_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
-        if (registry.is_pooling_model(architecture)
+        if (registry.is_pooling_model(architectures)
                 or convert_type in _RUNNER_CONVERTS["pooling"]):
             supported_tasks.append("encode")
 
-            extra_task = (self._get_default_pooling_task(architecture)
+            extra_task = (self._get_default_pooling_task(architectures)
                           if convert_type == "none" else convert_type)
             supported_tasks.append(extra_task)
 
@@ -1060,15 +1058,15 @@ def _get_supported_pooling_tasks(
 
     def _get_supported_tasks(
         self,
-        architecture: str,
+        architectures: list[str],
         runner_type: RunnerType,
         convert_type: ConvertType,
     ) -> list[_ResolvedTask]:
         if runner_type == "generate":
             return self._get_supported_generation_tasks(
-                architecture, convert_type)
+                architectures, convert_type)
         if runner_type == "pooling":
-            return self._get_supported_pooling_tasks(architecture,
+            return self._get_supported_pooling_tasks(architectures,
                                                      convert_type)
         if runner_type == "draft":
             return ["draft"]
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 466025bc1964..496de87d8d37 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -9,25 +9,20 @@
 from typing import Optional
 
 import torch
-import transformers
 from torch import nn
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
 from typing_extensions import assert_never
 
 from vllm.attention import Attention
-from vllm.config import (SUFFIX_TO_DEFAULTS, ModelConfig, ModelImpl,
-                         VllmConfig, set_current_vllm_config)
+from vllm.config import (ModelConfig, ModelImpl, VllmConfig,
+                         set_current_vllm_config)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import QKVCrossParallelLinear
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_embedding_model,
                                                  as_reward_model,
                                                  as_seq_cls_model)
 from vllm.model_executor.models.interfaces import SupportsQuant
-from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
-                                                 _TRANSFORMERS_BACKEND_MODELS)
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -170,61 +165,6 @@ def device_loading_context(module: torch.nn.Module,
         # New parameters or parameters already on target device are untouched
 
 
-def resolve_transformers_arch(model_config: ModelConfig,
-                              architectures: list[str]):
-    if model_config.model_impl == ModelImpl.VLLM:
-        raise ValueError(
-            "Attempting to resolve architecture from the Transformers library "
-            "but the model implementation is set to vLLM. This should never "
-            "happen.")
-
-    for i, arch in enumerate(architectures):
-        if arch in _TRANSFORMERS_BACKEND_MODELS:
-            continue
-
-        if model_config.model_impl == ModelImpl.AUTO:
-            logger.warning(
-                "%s has no vLLM implementation, falling back to Transformers "
-                "implementation. Some features may not be supported and "
-                "performance may not be optimal.", arch)
-
-        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
-                                           None) or dict()
-        # Make sure that config class is always initialized before model class,
-        # otherwise the model class won't be able to access the config class,
-        # the expected auto_map should have correct order like:
-        # "auto_map": {
-        #     "AutoConfig": "<your-repo-name>--<config-name>",
-        #     "AutoModel": "<your-repo-name>--<config-name>",
-        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
-        # },
-        auto_modules = {
-            name:
-            get_class_from_dynamic_module(module,
-                                          model_config.model,
-                                          revision=model_config.revision)
-            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
-        }
-        model_module = getattr(transformers, arch, None)
-        if model_module is None:
-            if "AutoModel" not in auto_map:
-                raise ValueError(
-                    f"Cannot find model module. '{arch}' is not a registered "
-                    "model in the Transformers library (only relevant if the "
-                    "model is meant to be in Transformers) and 'AutoModel' is "
-                    "not present in the model config's 'auto_map' (relevant "
-                    "if the model is custom).")
-            model_module = auto_modules["AutoModel"]
-
-        if not model_module.is_backend_compatible():
-            raise ValueError(
-                f"The Transformers implementation of '{arch}' is not "
-                "compatible with vLLM.")
-
-        architectures[i] = model_config._get_transformers_backend_cls()
-    return architectures
-
-
 def get_model_architecture(
         model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
@@ -240,44 +180,22 @@ def get_model_architecture(
         "bitsandbytes",
     ]
 
-    vllm_supported_archs = ModelRegistry.get_supported_archs()
-    is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
-                                 _TRANSFORMERS_BACKEND_MODELS)
-
-    if not any(is_supported(arch) for arch in architectures):
-        # try automatic conversion in adapters.py
-        for i in range(len(architectures)):
-            arch = architectures[i]
-
-            for suffix, (runner_type, convert_type) in SUFFIX_TO_DEFAULTS:
-                if (runner_type == model_config.runner_type
-                        and convert_type == model_config.convert_type
-                        and arch.endswith(suffix)):
-                    causal_lm_arch = arch.replace(suffix, "ForCausalLM")
-                    if causal_lm_arch in vllm_supported_archs:
-                        architectures[i] = causal_lm_arch
-                        break
-
-    vllm_not_supported = not any(is_supported(arch) for arch in architectures)
-
-    if any(arch in _PREVIOUSLY_SUPPORTED_MODELS for arch in architectures):
-        previous_version = _PREVIOUSLY_SUPPORTED_MODELS[architectures[0]]
-        raise ValueError(
-            f"Model architecture {architectures[0]} was supported"
-            f" in vLLM until version {previous_version}, and is "
-            "not supported anymore. Please use an older version"
-            " of vLLM if you want to use this model architecture.")
-
-    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
-            model_config.model_impl == ModelImpl.AUTO and vllm_not_supported):
-        architectures = resolve_transformers_arch(model_config, architectures)
-        logger.debug_once("Resolve transformers arch %s", str(architectures))
-    elif (model_config.quantization is not None
-          and model_config.quantization not in mixtral_supported
-          and "MixtralForCausalLM" in architectures):
+    if (model_config.quantization is not None
+            and model_config.quantization not in mixtral_supported
+            and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
-    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
+    registry = model_config.registry
+    model_cls, arch = registry.resolve_model_cls(architectures,
+                                                 model_config=model_config)
+
+    if arch == model_config._get_transformers_backend_cls():
+        assert model_config.model_impl != ModelImpl.VLLM
+        if model_config.model_impl == ModelImpl.AUTO:
+            logger.warning_once(
+                "%s has no vLLM implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.", arch)
 
     convert_type = model_config.convert_type
     if convert_type == "none":
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b3273147201d..31dce02ccbba 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -12,12 +12,13 @@
 import tempfile
 from abc import ABC, abstractmethod
 from collections.abc import Set
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Callable, Optional, TypeVar, Union
 
 import torch.nn as nn
 
+from vllm.config import SUFFIX_TO_DEFAULTS, ModelConfig, ModelImpl
 from vllm.logger import init_logger
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
@@ -461,6 +462,16 @@ def _raise_for_unsupported(self, architectures: list[str]):
                 f"Model architectures {architectures} failed "
                 "to be inspected. Please check the logs for more details.")
 
+        for arch in architectures:
+            if arch in _PREVIOUSLY_SUPPORTED_MODELS:
+                previous_version = _PREVIOUSLY_SUPPORTED_MODELS[arch]
+
+                raise ValueError(
+                    f"Model architecture {arch} was supported in vLLM until "
+                    f"v{previous_version}, and is not supported anymore. "
+                    "Please use an older version of vLLM if you want to "
+                    "use this model architecture.")
+
         raise ValueError(
             f"Model architectures {architectures} are not supported for now. "
             f"Supported architectures: {all_supported_archs}")
@@ -473,64 +484,123 @@ def _try_load_model_cls(self,
         return _try_load_model_cls(model_arch, self.models[model_arch])
 
     def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
-        if model_arch in self.models:
-            return _try_inspect_model_cls(model_arch, self.models[model_arch])
-
-        if model_arch.endswith("ForSequenceClassification"):
-            causal_lm_arch = model_arch.replace("ForSequenceClassification",
-                                                "ForCausalLM")
-            if causal_lm_arch not in self.models:
-                return None
+        if model_arch not in self.models:
+            return None
 
-            info = _try_inspect_model_cls(causal_lm_arch,
-                                          self.models[causal_lm_arch])
+        return _try_inspect_model_cls(model_arch, self.models[model_arch])
+
+    def _resolve_transformers(self, architecture: str,
+                              model_config: ModelConfig):
+        if architecture in _TRANSFORMERS_BACKEND_MODELS:
+            return architecture
+        if model_config.model_impl == ModelImpl.VLLM:
+            return architecture
+
+        import transformers
+        from transformers.dynamic_module_utils import (
+            get_class_from_dynamic_module)
+
+        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
+                                           None) or dict()
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name:
+            get_class_from_dynamic_module(module,
+                                          model_config.model,
+                                          revision=model_config.revision)
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+        model_module = getattr(transformers, architecture, None)
+
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                if model_config.model_impl == ModelImpl.AUTO:
+                    return architecture
+
+                raise ValueError(
+                    f"Cannot find model module. {architecture!r} is not a "
+                    "registered model in the Transformers library (only "
+                    "relevant if the model is meant to be in Transformers) "
+                    "and 'AutoModel' is not present in the model config's "
+                    "'auto_map' (relevant if the model is custom).")
+
+            model_module = auto_modules["AutoModel"]
+
+        if not model_module.is_backend_compatible():
+            if model_config.model_impl == ModelImpl.AUTO:
+                return architecture
 
-            info = _ModelInfo(**dict(
-                asdict(info), **{
-                    "architecture": model_arch,
-                    "supports_cross_encoding": True
-                }))
-            return info
+            raise ValueError(
+                f"The Transformers implementation of {architecture!r} "
+                "is not compatible with vLLM.")
 
-        return None
+        return model_config._get_transformers_backend_cls()
 
-    def _normalize_archs(
+    def _normalize_arch(
+        self,
+        architecture: str,
+        *,
+        model_config: Optional[ModelConfig],
+    ) -> str:
+        if architecture in self.models:
+            return architecture
+        if model_config is None:
+            return architecture
+
+        for suffix, (default_runner_type,
+                     default_convert_type) in SUFFIX_TO_DEFAULTS:
+            if (model_config.runner_type == default_runner_type
+                    and model_config.convert_type == default_convert_type
+                    and architecture.endswith(suffix)):
+                return architecture.replace(suffix, "ForCausalLM")
+
+        if architecture not in self.models:
+            architecture = self._resolve_transformers(architecture,
+                                                      model_config)
+
+        return architecture
+
+    def normalize_archs(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> list[str]:
         if isinstance(architectures, str):
             architectures = [architectures]
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        # filter out support architectures
-        normalized_arch = list(
-            filter(lambda model: model in self.models, architectures))
-
-        # try automatic conversion in adapters.py
-        for arch in architectures:
-            if not arch.endswith("ForSequenceClassification"):
-                continue
-            causal_lm_arch = arch.replace("ForSequenceClassification",
-                                          "ForCausalLM")
-            if causal_lm_arch in self.models:
-                normalized_arch.append(arch)
+        normalized_archs = [
+            self._normalize_arch(arch, model_config=model_config)
+            for arch in architectures
+        ]
 
         # NOTE(Isotr0py): Be careful of architectures' order!
         # Make sure Transformers backend architecture is at the end of the
         # list, otherwise pooling models automatic conversion will fail!
-        for arch in normalized_arch:
+        for arch in normalized_archs:
             if arch.startswith("TransformersFor"):
-                normalized_arch.remove(arch)
-                normalized_arch.append(arch)
+                normalized_archs.remove(arch)
+                normalized_archs.append(arch)
 
-        return normalized_arch
+        return normalized_archs
 
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> tuple[_ModelInfo, str]:
-        architectures = self._normalize_archs(architectures)
+        architectures = self.normalize_archs(architectures,
+                                             model_config=model_config)
 
         for arch in architectures:
             model_info = self._try_inspect_model_cls(arch)
@@ -542,8 +612,11 @@ def inspect_model_cls(
     def resolve_model_cls(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> tuple[type[nn.Module], str]:
-        architectures = self._normalize_archs(architectures)
+        architectures = self.normalize_archs(architectures,
+                                             model_config=model_config)
 
         for arch in architectures:
             model_cls = self._try_load_model_cls(arch)
@@ -555,92 +628,131 @@ def resolve_model_cls(
     def is_text_generation_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.is_text_generation_model
 
     def is_pooling_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.is_pooling_model
 
     def is_cross_encoder_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.supports_cross_encoding
 
     def is_multimodal_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.supports_multimodal
 
     def supports_multimodal_raw_input(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.supports_multimodal_raw_input
 
     def is_pp_supported_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.supports_pp
 
     def model_has_inner_state(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.has_inner_state
 
     def is_attention_free_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.is_attention_free
 
     def is_hybrid_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.is_hybrid
 
     def is_noops_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.has_noops
 
     def is_transcription_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.supports_transcription
 
     def is_transcription_only_model(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return model_cls.supports_transcription_only
 
     def is_v1_compatible(
         self,
         architectures: Union[str, list[str]],
+        *,
+        model_config: Optional[ModelConfig] = None,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures)
+        model_cls, _ = self.inspect_model_cls(architectures,
+                                              model_config=model_config)
         return not model_cls.supports_v0_only
 
 

From b6d41bb55ccd055da24ff081ae22dfac24ab1f2f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 17:29:59 +0000
Subject: [PATCH 16/47] Try fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index d13237a6c962..84f2f8532c4c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -680,6 +680,11 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self.supported_tasks = self._get_supported_tasks(
             architectures, self.runner_type, self.convert_type)
 
+        _, arch = self.registry.inspect_model_cls(self.architectures,
+                                                  model_config=self)
+        self._architecture = arch
+        logger.info("Resolved architecture: %s", arch)
+
         self.pooler_config = self._init_pooler_config()
 
         self.dtype = _get_and_verify_dtype(
@@ -799,9 +804,7 @@ def architectures(self) -> list[str]:
     @cached_property
     def architecture(self) -> str:
         """The architecture vllm actually used."""
-        _, arch = self.registry.inspect_model_cls(self.architectures,
-                                                  model_config=self)
-        return arch
+        return self._architecture
 
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
@@ -4853,7 +4856,7 @@ def recalculate_max_model_len(self, max_model_len: int):
         self.scheduler_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
-        architecture = getattr(self.model_config, "architecture", None)
+        architecture = self.model_config.architecture
         if architecture is None:
             return
 

From c764da3ac93d8546095faf03a560eee18085f47b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 17:53:02 +0000
Subject: [PATCH 17/47] More fixes and cleanup

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                         | 28 ++++++---------
 vllm/model_executor/models/registry.py | 48 ++++++++++++--------------
 2 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 84f2f8532c4c..44c5aa9027fa 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -741,10 +741,12 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
-        self.is_attention_free = self._init_attention_free()
-        self.is_hybrid = self._init_is_hybrid()
-        self.has_noops = self._init_has_noops()
-        self.has_inner_state = self._init_has_inner_state()
+        self.is_attention_free = self.registry.is_attention_free_model(
+            architectures)
+        self.is_hybrid = self.registry.is_hybrid_model(architectures)
+        self.has_noops = self.registry.is_noops_model(architectures)
+        self.has_inner_state = self.registry.model_has_inner_state(
+            architectures)
 
         if (not current_platform.is_neuron() and self.override_neuron_config):
             raise ValueError(
@@ -899,19 +901,6 @@ def _init_pooler_config(self) -> Optional["PoolerConfig"]:
 
         return None
 
-    def _init_attention_free(self) -> bool:
-        return self.registry.is_attention_free_model(self.architectures)
-
-    def _init_is_hybrid(self) -> bool:
-        return self.registry.is_hybrid_model(self.architectures)
-
-    def _init_has_noops(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return self.registry.is_noops_model(architectures)
-
-    def _init_has_inner_state(self) -> bool:
-        return self.registry.model_has_inner_state(self.architectures)
-
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
         if tokenizer_mode not in get_args(TokenizerMode):
@@ -1644,7 +1633,7 @@ def is_multimodal_model(self) -> bool:
 
     @property
     def is_cross_encoder(self) -> bool:
-        return (self.registry.is_cross_encoder_model(self.architecture)
+        return (self.registry.is_cross_encoder_model(self.architectures)
                 or self.convert_type == "classify")
 
     @property
@@ -4856,6 +4845,9 @@ def recalculate_max_model_len(self, max_model_len: int):
         self.scheduler_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
+        if self.model_config is None:
+            return
+
         architecture = self.model_config.architecture
         if architecture is None:
             return
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 31dce02ccbba..fbe4d2d5ebc3 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -554,6 +554,10 @@ def _normalize_arch(
         if model_config is None:
             return architecture
 
+        # Force transformers impl
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            return self._resolve_transformers(architecture, model_config)
+
         for suffix, (default_runner_type,
                      default_convert_type) in SUFFIX_TO_DEFAULTS:
             if (model_config.runner_type == default_runner_type
@@ -561,49 +565,40 @@ def _normalize_arch(
                     and architecture.endswith(suffix)):
                 return architecture.replace(suffix, "ForCausalLM")
 
+        # Fallback to transformers impl
         if architecture not in self.models:
-            architecture = self._resolve_transformers(architecture,
-                                                      model_config)
+            return self._resolve_transformers(architecture, model_config)
 
         return architecture
 
-    def normalize_archs(
+    def _normalize_archs(
         self,
-        architectures: Union[str, list[str]],
+        architectures: list[str],
         *,
         model_config: Optional[ModelConfig] = None,
     ) -> list[str]:
-        if isinstance(architectures, str):
-            architectures = [architectures]
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        normalized_archs = [
+        return [
             self._normalize_arch(arch, model_config=model_config)
             for arch in architectures
         ]
 
-        # NOTE(Isotr0py): Be careful of architectures' order!
-        # Make sure Transformers backend architecture is at the end of the
-        # list, otherwise pooling models automatic conversion will fail!
-        for arch in normalized_archs:
-            if arch.startswith("TransformersFor"):
-                normalized_archs.remove(arch)
-                normalized_archs.append(arch)
-
-        return normalized_archs
-
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
         *,
         model_config: Optional[ModelConfig] = None,
     ) -> tuple[_ModelInfo, str]:
-        architectures = self.normalize_archs(architectures,
-                                             model_config=model_config)
+        if isinstance(architectures, str):
+            architectures = [architectures]
 
-        for arch in architectures:
-            model_info = self._try_inspect_model_cls(arch)
+        normalized_archs = self._normalize_archs(architectures,
+                                                 model_config=model_config)
+
+        for arch, normalized_arch in zip(architectures, normalized_archs):
+            model_info = self._try_inspect_model_cls(normalized_arch)
             if model_info is not None:
                 return (model_info, arch)
 
@@ -615,11 +610,14 @@ def resolve_model_cls(
         *,
         model_config: Optional[ModelConfig] = None,
     ) -> tuple[type[nn.Module], str]:
-        architectures = self.normalize_archs(architectures,
-                                             model_config=model_config)
+        if isinstance(architectures, str):
+            architectures = [architectures]
 
-        for arch in architectures:
-            model_cls = self._try_load_model_cls(arch)
+        normalized_archs = self._normalize_archs(architectures,
+                                                 model_config=model_config)
+
+        for arch, normalized_arch in zip(architectures, normalized_archs):
+            model_cls = self._try_load_model_cls(normalized_arch)
             if model_cls is not None:
                 return (model_cls, arch)
 

From f78fb3d05603c8fec0a07f804d54b3dbf70259e7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 17:57:26 +0000
Subject: [PATCH 18/47] Make the transformers test stricter

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/test_transformers.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index cd5b6193d001..5b7d90dfb896 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -33,6 +33,10 @@ def check_implementation(
     args = (example_prompts, max_tokens, num_logprobs)
 
     with runner_test(model, **kwargs_test, **kwargs) as model_test:
+        model_config = model_test.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
         outputs_test = model_test.generate_greedy_logprobs(*args)
 
     with runner_ref(model, **kwargs_ref) as model_ref:
@@ -130,8 +134,13 @@ def test_quantization(
             model_impl="transformers",
             enforce_eager=True,
             **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model_config = vllm_model.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
         transformers_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+
     check_logprobs_close(
         outputs_0_lst=transformers_outputs,
         outputs_1_lst=vllm_outputs,
@@ -151,7 +160,6 @@ def test_classify(
     example_prompts,
     model: str,
     dtype: str,
-    monkeypatch,
 ) -> None:
     import torch
     from transformers import AutoModelForSequenceClassification
@@ -160,6 +168,10 @@ def test_classify(
                      max_model_len=512,
                      dtype=dtype,
                      model_impl="transformers") as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
         vllm_outputs = vllm_model.classify(example_prompts)
 
     with hf_runner(model,

From 0c1266cae6c048deaa2f58feec7acb80706f7d99 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 24 Jul 2025 18:01:48 +0000
Subject: [PATCH 19/47] Reorder

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/registry.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index fbe4d2d5ebc3..62d26e04d0cd 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -549,8 +549,6 @@ def _normalize_arch(
         *,
         model_config: Optional[ModelConfig],
     ) -> str:
-        if architecture in self.models:
-            return architecture
         if model_config is None:
             return architecture
 
@@ -558,6 +556,9 @@ def _normalize_arch(
         if model_config.model_impl == ModelImpl.TRANSFORMERS:
             return self._resolve_transformers(architecture, model_config)
 
+        if architecture in self.models:
+            return architecture
+
         for suffix, (default_runner_type,
                      default_convert_type) in SUFFIX_TO_DEFAULTS:
             if (model_config.runner_type == default_runner_type

From 631dfef4e16b802101127f71d94ef45bfa4138ff Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 25 Jul 2025 11:32:04 +0200
Subject: [PATCH 20/47] Remove local variable that's only used one time

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/model_loader/utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 496de87d8d37..f57ebdb1abcb 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -185,9 +185,10 @@ def get_model_architecture(
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
-    registry = model_config.registry
-    model_cls, arch = registry.resolve_model_cls(architectures,
-                                                 model_config=model_config)
+    model_cls, arch = model_config.registry.resolve_model_cls(
+        architectures,
+        model_config=model_config,
+    )
 
     if arch == model_config._get_transformers_backend_cls():
         assert model_config.model_impl != ModelImpl.VLLM

From 2485ebeb6061264f7651fa7eb822fffd230cc8ab Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 25 Jul 2025 11:36:53 +0200
Subject: [PATCH 21/47] Simplify conditions in `_normalize_arch`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/registry.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 62d26e04d0cd..830e91ed9af7 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -549,11 +549,16 @@ def _normalize_arch(
         *,
         model_config: Optional[ModelConfig],
     ) -> str:
+        # model_config is required for further normalization
         if model_config is None:
             return architecture
 
-        # Force transformers impl
-        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+        # Use Transformers backend architecture if the config specifies it
+        # or if the architecture is not in the registry and the config allows
+        # for Transformers backend fallback behaviour
+        if (model_config.model_impl == ModelImpl.TRANSFORMERS
+                or model_config.model_impl == ModelImpl.AUTO
+                and architecture not in self.models):
             return self._resolve_transformers(architecture, model_config)
 
         if architecture in self.models:
@@ -566,12 +571,6 @@ def _normalize_arch(
                     and architecture.endswith(suffix)):
                 return architecture.replace(suffix, "ForCausalLM")
 
-        # Fallback to transformers impl
-        if architecture not in self.models:
-            return self._resolve_transformers(architecture, model_config)
-
-        return architecture
-
     def _normalize_archs(
         self,
         architectures: list[str],

From 775aa0573b144dc9817f94bd15f601a49287e3b8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 25 Jul 2025 11:37:32 +0200
Subject: [PATCH 22/47] Return `normalized_arch` in `inspect_model_cls`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 830e91ed9af7..8b419d671b5f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -597,10 +597,10 @@ def inspect_model_cls(
         normalized_archs = self._normalize_archs(architectures,
                                                  model_config=model_config)
 
-        for arch, normalized_arch in zip(architectures, normalized_archs):
+        for normalized_arch in normalized_archs:
             model_info = self._try_inspect_model_cls(normalized_arch)
             if model_info is not None:
-                return (model_info, arch)
+                return (model_info, normalized_arch)
 
         return self._raise_for_unsupported(architectures)
 

From 9047c904b581c7effb3b4d653a908f5e9dc7331a Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 11:39:43 +0000
Subject: [PATCH 23/47] Fix verify_and_update_config

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/registry.py | 30 ++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8b419d671b5f..8c98eb4cca4e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -571,11 +571,13 @@ def _normalize_arch(
                     and architecture.endswith(suffix)):
                 return architecture.replace(suffix, "ForCausalLM")
 
+        return architecture
+
     def _normalize_archs(
         self,
         architectures: list[str],
         *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: Optional[ModelConfig],
     ) -> list[str]:
         if not architectures:
             logger.warning("No model architectures are specified")
@@ -585,6 +587,20 @@ def _normalize_archs(
             for arch in architectures
         ]
 
+    def _get_inspected_arch(
+        self,
+        orig_architecture: str,
+        normalized_architecture: str,
+        *,
+        model_config: Optional[ModelConfig],
+    ) -> str:
+        if (model_config is not None and normalized_architecture
+                == model_config._get_transformers_backend_cls()):
+            return normalized_architecture
+
+        # Avoid breaking `verify_and_update_config`
+        return orig_architecture
+
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
@@ -597,10 +613,13 @@ def inspect_model_cls(
         normalized_archs = self._normalize_archs(architectures,
                                                  model_config=model_config)
 
-        for normalized_arch in normalized_archs:
+        for arch, normalized_arch in zip(architectures, normalized_archs):
             model_info = self._try_inspect_model_cls(normalized_arch)
             if model_info is not None:
-                return (model_info, normalized_arch)
+                out_arch = self._get_inspected_arch(arch,
+                                                    normalized_arch,
+                                                    model_config=model_config)
+                return (model_info, out_arch)
 
         return self._raise_for_unsupported(architectures)
 
@@ -619,7 +638,10 @@ def resolve_model_cls(
         for arch, normalized_arch in zip(architectures, normalized_archs):
             model_cls = self._try_load_model_cls(normalized_arch)
             if model_cls is not None:
-                return (model_cls, arch)
+                out_arch = self._get_inspected_arch(arch,
+                                                    normalized_arch,
+                                                    model_config=model_config)
+                return (model_cls, out_arch)
 
         return self._raise_for_unsupported(architectures)
 

From 784e723bfb4b1283fd4c840f8ca11357dc174391 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 12:40:49 +0000
Subject: [PATCH 24/47] Update comment

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 44c5aa9027fa..137821be7b52 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -118,6 +118,7 @@
     "draft": [],
 }
 
+# Some model suffixes are based on auto classes from Transformers:
 # https://huggingface.co/docs/transformers/en/model_doc/auto
 SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
     ("ForCausalLM", ("generate", "none")),
@@ -971,6 +972,8 @@ def _get_default_convert_type(
                     return convert_type
 
         if runner_type == "pooling":
+            # Some models simply end with *Model which we assume
+            # to be embedding model if the runner type is known
             return "embed"
 
         return "none"

From 330d0baecfdbc532a17616b0e8ec45f49e455cfa Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 12:48:04 +0000
Subject: [PATCH 25/47] Handle `*Model` explicitly

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/pooling_models.md | 10 +++++-----
 vllm/config.py                |  6 +-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 03b3ddb28d2f..a06d86523af1 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -30,11 +30,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe
 vLLM will attempt to automatically convert the model according to the architecture names
 shown in the table below.
 
-| Architecture                                  | `--convert` | Supported pooling tasks       |
-|-----------------------------------------------|-------------|-------------------------------|
-| `*ForTextEncoding`, `*EmbeddingModel`         | `embed`     | `encode`, `embed`             |
-| `*For*Classification`, `*ClassificationModel` | `classify`  | `encode`, `classify`, `score` |
-| `*ForRewardModeling`, `*RewardModel`          | `reward`    | `encode`                      |
+| Architecture                                    | `--convert` | Supported pooling tasks       |
+|-------------------------------------------------|-------------|-------------------------------|
+| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `encode`, `embed`             |
+| `*For*Classification`, `*ClassificationModel`   | `classify`  | `encode`, `classify`, `score` |
+| `*ForRewardModeling`, `*RewardModel`            | `reward`    | `encode`                      |
 
 !!! tip
     You can explicitly set `--convert <type>` to specify how to convert the model.
diff --git a/vllm/config.py b/vllm/config.py
index 137821be7b52..fba3c13f3e6c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -134,6 +134,7 @@
     ("ClassificationModel", ("pooling", "classify")),
     ("ForRewardModeling", ("pooling", "reward")),
     ("RewardModel", ("pooling", "reward")),
+    ("Model", ("pooling", "embed")),
 ]
 
 
@@ -971,11 +972,6 @@ def _get_default_convert_type(
                         and arch.endswith(suffix)):
                     return convert_type
 
-        if runner_type == "pooling":
-            # Some models simply end with *Model which we assume
-            # to be embedding model if the runner type is known
-            return "embed"
-
         return "none"
 
     def _get_convert_type(

From b50ac676990e278192e40c14d0872cc4f2ba605c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 12:57:32 +0000
Subject: [PATCH 26/47] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_config.py | 8 ++++++--
 vllm/config.py       | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 3e1de8fb111b..8278ab3a23ee 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -108,12 +108,16 @@ def test_auto_task(model_id, expected_runner_type, expected_convert_type,
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
          "classify"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
-        ("openai/whisper-small", "pooling", "embed", "embed"),
+        ("openai/whisper-small", "pooling", None, "embed"),
     ],
 )
 def test_score_task(model_id, expected_runner_type, expected_convert_type,
                     expected_task):
-    config = ModelConfig(model_id, task="score")
+    if expected_runner_type is None:
+        with pytest.raises(ValueError):
+            config = ModelConfig(model_id, task="score")
+    else:
+        config = ModelConfig(model_id, task="score")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
diff --git a/vllm/config.py b/vllm/config.py
index fba3c13f3e6c..bff65fe2cf04 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -120,6 +120,7 @@
 
 # Some model suffixes are based on auto classes from Transformers:
 # https://huggingface.co/docs/transformers/en/model_doc/auto
+# NOTE: Items higher on this list priority over lower ones
 SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
     ("ForCausalLM", ("generate", "none")),
     ("ForConditionalGeneration", ("generate", "none")),
@@ -134,6 +135,7 @@
     ("ClassificationModel", ("pooling", "classify")),
     ("ForRewardModeling", ("pooling", "reward")),
     ("RewardModel", ("pooling", "reward")),
+    # Let other `*Model`s take priority
     ("Model", ("pooling", "embed")),
 ]
 

From da0a183eac90ad4468c6993929168e526980da63 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 12:59:32 +0000
Subject: [PATCH 27/47] Handle ST models

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index bff65fe2cf04..9b99782eaa8d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -974,6 +974,10 @@ def _get_default_convert_type(
                         and arch.endswith(suffix)):
                     return convert_type
 
+        # Some Sentence Transformers models use *ForCausalLM archs
+        if get_pooling_config(self.model, self.revision):
+            return "embed"
+
         return "none"
 
     def _get_convert_type(

From 2e664524b0479c763d5b66f6986bdaf6cfd1528c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 13:01:21 +0000
Subject: [PATCH 28/47] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_config.py | 36 ++++++++++--------------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 8278ab3a23ee..c15159d023ae 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -116,6 +116,7 @@ def test_score_task(model_id, expected_runner_type, expected_convert_type,
     if expected_runner_type is None:
         with pytest.raises(ValueError):
             config = ModelConfig(model_id, task="score")
+            return
     else:
         config = ModelConfig(model_id, task="score")
 
@@ -156,15 +157,7 @@ def test_transcription_task(model_id, expected_runner_type,
 )
 def test_auto_runner(model_id, expected_runner_type, expected_convert_type,
                      expected_task):
-    config = ModelConfig(
-        model_id,
-        runner="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+    config = ModelConfig(model_id, runner="auto")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
@@ -181,20 +174,17 @@ def test_auto_runner(model_id, expected_runner_type, expected_convert_type,
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
          "classify"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
-        ("openai/whisper-small", "pooling", "embed", "embed"),
+        ("openai/whisper-small", "pooling", None, "embed"),
     ],
 )
 def test_pooling_runner(model_id, expected_runner_type, expected_convert_type,
                         expected_task):
-    config = ModelConfig(
-        model_id,
-        runner="pooling",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+    if expected_runner_type is None:
+        with pytest.raises(ValueError):
+            config = ModelConfig(model_id, runner="pooling")
+            return
+    else:
+        config = ModelConfig(model_id, runner="pooling")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
@@ -210,13 +200,7 @@ def test_pooling_runner(model_id, expected_runner_type, expected_convert_type,
 )
 def test_draft_runner(model_id, expected_runner_type, expected_convert_type,
                       expected_task):
-    config = ModelConfig(
-        model_id,
-        runner="draft",
-        tokenizer=model_id,
-        seed=0,
-        dtype="float16",
-    )
+    config = ModelConfig(model_id, runner="draft")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type

From 6e89575b7ff519697d0789f0807de8502caefd63 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 14:02:57 +0000
Subject: [PATCH 29/47] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_config.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index c15159d023ae..a2459d6db6fd 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -116,9 +116,9 @@ def test_score_task(model_id, expected_runner_type, expected_convert_type,
     if expected_runner_type is None:
         with pytest.raises(ValueError):
             config = ModelConfig(model_id, task="score")
-            return
-    else:
-        config = ModelConfig(model_id, task="score")
+        return
+
+    config = ModelConfig(model_id, task="score")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
@@ -182,9 +182,9 @@ def test_pooling_runner(model_id, expected_runner_type, expected_convert_type,
     if expected_runner_type is None:
         with pytest.raises(ValueError):
             config = ModelConfig(model_id, runner="pooling")
-            return
-    else:
-        config = ModelConfig(model_id, runner="pooling")
+        return
+
+    config = ModelConfig(model_id, runner="pooling")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type

From 18f0a32d68982a321460cc79607e4c291c37a3fc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 14:08:04 +0000
Subject: [PATCH 30/47] Remove task check

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_config.py | 58 ++++++++++++++++----------------------------
 1 file changed, 21 insertions(+), 37 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index a2459d6db6fd..1570a0e250f0 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -108,16 +108,11 @@ def test_auto_task(model_id, expected_runner_type, expected_convert_type,
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
          "classify"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
-        ("openai/whisper-small", "pooling", None, "embed"),
+        ("openai/whisper-small", "pooling", "embed", "embed"),
     ],
 )
 def test_score_task(model_id, expected_runner_type, expected_convert_type,
                     expected_task):
-    if expected_runner_type is None:
-        with pytest.raises(ValueError):
-            config = ModelConfig(model_id, task="score")
-        return
-
     config = ModelConfig(model_id, task="score")
 
     assert config.runner_type == expected_runner_type
@@ -143,43 +138,36 @@ def test_transcription_task(model_id, expected_runner_type,
 
 
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type",
-     "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type"),
     [
-        ("distilbert/distilgpt2", "generate", "none", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
-         "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
-        ("openai/whisper-small", "generate", "none", "transcription"),
+        ("distilbert/distilgpt2", "generate", "none"),
+        ("intfloat/multilingual-e5-small", "pooling", "none"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
+        ("openai/whisper-small", "generate", "none"),
     ],
 )
-def test_auto_runner(model_id, expected_runner_type, expected_convert_type,
-                     expected_task):
+def test_auto_runner(model_id, expected_runner_type, expected_convert_type):
     config = ModelConfig(model_id, runner="auto")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
-    assert expected_task in config.supported_tasks
 
 
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type",
-     "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type"),
     [
-        ("distilbert/distilgpt2", "pooling", "embed", "embed"),
-        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
-         "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
-        ("openai/whisper-small", "pooling", None, "embed"),
+        ("distilbert/distilgpt2", "pooling", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "none"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
+        ("openai/whisper-small", "pooling", None),
     ],
 )
-def test_pooling_runner(model_id, expected_runner_type, expected_convert_type,
-                        expected_task):
-    if expected_runner_type is None:
+def test_pooling_runner(model_id, expected_runner_type, expected_convert_type):
+    if expected_convert_type is None:
         with pytest.raises(ValueError):
             config = ModelConfig(model_id, runner="pooling")
         return
@@ -188,23 +176,19 @@ def test_pooling_runner(model_id, expected_runner_type, expected_convert_type,
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
-    assert expected_task in config.supported_tasks
 
 
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type",
-     "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type"),
     [
-        ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none", "draft"),
+        ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none"),
     ],
 )
-def test_draft_runner(model_id, expected_runner_type, expected_convert_type,
-                      expected_task):
+def test_draft_runner(model_id, expected_runner_type, expected_convert_type):
     config = ModelConfig(model_id, runner="draft")
 
     assert config.runner_type == expected_runner_type
     assert config.convert_type == expected_convert_type
-    assert expected_task in config.supported_tasks
 
 
 MODEL_IDS_EXPECTED = [

From 20b8215b06b7ed3b405ac680d015ee684ca80229 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 14:10:55 +0000
Subject: [PATCH 31/47] Update test

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_skip_tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
index 32d28277e0ef..0bb42ed8aa7f 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -26,8 +26,8 @@ def v1(run_with_both_engines):
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--task",
-        "embed",
+        "--runner",
+        "pooling",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         DTYPE,

From b718c8b8011037e144c58d650b92613a72781db6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 25 Jul 2025 14:13:23 +0000
Subject: [PATCH 32/47] Fix model resolution

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_config.py | 7 +------
 vllm/config.py       | 6 ++++--
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 1570a0e250f0..441c07b99acf 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -163,15 +163,10 @@ def test_auto_runner(model_id, expected_runner_type, expected_convert_type):
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
-        ("openai/whisper-small", "pooling", None),
+        ("openai/whisper-small", "pooling", "embed"),
     ],
 )
 def test_pooling_runner(model_id, expected_runner_type, expected_convert_type):
-    if expected_convert_type is None:
-        with pytest.raises(ValueError):
-            config = ModelConfig(model_id, runner="pooling")
-        return
-
     config = ModelConfig(model_id, runner="pooling")
 
     assert config.runner_type == expected_runner_type
diff --git a/vllm/config.py b/vllm/config.py
index 9b99782eaa8d..4a7edc5ca1d8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -974,8 +974,10 @@ def _get_default_convert_type(
                         and arch.endswith(suffix)):
                     return convert_type
 
-        # Some Sentence Transformers models use *ForCausalLM archs
-        if get_pooling_config(self.model, self.revision):
+        # This is to handle Sentence Transformers models that use *ForCausalLM
+        # and also multi-modal pooling models which are not defined as
+        # Sentence Transformers models
+        if runner_type == "pooling":
             return "embed"
 
         return "none"

From 164b05babc070adf30396c72556442aad8461a19 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 26 Jul 2025 02:16:03 +0000
Subject: [PATCH 33/47] Try fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                         | 115 +++++++++++++--------
 vllm/model_executor/models/registry.py | 134 ++++++++++++-------------
 2 files changed, 140 insertions(+), 109 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4a7edc5ca1d8..7bda3ba0033a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -121,7 +121,7 @@
 # Some model suffixes are based on auto classes from Transformers:
 # https://huggingface.co/docs/transformers/en/model_doc/auto
 # NOTE: Items higher on this list priority over lower ones
-SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
+_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
     ("ForCausalLM", ("generate", "none")),
     ("ForConditionalGeneration", ("generate", "none")),
     ("ChatModel", ("generate", "none")),
@@ -140,6 +140,22 @@
 ]
 
 
+def try_match_architecture_defaults(
+    architecture: str,
+    *,
+    runner_type: Optional[RunnerType] = None,
+    convert_type: Optional[ConvertType] = None,
+) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
+    for suffix, (default_runner_type,
+                 default_convert_type) in _SUFFIX_TO_DEFAULTS:
+        if ((runner_type is None or runner_type == default_runner_type) and
+            (convert_type is None or convert_type == default_convert_type)
+                and architecture.endswith(suffix)):
+            return suffix, (default_runner_type, default_convert_type)
+
+    return None
+
+
 @runtime_checkable
 class SupportsHash(Protocol):
 
@@ -594,10 +610,14 @@ def __post_init__(self) -> None:
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision)
 
+        # NOTE: We need to resolve this early, otherwise the temporary
+        # transformers modules might not be available to child processes
+        self._resolve_transformers_backend()
+
         architectures = self.architectures
-        is_generative_model = self.registry.is_text_generation_model(
-            architectures)
-        is_pooling_model = self.registry.is_pooling_model(architectures)
+        registry = self.registry
+        is_generative_model = registry.is_text_generation_model(architectures)
+        is_pooling_model = registry.is_pooling_model(architectures)
 
         def _task_to_convert(task: TaskOption) -> ConvertType:
             if task == "embedding" or task == "embed":
@@ -684,8 +704,7 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self.supported_tasks = self._get_supported_tasks(
             architectures, self.runner_type, self.convert_type)
 
-        _, arch = self.registry.inspect_model_cls(self.architectures,
-                                                  model_config=self)
+        _, arch = registry.inspect_model_cls(architectures, model_config=self)
         self._architecture = arch
         logger.info("Resolved architecture: %s", arch)
 
@@ -740,18 +759,18 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         self.multimodal_config = self._init_multimodal_config()
+
         self.model_supports_multimodal_raw_input = (
-            self.registry.supports_multimodal_raw_input(self.architectures))
+            registry.supports_multimodal_raw_input(architectures))
+        self.is_attention_free = registry.is_attention_free_model(
+            architectures)
+        self.is_hybrid = registry.is_hybrid_model(architectures)
+        self.has_noops = registry.is_noops_model(architectures)
+        self.has_inner_state = registry.model_has_inner_state(architectures)
+
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
-        self.is_attention_free = self.registry.is_attention_free_model(
-            architectures)
-        self.is_hybrid = self.registry.is_hybrid_model(architectures)
-        self.has_noops = self.registry.is_noops_model(architectures)
-        self.has_inner_state = self.registry.model_has_inner_state(
-            architectures)
-
         if (not current_platform.is_neuron() and self.override_neuron_config):
             raise ValueError(
                 "`override_neuron_config` is only supported on Neuron.")
@@ -776,6 +795,32 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
                 "max_model_len must be an integer after __post_init__.")
         return self
 
+    def _resolve_transformers_backend(self):
+        model = self.model
+        revision = self.revision
+
+        from transformers.dynamic_module_utils import (
+            get_class_from_dynamic_module)
+
+        auto_map: dict[str, str] = getattr(self.hf_config, "auto_map",
+                                           None) or dict()
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name: get_class_from_dynamic_module(module,
+                                                model,
+                                                revision=revision)
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+
+        return auto_map, auto_modules
+
     def _get_transformers_backend_cls(self) -> str:
         """Determine which Transformers backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
@@ -792,20 +837,7 @@ def registry(self):
 
     @property
     def architectures(self) -> list[str]:
-        # architectures in the model config.
-        architectures = getattr(self.hf_config, "architectures", [])
-        # The registry assumes that it can always inspect the vLLM model class
-        # for a given architecture. This assumption breaks down for the
-        # Transformers backend, which may use a different class depending on
-        # the model type. To work around this, we add the correct Transformers
-        # backend class to the architectures list. We must do this here because
-        # we need access to the `hf_config` to determine the backend class.
-        transformers_backend_cls = self._get_transformers_backend_cls()
-        if (self.model_impl != ModelImpl.VLLM.value
-                and all(arch != transformers_backend_cls
-                        for arch in architectures)):
-            architectures.append(transformers_backend_cls)
-        return architectures
+        return getattr(self.hf_config, "architectures", [])
 
     @cached_property
     def architecture(self) -> str:
@@ -930,9 +962,10 @@ def _get_default_runner_type(
                 if registry.is_text_generation_model(architectures):
                     return "generate"
 
-            for suffix, (runner_type, _) in SUFFIX_TO_DEFAULTS:
-                if arch.endswith(suffix):
-                    return runner_type
+            match = try_match_architecture_defaults(arch)
+            if match:
+                _, (runner_type, _) = match
+                return runner_type
 
         return "generate"
 
@@ -968,11 +1001,11 @@ def _get_default_convert_type(
                         and registry.is_pooling_model(architectures)):
                     return "none"
 
-            for suffix, (default_runner_type,
-                         convert_type) in SUFFIX_TO_DEFAULTS:
-                if (default_runner_type == runner_type
-                        and arch.endswith(suffix)):
-                    return convert_type
+            match = try_match_architecture_defaults(arch,
+                                                    runner_type=runner_type)
+            if match:
+                _, (_, convert_type) = match
+                return convert_type
 
         # This is to handle Sentence Transformers models that use *ForCausalLM
         # and also multi-modal pooling models which are not defined as
@@ -1029,10 +1062,12 @@ def _get_default_pooling_task(
             return "classify"
 
         for arch in architectures:
-            for suffix, (runner_type, convert_type) in SUFFIX_TO_DEFAULTS:
-                if runner_type == "pooling" and arch.endswith(suffix):
-                    assert convert_type != "none"
-                    return convert_type
+            match = try_match_architecture_defaults(arch,
+                                                    runner_type="pooling")
+            if match:
+                _, (_, convert_type) = match
+                assert convert_type != "none"
+                return convert_type
 
         return "embed"
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a27e3668191a..5ccc5e38095e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -18,7 +18,7 @@
 
 import torch.nn as nn
 
-from vllm.config import SUFFIX_TO_DEFAULTS, ModelConfig, ModelImpl
+from vllm.config import ModelConfig, ModelImpl, try_match_architecture_defaults
 from vllm.logger import init_logger
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
@@ -490,40 +490,24 @@ def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
 
         return _try_inspect_model_cls(model_arch, self.models[model_arch])
 
-    def _resolve_transformers(self, architecture: str,
-                              model_config: ModelConfig):
+    def _try_resolve_transformers(
+        self,
+        architecture: str,
+        model_config: ModelConfig,
+    ) -> Optional[str]:
         if architecture in _TRANSFORMERS_BACKEND_MODELS:
             return architecture
-        if model_config.model_impl == ModelImpl.VLLM:
-            return architecture
+
+        auto_map, auto_modules = model_config._resolve_transformers_backend()
 
         import transformers
-        from transformers.dynamic_module_utils import (
-            get_class_from_dynamic_module)
-
-        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
-                                           None) or dict()
-        # Make sure that config class is always initialized before model class,
-        # otherwise the model class won't be able to access the config class,
-        # the expected auto_map should have correct order like:
-        # "auto_map": {
-        #     "AutoConfig": "<your-repo-name>--<config-name>",
-        #     "AutoModel": "<your-repo-name>--<config-name>",
-        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
-        # },
-        auto_modules = {
-            name:
-            get_class_from_dynamic_module(module,
-                                          model_config.model,
-                                          revision=model_config.revision)
-            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
-        }
+
         model_module = getattr(transformers, architecture, None)
 
         if model_module is None:
             if "AutoModel" not in auto_map:
-                if model_config.model_impl == ModelImpl.AUTO:
-                    return architecture
+                if model_config.model_impl != ModelImpl.TRANSFORMERS:
+                    return None
 
                 raise ValueError(
                     f"Cannot find model module. {architecture!r} is not a "
@@ -535,8 +519,8 @@ def _resolve_transformers(self, architecture: str,
             model_module = auto_modules["AutoModel"]
 
         if not model_module.is_backend_compatible():
-            if model_config.model_impl == ModelImpl.AUTO:
-                return architecture
+            if model_config.model_impl != ModelImpl.TRANSFORMERS:
+                return None
 
             raise ValueError(
                 f"The Transformers implementation of {architecture!r} "
@@ -550,27 +534,19 @@ def _normalize_arch(
         *,
         model_config: Optional[ModelConfig],
     ) -> str:
-        # model_config is required for further normalization
-        if model_config is None:
-            return architecture
-
-        # Use Transformers backend architecture if the config specifies it
-        # or if the architecture is not in the registry and the config allows
-        # for Transformers backend fallback behaviour
-        if (model_config.model_impl == ModelImpl.TRANSFORMERS
-                or model_config.model_impl == ModelImpl.AUTO
-                and architecture not in self.models):
-            return self._resolve_transformers(architecture, model_config)
-
         if architecture in self.models:
             return architecture
 
-        for suffix, (default_runner_type,
-                     default_convert_type) in SUFFIX_TO_DEFAULTS:
-            if (model_config.runner_type == default_runner_type
-                    and model_config.convert_type == default_convert_type
-                    and architecture.endswith(suffix)):
-                return architecture.replace(suffix, "ForCausalLM")
+        match = try_match_architecture_defaults(
+            architecture,
+            runner_type=None
+            if model_config is None else model_config.runner_type,
+            convert_type=None
+            if model_config is None else model_config.convert_type,
+        )
+        if match:
+            suffix, _ = match
+            return architecture.replace(suffix, "ForCausalLM")
 
         return architecture
 
@@ -588,20 +564,6 @@ def _normalize_archs(
             for arch in architectures
         ]
 
-    def _get_inspected_arch(
-        self,
-        orig_architecture: str,
-        normalized_architecture: str,
-        *,
-        model_config: Optional[ModelConfig],
-    ) -> str:
-        if (model_config is not None and normalized_architecture
-                == model_config._get_transformers_backend_cls()):
-            return normalized_architecture
-
-        # Avoid breaking `verify_and_update_config`
-        return orig_architecture
-
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
@@ -614,13 +576,30 @@ def inspect_model_cls(
         normalized_archs = self._normalize_archs(architectures,
                                                  model_config=model_config)
 
+        # Require transformers impl
+        if (model_config is not None
+                and model_config.model_impl == ModelImpl.TRANSFORMERS):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_info = self._try_inspect_model_cls(arch)
+                if model_info is not None:
+                    return (model_info, arch)
+
         for arch, normalized_arch in zip(architectures, normalized_archs):
             model_info = self._try_inspect_model_cls(normalized_arch)
             if model_info is not None:
-                out_arch = self._get_inspected_arch(arch,
-                                                    normalized_arch,
-                                                    model_config=model_config)
-                return (model_info, out_arch)
+                return (model_info, arch)
+
+        # Fallback to transformers impl
+        if (model_config is not None
+                and model_config.model_impl != ModelImpl.VLLM):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_info = self._try_inspect_model_cls(arch)
+                if model_info is not None:
+                    return (model_info, arch)
 
         return self._raise_for_unsupported(architectures)
 
@@ -636,13 +615,30 @@ def resolve_model_cls(
         normalized_archs = self._normalize_archs(architectures,
                                                  model_config=model_config)
 
+        # Require transformers impl
+        if (model_config is not None
+                and model_config.model_impl == ModelImpl.TRANSFORMERS):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_cls = self._try_load_model_cls(arch)
+                if model_cls is not None:
+                    return (model_cls, arch)
+
         for arch, normalized_arch in zip(architectures, normalized_archs):
             model_cls = self._try_load_model_cls(normalized_arch)
             if model_cls is not None:
-                out_arch = self._get_inspected_arch(arch,
-                                                    normalized_arch,
-                                                    model_config=model_config)
-                return (model_cls, out_arch)
+                return (model_cls, arch)
+
+        # Fallback to transformers impl
+        if (model_config is not None
+                and model_config.model_impl != ModelImpl.VLLM):
+            arch = self._try_resolve_transformers(architectures[0],
+                                                  model_config)
+            if arch is not None:
+                model_cls = self._try_load_model_cls(arch)
+                if model_cls is not None:
+                    return (model_cls, arch)
 
         return self._raise_for_unsupported(architectures)
 

From fdcbda00428beb839cec0b07be35469e25a6bf73 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 26 Jul 2025 02:44:49 +0000
Subject: [PATCH 34/47] Always pass model config

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                         |  75 ++++++++------
 vllm/model_executor/models/registry.py | 129 ++++++++++---------------
 2 files changed, 96 insertions(+), 108 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 7bda3ba0033a..65512968e02a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -140,6 +140,10 @@
 ]
 
 
+def iter_architecture_defaults():
+    yield from _SUFFIX_TO_DEFAULTS
+
+
 def try_match_architecture_defaults(
     architecture: str,
     *,
@@ -147,7 +151,7 @@ def try_match_architecture_defaults(
     convert_type: Optional[ConvertType] = None,
 ) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
     for suffix, (default_runner_type,
-                 default_convert_type) in _SUFFIX_TO_DEFAULTS:
+                 default_convert_type) in iter_architecture_defaults():
         if ((runner_type is None or runner_type == default_runner_type) and
             (convert_type is None or convert_type == default_convert_type)
                 and architecture.endswith(suffix)):
@@ -616,8 +620,9 @@ def __post_init__(self) -> None:
 
         architectures = self.architectures
         registry = self.registry
-        is_generative_model = registry.is_text_generation_model(architectures)
-        is_pooling_model = registry.is_pooling_model(architectures)
+        is_generative_model = registry.is_text_generation_model(
+            architectures, self)
+        is_pooling_model = registry.is_pooling_model(architectures, self)
 
         def _task_to_convert(task: TaskOption) -> ConvertType:
             if task == "embedding" or task == "embed":
@@ -704,7 +709,7 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self.supported_tasks = self._get_supported_tasks(
             architectures, self.runner_type, self.convert_type)
 
-        _, arch = registry.inspect_model_cls(architectures, model_config=self)
+        _, arch = registry.inspect_model_cls(architectures, self)
         self._architecture = arch
         logger.info("Resolved architecture: %s", arch)
 
@@ -760,14 +765,6 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         self.multimodal_config = self._init_multimodal_config()
 
-        self.model_supports_multimodal_raw_input = (
-            registry.supports_multimodal_raw_input(architectures))
-        self.is_attention_free = registry.is_attention_free_model(
-            architectures)
-        self.is_hybrid = registry.is_hybrid_model(architectures)
-        self.has_noops = registry.is_noops_model(architectures)
-        self.has_inner_state = registry.model_has_inner_state(architectures)
-
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
@@ -881,7 +878,7 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
             self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
-        if self.registry.is_multimodal_model(self.architectures):
+        if self.registry.is_multimodal_model(self.architectures, self):
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
@@ -957,9 +954,9 @@ def _get_default_runner_type(
 
         for arch in architectures:
             if arch in registry.get_supported_archs():
-                if registry.is_pooling_model(architectures):
+                if registry.is_pooling_model(architectures, self):
                     return "pooling"
-                if registry.is_text_generation_model(architectures):
+                if registry.is_text_generation_model(architectures, self):
                     return "generate"
 
             match = try_match_architecture_defaults(arch)
@@ -995,10 +992,11 @@ def _get_default_convert_type(
         for arch in architectures:
             if arch in registry.get_supported_archs():
                 if (runner_type == "generate"
-                        and registry.is_text_generation_model(architectures)):
+                        and registry.is_text_generation_model(
+                            architectures, self)):
                     return "none"
                 if (runner_type == "pooling"
-                        and registry.is_pooling_model(architectures)):
+                        and registry.is_pooling_model(architectures, self)):
                     return "none"
 
             match = try_match_architecture_defaults(arch,
@@ -1040,16 +1038,16 @@ def _get_supported_generation_tasks(
     ) -> list[_ResolvedTask]:
         registry = self.registry
 
-        if registry.is_transcription_only_model(architectures):
+        if registry.is_transcription_only_model(architectures, self):
             return ["transcription"]
 
         # TODO: Use get_supported_generation_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
-        if (registry.is_text_generation_model(architectures)
+        if (registry.is_text_generation_model(architectures, self)
                 or convert_type in _RUNNER_CONVERTS["generate"]):
             supported_tasks.append("generate")
 
-        if registry.is_transcription_model(architectures):
+        if registry.is_transcription_model(architectures, self):
             supported_tasks.append("transcription")
 
         return supported_tasks
@@ -1058,7 +1056,7 @@ def _get_default_pooling_task(
         self,
         architectures: list[str],
     ) -> Literal["embed", "classify", "reward"]:
-        if self.registry.is_cross_encoder_model(architectures):
+        if self.registry.is_cross_encoder_model(architectures, self):
             return "classify"
 
         for arch in architectures:
@@ -1080,7 +1078,7 @@ def _get_supported_pooling_tasks(
 
         # TODO: Use get_supported_pooling_tasks once V0 is removed
         supported_tasks = list[_ResolvedTask]()
-        if (registry.is_pooling_model(architectures)
+        if (registry.is_pooling_model(architectures, self)
                 or convert_type in _RUNNER_CONVERTS["pooling"]):
             supported_tasks.append("encode")
 
@@ -1333,7 +1331,8 @@ def verify_with_parallel_config(
 
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
         if pipeline_parallel_size > 1:
-            if not self.registry.is_pp_supported_model(self.architectures):
+            if not self.registry.is_pp_supported_model(self.architectures,
+                                                       self):
                 raise NotImplementedError(
                     "Pipeline parallelism is not supported for this model. "
                     "Supported models implement the `SupportsPP` interface.")
@@ -1673,19 +1672,39 @@ def uses_mrope(self) -> bool:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
-    @property
+    @cached_property
     def is_cross_encoder(self) -> bool:
-        return (self.registry.is_cross_encoder_model(self.architectures)
+        return (self.registry.is_cross_encoder_model(self.architectures, self)
                 or self.convert_type == "classify")
 
+    @cached_property
+    def model_supports_multimodal_raw_input(self):
+        return self.registry.supports_multimodal_raw_input(
+            self.architectures, self)
+
+    @cached_property
+    def is_attention_free(self):
+        return self.registry.is_attention_free_model(self.architectures, self)
+
+    @cached_property
+    def is_hybrid(self):
+        return self.registry.is_hybrid_model(self.architectures, self)
+
+    @cached_property
+    def has_noops(self):
+        return self.registry.is_noops_model(self.architectures, self)
+
+    @cached_property
+    def has_inner_state(self):
+        return self.registry.model_has_inner_state(self.architectures, self)
+
     @property
     def use_mla(self) -> bool:
         return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
 
-    @property
+    @cached_property
     def is_v1_compatible(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return self.registry.is_v1_compatible(architectures)
+        return self.registry.is_v1_compatible(self.architectures, self)
 
     @property
     def is_matryoshka(self) -> bool:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 5ccc5e38095e..9160c8e81b1d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -18,7 +18,8 @@
 
 import torch.nn as nn
 
-from vllm.config import ModelConfig, ModelImpl, try_match_architecture_defaults
+from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
+                         try_match_architecture_defaults)
 from vllm.logger import init_logger
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
@@ -531,54 +532,53 @@ def _try_resolve_transformers(
     def _normalize_arch(
         self,
         architecture: str,
-        *,
-        model_config: Optional[ModelConfig],
+        model_config: ModelConfig,
     ) -> str:
         if architecture in self.models:
             return architecture
 
+        # This may be called in order to resolve runner_type and convert_type
+        # in the first place, in which case we consider the default match
         match = try_match_architecture_defaults(
             architecture,
-            runner_type=None
-            if model_config is None else model_config.runner_type,
-            convert_type=None
-            if model_config is None else model_config.convert_type,
+            runner_type=getattr(model_config, "runner_type", None),
+            convert_type=getattr(model_config, "convert_type", None),
         )
         if match:
             suffix, _ = match
-            return architecture.replace(suffix, "ForCausalLM")
+
+            # Get the name of the base model to convert
+            for repl_suffix, _ in iter_architecture_defaults():
+                base_arch = architecture.replace(suffix, repl_suffix)
+                if base_arch in self.models:
+                    return base_arch
 
         return architecture
 
     def _normalize_archs(
         self,
         architectures: list[str],
-        *,
-        model_config: Optional[ModelConfig],
+        model_config: ModelConfig,
     ) -> list[str]:
         if not architectures:
             logger.warning("No model architectures are specified")
 
         return [
-            self._normalize_arch(arch, model_config=model_config)
-            for arch in architectures
+            self._normalize_arch(arch, model_config) for arch in architectures
         ]
 
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> tuple[_ModelInfo, str]:
         if isinstance(architectures, str):
             architectures = [architectures]
 
-        normalized_archs = self._normalize_archs(architectures,
-                                                 model_config=model_config)
+        normalized_archs = self._normalize_archs(architectures, model_config)
 
         # Require transformers impl
-        if (model_config is not None
-                and model_config.model_impl == ModelImpl.TRANSFORMERS):
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
             arch = self._try_resolve_transformers(architectures[0],
                                                   model_config)
             if arch is not None:
@@ -592,8 +592,7 @@ def inspect_model_cls(
                 return (model_info, arch)
 
         # Fallback to transformers impl
-        if (model_config is not None
-                and model_config.model_impl != ModelImpl.VLLM):
+        if model_config.model_impl != ModelImpl.VLLM:
             arch = self._try_resolve_transformers(architectures[0],
                                                   model_config)
             if arch is not None:
@@ -606,18 +605,15 @@ def inspect_model_cls(
     def resolve_model_cls(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> tuple[type[nn.Module], str]:
         if isinstance(architectures, str):
             architectures = [architectures]
 
-        normalized_archs = self._normalize_archs(architectures,
-                                                 model_config=model_config)
+        normalized_archs = self._normalize_archs(architectures, model_config)
 
         # Require transformers impl
-        if (model_config is not None
-                and model_config.model_impl == ModelImpl.TRANSFORMERS):
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
             arch = self._try_resolve_transformers(architectures[0],
                                                   model_config)
             if arch is not None:
@@ -631,8 +627,7 @@ def resolve_model_cls(
                 return (model_cls, arch)
 
         # Fallback to transformers impl
-        if (model_config is not None
-                and model_config.model_impl != ModelImpl.VLLM):
+        if model_config.model_impl != ModelImpl.VLLM:
             arch = self._try_resolve_transformers(architectures[0],
                                                   model_config)
             if arch is not None:
@@ -645,131 +640,105 @@ def resolve_model_cls(
     def is_text_generation_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_text_generation_model
 
     def is_pooling_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_pooling_model
 
     def is_cross_encoder_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_cross_encoding
 
     def is_multimodal_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_multimodal
 
     def supports_multimodal_raw_input(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_multimodal_raw_input
 
     def is_pp_supported_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_pp
 
     def model_has_inner_state(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.has_inner_state
 
     def is_attention_free_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_attention_free
 
     def is_hybrid_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_hybrid
 
     def is_noops_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.has_noops
 
     def is_transcription_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_transcription
 
     def is_transcription_only_model(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_transcription_only
 
     def is_v1_compatible(
         self,
         architectures: Union[str, list[str]],
-        *,
-        model_config: Optional[ModelConfig] = None,
+        model_config: ModelConfig,
     ) -> bool:
-        model_cls, _ = self.inspect_model_cls(architectures,
-                                              model_config=model_config)
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return not model_cls.supports_v0_only
 
 

From b1c21180e59e2e4ba878309c245d68aeca0884db Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 26 Jul 2025 13:58:03 +0000
Subject: [PATCH 35/47] Fixes

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../entrypoints/openai/test_chat_template.py  |  1 +
 tests/entrypoints/test_chat_utils.py          |  3 ++
 .../multimodal/processing/test_common.py      |  5 +-
 tests/models/multimodal/test_mapping.py       |  5 +-
 tests/models/test_registry.py                 | 45 ++++++++++++++---
 tests/models/utils.py                         |  1 +
 vllm/config.py                                | 17 ++++---
 vllm/model_executor/models/config.py          |  4 +-
 vllm/model_executor/models/registry.py        | 23 ++++++++-
 vllm/transformers_utils/dynamic_module.py     | 48 +++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py            |  6 +--
 11 files changed, 132 insertions(+), 26 deletions(-)
 create mode 100644 vllm/transformers_utils/dynamic_module.py

diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 6e32887f5ed0..5b6e2a4146b1 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -102,6 +102,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
+        revision=model_info.revision,
         hf_overrides=model_info.hf_overrides,
     )
 
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 3a6e5929eb45..54daf1a91d64 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1143,6 +1143,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         model,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
     )
@@ -1198,6 +1199,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         model,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
     )
@@ -1257,6 +1259,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         model,
         tokenizer=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
     )
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 622cb95cb6ec..595dbd423695 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -52,13 +52,10 @@ def _test_processing_correctness(
 
     model_config = ModelConfig(
         model_id,
-        runner="auto",
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
-        seed=0,
-        dtype="auto",
-        revision=None,
         hf_overrides=model_info.hf_overrides,
     )
 
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 173804b57253..7096810d8e15 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -54,13 +54,10 @@ def test_hf_model_weights_mapper(model_arch: str):
 
     model_config = ModelConfig(
         model_id,
-        runner="auto",
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
-        seed=0,
-        dtype="auto",
-        revision=None,
         hf_overrides=model_info.hf_overrides,
     )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 1ce90070c5c8..8a3afec4a27d 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,6 +6,7 @@
 import pytest
 import torch.cuda
 
+from vllm.config import ModelConfig
 from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
@@ -27,8 +28,17 @@ def test_registry_imports(model_arch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
     model_info.check_transformers_version(on_fail="skip")
 
+    model_id = model_info.default
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+    )
+
     # Ensure all model classes can be imported successfully
-    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
+    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch, model_config)
 
     if model_arch in _SPECULATIVE_DECODING_MODELS:
         return  # Ignore these models which do not have a unified format
@@ -56,14 +66,25 @@ def test_registry_imports(model_arch):
     ("XLMRobertaForSequenceClassification", False, False, True),
 ])
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
-    assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_id = model_info.default
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+    )
 
-    assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
+    assert model_config.is_multimodal_model is is_mm
+    assert model_config.is_cross_encoder is is_ce
 
     if init_cuda and current_platform.is_cuda_alike():
         assert not torch.cuda.is_initialized()
 
-        ModelRegistry.resolve_model_cls(model_arch)
+        ModelRegistry.resolve_model_cls(model_arch, model_config)
         if not torch.cuda.is_initialized():
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
@@ -82,12 +103,24 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
         ("Qwen2VLForConditionalGeneration", True, True),
     ])
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
-    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_id = model_info.default
+    model_config = ModelConfig(
+        model_id,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+    )
+
+    assert model_config.is_pp_supported is is_pp
 
     if init_cuda and current_platform.is_cuda_alike():
         assert not torch.cuda.is_initialized()
 
-        ModelRegistry.resolve_model_cls(model_arch)
+        ModelRegistry.resolve_model_cls(model_arch, model_config)
         if not torch.cuda.is_initialized():
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
diff --git a/tests/models/utils.py b/tests/models/utils.py
index a8ae1f083724..3cd0721be1b6 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -283,6 +283,7 @@ def build_model_context(
         runner=runner,
         tokenizer=model_info.tokenizer or model_id,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         dtype=dtype,
         seed=0,
diff --git a/vllm/config.py b/vllm/config.py
index 65512968e02a..571df4bb53b7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -796,8 +796,8 @@ def _resolve_transformers_backend(self):
         model = self.model
         revision = self.revision
 
-        from transformers.dynamic_module_utils import (
-            get_class_from_dynamic_module)
+        from vllm.transformers_utils.dynamic_module import (
+            get_transformers_dynamic_module)
 
         auto_map: dict[str, str] = getattr(self.hf_config, "auto_map",
                                            None) or dict()
@@ -810,10 +810,11 @@ def _resolve_transformers_backend(self):
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
         auto_modules = {
-            name: get_class_from_dynamic_module(module,
-                                                model,
-                                                revision=revision)
+            name: get_transformers_dynamic_module(module,
+                                                  model,
+                                                  revision=revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+            if "." in module  # Ignore entries that are improperly formatted
         }
 
         return auto_map, auto_modules
@@ -1678,7 +1679,11 @@ def is_cross_encoder(self) -> bool:
                 or self.convert_type == "classify")
 
     @cached_property
-    def model_supports_multimodal_raw_input(self):
+    def is_pp_supported(self) -> bool:
+        return self.registry.is_pp_supported_model(self.architectures, self)
+
+    @cached_property
+    def is_multimodal_raw_input_supported(self):
         return self.registry.supports_multimodal_raw_input(
             self.architectures, self)
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index e297fa5497ba..6f50b1753098 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -254,7 +254,9 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             use_mla=model_config.use_mla).page_size_bytes
 
         model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config.architecture)
+            model_config.architecture,
+            model_config=model_config,
+        )
 
         # get mamba page size
         mamba_page_size = MambaSpec(
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9160c8e81b1d..036dab3d2236 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -499,9 +499,28 @@ def _try_resolve_transformers(
         if architecture in _TRANSFORMERS_BACKEND_MODELS:
             return architecture
 
-        auto_map, auto_modules = model_config._resolve_transformers_backend()
-
         import transformers
+        from transformers.dynamic_module_utils import (
+            get_class_from_dynamic_module)
+
+        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
+                                           None) or dict()
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name:
+            get_class_from_dynamic_module(module,
+                                          model_config.model,
+                                          revision=model_config.revision)
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+            if "." in module  # Ignore entries that are improperly formatted
+        }
 
         model_module = getattr(transformers, architecture, None)
 
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
new file mode 100644
index 000000000000..e359ff31b19f
--- /dev/null
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Optional, Union
+
+from transformers.dynamic_module_utils import get_cached_module_file
+
+
+def get_transformers_dynamic_module(
+    class_reference: str,
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    **kwargs,
+):
+    """
+    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
+    but does not open the module to avoid unnecessary import errors.
+    """
+    if "--" in class_reference:
+        repo_id, class_reference = class_reference.split("--")
+    else:
+        repo_id = pretrained_model_name_or_path
+
+    module_file, class_name = class_reference.split(".")
+
+    if code_revision is None and pretrained_model_name_or_path == repo_id:
+        code_revision = revision
+
+    return get_cached_module_file(
+        repo_id,
+        module_file + ".py",
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=code_revision,
+        local_files_only=local_files_only,
+        repo_type=repo_type,
+    )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5fe594db667a..abd595963574 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -126,8 +126,8 @@ def __init__(
 
         self.is_multimodal_model = model_config.is_multimodal_model
         self.is_pooling_model = model_config.pooler_config is not None
-        self.model_supports_multimodal_raw_input = (
-            model_config.model_supports_multimodal_raw_input)
+        self.is_multimodal_raw_input_supported = (
+            model_config.is_multimodal_raw_input_supported)
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -582,7 +582,7 @@ def _init_model_kwargs_for_multimodal_model(
     ) -> dict[str, Any]:
 
         model_kwargs: dict[str, Any] = {}
-        if self.model_supports_multimodal_raw_input:
+        if self.is_multimodal_raw_input_supported:
             # This model requires the raw multimodal data in input.
             if scheduler_output:
                 multi_modal_kwargs_list = []

From 7f438d294f6a3bc4af641d50d7c1ce21f31db1f1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 26 Jul 2025 14:11:55 +0000
Subject: [PATCH 36/47] Fix pre-commit

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/transformers_utils/dynamic_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index e359ff31b19f..c3c0f4548777 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -8,7 +8,7 @@
 
 def get_transformers_dynamic_module(
     class_reference: str,
-    pretrained_model_name_or_path: Union[str, os.PathLike],
+    pretrained_model_name_or_path: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
     resume_download: Optional[bool] = None,

From 397c0c7cb2730045af861aa9d14471b9153368b2 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 26 Jul 2025 15:29:41 +0000
Subject: [PATCH 37/47] Avoid checking imports

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/transformers_utils/dynamic_module.py | 34 +++++++++++++----------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index c3c0f4548777..485b8d7641b3 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -2,11 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from typing import Optional, Union
+from unittest.mock import patch
 
 from transformers.dynamic_module_utils import get_cached_module_file
 
 
-def get_transformers_dynamic_module(
+def get_dynamic_module_file(
     class_reference: str,
     pretrained_model_name_or_path: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
@@ -19,10 +20,11 @@ def get_transformers_dynamic_module(
     repo_type: Optional[str] = None,
     code_revision: Optional[str] = None,
     **kwargs,
-):
+) -> str:
     """
     As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
-    but does not open the module to avoid unnecessary import errors.
+    but only makes sure that the module has been downloaded without checking
+    imports within the module.
     """
     if "--" in class_reference:
         repo_id, class_reference = class_reference.split("--")
@@ -34,15 +36,17 @@ def get_transformers_dynamic_module(
     if code_revision is None and pretrained_model_name_or_path == repo_id:
         code_revision = revision
 
-    return get_cached_module_file(
-        repo_id,
-        module_file + ".py",
-        cache_dir=cache_dir,
-        force_download=force_download,
-        resume_download=resume_download,
-        proxies=proxies,
-        token=token,
-        revision=code_revision,
-        local_files_only=local_files_only,
-        repo_type=repo_type,
-    )
+    with patch("transformers.dynamic_module_utils.check_imports",
+               lambda _: []):
+        return get_cached_module_file(
+            repo_id,
+            module_file + ".py",
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            revision=code_revision,
+            local_files_only=local_files_only,
+            repo_type=repo_type,
+        )

From b3c253526ba58ad81b83db286fa0720ca72ffe8d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 26 Jul 2025 15:38:57 +0000
Subject: [PATCH 38/47] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 571df4bb53b7..bc84804063c3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -797,7 +797,7 @@ def _resolve_transformers_backend(self):
         revision = self.revision
 
         from vllm.transformers_utils.dynamic_module import (
-            get_transformers_dynamic_module)
+            get_dynamic_module_file)
 
         auto_map: dict[str, str] = getattr(self.hf_config, "auto_map",
                                            None) or dict()
@@ -810,9 +810,7 @@ def _resolve_transformers_backend(self):
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
         auto_modules = {
-            name: get_transformers_dynamic_module(module,
-                                                  model,
-                                                  revision=revision)
+            name: get_dynamic_module_file(module, model, revision=revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
             if "." in module  # Ignore entries that are improperly formatted
         }

From 54b93ba6cbaeaa9917d44c63a184def645a01b07 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 02:52:47 +0000
Subject: [PATCH 39/47] Try fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                            |  8 +--
 vllm/model_executor/models/registry.py    | 33 +++++++-----
 vllm/transformers_utils/dynamic_module.py | 66 +++++++++++++++++++++--
 3 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index bc84804063c3..36fff35cc3fb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -793,11 +793,14 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
         return self
 
     def _resolve_transformers_backend(self):
+        if self.model_impl not in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
+            return
+
         model = self.model
         revision = self.revision
 
         from vllm.transformers_utils.dynamic_module import (
-            get_dynamic_module_file)
+            try_get_dynamic_module_file)
 
         auto_map: dict[str, str] = getattr(self.hf_config, "auto_map",
                                            None) or dict()
@@ -810,9 +813,8 @@ def _resolve_transformers_backend(self):
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
         auto_modules = {
-            name: get_dynamic_module_file(module, model, revision=revision)
+            name: try_get_dynamic_module_file(module, model, revision=revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
-            if "." in module  # Ignore entries that are improperly formatted
         }
 
         return auto_map, auto_modules
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 1c1522508123..0dbd0e5a4901 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -18,8 +18,7 @@
 
 import torch.nn as nn
 
-from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
-                         try_match_architecture_defaults)
+from vllm.config import ModelConfig, ModelImpl, try_match_architecture_defaults
 from vllm.logger import init_logger
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
@@ -507,6 +506,7 @@ def _try_resolve_transformers(
 
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
+
         # Make sure that config class is always initialized before model class,
         # otherwise the model class won't be able to access the config class,
         # the expected auto_map should have correct order like:
@@ -515,14 +515,20 @@ def _try_resolve_transformers(
         #     "AutoModel": "<your-repo-name>--<config-name>",
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
-        auto_modules = {
-            name:
-            get_class_from_dynamic_module(module,
-                                          model_config.model,
-                                          revision=model_config.revision)
-            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
-            if "." in module  # Ignore entries that are improperly formatted
-        }
+        try:
+            auto_modules = {
+                name:
+                get_class_from_dynamic_module(module,
+                                              model_config.model,
+                                              revision=model_config.revision)
+                for name, module in sorted(auto_map.items(),
+                                           key=lambda x: x[0])
+            }
+        except Exception:
+            if model_config.model_impl != ModelImpl.TRANSFORMERS:
+                return None
+
+            raise
 
         model_module = getattr(transformers, architecture, None)
 
@@ -569,7 +575,8 @@ def _normalize_arch(
             suffix, _ = match
 
             # Get the name of the base model to convert
-            for repl_suffix, _ in iter_architecture_defaults():
+            # for repl_suffix, _ in iter_architecture_defaults():
+            for repl_suffix, _ in ["ForCausalLM"]:
                 base_arch = architecture.replace(suffix, repl_suffix)
                 if base_arch in self.models:
                     return base_arch
@@ -613,7 +620,7 @@ def inspect_model_cls(
                 return (model_info, arch)
 
         # Fallback to transformers impl
-        if model_config.model_impl != ModelImpl.VLLM:
+        if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
             arch = self._try_resolve_transformers(architectures[0],
                                                   model_config)
             if arch is not None:
@@ -648,7 +655,7 @@ def resolve_model_cls(
                 return (model_cls, arch)
 
         # Fallback to transformers impl
-        if model_config.model_impl != ModelImpl.VLLM:
+        if model_config.model_impl in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
             arch = self._try_resolve_transformers(architectures[0],
                                                   model_config)
             if arch is not None:
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index 485b8d7641b3..b9f51746d204 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
 import os
 from typing import Optional, Union
-from unittest.mock import patch
 
 from transformers.dynamic_module_utils import get_cached_module_file
 
+import vllm.envs as envs
+
+logger = logging.getLogger(__name__)
+
 
 def get_dynamic_module_file(
     class_reference: str,
@@ -23,8 +27,49 @@ def get_dynamic_module_file(
 ) -> str:
     """
     As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
-    but only makes sure that the module has been downloaded without checking
-    imports within the module.
+    but only makes sure that the module has been downloaded.
+    """
+    if "--" in class_reference:
+        repo_id, class_reference = class_reference.split("--")
+    else:
+        repo_id = pretrained_model_name_or_path
+
+    module_file, class_name = class_reference.split(".")
+
+    if code_revision is None and pretrained_model_name_or_path == repo_id:
+        code_revision = revision
+
+    return get_cached_module_file(
+        repo_id,
+        module_file + ".py",
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=code_revision,
+        local_files_only=local_files_only,
+        repo_type=repo_type,
+    )
+
+
+def try_get_dynamic_module_file(
+    class_reference: str,
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    **kwargs,
+) -> Optional[str]:
+    """
+    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
+    ignoring any errors.
     """
     if "--" in class_reference:
         repo_id, class_reference = class_reference.split("--")
@@ -36,8 +81,7 @@ def get_dynamic_module_file(
     if code_revision is None and pretrained_model_name_or_path == repo_id:
         code_revision = revision
 
-    with patch("transformers.dynamic_module_utils.check_imports",
-               lambda _: []):
+    try:
         return get_cached_module_file(
             repo_id,
             module_file + ".py",
@@ -50,3 +94,15 @@ def get_dynamic_module_file(
             local_files_only=local_files_only,
             repo_type=repo_type,
         )
+    except Exception:
+        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
+
+        logger.exception(
+            "Unable to load %s from %s on %s. This means that Transformers "
+            "backend will not work for this model.",
+            class_reference,
+            pretrained_model_name_or_path,
+            location,
+        )
+
+        return None

From 6c63bd09f2831f3a13dfdae1de29d28319341f58 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 04:55:33 +0000
Subject: [PATCH 40/47] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                            | 13 ++--
 vllm/model_executor/models/registry.py    | 43 ++++---------
 vllm/transformers_utils/dynamic_module.py | 77 ++++++++++++++++++-----
 3 files changed, 80 insertions(+), 53 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 36fff35cc3fb..c64065334b67 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -804,6 +804,7 @@ def _resolve_transformers_backend(self):
 
         auto_map: dict[str, str] = getattr(self.hf_config, "auto_map",
                                            None) or dict()
+
         # Make sure that config class is always initialized before model class,
         # otherwise the model class won't be able to access the config class,
         # the expected auto_map should have correct order like:
@@ -812,12 +813,12 @@ def _resolve_transformers_backend(self):
         #     "AutoModel": "<your-repo-name>--<config-name>",
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
-        auto_modules = {
-            name: try_get_dynamic_module_file(module, model, revision=revision)
-            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
-        }
-
-        return auto_map, auto_modules
+        for prefix in ("AutoConfig", "AutoModel"):
+            for name, module in auto_map.items():
+                if name.startswith(prefix):
+                    try_get_dynamic_module_file(module,
+                                                model,
+                                                revision=revision)
 
     def _get_transformers_backend_cls(self) -> str:
         """Determine which Transformers backend class will be used if
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 2e4162258520..1a003813a51b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -17,9 +17,12 @@
 from typing import Callable, Optional, TypeVar, Union
 
 import torch.nn as nn
+import transformers
 
 from vllm.config import ModelConfig, ModelImpl, try_match_architecture_defaults
 from vllm.logger import init_logger
+from vllm.transformers_utils.dynamic_module import (
+    try_get_class_from_dynamic_module)
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
@@ -501,40 +504,22 @@ def _try_resolve_transformers(
         if architecture in _TRANSFORMERS_BACKEND_MODELS:
             return architecture
 
-        import transformers
-        from transformers.dynamic_module_utils import (
-            get_class_from_dynamic_module)
-
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
 
-        # Make sure that config class is always initialized before model class,
-        # otherwise the model class won't be able to access the config class,
-        # the expected auto_map should have correct order like:
-        # "auto_map": {
-        #     "AutoConfig": "<your-repo-name>--<config-name>",
-        #     "AutoModel": "<your-repo-name>--<config-name>",
-        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
-        # },
-        try:
-            auto_modules = {
-                name:
-                get_class_from_dynamic_module(module,
-                                              model_config.model,
-                                              revision=model_config.revision)
-                for name, module in sorted(auto_map.items(),
-                                           key=lambda x: x[0])
-            }
-        except Exception:
-            if model_config.model_impl != ModelImpl.TRANSFORMERS:
-                return None
-
-            raise
-
         model_module = getattr(transformers, architecture, None)
 
         if model_module is None:
-            if "AutoModel" not in auto_map:
+            for name, module in auto_map.items():
+                if name.startswith("AutoModel"):
+                    model_module = try_get_class_from_dynamic_module(
+                        module,
+                        model_config.model,
+                        revision=model_config.revision,
+                    )
+                    if model_module is not None:
+                        break
+            else:
                 if model_config.model_impl != ModelImpl.TRANSFORMERS:
                     return None
 
@@ -545,8 +530,6 @@ def _try_resolve_transformers(
                     "and 'AutoModel' is not present in the model config's "
                     "'auto_map' (relevant if the model is custom).")
 
-            model_module = auto_modules["AutoModel"]
-
         if not model_module.is_backend_compatible():
             if model_config.model_impl != ModelImpl.TRANSFORMERS:
                 return None
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index b9f51746d204..d07daeda0c9c 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -1,14 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import logging
 import os
 from typing import Optional, Union
 
-from transformers.dynamic_module_utils import get_cached_module_file
+from transformers.dynamic_module_utils import (get_cached_module_file,
+                                               get_class_from_dynamic_module)
 
 import vllm.envs as envs
+from vllm.logger import init_logger
 
-logger = logging.getLogger(__name__)
+logger = init_logger(__name__)
 
 
 def get_dynamic_module_file(
@@ -50,6 +51,7 @@ def get_dynamic_module_file(
         revision=code_revision,
         local_files_only=local_files_only,
         repo_type=repo_type,
+        **kwargs,
     )
 
 
@@ -68,41 +70,82 @@ def try_get_dynamic_module_file(
     **kwargs,
 ) -> Optional[str]:
     """
-    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
-    ignoring any errors.
+    As [vllm.transformers_utils.dynamic_module.get_dynamic_module_file][],
+    but ignoring any errors.
     """
-    if "--" in class_reference:
-        repo_id, class_reference = class_reference.split("--")
-    else:
-        repo_id = pretrained_model_name_or_path
+    try:
+        return get_dynamic_module_file(
+            class_reference,
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            repo_type=repo_type,
+            code_revision=code_revision,
+            **kwargs,
+        )
+    except Exception:
+        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
 
-    module_file, class_name = class_reference.split(".")
+        logger.warning(
+            "Unable to load %s from %s on %s. This means that Transformers "
+            "backend will not work for this model.",
+            class_reference,
+            pretrained_model_name_or_path,
+            location,
+            exc_info=True,
+        )
+
+        return None
 
-    if code_revision is None and pretrained_model_name_or_path == repo_id:
-        code_revision = revision
 
+def try_get_class_from_dynamic_module(
+    class_reference: str,
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    **kwargs,
+) -> Optional[type]:
+    """
+    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
+    but ignoring any errors.
+    """
     try:
-        return get_cached_module_file(
-            repo_id,
-            module_file + ".py",
+        return get_class_from_dynamic_module(
+            class_reference,
+            pretrained_model_name_or_path,
             cache_dir=cache_dir,
             force_download=force_download,
             resume_download=resume_download,
             proxies=proxies,
             token=token,
-            revision=code_revision,
+            revision=revision,
             local_files_only=local_files_only,
             repo_type=repo_type,
+            code_revision=code_revision,
+            **kwargs,
         )
     except Exception:
         location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
 
-        logger.exception(
+        logger.warning(
             "Unable to load %s from %s on %s. This means that Transformers "
             "backend will not work for this model.",
             class_reference,
             pretrained_model_name_or_path,
             location,
+            exc_info=True,
         )
 
         return None

From 61d5160235490c675da2ccfd3a3a8c7da907b15c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 05:03:14 +0000
Subject: [PATCH 41/47] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/registry.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 1a003813a51b..917bb687f61f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -19,7 +19,8 @@
 import torch.nn as nn
 import transformers
 
-from vllm.config import ModelConfig, ModelImpl, try_match_architecture_defaults
+from vllm.config import (ModelConfig, ModelImpl, iter_architecture_defaults,
+                         try_match_architecture_defaults)
 from vllm.logger import init_logger
 from vllm.transformers_utils.dynamic_module import (
     try_get_class_from_dynamic_module)
@@ -559,8 +560,7 @@ def _normalize_arch(
             suffix, _ = match
 
             # Get the name of the base model to convert
-            # for repl_suffix, _ in iter_architecture_defaults():
-            for repl_suffix, _ in ["ForCausalLM"]:
+            for repl_suffix, _ in iter_architecture_defaults():
                 base_arch = architecture.replace(suffix, repl_suffix)
                 if base_arch in self.models:
                     return base_arch

From 1ecd6bdf2ae176351263bb0df33633edd01f6ae3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 09:39:24 +0000
Subject: [PATCH 42/47] Skip roberta seq cls for V1

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/language/pooling/test_jina.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 6579e79bcb6f..2ae431de1683 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -4,6 +4,7 @@
 
 import pytest
 
+import vllm.envs as envs
 from vllm import PoolingParams
 
 from ...utils import EmbedModelInfo, RerankModelInfo
@@ -62,6 +63,10 @@ def hf_model_callback(model):
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(hf_runner, vllm_runner,
                             model_info: RerankModelInfo) -> None:
+    if (model_info.architecture == "XLMRobertaForSequenceClassification"
+            and envs.VLLM_USE_V1):
+        pytest.skip("Not supported yet")
+
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
 
 

From 8925fac6f83028e8a4e18eac49b64640cac59823 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 12:17:18 +0000
Subject: [PATCH 43/47] Fix transformers loading

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                            |  83 +++-------------
 vllm/model_executor/models/registry.py    |  19 ++++
 vllm/transformers_utils/dynamic_module.py | 111 ++--------------------
 3 files changed, 44 insertions(+), 169 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c64065334b67..bf7ad8472fdd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -614,10 +614,6 @@ def __post_init__(self) -> None:
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=self.hf_token, revision=self.revision)
 
-        # NOTE: We need to resolve this early, otherwise the temporary
-        # transformers modules might not be available to child processes
-        self._resolve_transformers_backend()
-
         architectures = self.architectures
         registry = self.registry
         is_generative_model = registry.is_text_generation_model(
@@ -709,10 +705,23 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self.supported_tasks = self._get_supported_tasks(
             architectures, self.runner_type, self.convert_type)
 
-        _, arch = registry.inspect_model_cls(architectures, self)
+        model_info, arch = registry.inspect_model_cls(architectures, self)
         self._architecture = arch
         logger.info("Resolved architecture: %s", arch)
 
+        # Note: Initialize these attributes early because transformers fallback
+        # may fail to load dynamic modules in child processes
+        self.is_cross_encoder = (model_info.supports_cross_encoding
+                                 or self.convert_type == "classify")
+        self.is_pp_supported = model_info.supports_pp
+        self.is_multimodal_raw_input_supported = (
+            model_info.supports_multimodal_raw_input)
+        self.is_attention_free = model_info.is_attention_free
+        self.is_hybrid = model_info.is_hybrid
+        self.has_noops = model_info.has_noops
+        self.has_inner_state = model_info.has_inner_state
+        self.is_v1_compatible = not model_info.supports_v0_only
+
         self.pooler_config = self._init_pooler_config()
 
         self.dtype = _get_and_verify_dtype(
@@ -792,34 +801,6 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
                 "max_model_len must be an integer after __post_init__.")
         return self
 
-    def _resolve_transformers_backend(self):
-        if self.model_impl not in (ModelImpl.AUTO, ModelImpl.TRANSFORMERS):
-            return
-
-        model = self.model
-        revision = self.revision
-
-        from vllm.transformers_utils.dynamic_module import (
-            try_get_dynamic_module_file)
-
-        auto_map: dict[str, str] = getattr(self.hf_config, "auto_map",
-                                           None) or dict()
-
-        # Make sure that config class is always initialized before model class,
-        # otherwise the model class won't be able to access the config class,
-        # the expected auto_map should have correct order like:
-        # "auto_map": {
-        #     "AutoConfig": "<your-repo-name>--<config-name>",
-        #     "AutoModel": "<your-repo-name>--<config-name>",
-        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
-        # },
-        for prefix in ("AutoConfig", "AutoModel"):
-            for name, module in auto_map.items():
-                if name.startswith(prefix):
-                    try_get_dynamic_module_file(module,
-                                                model,
-                                                revision=revision)
-
     def _get_transformers_backend_cls(self) -> str:
         """Determine which Transformers backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
@@ -838,7 +819,7 @@ def registry(self):
     def architectures(self) -> list[str]:
         return getattr(self.hf_config, "architectures", [])
 
-    @cached_property
+    @property
     def architecture(self) -> str:
         """The architecture vllm actually used."""
         return self._architecture
@@ -1674,44 +1655,10 @@ def uses_mrope(self) -> bool:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
-    @cached_property
-    def is_cross_encoder(self) -> bool:
-        return (self.registry.is_cross_encoder_model(self.architectures, self)
-                or self.convert_type == "classify")
-
-    @cached_property
-    def is_pp_supported(self) -> bool:
-        return self.registry.is_pp_supported_model(self.architectures, self)
-
-    @cached_property
-    def is_multimodal_raw_input_supported(self):
-        return self.registry.supports_multimodal_raw_input(
-            self.architectures, self)
-
-    @cached_property
-    def is_attention_free(self):
-        return self.registry.is_attention_free_model(self.architectures, self)
-
-    @cached_property
-    def is_hybrid(self):
-        return self.registry.is_hybrid_model(self.architectures, self)
-
-    @cached_property
-    def has_noops(self):
-        return self.registry.is_noops_model(self.architectures, self)
-
-    @cached_property
-    def has_inner_state(self):
-        return self.registry.model_has_inner_state(self.architectures, self)
-
     @property
     def use_mla(self) -> bool:
         return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
 
-    @cached_property
-    def is_v1_compatible(self) -> bool:
-        return self.registry.is_v1_compatible(self.architectures, self)
-
     @property
     def is_matryoshka(self) -> bool:
         return (bool(getattr(self.hf_config, "matryoshka_dimensions", None))
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 917bb687f61f..179d5e324da9 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -508,6 +508,24 @@ def _try_resolve_transformers(
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
 
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        for prefix in ("AutoConfig", "AutoModel"):
+            for name, module in auto_map.items():
+                if name.startswith(prefix):
+                    try_get_class_from_dynamic_module(
+                        module,
+                        model_config.model,
+                        revision=model_config.revision,
+                        warn_on_fail=False,
+                    )
+
         model_module = getattr(transformers, architecture, None)
 
         if model_module is None:
@@ -517,6 +535,7 @@ def _try_resolve_transformers(
                         module,
                         model_config.model,
                         revision=model_config.revision,
+                        warn_on_fail=True,
                     )
                     if model_module is not None:
                         break
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index d07daeda0c9c..05191f95216c 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -3,8 +3,7 @@
 import os
 from typing import Optional, Union
 
-from transformers.dynamic_module_utils import (get_cached_module_file,
-                                               get_class_from_dynamic_module)
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -12,97 +11,6 @@
 logger = init_logger(__name__)
 
 
-def get_dynamic_module_file(
-    class_reference: str,
-    pretrained_model_name_or_path: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: Optional[bool] = None,
-    proxies: Optional[dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    repo_type: Optional[str] = None,
-    code_revision: Optional[str] = None,
-    **kwargs,
-) -> str:
-    """
-    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
-    but only makes sure that the module has been downloaded.
-    """
-    if "--" in class_reference:
-        repo_id, class_reference = class_reference.split("--")
-    else:
-        repo_id = pretrained_model_name_or_path
-
-    module_file, class_name = class_reference.split(".")
-
-    if code_revision is None and pretrained_model_name_or_path == repo_id:
-        code_revision = revision
-
-    return get_cached_module_file(
-        repo_id,
-        module_file + ".py",
-        cache_dir=cache_dir,
-        force_download=force_download,
-        resume_download=resume_download,
-        proxies=proxies,
-        token=token,
-        revision=code_revision,
-        local_files_only=local_files_only,
-        repo_type=repo_type,
-        **kwargs,
-    )
-
-
-def try_get_dynamic_module_file(
-    class_reference: str,
-    pretrained_model_name_or_path: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: Optional[bool] = None,
-    proxies: Optional[dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    repo_type: Optional[str] = None,
-    code_revision: Optional[str] = None,
-    **kwargs,
-) -> Optional[str]:
-    """
-    As [vllm.transformers_utils.dynamic_module.get_dynamic_module_file][],
-    but ignoring any errors.
-    """
-    try:
-        return get_dynamic_module_file(
-            class_reference,
-            pretrained_model_name_or_path,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            token=token,
-            revision=revision,
-            local_files_only=local_files_only,
-            repo_type=repo_type,
-            code_revision=code_revision,
-            **kwargs,
-        )
-    except Exception:
-        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
-
-        logger.warning(
-            "Unable to load %s from %s on %s. This means that Transformers "
-            "backend will not work for this model.",
-            class_reference,
-            pretrained_model_name_or_path,
-            location,
-            exc_info=True,
-        )
-
-        return None
-
-
 def try_get_class_from_dynamic_module(
     class_reference: str,
     pretrained_model_name_or_path: str,
@@ -115,6 +23,7 @@ def try_get_class_from_dynamic_module(
     local_files_only: bool = False,
     repo_type: Optional[str] = None,
     code_revision: Optional[str] = None,
+    warn_on_fail: bool = True,
     **kwargs,
 ) -> Optional[type]:
     """
@@ -139,13 +48,13 @@ def try_get_class_from_dynamic_module(
     except Exception:
         location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
 
-        logger.warning(
-            "Unable to load %s from %s on %s. This means that Transformers "
-            "backend will not work for this model.",
-            class_reference,
-            pretrained_model_name_or_path,
-            location,
-            exc_info=True,
-        )
+        if warn_on_fail:
+            logger.warning(
+                "Unable to load %s from %s on %s.",
+                class_reference,
+                pretrained_model_name_or_path,
+                location,
+                exc_info=True,
+            )
 
         return None

From cc999d34a2007dc754091a7105fa7cde7f5146bd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 12:20:17 +0000
Subject: [PATCH 44/47] Cleanup

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index bf7ad8472fdd..f7434e8f7438 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -705,23 +705,13 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self.supported_tasks = self._get_supported_tasks(
             architectures, self.runner_type, self.convert_type)
 
+        # Note: Initialize these attributes early because transformers fallback
+        # may fail to load dynamic modules in child processes
         model_info, arch = registry.inspect_model_cls(architectures, self)
+        self._model_info = model_info
         self._architecture = arch
         logger.info("Resolved architecture: %s", arch)
 
-        # Note: Initialize these attributes early because transformers fallback
-        # may fail to load dynamic modules in child processes
-        self.is_cross_encoder = (model_info.supports_cross_encoding
-                                 or self.convert_type == "classify")
-        self.is_pp_supported = model_info.supports_pp
-        self.is_multimodal_raw_input_supported = (
-            model_info.supports_multimodal_raw_input)
-        self.is_attention_free = model_info.is_attention_free
-        self.is_hybrid = model_info.is_hybrid
-        self.has_noops = model_info.has_noops
-        self.has_inner_state = model_info.has_inner_state
-        self.is_v1_compatible = not model_info.supports_v0_only
-
         self.pooler_config = self._init_pooler_config()
 
         self.dtype = _get_and_verify_dtype(
@@ -1655,6 +1645,39 @@ def uses_mrope(self) -> bool:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
+    @property
+    def is_cross_encoder(self) -> bool:
+        return (self._model_info.supports_cross_encoding
+                or self.convert_type == "classify")
+
+    @property
+    def is_pp_supported(self) -> bool:
+        return self._model_info.supports_pp
+
+    @property
+    def is_multimodal_raw_input_supported(self) -> bool:
+        return self._model_info.supports_multimodal_raw_input
+
+    @property
+    def is_attention_free(self) -> bool:
+        return self._model_info.is_attention_free
+
+    @property
+    def is_hybrid(self) -> bool:
+        return self._model_info.is_hybrid
+
+    @property
+    def has_noops(self) -> bool:
+        return self._model_info.has_noops
+
+    @property
+    def has_inner_state(self):
+        return self._model_info.has_inner_state
+
+    @property
+    def is_v1_compatible(self) -> bool:
+        return not self._model_info.supports_v0_only
+
     @property
     def use_mla(self) -> bool:
         return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE

From 11377a4771e7bc5920faea9c3f687cc649f46534 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 13:58:40 +0000
Subject: [PATCH 45/47] Don't load HF in registry test

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/test_registry.py | 52 ++++++++---------------------------
 1 file changed, 11 insertions(+), 41 deletions(-)

diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 8a3afec4a27d..8769ad45eb93 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,7 +6,6 @@
 import pytest
 import torch.cuda
 
-from vllm.config import ModelConfig
 from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
@@ -25,20 +24,9 @@
 
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
-    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    model_info.check_transformers_version(on_fail="skip")
-
-    model_id = model_info.default
-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-    )
-
     # Ensure all model classes can be imported successfully
-    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch, model_config)
+    model_cls = ModelRegistry._try_load_model_cls(model_arch)
+    assert model_cls is not None
 
     if model_arch in _SPECULATIVE_DECODING_MODELS:
         return  # Ignore these models which do not have a unified format
@@ -66,25 +54,16 @@ def test_registry_imports(model_arch):
     ("XLMRobertaForSequenceClassification", False, False, True),
 ])
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
-    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    model_info.check_transformers_version(on_fail="skip")
-
-    model_id = model_info.default
-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-    )
+    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
+    assert model_info is not None
 
-    assert model_config.is_multimodal_model is is_mm
-    assert model_config.is_cross_encoder is is_ce
+    assert model_info.supports_multimodal is is_mm
+    assert model_info.supports_cross_encoding is is_ce
 
     if init_cuda and current_platform.is_cuda_alike():
         assert not torch.cuda.is_initialized()
 
-        ModelRegistry.resolve_model_cls(model_arch, model_config)
+        ModelRegistry._try_load_model_cls(model_arch)
         if not torch.cuda.is_initialized():
             warnings.warn(
                 "This model no longer initializes CUDA on import. "
@@ -103,24 +82,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
         ("Qwen2VLForConditionalGeneration", True, True),
     ])
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
-    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    model_info.check_transformers_version(on_fail="skip")
-
-    model_id = model_info.default
-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-    )
+    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
+    assert model_info is not None
 
-    assert model_config.is_pp_supported is is_pp
+    assert model_info.supports_pp is is_pp
 
     if init_cuda and current_platform.is_cuda_alike():
         assert not torch.cuda.is_initialized()
 
-        ModelRegistry.resolve_model_cls(model_arch, model_config)
+        ModelRegistry._try_load_model_cls(model_arch)
         if not torch.cuda.is_initialized():
             warnings.warn(
                 "This model no longer initializes CUDA on import. "

From af6498ab722feea3ee8914e1920e539feb9e3a3f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 27 Jul 2025 15:52:16 +0000
Subject: [PATCH 46/47] Fix model impl

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/test_initialization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 14d243012b2f..d5441540176e 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,13 +7,15 @@
 from transformers import PretrainedConfig
 
 from vllm import LLM
+from vllm.config import ModelImpl
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from ..utils import create_new_process_for_each_test
-from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
+from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
+                       HF_EXAMPLE_MODELS, HfExampleModels)
 
 
 @create_new_process_for_each_test()
@@ -126,6 +128,8 @@ def _initialize_kv_caches_v1(self, vllm_config):
             # these tests seem to produce leftover memory
             gpu_memory_utilization=0.80,
             load_format="dummy",
+            model_impl=ModelImpl.TRANSFORMERS
+            if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
             hf_overrides=hf_overrides,
         )
 

From 741be4799e7b6d833818fcadab834f5870386714 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 28 Jul 2025 02:23:22 +0000
Subject: [PATCH 47/47] Fix remaining test

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/multimodal/generation/test_common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 5bff615fb107..c3094b0f6461 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -222,7 +222,6 @@
         },
         marks=[large_gpu_mark(min_gb=32)],
     ),
-    # Check "auto" with fallback to transformers
     "internvl-transformers": VLMTestInfo(
         models=["OpenGVLab/InternVL3-1B-hf"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -232,7 +231,7 @@
         use_tokenizer_eos=True,
         image_size_factors=[(0.25, 0.5, 1.0)],
         vllm_runner_kwargs={
-            "model_impl": "auto",
+            "model_impl": "transformers",
         },
         auto_cls=AutoModelForImageTextToText,
         marks=[pytest.mark.core_model],