Skip to content

[Bug]: accuract test failed due to unexpected keyword argument 'prompt_token_ids' #2865

@Yikun

Description

@Yikun

Your current environment

https://github.com/vllm-project/vllm-ascend/actions/runs/17617228019/job/50053429133?pr=2864

🐛 Describe the bug

=================================== FAILURES ===================================
_______________ test_lm_eval_correctness_param[config_filename0] _______________

config_filename = PosixPath('/__w/vllm-ascend/vllm-ascend/tests/e2e/models/configs/Qwen3-8B-Base.yaml')
tp_size = '1', report_dir = './benchmarks/accuracy'
env_config = EnvConfig(vllm_version='0.1.dev1', vllm_commit='b8a9307', vllm_ascend_version='refs/pull/2864/merge', vllm_ascend_commit='f890241', cann_version='8.2.RC1', torch_version='2.7.1', torch_npu_version='2.7.1.dev20250724')

    def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
                                       env_config):
        eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
        model_args = build_model_args(eval_config, tp_size)
        success = True
        report_data: dict[str, list[dict]] = {"rows": []}
    
        eval_params = {
            "model": eval_config.get("model", "vllm"),
            "model_args": model_args,
            "tasks": [task["name"] for task in eval_config["tasks"]],
            "apply_chat_template": eval_config.get("apply_chat_template", True),
            "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
            "limit": eval_config.get("limit", None),
            "batch_size": "auto",
        }
        for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
            val = eval_config.get(s, None)
            if val is not None:
                eval_params[s] = val
    
        print("Eval Parameters:")
        print(eval_params)
    
>       results = lm_eval.simple_evaluate(**eval_params)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

tests/e2e/models/test_lm_eval_correctness.py:123: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/local/python3.11.13/lib/python3.11/site-packages/lm_eval/utils.py:422: in _wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
/usr/local/python3.11.13/lib/python3.11/site-packages/lm_eval/evaluator.py:308: in simple_evaluate
    results = evaluate(
/usr/local/python3.11.13/lib/python3.11/site-packages/lm_eval/utils.py:422: in _wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
/usr/local/python3.11.13/lib/python3.11/site-packages/lm_eval/evaluator.py:528: in evaluate
    resps = getattr(lm, reqtype)(cloned_reqs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/usr/local/python3.11.13/lib/python3.11/site-packages/lm_eval/api/model.py:382: in loglikelihood
    return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/usr/local/python3.11.13/lib/python3.11/site-packages/lm_eval/models/vllm_causallms.py:473: in _loglikelihood_tokens
    outputs = self._model_generate(requests=inputs, generate=False)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <lm_eval.models.vllm_causallms.VLLM object at 0xfffdd0333b10>
requests = [[151644, 8948, 198, 87752, 105196, 101888, ...], [151644, 8948, 198, 87752, 105196, 101888, ...], [151644, 8948, 198,...6, 101888, ...], [151644, 8948, 198, 87752, 105196, 101888, ...], [151644, 8948, 198, 87752, 105196, 101888, ...], ...]
generate = False, max_tokens = None, stop = None, kwargs = {}

    def _model_generate(
        self,
        requests: List[List[int]] = None,
        generate: bool = False,
        max_tokens: int = None,
        stop: Optional[List[str]] = None,
        **kwargs,
    ):
        if generate:
            kwargs = self.modify_gen_kwargs(kwargs)
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
                sampling_params: SamplingParams,
                requests: List[List[int]],
                lora_request: LoRARequest,
            ):
                llm = LLM(**model_args)
                return llm.generate(

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions