From af94eb2a670e9377277b7a58015d02c60c068627 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Fri, 28 Feb 2025 16:26:18 -0500 Subject: [PATCH 1/2] allow return_tokens_as_token_ids to be passed as a request param Signed-off-by: Benjamin Chislett --- .../openai/test_return_tokens_as_ids.py | 45 ++++++++++++------- vllm/entrypoints/openai/protocol.py | 12 +++++ vllm/entrypoints/openai/serving_chat.py | 21 +++++---- vllm/entrypoints/openai/serving_completion.py | 12 +++-- 4 files changed, 64 insertions(+), 26 deletions(-) diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 9b33eddae2a8..ebdff9b9c8cd 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -16,19 +16,28 @@ from .test_completion import MODEL_NAME -@pytest.fixture(scope="module") -def server_with_return_tokens_as_token_ids_flag( - default_server_args): # noqa: F811 - args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] - with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server: - yield remote_server +@pytest.fixture(scope="module", params=[True, False]) +def server_fixture(request, args): + use_server_flag = request.param + if use_server_flag: + args_with_flag = args + ["--return-tokens-as-token-ids"] + with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server: + yield (remote_server, True) + else: + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield (remote_server, False) @pytest.mark.asyncio +@pytest.mark.parametrize("server_fixture", [True, False], indirect=True) async def test_completion_return_tokens_as_token_ids_completion( - server_with_return_tokens_as_token_ids_flag): - async with server_with_return_tokens_as_token_ids_flag.get_async_client( - ) as client: + server_fixture): + server, use_server_flag = server_fixture + request_args = {} + if not use_server_flag: + request_args["return_tokens_as_token_ids"] = True + + async with server.get_async_client() as client: completion = await client.completions.create( model=MODEL_NAME, @@ -39,7 +48,8 @@ async def test_completion_return_tokens_as_token_ids_completion( echo=True, temperature=0, max_tokens=10, - logprobs=1) + logprobs=1, + extra_body=request_args) text = completion.choices[0].text token_strs = completion.choices[0].logprobs.tokens @@ -60,10 +70,14 @@ async def test_completion_return_tokens_as_token_ids_completion( @pytest.mark.asyncio -async def test_chat_return_tokens_as_token_ids_completion( - server_with_return_tokens_as_token_ids_flag): - async with server_with_return_tokens_as_token_ids_flag.get_async_client( - ) as client: +@pytest.mark.parametrize("server_fixture", [True, False], indirect=True) +async def test_chat_return_tokens_as_token_ids_completion(server_fixture): + server, use_server_flag = server_fixture + request_args = {} + if not use_server_flag: + request_args["return_tokens_as_token_ids"] = True + + async with server.get_async_client() as client: response = await client.chat.completions.create( model=MODEL_NAME, # Include Unicode characters to test for dividing a single @@ -78,7 +92,8 @@ async def test_chat_return_tokens_as_token_ids_completion( }], temperature=0, max_tokens=8, - logprobs=True) + logprobs=True, + extra_body=request_args) text = response.choices[0].message.content tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 31214211cfc4..74b59559e91b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -369,6 +369,12 @@ class ChatCompletionRequest(OpenAIBaseModel): "arguments. For example: {'qualname': " "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " "{'param': 'value'}}.")) + return_tokens_as_token_ids: Optional[bool] = Field( + default=None, + description=( + "If specified with 'logprobs', tokens are represented " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified.")) # doc: end-chat-completion-extra-params @@ -739,6 +745,12 @@ class CompletionRequest(OpenAIBaseModel): "arguments. For example: {'qualname': " "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " "{'param': 'value'}}.")) + return_tokens_as_token_ids: Optional[bool] = Field( + default=None, + description=( + "If specified with 'logprobs', tokens are represented " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified.")) # doc: end-completion-extra-params diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 02dd2c4881c6..b08f7101dc69 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -452,6 +452,8 @@ async def chat_completion_stream_generator( top_logprobs=output.logprobs, tokenizer=tokenizer, num_output_top_logprobs=request.top_logprobs, + return_as_token_id=request. + return_tokens_as_token_ids, ) else: logprobs = None @@ -707,6 +709,7 @@ async def chat_completion_full_generator( top_logprobs=out_logprobs, num_output_top_logprobs=request.top_logprobs, tokenizer=tokenizer, + return_as_token_id=request.return_tokens_as_token_ids, ) else: logprobs = None @@ -854,13 +857,14 @@ async def chat_completion_full_generator( def _get_top_logprobs( self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int], - tokenizer: AnyTokenizer) -> List[ChatCompletionLogProb]: + tokenizer: AnyTokenizer, + should_return_as_token_id: bool) -> List[ChatCompletionLogProb]: return [ ChatCompletionLogProb(token=(token := self._get_decoded_token( p[1], p[0], tokenizer, - return_as_token_id=self.return_tokens_as_token_ids)), + return_as_token_id=should_return_as_token_id)), logprob=max(p[1].logprob, -9999.0), bytes=list( token.encode("utf-8", errors="replace"))) @@ -874,15 +878,18 @@ def _create_chat_logprobs( top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], tokenizer: AnyTokenizer, num_output_top_logprobs: Optional[int] = None, + return_as_token_id: Optional[bool] = None, ) -> ChatCompletionLogProbs: """Create OpenAI-style logprobs.""" logprobs_content: List[ChatCompletionLogProbsContent] = [] + should_return_as_token_id = return_as_token_id if \ + return_as_token_id is not None else self.return_tokens_as_token_ids for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: token = tokenizer.decode(token_id) - if self.return_tokens_as_token_ids: + if should_return_as_token_id: token = f"token_id:{token_id}" logprobs_content.append( @@ -900,16 +907,14 @@ def _create_chat_logprobs( step_token, token_id, tokenizer, - self.return_tokens_as_token_ids, + should_return_as_token_id, ), logprob=max(step_token.logprob, -9999.0), bytes=None if step_decoded is None else list( step_decoded.encode("utf-8", errors="replace")), top_logprobs=self._get_top_logprobs( - step_top_logprobs, - num_output_top_logprobs, - tokenizer, - ), + step_top_logprobs, num_output_top_logprobs, + tokenizer, should_return_as_token_id), )) return ChatCompletionLogProbs(content=logprobs_content) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 840f0f9b8448..f19dfc4341ee 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -317,6 +317,8 @@ async def completion_stream_generator( num_output_top_logprobs=request.logprobs, tokenizer=tokenizer, initial_text_offset=previous_text_lens[i], + return_as_token_id=request. + return_tokens_as_token_ids, ) else: logprobs = None @@ -443,6 +445,7 @@ def request_output_to_completion_response( top_logprobs=out_logprobs, tokenizer=tokenizer, num_output_top_logprobs=request.logprobs, + return_as_token_id=request.return_tokens_as_token_ids, ) else: logprobs = None @@ -484,6 +487,7 @@ def _create_completion_logprobs( num_output_top_logprobs: int, tokenizer: AnyTokenizer, initial_text_offset: int = 0, + return_as_token_id: Optional[bool] = None, ) -> CompletionLogProbs: """Create logprobs for OpenAI Completion API.""" out_text_offset: List[int] = [] @@ -493,11 +497,13 @@ def _create_completion_logprobs( last_token_len = 0 + should_return_as_token_id = return_as_token_id if \ + return_as_token_id is not None else self.return_tokens_as_token_ids for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: token = tokenizer.decode(token_id) - if self.return_tokens_as_token_ids: + if should_return_as_token_id: token = f"token_id:{token_id}" out_tokens.append(token) @@ -510,7 +516,7 @@ def _create_completion_logprobs( step_token, token_id, tokenizer, - return_as_token_id=self.return_tokens_as_token_ids, + return_as_token_id=should_return_as_token_id, ) token_logprob = max(step_token.logprob, -9999.0) @@ -527,7 +533,7 @@ def _create_completion_logprobs( self._get_decoded_token(top_lp[1], top_lp[0], tokenizer, - return_as_token_id=self.return_tokens_as_token_ids): + return_as_token_id=should_return_as_token_id): max(top_lp[1].logprob, -9999.0) for i, top_lp in enumerate(step_top_logprobs.items()) if num_output_top_logprobs >= i From f459e25566a00c6eeaf68bc606ace2adee724f1e Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Tue, 4 Mar 2025 09:55:25 -0500 Subject: [PATCH 2/2] update tests Signed-off-by: Benjamin Chislett --- tests/entrypoints/openai/test_return_tokens_as_ids.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index ebdff9b9c8cd..6474858642d7 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -16,15 +16,16 @@ from .test_completion import MODEL_NAME -@pytest.fixture(scope="module", params=[True, False]) -def server_fixture(request, args): +@pytest.fixture(scope="module") +def server_fixture(request, default_server_args): # noqa: F811 use_server_flag = request.param if use_server_flag: - args_with_flag = args + ["--return-tokens-as-token-ids"] + args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server: yield (remote_server, True) else: - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, + default_server_args) as remote_server: yield (remote_server, False)