Skip to content

Commit b13a5a9

Browse files
authored
[None][chore] Add tests for non-existent and completed request cancellation (#6840)
Signed-off-by: Aurelien Chartier <[email protected]>
1 parent 5c2f0fd commit b13a5a9

File tree

2 files changed

+121
-0
lines changed

2 files changed

+121
-0
lines changed

tests/integration/defs/triton_server/test_triton_llm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3726,6 +3726,12 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
37263726
output = venv_check_output(llm_backend_venv, run_cmd)
37273727
assert 'Request is cancelled' in output
37283728

3729+
# Test request cancellation for non-existing request and completed request
3730+
run_cmd = [
3731+
f"{llm_backend_repo_root}/tools/tests/test_llmapi_cancel.py"
3732+
]
3733+
output = venv_check_output(llm_backend_venv, run_cmd)
3734+
37293735

37303736
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
37313737
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#!/usr/bin/env python
2+
# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
import os
29+
import sys
30+
from functools import partial
31+
32+
import numpy as np
33+
from tritonclient import grpc as grpcclient
34+
from tritonclient.utils import InferenceServerException
35+
36+
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/..')
37+
from llmapi_client import (UserData, _prepare_inputs, callback,
38+
prepare_stop_signals)
39+
40+
if __name__ == "__main__":
41+
input_data = np.array([
42+
"The current time is",
43+
], dtype=object)
44+
output_len = 100
45+
inputs = _prepare_inputs(input_data, output_len)
46+
47+
stop_inputs = prepare_stop_signals()
48+
request_id = 1
49+
user_data = UserData()
50+
with grpcclient.InferenceServerClient(
51+
url="localhost:8001",
52+
verbose=False,
53+
ssl=False,
54+
root_certificates=None,
55+
private_key=None,
56+
certificate_chain=None,
57+
) as triton_client:
58+
59+
# Send stop request for non-existing request
60+
triton_client.async_infer(
61+
"tensorrt_llm",
62+
stop_inputs,
63+
request_id=str(request_id), # Request does not exist yet
64+
callback=partial(callback, user_data),
65+
parameters={'Streaming': False})
66+
67+
result = user_data._completed_requests.get()
68+
assert isinstance(result, InferenceServerException)
69+
assert result.status() == "StatusCode.CANCELLED"
70+
71+
# Send actual request
72+
infer_response = triton_client.async_infer(
73+
"tensorrt_llm",
74+
inputs,
75+
request_id=str(request_id),
76+
callback=partial(callback, user_data),
77+
parameters={'Streaming': False})
78+
79+
result = user_data._completed_requests.get()
80+
print(
81+
f'Output text: {result.as_numpy("text_output")[0].decode("utf-8")}')
82+
83+
# Cancel request after it is completed
84+
infer_response.cancel()
85+
86+
# Send stop request for completed request
87+
triton_client.async_infer("tensorrt_llm",
88+
stop_inputs,
89+
request_id=str(request_id),
90+
callback=partial(callback, user_data),
91+
parameters={'Streaming': False})
92+
93+
cancel_result = user_data._completed_requests.get()
94+
assert isinstance(cancel_result, InferenceServerException)
95+
assert cancel_result.status() == "StatusCode.CANCELLED"
96+
97+
# Send a second request to check if server is still healthy
98+
infer_response_2 = triton_client.async_infer(
99+
"tensorrt_llm",
100+
inputs,
101+
request_id=str(request_id + 1),
102+
callback=partial(callback, user_data),
103+
parameters={'Streaming': False})
104+
105+
# Get result of second request
106+
result_2 = user_data._completed_requests.get()
107+
print('Got completed request')
108+
109+
print(
110+
f'Output text: {result_2.as_numpy("text_output")[0].decode("utf-8")}'
111+
)
112+
113+
# Check that both results match
114+
assert np.array_equal(result.as_numpy("text_output"),
115+
result_2.as_numpy("text_output"))

0 commit comments

Comments
 (0)