vllm-project · youkaichao · Sep 12, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
@@ -1,4 +1,3 @@
-import os
 import subprocess
 import sys
 import time
@@ -26,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
 
 
 @pytest.fixture
-def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
-               worker_use_ray: bool):
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
     script_path = Path(__file__).parent.joinpath(
         "api_server_async_engine.py").absolute()
     commands = [
@@ -37,25 +35,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
         str(tokenizer_pool_size)
     ]
 
-    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
-    # to prevent `--engine-use-ray` raises an exception due to it deprecation
-    env_vars = os.environ.copy()
-    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
-
-    if engine_use_ray:
-        commands.append("--engine-use-ray")
     if worker_use_ray:
         commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands, env=env_vars)
+    uvicorn_process = subprocess.Popen(commands)
     yield
     uvicorn_process.terminate()
 
 
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
 @pytest.mark.parametrize("worker_use_ray", [False, True])
-@pytest.mark.parametrize("engine_use_ray", [False, True])
-def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
-                    engine_use_ray: bool):
+def test_api_server(api_server, tokenizer_pool_size: int,
+                    worker_use_ray: bool):
     """
     Run the API server and test it.
 

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
@@ -1,5 +1,4 @@
 import asyncio
-import os
 from asyncio import CancelledError
 from dataclasses import dataclass
 from typing import Optional
@@ -72,14 +71,12 @@ def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
 
 
 class MockAsyncLLMEngine(AsyncLLMEngine):
-
-    def _init_engine(self, *args, **kwargs):
-        return MockEngine()
+    _engine_class = MockEngine
 
 
 @pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    engine = MockAsyncLLMEngine(worker_use_ray=False)
     engine.start_background_loop()
     await asyncio.sleep(0.01)
     assert engine.engine.step_calls == 0
@@ -112,16 +109,11 @@ async def test_new_requests_event():
     assert engine.engine.add_request_calls == 3
     assert engine.engine.step_calls == old_step_calls + 1
 
-    # Allow deprecated engine_use_ray to not raise exception
-    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
-
-    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    engine = MockAsyncLLMEngine(worker_use_ray=True)
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
     assert engine.get_decoding_config() is not None
 
-    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
-
 
 def start_engine():
     wait_for_gpu_memory_to_clear(

diff --git a/...s/async_engine/test_openapi_server_ray.py → tests/async_engine/test_openapi_server.py b/...s/async_engine/test_openapi_server_ray.py → tests/async_engine/test_openapi_server.py
@@ -19,16 +19,11 @@ def server():
         "--max-model-len",
         "2048",
         "--enforce-eager",
-        "--engine-use-ray",
         "--chat-template",
         str(chatml_jinja_path),
     ]
 
-    # Allow `--engine-use-ray`, otherwise the launch of the server throw
-    # an error due to try to use a deprecated feature
-    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
-    with RemoteOpenAIServer(MODEL_NAME, args,
-                            env_dict=env_dict) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1035,24 +1035,13 @@ def create_engine_config(self) -> EngineConfig:
 @dataclass
 class AsyncEngineArgs(EngineArgs):
     """Arguments for asynchronous vLLM engine."""
-    engine_use_ray: bool = False
     disable_log_requests: bool = False
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser,
                      async_args_only: bool = False) -> FlexibleArgumentParser:
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
-        parser.add_argument('--engine-use-ray',
-                            action='store_true',
-                            help='Use Ray to start the LLM engine in a '
-                            'separate process as the server process.'
-                            '(DEPRECATED. This argument is deprecated '
-                            'and will be removed in a future update. '
-                            'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
-                            'use it. See '
-                            'https://github.com/vllm-project/vllm/issues/7045.'
-                            ')')
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')