Skip to content

Commit c21d8b8

Browse files
youkaichaojimpang
authored andcommitted
[misc][distributed] use 127.0.0.1 for single-node (vllm-project#5619)
1 parent 9ea2366 commit c21d8b8

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

vllm/executor/multiproc_gpu_executor.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from vllm.logger import init_logger
1111
from vllm.sequence import ExecuteModelRequest, SamplerOutput
1212
from vllm.utils import (cuda_device_count_stateless,
13-
get_distributed_init_method, get_ip, get_open_port,
13+
get_distributed_init_method, get_open_port,
1414
get_vllm_instance_id, make_async)
1515

1616
logger = init_logger(__name__)
@@ -37,8 +37,11 @@ def _init_executor(self) -> None:
3737
assert world_size <= cuda_device_count_stateless(), (
3838
"please set tensor_parallel_size to less than max local gpu count")
3939

40+
# Multiprocessing-based executor does not support multi-node setting.
41+
# Since it only works for single node, we can use the loopback address
42+
# 127.0.0.1 for communication.
4043
distributed_init_method = get_distributed_init_method(
41-
get_ip(), get_open_port())
44+
"127.0.0.1", get_open_port())
4245

4346
if world_size == 1:
4447
self.workers = []

vllm/executor/ray_gpu_executor.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,16 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
161161
self._run_workers("update_environment_variables",
162162
all_args=all_args_to_update_environment_variables)
163163

164+
if len(node_gpus) == 1:
165+
# in single node case, we don't need to get the IP address.
166+
# the loopback address is sufficient
167+
# NOTE: a node may have several IP addresses, one for each
168+
# network interface. `get_ip()` might return any of them,
169+
# while they might not work for communication inside the node
170+
# if the network setup is complicated. Using the loopback address
171+
# solves this issue, as it always works for communication inside
172+
# the node.
173+
driver_ip = "127.0.0.1"
164174
distributed_init_method = get_distributed_init_method(
165175
driver_ip, get_open_port())
166176

0 commit comments

Comments
 (0)