Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions vllm/executor/multiproc_gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (cuda_device_count_stateless,
get_distributed_init_method, get_ip, get_open_port,
get_distributed_init_method, get_open_port,
get_vllm_instance_id, make_async)

logger = init_logger(__name__)
Expand All @@ -37,8 +37,11 @@ def _init_executor(self) -> None:
assert world_size <= cuda_device_count_stateless(), (
"please set tensor_parallel_size to less than max local gpu count")

# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# 127.0.0.1 for communication.
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
"127.0.0.1", get_open_port())

if world_size == 1:
self.workers = []
Expand Down
4 changes: 4 additions & 0 deletions vllm/executor/ray_gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
self._run_workers("update_environment_variables",
all_args=all_args_to_update_environment_variables)

if len(node_gpus) == 1:
# in single node case, we don't need to get the IP address.
# the loopback address is sufficient
driver_ip = "127.0.0.1"
distributed_init_method = get_distributed_init_method(
driver_ip, get_open_port())

Expand Down