From 6b4263814176d1d664d6da48044713903955a310 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 17 Jun 2024 15:24:59 -0700 Subject: [PATCH 1/2] do not error if two processes do not agree --- .../device_communicators/custom_all_reduce_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 75b7c374c8e6..b23b2f36b216 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -142,8 +142,13 @@ def can_actually_p2p( for src, tgt in zip(batch_src, batch_tgt): a = result_queue.get() b = result_queue.get() - assert a == b - result.append(a) + if a != b: + logger.warning( + "Two processes do not agree on the P2P access" + " status on %d -> %d, treat as disabled.", src, tgt) + result.append(False) + else: + result.append(a) return result From 8b23a06de456dfc8a302210e2e6e0cd437c56e46 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 17 Jun 2024 16:37:47 -0700 Subject: [PATCH 2/2] fix race condition --- vllm/distributed/device_communicators/custom_all_reduce_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index b23b2f36b216..e0641a54c419 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -71,6 +71,7 @@ def consumer(batch_tgt: Sequence[int], if open_success: # modify the memory lib.cudaMemset(pointer, 2, 1024) + lib.cudaDeviceSynchronize() # use two queues to simulate barrier producer_queue.get() consumer_queue.put(0)