pragupta
diff --git a/‎build_variables.bzl
Lines changed: 1 addition & 0 deletions b/‎build_variables.bzl
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/distributed/test_hooks.py
Lines changed: 270 additions & 0 deletions b/‎test/distributed/test_hooks.py
Lines changed: 270 additions & 0 deletions
diff --git a/‎torch/_C/_distributed_c10d.pyi
Lines changed: 6 additions & 0 deletions b/‎torch/_C/_distributed_c10d.pyi
Lines changed: 6 additions & 0 deletions
diff --git a/‎torch/csrc/distributed/c10d/Backend.cpp
Lines changed: 34 additions & 0 deletions b/‎torch/csrc/distributed/c10d/Backend.cpp
Lines changed: 34 additions & 0 deletions
diff --git a/‎torch/csrc/distributed/c10d/Backend.hpp
Lines changed: 2 additions & 0 deletions b/‎torch/csrc/distributed/c10d/Backend.hpp
Lines changed: 2 additions & 0 deletions
@@ -522,6 +522,7 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/Backend.cpp",
     "torch/csrc/distributed/c10d/FileStore.cpp",
     "torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
+    "torch/csrc/distributed/c10d/Hooks.cpp",
     "torch/csrc/distributed/c10d/Ops.cpp",
     "torch/csrc/distributed/c10d/ParamCommsUtils.cpp",
     "torch/csrc/distributed/c10d/PrefixStore.cpp",
 
@@ -0,0 +1,270 @@
+# Owner(s): ["oncall: distributed"]
+
+import os
+import sys
+import tempfile
+import threading
+from functools import partial, wraps
+
+import torch
+import torch.distributed as dist
+import torch.distributed._hooks as dhooks
+
+if not dist.is_available():
+    print("torch.distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    skip_if_lt_x_gpu,
+)
+
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class PgHooks(MultiProcessTestCase):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def test_pg_hook(self):
+        pgs = []
+
+        def pg_hook(pg, pg_name):
+            pgs.append((pg, pg_name))
+
+        dhooks.register_process_group_hook(pg_hook)
+        dist.init_process_group(
+            backend="gloo",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=dist.FileStore(self.file_name, self.world_size),
+        )
+        self.assertEqual(len(pgs), 1)
+        self.assertEqual(pgs[0][0], dist.group.WORLD)
+
+        # create two partial world PGs
+        pg0 = dist.new_group(ranks=[0, 1])
+        pg1 = dist.new_group(ranks=[2, 3])
+
+        # Each rank only observe two PGs being created: the default PG and one covering its ranks
+        # We don't emit events for PG creation if the current rank doesn't belong to it.
+        # For example, say you're rank 1, you'll get an event for pg0 but not pg1 even though the API contact
+        # dictates you need to call new_group for both.
+        self.assertEqual(len(pgs), 2)
+        self.assertEqual(pgs[1][0], pg0 if self.rank < 2 else pg1)
+
+
+def with_comms(func=None):
+    if func is None:
+        return partial(
+            with_comms,
+        )
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        self.init_comms()
+        func(self, *args, **kwargs)
+        self.destroy_comms()
+
+    return wrapper
+
+
+class CollectiveHooks:
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    def _collective_hooks(self):
+        # it's ok to access them directly since there's a single bg thread poking at them.
+        starts = []
+        ends = []
+        cv = threading.Condition()
+
+        def coll_start(status):
+            starts.append(status)
+            print(f"col_start {len(starts)} rank{self.rank}")
+
+        def coll_end(status):
+            ends.append(status)
+            print(f"col_end {len(ends)} rank{self.rank}")
+            if len(ends) == 2:
+                with cv:
+                    cv.notify()
+
+        dhooks.register_collective_start_hook(coll_start)
+        dhooks.register_collective_end_hook(coll_end)
+
+        tensor = torch.ones([2, 3]).to(self.device) * self.rank
+        tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]
+
+        dist.all_gather(tensor_list, tensor)
+
+        tensor2 = torch.ones([2, 3]).to(self.device) * self.rank
+        dist.all_reduce(tensor2)
+
+        with cv:
+            cv.wait(1)
+
+        default_pg_name = dist.group.WORLD.group_name
+        self.assertEqual(2, len(starts))
+        self.assertEqual(2, len(ends))
+
+        def check_op(idx, coll_name):
+            self.assertEqual(default_pg_name, starts[idx].pg_name)
+            self.assertEqual(self.backend_name, starts[idx].backend)
+            self.assertGreaterEqual(starts[idx].sequence_number, 0)
+            self.assertGreaterEqual(starts[idx].timestamp, 0)
+            self.assertEqual(coll_name, starts[idx].operation)
+
+            self.assertEqual(default_pg_name, ends[idx].pg_name)
+            self.assertEqual(self.backend_name, ends[idx].backend)
+
+            self.assertEqual(starts[idx].sequence_number, ends[idx].sequence_number)
+            self.assertLessEqual(starts[idx].timestamp, ends[idx].timestamp)
+            self.assertEqual(coll_name, ends[idx].operation)
+
+        check_op(0, "ALLGATHER")
+        check_op(1, "ALLREDUCE")
+
+
+class GlooHooks(MultiProcessTestCase, CollectiveHooks):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def init_comms(self):
+        dist.init_process_group(
+            backend="gloo",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=dist.FileStore(self.file_name, self.world_size),
+        )
+
+    def destroy_comms(self):
+        dist.destroy_process_group()
+
+    @property
+    def backend_name(self):
+        return "gloo"
+
+    @property
+    def device(self):
+        return "cpu"
+
+    @with_comms
+    def test_collective_hooks(self):
+        self._collective_hooks()
+
+
+class NcclHooks(MultiProcessTestCase, CollectiveHooks):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def init_comms(self):
+        dist.init_process_group(
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=dist.FileStore(self.file_name, self.world_size),
+        )
+
+    def destroy_comms(self):
+        dist.destroy_process_group()
+
+    @property
+    def backend_name(self):
+        return "nccl"
+
+    @property
+    def device(self):
+        return f"cuda:{self.rank}"
+
+    @skip_if_lt_x_gpu(4)
+    @with_comms
+    def test_collective_hooks(self):
+        self._collective_hooks()
+
+
+class SingleRankTests(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self.rank = 0
+        self.file_name = tempfile.NamedTemporaryFile(delete=False).name
+        dist.init_process_group(
+            backend="gloo",
+            rank=0,
+            world_size=1,
+            store=dist.FileStore(self.file_name, 1),
+        )
+
+    def tearDown(self) -> None:
+        dist.destroy_process_group()
+
+    def test_queue_overflow(self) -> None:
+        cv_done_colls = threading.Condition()
+        cv_done_cb = threading.Condition()
+        colls_done = False
+        starts = []
+        status_with_dropped = None
+
+        def coll_start(status: dhooks.CollectiveStatus):
+            starts.append(status)
+            with cv_done_colls:
+                while not colls_done:
+                    cv_done_colls.wait()
+            if status.drop_count > 0:
+                nonlocal status_with_dropped
+                status_with_dropped = status
+                with cv_done_cb:
+                    cv_done_cb.notify()
+
+        dhooks.register_collective_start_hook(coll_start)
+
+        # native limit is 512
+        for i in range(600):
+            dist.all_reduce(torch.ones([2, 3]))
+        colls_done = True
+        with cv_done_colls:
+            cv_done_colls.notify()
+
+        with cv_done_cb:
+            cv_done_cb.wait(10)
+
+        self.assertTrue(status_with_dropped is not None)
+        self.assertTrue(status_with_dropped.drop_count > 0)
+
+
+if __name__ == "__main__":
+    assert (
+        not torch.cuda._initialized
+    ), "test_distributed must not have initialized CUDA context on main process"
+
+    run_tests()
@@ -11,6 +11,10 @@ _DEFAULT_FIRST_BUCKET_BYTES: int
 _DEFAULT_NO_TIMEOUT: timedelta
 _DEFAULT_PG_TIMEOUT: timedelta
 
+class EventKind(Enum):
+    START = ...
+    END = ...
+
 class BuiltinCommHookType(Enum):
     ALLREDUCE = ...
     FP16_COMPRESS = ...
@@ -20,6 +24,8 @@ def _register_builtin_comm_hook(
     reducer: Reducer,
     comm_hook_type: BuiltinCommHookType,
 ): ...
+def _dequeue_c10d_event() -> Dict[str, object]: ...
+def _enable_event_collection(pipe_fs: int) -> None: ...
 
 class GradBucket:
     def index(self) -> int: ...
 
@@ -1,9 +1,26 @@
 #include <c10/util/Logging.h>
 #include <fmt/format.h>
 #include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Hooks.hpp>
+#include <torch/csrc/distributed/c10d/logging.h>
 
 namespace c10d {
 
+namespace {
+void commonEventinit(
+    details::EventInfo& evt,
+    const Backend& backend,
+    const Work& work) {
+  evt.timestamp =
+      std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+  evt.pg_name = backend.getGroupName();
+  evt.backend = backend.getBackendName();
+  evt.sequence_number = work.getSequencenumber();
+  evt.operation = c10d::opTypeToString(work.retrieveOpType());
+  evt.drop_count = 0;
+}
+} // namespace
+
 Backend::Backend(int rank, int size)
     : rank_(rank), size_(size), dist_debug_level_(debug_level()) {
   C10_LOG_API_USAGE_ONCE("c10d.backend");
@@ -15,4 +32,21 @@ void Backend::init() {
   C10_LOG_API_USAGE_ONCE(fmt::format("c10d.backend_{}", getBackendName()));
 }
 
+void Backend::emitCollectiveStart(const Work& work) {
+  details::EventInfo evt;
+  commonEventinit(evt, *this, work);
+
+  evt.event_kind = ::c10d::EventKind::CollectiveStart;
+  details::enqueue_c10d_event(std::move(evt));
+}
+
+void Backend::emitCollectiveEnd(const Work& work) {
+  details::EventInfo evt;
+  commonEventinit(evt, *this, work);
+
+  evt.event_kind = ::c10d::EventKind::CollectiveEnd;
+  evt.duration_ms = work.getDuration();
+  details::enqueue_c10d_event(std::move(evt));
+}
+
 } // namespace c10d
@@ -366,6 +366,8 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   // Implementations of this interface need to call this to setup
   // appropriate logging etc.
   void init();
+  void emitCollectiveStart(const Work& work);
+  void emitCollectiveEnd(const Work& work);
 
   const int rank_;
   const int size_;