pragupta
diff --git a/‎.ci/pytorch/macos-test.sh
Lines changed: 2 additions & 0 deletions b/‎.ci/pytorch/macos-test.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/distributed/tensor/test_fake.py
Lines changed: 41 additions & 0 deletions b/‎test/distributed/tensor/test_fake.py
Lines changed: 41 additions & 0 deletions
diff --git a/‎test/test_numa_binding.py
Lines changed: 3 additions & 2 deletions b/‎test/test_numa_binding.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎torch/_C/_distributed_c10d.pyi
Lines changed: 9 additions & 0 deletions b/‎torch/_C/_distributed_c10d.pyi
Lines changed: 9 additions & 0 deletions
diff --git a/‎torch/distributed/_C_stubs.py
Lines changed: 150 additions & 0 deletions b/‎torch/distributed/_C_stubs.py
Lines changed: 150 additions & 0 deletions
@@ -13,6 +13,8 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd
 
+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1
 
 
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.placement_types import Shard
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+
+class TestFakeDTensor(TestCase):
+    def test_fake_dtensor_operations(self):
+        # Use FakeTensorMode to handle CUDA tensors without actual CUDA
+        fake_mode = FakeTensorMode()
+        world_size = 4
+
+        fake_store = FakeStore()
+        torch.distributed.init_process_group(
+            "fake", store=fake_store, rank=0, world_size=world_size
+        )
+        device_mesh = torch.distributed.device_mesh.init_device_mesh(
+            "cuda",
+            (2, world_size // 2),
+        )
+
+        # Create fake CUDA tensor using FakeTensorMode
+        with fake_mode:
+            x = torch.randn(1, 1, device="cuda")
+            x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
+
+            # Test basic DTensor operations
+            self.assertIsInstance(x, DTensor)
+
+            # Test sum operation
+            r = x.sum(1)
+            self.assertIsInstance(r, DTensor)
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from multiprocessing.context import SpawnProcess
 from typing import Any, Optional
-from unittest import skipUnless
+from unittest import skipIf, skipUnless
 from unittest.mock import mock_open, patch
 
 import torch
@@ -22,7 +22,7 @@
     AffinityMode,
     NumaOptions,
 )
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import IS_MACOS, run_tests, TestCase
 
 
 @dataclass(frozen=True)
@@ -680,6 +680,7 @@ def test_core_complex_tiebreak_prefers_lower_cache_key(self) -> None:
             set(range(0, 2)),
         )
 
+    @skipIf(IS_MACOS, "sched_getaffinity doesn't exist")
     def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
         self._add_mock_hardware(
             num_sockets=1,
 
@@ -851,3 +851,12 @@ class ProcessGroupXCCL(Backend):
 
 def _set_process_group(pg: ProcessGroup) -> None: ...
 def _current_process_group() -> ProcessGroup: ...
+def _dump_nccl_trace_json(
+    includeCollectives: Optional[bool] = ...,
+    onlyActive: Optional[bool] = ...,
+) -> bytes: ...
+def _dump_nccl_trace(
+    includeCollectives: Optional[bool] = ...,
+    includeStackTraces: Optional[bool] = ...,
+    onlyActive: Optional[bool] = ...,
+) -> bytes: ...
@@ -0,0 +1,150 @@
+# mypy: allow-untyped-defs
+"""
+Python stubs for backend-specific distributed components.
+
+Since _C._distributed_c10d always exists now, this module only provides
+stubs for backend-specific functionality that may not be available in all builds
+(e.g., NCCL, UCC, MPI, Gloo, etc.).
+"""
+
+from __future__ import annotations
+
+from typing import Optional, TYPE_CHECKING
+
+from torch._C._distributed_c10d import Store
+
+
+if TYPE_CHECKING:
+    from datetime import timedelta
+
+import torch
+
+
+# Store classes
+class HashStore(Store):
+    """Stub HashStore for builds without this functionality."""
+
+    def __init__(self, *args, **kwargs):
+        self._data = {}
+
+    def set(self, key: str, value: str):
+        self._data[key] = value
+
+    def get(self, key: str) -> bytes:
+        return self._data.get(key, "").encode()
+
+
+# Backend-specific process group stubs
+class ProcessGroupMPI:
+    """Stub ProcessGroupMPI for non-MPI builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupNCCL:
+    """Stub ProcessGroupNCCL for non-NCCL builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupGloo:
+    """Stub ProcessGroupGloo for non-Gloo builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupUCC:
+    """Stub ProcessGroupUCC for non-UCC builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class ProcessGroupXCCL:
+    """Stub ProcessGroupXCCL for non-XCCL builds."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class _ProcessGroupWrapper:
+    """Stub _ProcessGroupWrapper for non-Gloo builds."""
+
+    def __init__(self, process_group, *args, **kwargs):
+        self._process_group = process_group
+
+    def __getattr__(self, name):
+        return getattr(self._process_group, name)
+
+
+# NCCL-specific function stubs
+_DEFAULT_PG_NCCL_TIMEOUT: Optional[timedelta] = None
+
+
+def _hash_tensors(tensors):
+    """Stub function to hash tensors - returns dummy hash."""
+    return 0
+
+
+def _dump_nccl_trace_json(
+    includeCollectives: Optional[bool] = None, onlyActive: Optional[bool] = None
+) -> bytes:
+    """Stub function that returns empty JSON trace."""
+    return b"{}"
+
+
+def _dump_nccl_trace(
+    includeCollectives: Optional[bool] = None,
+    includeStackTraces: Optional[bool] = None,
+    onlyActive: Optional[bool] = None,
+) -> bytes:
+    """Stub function that returns empty pickle trace."""
+    return b""
+
+
+# NVSHMEM/SymmetricMemory stubs
+def _is_nvshmem_available() -> bool:
+    """Stub function that returns False indicating NVSHMEM is not available."""
+    return False
+
+
+def _nvshmemx_cumodule_init(module: int) -> None:
+    """Stub function for NVSHMEM CU module initialization."""
+
+
+class _SymmetricMemory:
+    """Stub _SymmetricMemory class for builds without this functionality."""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    @classmethod
+    def empty_strided_p2p(cls, size, stride, dtype, device, group_name=None):
+        """Stub that returns a regular tensor."""
+        return torch.empty(size, dtype=dtype, device=device)
+
+    @classmethod
+    def rendezvous(cls, tensor, group_name=None):
+        """Stub that returns None."""
+        return None
+
+    @classmethod
+    def set_group_info(cls, *args, **kwargs):
+        """Stub that does nothing."""
+
+    @classmethod
+    def set_backend(cls, name):
+        """Stub that does nothing."""
+
+    @classmethod
+    def get_backend(cls, device):
+        """Stub that returns None."""
+        return None
+
+    @classmethod
+    def has_multicast_support(cls, device_type, device_index):
+        """Stub that returns False."""
+        return False