lcskrishna
diff --git a/‎torch/distributed/utils.py‎
Lines changed: 1 addition & 3 deletions b/‎torch/distributed/utils.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎torch/nn/parallel/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎torch/nn/parallel/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/nn/parallel/__init__.pyi‎
Lines changed: 0 additions & 5 deletions b/‎torch/nn/parallel/__init__.pyi‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎torch/nn/parallel/common_types.pyi‎
Lines changed: 0 additions & 6 deletions b/‎torch/nn/parallel/common_types.pyi‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎torch/nn/parallel/data_parallel.py‎
Lines changed: 43 additions & 13 deletions b/‎torch/nn/parallel/data_parallel.py‎
Lines changed: 43 additions & 13 deletions
diff --git a/‎torch/nn/parallel/data_parallel.pyi‎
Lines changed: 0 additions & 29 deletions b/‎torch/nn/parallel/data_parallel.pyi‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎torch/nn/parallel/parallel_apply.py‎
Lines changed: 27 additions & 6 deletions b/‎torch/nn/parallel/parallel_apply.py‎
Lines changed: 27 additions & 6 deletions
diff --git a/‎torch/nn/parallel/parallel_apply.pyi‎
Lines changed: 0 additions & 11 deletions b/‎torch/nn/parallel/parallel_apply.pyi‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎torch/nn/parallel/replicate.py‎
Lines changed: 33 additions & 17 deletions b/‎torch/nn/parallel/replicate.py‎
Lines changed: 33 additions & 17 deletions
@@ -5,9 +5,7 @@
 import torch
 import torch.distributed as dist
 from torch.nn.parallel._functions import _get_stream
-from torch.nn.parallel.scatter_gather import (  # type: ignore[attr-defined]
-    _is_namedtuple,
-)
+from torch.nn.parallel.scatter_gather import _is_namedtuple
 from torch.nn.utils.rnn import PackedSequence
 
 __all__ = []  # type: ignore[var-annotated]
 
@@ -1,7 +1,7 @@
 from .parallel_apply import parallel_apply
 from .replicate import replicate
 from .data_parallel import DataParallel, data_parallel
-from .scatter_gather import scatter, gather
+from .scatter_gather import gather, scatter
 from .distributed import DistributedDataParallel
 
 __all__ = ['replicate', 'scatter', 'parallel_apply', 'gather', 'data_parallel',
 
@@ -2,6 +2,7 @@
 import torch
 import warnings
 from itertools import chain
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 from ..modules import Module
 from .scatter_gather import scatter_kwargs, gather
 from .replicate import replicate
@@ -15,7 +16,7 @@
 
 __all__ = ['DataParallel', 'data_parallel']
 
-def _check_balance(device_ids):
+def _check_balance(device_ids: Sequence[Union[int, torch.device]]) -> None:
     imbalance_warn = """
     There is an imbalance between your GPUs. You may want to exclude GPU {} which
     has less than 75% of the memory or cores of GPU {}. You can do so by setting
@@ -121,7 +122,13 @@ class DataParallel(Module):
 
     # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
 
-    def __init__(self, module, device_ids=None, output_device=None, dim=0):
+    def __init__(
+        self,
+        module: Module,
+        device_ids: Optional[Sequence[Union[int, torch.device]]] = None,
+        output_device: Optional[Union[int, torch.device]] = None,
+        dim: int = 0,
+    ) -> None:
         super().__init__()
         torch._C._log_api_usage_once("torch.nn.parallel.DataParallel")
         device_type = _get_available_device_type()
@@ -133,6 +140,9 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0):
         if device_ids is None:
             device_ids = _get_all_device_indices()
 
+        if device_ids is None:
+            raise RuntimeError("no available devices were found")
+
         if output_device is None:
             output_device = device_ids[0]
 
@@ -147,7 +157,7 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0):
         if len(self.device_ids) == 1:
             self.module.to(self.src_device_obj)
 
-    def forward(self, *inputs, **kwargs):
+    def forward(self, *inputs: Any, **kwargs: Any) -> Any:
         with torch.autograd.profiler.record_function("DataParallel.forward"):
             if not self.device_ids:
                 return self.module(*inputs, **kwargs)
@@ -158,33 +168,45 @@ def forward(self, *inputs, **kwargs):
                                        "on device {} (device_ids[0]) but found one of "
                                        "them on device: {}".format(self.src_device_obj, t.device))
 
-            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
             # for forward function without any inputs, empty list and dict will be created
             # so the module can be executed on one device which is the first one in device_ids
-            if not inputs and not kwargs:
+            if not inputs and not module_kwargs:
                 inputs = ((),)
-                kwargs = ({},)
+                module_kwargs = ({},)
 
             if len(self.device_ids) == 1:
-                return self.module(*inputs[0], **kwargs[0])
+                return self.module(*inputs[0], **module_kwargs[0])
             replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
-            outputs = self.parallel_apply(replicas, inputs, kwargs)
+            outputs = self.parallel_apply(replicas, inputs, module_kwargs)
             return self.gather(outputs, self.output_device)
 
-    def replicate(self, module, device_ids):
+    def replicate(self, module: Module, device_ids: Sequence[Union[int, torch.device]]) -> List[Module]:
         return replicate(module, device_ids, not torch.is_grad_enabled())
 
-    def scatter(self, inputs, kwargs, device_ids):
+    def scatter(
+        self,
+        inputs: Tuple[Any, ...],
+        kwargs: Optional[Dict[str, Any]],
+        device_ids: Sequence[Union[int, torch.device]],
+    ) -> Any:
         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
 
-    def parallel_apply(self, replicas, inputs, kwargs):
+    def parallel_apply(self, replicas: Sequence[Module], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
 
-    def gather(self, outputs, output_device):
+    def gather(self, outputs: Any, output_device: Union[int, torch.device]) -> Any:
         return gather(outputs, output_device, dim=self.dim)
 
 
-def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None):
+def data_parallel(
+    module: Module,
+    inputs: Any,
+    device_ids: Optional[Sequence[Union[int, torch.device]]] = None,
+    output_device: Optional[Union[int, torch.device]] = None,
+    dim: int = 0,
+    module_kwargs: Optional[Any] = None,
+) -> torch.Tensor:
     r"""Evaluates module(input) in parallel across the GPUs given in device_ids.
 
     This is the functional version of the DataParallel module.
@@ -204,9 +226,15 @@ def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, mo
 
     device_type = _get_available_device_type()
 
+    if device_type is None:
+        raise RuntimeError("device type could not be determined")
+
     if device_ids is None:
         device_ids = _get_all_device_indices()
 
+    if device_ids is None:
+        raise RuntimeError("no available devices were found")
+
     if output_device is None:
         output_device = device_ids[0]
 
@@ -227,6 +255,8 @@ def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, mo
         inputs = ((),)
         module_kwargs = ({},)
 
+    assert module_kwargs is not None
+
     if len(device_ids) == 1:
         return module(*inputs[0], **module_kwargs[0])
     used_device_ids = device_ids[:len(inputs)]
 
@@ -1,11 +1,14 @@
 import threading
 import torch
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast
+from ..modules import Module
 from torch.cuda._utils import _get_device_index
 from torch.cuda.amp import autocast
 from torch._utils import ExceptionWrapper
 
+__all__ = ['get_a_var', 'parallel_apply']
 
-def get_a_var(obj):
+def get_a_var(obj: Union[torch.Tensor, List[Any], Tuple[Any, ...], Dict[Any, Any]]) -> Optional[torch.Tensor]:
     if isinstance(obj, torch.Tensor):
         return obj
 
@@ -19,8 +22,12 @@ def get_a_var(obj):
                 return result
     return None
 
-
-def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
+def parallel_apply(
+    modules: Sequence[Module],
+    inputs: Sequence[Any],
+    kwargs_tup: Optional[Sequence[Dict[str, Any]]] = None,
+    devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None,
+) -> List[Any]:
     r"""Applies each `module` in :attr:`modules` in parallel on arguments
     contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
     on each of :attr:`devices`.
@@ -39,7 +46,7 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
     if kwargs_tup is not None:
         assert len(modules) == len(kwargs_tup)
     else:
-        kwargs_tup = ({},) * len(modules)
+        kwargs_tup = (cast(Dict[str, Any], {}),) * len(modules)
     if devices is not None:
         assert len(modules) == len(devices)
     else:
@@ -50,10 +57,24 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
     results = {}
     grad_enabled, autocast_enabled = torch.is_grad_enabled(), torch.is_autocast_enabled()
 
-    def _worker(i, module, input, kwargs, device=None, stream=None):
+    def _worker(
+        i: int,
+        module: Module,
+        input: Any,
+        kwargs: Dict[str, Any],
+        device: Optional[Union[int, torch.device]] = None,
+        stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
         torch.set_grad_enabled(grad_enabled)
         if device is None:
-            device = get_a_var(input).get_device()
+            t = get_a_var(input)
+            if t is None:
+                with lock:
+                    results[i] = ExceptionWrapper(
+                        where="in replica {}, no device was provided and no tensor input was found; "
+                        "device cannot be resolved".format(i))
+                return
+            device = t.get_device()
         if stream is None:
             stream = torch.cuda.current_stream(device)
         try:
 
@@ -1,26 +1,34 @@
+import torch
+from ..modules import Module
 from . import comm
+from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Sequence, Set, Union
 from torch._utils import _get_device_index
 
 from collections import OrderedDict
 
+if TYPE_CHECKING:
+    import torch.jit
+    import torch.jit._state
+
+__all__ = ['replicate']
 
-def _is_script_module(module):
+def _is_script_module(module: Module) -> bool:
     import torch.jit
     return isinstance(module, torch.jit.ScriptModule)
 
 
-def _is_script_method(module):
+def _is_script_method(module: Module) -> bool:
     import torch.jit
     return isinstance(module, torch._C.ScriptMethod)
 
 
-def _init_script_module():
+def _init_script_module() -> "torch.jit.ScriptModule":
     import torch.jit
     return torch.jit.ScriptModule()
 
 
-def _is_jit_enabled():
-    import torch.jit
+def _is_jit_enabled() -> "torch.jit._state.EnabledProxy":
+    import torch.jit._state
     return torch.jit._state._enabled
 
 
@@ -31,10 +39,10 @@ def _is_jit_enabled():
 #
 # currently a module cannot be replicated properly if the descendants of
 # any ScriptModule contains python module (type 1 above)
-def _replicatable_module(module, memo=None):
+def _replicatable_module(module: Module, memo: Optional[Set[Module]] = None) -> bool:
 
     # module.modules() contains module itself as the first element
-    def descendant_modules(module):
+    def descendant_modules(module: Module) -> Iterator[Module]:
         gen = module.modules()
         next(gen)
         return gen
@@ -61,7 +69,11 @@ def descendant_modules(module):
 
     return True
 
-def _broadcast_coalesced_reshape(tensors, devices, detach=False):
+def _broadcast_coalesced_reshape(
+    tensors: Sequence[torch.Tensor],
+    devices: Sequence[Union[int, torch.device]],
+    detach: bool = False,
+) -> List[List[torch.Tensor]]:
     from ._functions import Broadcast
     if detach:
         return comm.broadcast_coalesced(tensors, devices)
@@ -75,7 +87,11 @@ def _broadcast_coalesced_reshape(tensors, devices, detach=False):
             return []
 
 
-def replicate(network, devices, detach=False):
+def replicate(
+    network: Module,
+    devices: Sequence[Union[int, torch.device]],
+    detach: bool = False,
+) -> List[Module]:
     if not _replicatable_module(network):
         raise RuntimeError("Cannot replicate network where python modules are "
                            "childrens of ScriptModule")
@@ -91,8 +107,8 @@ def replicate(network, devices, detach=False):
     param_copies = _broadcast_coalesced_reshape(params, devices, detach)
 
     buffers = list(network.buffers())
-    buffers_rg = []
-    buffers_not_rg = []
+    buffers_rg: List[torch.Tensor] = []
+    buffers_not_rg: List[torch.Tensor] = []
     for buf in buffers:
         if buf.requires_grad and not detach:
             buffers_rg.append(buf)
@@ -106,8 +122,8 @@ def replicate(network, devices, detach=False):
     buffer_copies_not_rg = _broadcast_coalesced_reshape(buffers_not_rg, devices, detach=True)
 
     modules = list(network.modules())
-    module_copies = [[] for device in devices]
-    module_indices = {}
+    module_copies: List[List[Module]] = [[] for _ in devices]
+    module_indices: Dict[Module, int] = {}
 
     for i, module in enumerate(modules):
         module_indices[module] = i
@@ -142,13 +158,13 @@ def replicate(network, devices, detach=False):
                 param_idx = param_indices[param]
                 for j in range(num_replicas):
                     replica = module_copies[j][i]
-                    param = param_copies[j][param_idx]
+                    param_copy = param_copies[j][param_idx]
                     # parameters in replicas are no longer leaves,
                     # so setattr them as non-parameter attributes
-                    setattr(replica, key, param)
+                    setattr(replica, key, param_copy)
                     # expose the parameter for DDP
-                    replica._former_parameters[key] = param
-        for key, buf in module._buffers.items():
+                    replica._former_parameters[key] = param_copy
+        for key, buf in module._buffers.items():  # type: ignore[assignment]
             if buf is None:
                 for j in range(num_replicas):
                     replica = module_copies[j][i]