diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index f1eda3103a24af..0f26005f74cb22 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -1,15 +1,28 @@
 #!/bin/bash
 
+# For distributed, four environmental configs:
+# (1) build with only NCCL
+# (2) build with NCCL and MPI
+# (3) build with only MPI
+# (4) build with neither
+if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then
+  # TODO: move this to Docker
+  sudo apt-get update
+  sudo apt-get install libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0
+fi
+
+if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
+  # TODO: move this to Docker
+  sudo apt-get update
+  sudo apt-get install openmpi-bin libopenmpi-dev
+  sudo apt-get install -y --no-install-recommends openssh-client openssh-server
+  sudo mkdir -p /var/run/sshd
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == "pytorch-linux-xenial-py3-clang5-asan" ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" $*
 fi
 
-# TODO: move this to Docker
-# TODO: add both NCCL and MPI in CI test by fixing these test first
-sudo apt-get update
-sudo apt-get install libnccl-dev libnccl2
-# sudo apt-get install openmpi-bin libopenmpi-dev
-
 # Required environment variable: $BUILD_ENVIRONMENT
 # (This is set by default in the Docker images we build, so you don't
 # need to set it yourself.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1009e5a4ec30f7..75b4bf7b4512d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -306,7 +306,7 @@ if(BUILD_DOCS)
 
     if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
       file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
-    endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
+    endif()
 
     file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
     configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
@@ -323,10 +323,10 @@ if(BUILD_DOCS)
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
         COMMENT "Generating Python API documentation with Doxygen"
         VERBATIM)
-  else (DOXYGEN_FOUND)
+  else()
     message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
-  endif (DOXYGEN_FOUND)
-endif (BUILD_DOCS)
+  endif()
+endif()
 
 # ---[ CMake related files
 # Uninistall option.
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index f85996f74c4b76..5b420d87b34fc1 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -9,6 +9,9 @@
 #include <stdexcept>
 
 #include "ATen/CPUGenerator.h"
+#include "ATen/RegisterCPU.h"
+
+#include "TH/TH.h"  // for USE_LAPACK
 
 #ifdef USE_SSE3
 #include <pmmintrin.h>
@@ -34,7 +37,7 @@ Context::Context()
 
   generator_registry[static_cast<int>(DeviceType::CPU)]
     .reset(new CPUGenerator(this));
-  Type::registerCPU(this);
+  register_cpu_types(this);
 }
 
 // TODO: This could be bad juju if someone calls globalContext() in the
@@ -79,6 +82,14 @@ bool Context::hasMKL() const {
 #endif
 }
 
+bool Context::hasLAPACK() const {
+#ifdef USE_LAPACK
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool Context::setFlushDenormal(bool on) {
 #ifdef USE_SSE3
   // Setting flush-to-zero (FTZ) flag
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index f2b3a452cfed57..bab1fa5dc5d069 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -50,6 +50,10 @@ class AT_API Context {
     return *generator;
   }
   bool hasMKL() const;
+  bool hasLAPACK() const;
+  bool hasMAGMA() const {
+    return detail::getCUDAHooks().hasMAGMA();
+  }
   bool hasCUDA() const {
     return detail::getCUDAHooks().hasCUDA();
   }
@@ -114,6 +118,7 @@ class AT_API Context {
   std::atomic<size_t> next_id;
   std::unique_ptr<THCState, void(*)(THCState*)> thc_state;
   friend struct Type;
+  friend void register_cpu_types(Context * context);
   friend void register_cuda_types(Context * context);
 };
 
@@ -157,6 +162,14 @@ static inline bool hasMKL() {
   return globalContext().hasMKL();
 }
 
+static inline bool hasLAPACK() {
+  return globalContext().hasLAPACK();
+}
+
+static inline bool hasMAGMA() {
+  return globalContext().hasMAGMA();
+}
+
 static inline int64_t current_device() {
   return globalContext().current_device();
 }
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
index 7adddfca27c9eb..b51d80d22d350f 100644
--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/Device.h>
-#include <ATen/ScalarType.h>
+#include <ATen/core/Device.h>
+#include <ATen/core/ScalarType.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/Error.h>
 #include <ATen/detail/CUDAHooksInterface.h>
diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp
index ef04cc4bdfd975..dcdf7653f2308b 100644
--- a/aten/src/ATen/Formatting.cpp
+++ b/aten/src/ATen/Formatting.cpp
@@ -1,6 +1,5 @@
 #include "ATen/Formatting.h"
 #include "ATen/Tensor.h"
-#include "ATen/Context.h"
 #include "ATen/TensorMethods.h"
 
 #include <cmath>
diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h
index 8db0b231bf53f9..d797618b285e6a 100644
--- a/aten/src/ATen/Storage.h
+++ b/aten/src/ATen/Storage.h
@@ -26,8 +26,8 @@ struct AT_API Storage {
   template <typename T>
   T* unsafe_data() const { return storage_impl_->unsafe_data<T>(); }
 
-  size_t elementSize() const { return storage_impl_->elementSize(); }
-  ptrdiff_t size() const { return storage_impl_->size(); }
+  size_t elementSize() const { return storage_impl_->itemsize(); }
+  ptrdiff_t size() const { return storage_impl_->numel(); }
   bool resizable() const { return storage_impl_->resizable(); }
   // get() use here is to get const-correctness
   void* data() const { return storage_impl_.get()->data(); }
diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp
index 233540bfa06f28..0ed836b9b3010a 100644
--- a/aten/src/ATen/StorageImpl.cpp
+++ b/aten/src/ATen/StorageImpl.cpp
@@ -1,31 +1,29 @@
-#include <ATen/Context.h>
 #include <ATen/StorageImpl.h>
 
 namespace at {
 
 StorageImpl::StorageImpl(
     at::DataType data_type,
-    ptrdiff_t size,
+    int64_t numel,
     at::DataPtr data_ptr,
     at::Allocator* allocator,
     bool resizable)
     : data_type_(data_type),
       data_ptr_(std::move(data_ptr)),
-      size_(size),
+      numel_(numel),
       resizable_(resizable),
-      allocator_(allocator),
-      finalizer_(nullptr) {}
+      allocator_(allocator) {}
 
 StorageImpl::StorageImpl(
     at::DataType data_type,
-    ptrdiff_t size,
+    int64_t numel,
     at::Allocator* allocator,
     bool resizable)
     : StorageImpl(
           data_type,
-          size,
+          numel,
           allocator->allocate(
-              at::elementSize(dataTypeToScalarType(data_type)) * size),
+              at::elementSize(dataTypeToScalarType(data_type)) * numel),
           allocator,
           resizable) {}
 
diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index a9394d53935636..35639478df664e 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -3,7 +3,6 @@
 #include <ATen/Allocator.h>
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
-#include <TH/THTypeConversion.hpp>
 
 #include <ATen/core/intrusive_ptr.h>
 
@@ -21,16 +20,16 @@ struct Type;
 struct AT_API StorageImpl : public c10::intrusive_ptr_target {
  public:
   StorageImpl() = delete;
-  virtual ~StorageImpl() {};
+  ~StorageImpl() {};
   StorageImpl(
       at::DataType data_type,
-      ptrdiff_t size,
+      int64_t numel,
       at::DataPtr data_ptr,
       at::Allocator* allocator,
       bool resizable);
   StorageImpl(
       at::DataType data_type,
-      ptrdiff_t size,
+      int64_t numel,
       at::Allocator* allocator,
       bool resizable);
   StorageImpl(StorageImpl&) = delete;
@@ -44,7 +43,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   template <typename T>
   inline T* data() const {
     auto data_type_T =
-        at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<T>>::to());
+        at::scalarTypeToDataType(at::CTypeToScalarType<T>::to());
     if (dtype() != data_type_T) {
       AT_ERROR(
           "Attempt to access StorageImpl having data type ",
@@ -61,27 +60,22 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   }
 
   void release_resources() override {
-    if (finalizer_) {
-      (*finalizer_)();
-    }
-    finalizer_ = nullptr;
     data_ptr_.clear();
   }
 
   void operator=(const StorageImpl&) = delete;
 
-  size_t elementSize() const {
+  size_t itemsize() const {
     return at::elementSize(dataTypeToScalarType(data_type_));
   }
 
   Type& type();
 
-  // TODO: Rename to size() and size to size_
-  ptrdiff_t size() const {
-    return size_;
+  int64_t numel() const {
+    return numel_;
   };
-  void set_size(ptrdiff_t size) {
-    size_ = size;
+  void set_numel(int64_t numel) {
+    numel_ = numel;
   };
   bool resizable() const {
     return resizable_;
@@ -132,9 +126,8 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
  private:
   at::DataType data_type_;
   at::DataPtr data_ptr_;
-  ptrdiff_t size_;
+  int64_t numel_;
   bool resizable_;
   at::Allocator* allocator_;
-  std::unique_ptr<THFinalizer> finalizer_;
 };
 } // namespace at
diff --git a/aten/src/ATen/TensorBase.h b/aten/src/ATen/TensorBase.h
deleted file mode 100644
index 1bda3ddfa14915..00000000000000
--- a/aten/src/ATen/TensorBase.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include "ATen/TensorImpl.h"
-#include "ATen/UndefinedTensor.h"
-#include "ATen/core/Error.h"
-
-namespace at { namespace detail {
-
-// TensorBase is the base class for Tensor.
-// TODO: Eliminate this, once we remove TensorBase from Scalar.  At
-// the moment it's only used to break an include cycle for Scalar
-struct TensorBase {
-  TensorBase() {}
-  TensorBase(TensorImpl * tensor_impl, bool retain) : tensor_impl_(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(tensor_impl)) {
-    if (tensor_impl == nullptr) {
-      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
-    }
-    if (retain && tensor_impl != UndefinedTensor::singleton()) {
-      c10::raw::intrusive_ptr::incref(tensor_impl);
-    }
-  }
-  TensorBase(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr) : tensor_impl_(std::move(ptr)) {}
-  TensorBase(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr) : tensor_impl_(ptr) {}
-
-  int64_t dim() const {
-    return tensor_impl_->dim();
-  }
-
-  TensorImpl * unsafeGetTensorImpl() const {
-    return tensor_impl_.get();
-  }
-  TensorImpl * unsafeReleaseTensorImpl() {
-    return tensor_impl_.release();
-  }
-  const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& getIntrusivePtr() const {
-    return tensor_impl_;
-  }
-
-  bool defined() const {
-    return tensor_impl_;
-  }
-
-  void reset() {
-    tensor_impl_.reset();
-  }
-
-  friend struct WeakTensor;
-
-protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl_;
-};
-
-}} // namespace at::detail
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index 30b34cabec769f..8976acb6a40904 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -3,8 +3,6 @@
 #include <atomic>
 #include <memory>
 
-#include "ATen/Retainable.h"
-#include "ATen/StorageImpl.h"
 #include "ATen/Storage.h"
 #include "ATen/core/optional.h"
 #include "ATen/core/TensorTypeId.h"
diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h
index c8717689833408..a598290485196d 100644
--- a/aten/src/ATen/TensorOptions.h
+++ b/aten/src/ATen/TensorOptions.h
@@ -2,10 +2,10 @@
 
 #include <ATen/core/Backend.h>
 #include <ATen/Context.h>
-#include <ATen/Device.h>
+#include <ATen/core/Device.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/core/Layout.h>
-#include <ATen/ScalarType.h>
+#include <ATen/core/ScalarType.h>
 #include <ATen/Tensor.h>
 #include <ATen/Type.h>
 
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
index 79f58479e90b52..f50a4e71da9cae 100644
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ b/aten/src/ATen/UndefinedTensor.cpp
@@ -1,5 +1,4 @@
 #include "ATen/UndefinedTensor.h"
-#include "ATen/Context.h"
 #include "ATen/core/Error.h"
 
 namespace at {
diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp
index 60d9c884b8aef2..2bc3965c6d33ae 100644
--- a/aten/src/ATen/UndefinedType.cpp
+++ b/aten/src/ATen/UndefinedType.cpp
@@ -3,8 +3,8 @@
 
 namespace at {
 
-UndefinedType::UndefinedType(Context* context)
-    : Type(context, UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {}
+UndefinedType::UndefinedType()
+    : Type(UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {}
 ScalarType UndefinedType::scalarType() const {
   return ScalarType::Undefined;
 }
diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h
index 9ca00cfb516ff7..d216e3131dd693 100644
--- a/aten/src/ATen/UndefinedType.h
+++ b/aten/src/ATen/UndefinedType.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "ATen/Type.h"
-#include "ATen/Context.h"
 #include "ATen/CheckGenerator.h"
 
 #ifdef _MSC_VER
@@ -13,7 +12,7 @@
 namespace at {
 
 struct UndefinedType final : public Type {
-  explicit UndefinedType(Context* context);
+  explicit UndefinedType();
   virtual ScalarType scalarType() const override;
   virtual Backend backend() const override;
   virtual bool is_cuda() const override;
diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/core/SparseTensorRef.h
similarity index 100%
rename from aten/src/ATen/SparseTensorRef.h
rename to aten/src/ATen/core/SparseTensorRef.h
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 7d73fafc994da5..570a375e3888a3 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -69,7 +69,7 @@ DynamicCUDAInterfaceSetter _;
 // let's not if we don't need to!)
 std::unique_ptr<THCState, void (*)(THCState*)> CUDAHooks::initCUDA() const {
   THCState* thc_state = THCState_alloc();
-  
+
   THCudaInit(thc_state);
   return std::unique_ptr<THCState, void (*)(THCState*)>(
       thc_state, [](THCState* p) {
@@ -92,6 +92,14 @@ bool CUDAHooks::hasCUDA() const {
   return true;
 }
 
+bool CUDAHooks::hasMAGMA() const {
+#ifdef USE_MAGMA
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool CUDAHooks::hasCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 766ab62b8ef79f..491adfc4d73f1a 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -13,6 +13,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   std::unique_ptr<THCState, void(*)(THCState*)> initCUDA() const override;
   std::unique_ptr<Generator> initCUDAGenerator(Context*) const override;
   bool hasCUDA() const override;
+  bool hasMAGMA() const override;
   bool hasCuDNN() const override;
   int64_t current_device() const override;
   Allocator* getPinnedMemoryAllocator() const override;
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 6b2e87c4f762af..cccf6dc28453dc 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -65,6 +65,10 @@ struct AT_API CUDAHooksInterface {
     return false;
   }
 
+  virtual bool hasMAGMA() const {
+    return false;
+  }
+
   virtual bool hasCuDNN() const {
     return false;
   }
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 0f859edd3ede3a..f7a4deb58dc941 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -109,6 +109,9 @@ def check_all_files_written(self):
 TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h")
 TYPE_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.cpp")
 
+REGISTER_CPU_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.h")
+REGISTER_CPU_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.cpp")
+
 REGISTER_CUDA_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.h")
 REGISTER_CUDA_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.cpp")
 
@@ -122,7 +125,7 @@ def check_all_files_written(self):
 TYPE_REGISTER = CodeTemplate("""\
 context->type_registry[static_cast<int>(Backend::${backend})]
                       [static_cast<int>(ScalarType::${scalar_type})]
-                      .reset(new ${type_name}(context));
+                      .reset(new ${type_name}());
 detail::getVariableHooks().registerVariableTypeFor(context, Backend::${backend}, ScalarType::${scalar_type});
 """)
 
@@ -280,19 +283,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     if scalar_name == "Half":
         env['SparseTensor'] = 'Tensor'
         if backend == "CUDA":
-            env['to_th_type'] = 'HalfFix<__half,Half>'
-            env['to_at_type'] = 'HalfFix<Half,__half>'
             env['AS_REAL'] = 'convert<half,double>'
-            env['THScalarType'] = 'half'
-        else:
-            env['to_th_type'] = 'HalfFix<THHalf,Half>'
-            env['to_at_type'] = 'HalfFix<Half,THHalf>'
-    elif scalar_name == 'Long':
-        env['to_th_type'] = 'long'
-        env['to_at_type'] = 'int64_t'
-    else:
-        env['to_th_type'] = ''
-        env['to_at_type'] = ''
 
     declarations, definitions = function_wrapper.create_derived(
         env, declarations)
@@ -340,7 +331,8 @@ def iterate_types():
 def declare_outputs():
     files = ['Declarations.yaml', 'Type.h', 'Type.cpp', 'Tensor.h',
              'TensorMethods.h', 'Functions.h',
-             'CPUCopy.cpp', 'NativeFunctions.h']
+             'CPUCopy.cpp', 'NativeFunctions.h',
+             'RegisterCPU.cpp', 'RegisterCPU.h']
     for f in files:
         file_manager.will_write(f)
     cuda_files = ['CUDACopy.cpp', 'RegisterCUDA.cpp', 'RegisterCUDA.h']
@@ -409,6 +401,9 @@ def generate_outputs():
     file_manager.write('Type.h', TYPE_H, top_env)
     file_manager.write('Type.cpp', TYPE_CPP, top_env)
 
+    file_manager.write('RegisterCPU.h', REGISTER_CPU_H, top_env)
+    file_manager.write('RegisterCPU.cpp', REGISTER_CPU_CPP, top_env)
+
     cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env)
     cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env)
 
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 5b73a09ad9b004..07d7e46ff79a56 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/SparseTensorRef.h>
+#include <ATen/core/SparseTensorRef.h>
 #include <ATen/ExpandUtils.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index d9bd94e1f7810b..d5ff300c0dd9e2 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -47,6 +47,82 @@ std::tuple<Tensor, Tensor> _unique_cpu_template(
   }
   return std::make_tuple(output, inverse_indices);
 }
+
+template<class ForwardIt>
+ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last,
+  std::vector<int64_t>& indices, Tensor inverse_indices_vec) {
+    if (first == last) {
+      return last;
+    }
+    // save to calculate distance to iterators
+    ForwardIt begin = first;
+
+    // set first inverse index
+    inverse_indices_vec[indices[0]] = 0;
+
+    ForwardIt result = first;
+    while (++first != last) {
+      if (!at::equal(*result, *first) && ++result != first) {
+          *result = std::move(*first);
+      }
+      int64_t idx_result = std::distance(begin, result);
+      int64_t idx_first = std::distance(begin, first);
+      inverse_indices_vec[indices[idx_first]] = idx_result;
+    }
+
+    return ++result;
+  }
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor> _unique_dim_cpu_template(
+    const Tensor& self,
+    const int64_t dim,
+    const bool return_inverse) {
+  // reshape tensor as [dim, -1]
+  Tensor input_flat = self.transpose(dim, 0);
+  auto orig_sizes = input_flat.sizes().vec();
+  input_flat = input_flat.contiguous().view({input_flat.size(0), -1});
+
+  std::vector<int64_t> indices(input_flat.size(0));
+  std::iota(indices.begin(), indices.end(), 0);
+  int64_t numel = input_flat.size(1);
+  scalar_t* input_flat_ptr = ((scalar_t*)input_flat.data_ptr());
+
+  // sort indices using data
+  std::sort(indices.begin(), indices.end(),
+    [&](int64_t a, int64_t b) -> bool {
+      for (int64_t i = 0; i < numel; ++i) {
+        scalar_t lhs = input_flat_ptr[i + a * numel];
+        scalar_t rhs = input_flat_ptr[i + b * numel];
+        if (lhs < rhs) {
+          return true;
+        } else if (lhs > rhs) {
+          return false;
+        }
+      }
+      return false;
+    });
+
+  Tensor input_sorted = at::empty(input_flat.sizes(), input_flat.type());
+  for (int i = 0; i < indices.size(); ++i) {
+    input_sorted[i] = input_flat[indices[i]];
+  }
+
+  Tensor inverse_indices = at::empty(indices.size(), self.type().toScalarType(kLong));
+  std::vector<Tensor> input_unbind = at::unbind(input_sorted, 0);
+  auto last = _unique_dim_cpu_impl(
+    input_unbind.begin(), input_unbind.end(), indices, inverse_indices);
+  input_unbind.erase(last, input_unbind.end());
+
+  // reshape back
+  auto output = at::stack(input_unbind, 0);
+  auto new_sizes = std::vector<int64_t>(orig_sizes);
+  new_sizes[0] = -1;
+  output = output.view(new_sizes);
+  output = output.transpose(0, dim);
+
+  return std::make_tuple(output, inverse_indices);
+}
 } // namespace
 
 std::tuple<Tensor, Tensor>
@@ -56,5 +132,13 @@ _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
   });
 }
 
+std::tuple<Tensor, Tensor>
+_unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
+  return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+    // The current implementation using `dim` always sorts due to unhashable tensors
+    return _unique_dim_cpu_template<scalar_t>(self, dim, return_inverse);
+  });
+}
+
 }  // namespace native
 }  // namespace at
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index 0692dd0fea2901..bc37e83990e192 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -48,7 +48,7 @@ void magmaGesvBatched<double>(
 }
 
 static magma_queue_t createMagmaQueue(const Tensor& tensor) {
-  auto& context = tensor.type().get_context();
+  auto& context = at::globalContext();
   magma_queue_t magma_queue;
   magma_queue_create_from_cuda(
       tensor.get_device(),
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index f2e13b4c708b62..c29337f90f1347 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -69,6 +69,92 @@ template <typename scalar_t>
     return std::tuple<Tensor, Tensor>(output, inverse_indices);
 
   }
+
+template <typename scalar_t>
+  std::tuple<Tensor, Tensor> _unique_dim_cuda_template(
+    const Tensor& self,
+    const int64_t dim,
+    const bool return_inverse) {
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+    auto policy = thrust::cuda::par(allocator).on(stream);
+
+    Tensor input_flat = self.transpose(dim, 0);
+    auto orig_sizes = input_flat.sizes().vec();
+    input_flat = input_flat.contiguous().view({input_flat.size(0), -1});
+
+    scalar_t* input_flat_ptr = input_flat.data<scalar_t>();
+
+    Tensor indices = at::arange(0, input_flat.size(0), self.type().toScalarType(kLong));
+    int64_t* indices_ptr = indices.data<int64_t>();
+    int64_t numel = input_flat.size(1);
+
+    // sort indices using data
+    thrust::sort(policy, indices_ptr, indices_ptr + indices.numel(),
+      [=] __device__ (int64_t a, int64_t b) -> bool {
+        for (int64_t i = 0; i < numel; ++i) {
+          scalar_t lhs = input_flat_ptr[i + a * numel];
+          scalar_t rhs = input_flat_ptr[i + b * numel];
+          if (lhs < rhs) {
+            return true;
+          } else if (lhs > rhs) {
+            return false;
+          }
+        }
+        return false;
+      });
+
+    Tensor input_sorted = input_flat.index_select(0, indices);
+
+    // get unique tensors
+    scalar_t* input_sorted_ptr = input_sorted.data<scalar_t>();    
+    Tensor input_sorted_indices = at::arange(0, input_sorted.size(0), self.type().toScalarType(kLong));
+    int64_t* input_sorted_indices_ptr = input_sorted_indices.data<int64_t>();
+    auto last = thrust::unique(policy, input_sorted_indices_ptr, input_sorted_indices_ptr + input_sorted_indices.numel(),
+      [=] __device__ (int64_t a, int64_t b) -> bool {
+        for (int64_t i = 0; i < numel; ++i) {
+          scalar_t lhs = input_sorted_ptr[i + a * numel];
+          scalar_t rhs = input_sorted_ptr[i + b * numel];
+          if (lhs != rhs) {
+            return false;
+          }
+        }
+        return true;
+      });
+    input_sorted_indices.resize_(last - input_sorted_indices_ptr);
+    Tensor output = input_sorted.index_select(0, input_sorted_indices);
+
+    // reshape back
+    auto new_sizes = std::vector<int64_t>(orig_sizes);
+    new_sizes[0] = -1;
+    output = output.view(new_sizes);
+    output = output.transpose(0, dim);
+
+    // calculate inverse indices
+    Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong));
+    if (return_inverse) {
+      int64_t size = self.size(dim);
+      inverse_indices.resize_(size);
+      Tensor mask = at::empty(input_sorted.size(0), self.type().toScalarType(kLong));
+      mask[0] = 1;
+      for (int i = 0; i < input_sorted.size(0) - 1; ++i) {
+        if (!at::equal(input_sorted[i], input_sorted[i+1])) {
+          mask[i+1] = 1; 
+        } else {
+          mask[i+1] = 0;
+        }
+      }
+
+      Tensor imask = at::cumsum(mask, 0) - 1;
+      for (int i = 0; i < indices.size(0); ++i) {
+        inverse_indices[indices[i]] = imask[i];
+      }
+    }
+
+    THCudaCheck(cudaGetLastError());  
+    return std::tuple<Tensor, Tensor>(output, inverse_indices);
+  }
 } // namespace
 
 #endif
@@ -86,5 +172,16 @@ _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
 #endif
 }
 
+std::tuple<Tensor, Tensor>
+_unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
+  #ifndef __HIP_PLATFORM_HCC__
+    return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+      return _unique_dim_cuda_template<scalar_t>(self, dim, return_inverse);
+    });
+  #else
+    AT_ERROR("unique_dim_cuda: HIP not supported");
+  #endif
+}
+
 }  // namespace native
 }  // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 466fe6c3134e84..cb194cd0c7bdee 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1748,6 +1748,11 @@
     CPU: _unique_cpu
     CUDA: _unique_cuda
 
+- func: _unique_dim(Tensor self, int64_t dim, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor)
+  dispatch:
+    CPU: _unique_dim_cpu
+    CUDA: _unique_dim_cuda
+
 - func: _unsafe_view(Tensor self, IntList size) -> Tensor
   variants: function
 
diff --git a/aten/src/ATen/templates/RegisterCPU.cpp b/aten/src/ATen/templates/RegisterCPU.cpp
new file mode 100644
index 00000000000000..0c1eeb4818fbbc
--- /dev/null
+++ b/aten/src/ATen/templates/RegisterCPU.cpp
@@ -0,0 +1,20 @@
+#include <ATen/RegisterCPU.h>
+
+// ${generated_comment}
+
+#include <ATen/Type.h>
+#include <ATen/Context.h>
+#include <ATen/UndefinedType.h>
+#include <ATen/detail/VariableHooksInterface.h>
+
+${cpu_type_headers}
+
+namespace at {
+
+void register_cpu_types(Context * context) {
+  ${cpu_type_registrations}
+  context->type_registry[static_cast<int>(Backend::Undefined)]
+                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType());
+}
+
+} // namespace at
diff --git a/aten/src/ATen/templates/RegisterCPU.h b/aten/src/ATen/templates/RegisterCPU.h
new file mode 100644
index 00000000000000..b923c180aac805
--- /dev/null
+++ b/aten/src/ATen/templates/RegisterCPU.h
@@ -0,0 +1,10 @@
+#pragma once
+
+// ${generated_comment}
+
+namespace at {
+
+class Context;
+void register_cpu_types(Context * context);
+
+} // namespace at
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 4a17004bb5ff8c..2ef9dbf398fa2f 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -27,8 +27,8 @@
 
 namespace at {
 
-${Type}::${Type}(Context* context)
-  : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
+${Type}::${Type}()
+  : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
@@ -58,7 +58,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   AT_ERROR("unsafeTensorFromTH not supported on sparse");
 }
 std::unique_ptr<Generator> ${Type}::generator() const {
-  return std::unique_ptr<Generator>(new ${Generator}(context));
+  return std::unique_ptr<Generator>(new ${Generator}(&at::globalContext()));
 }
 
 const char * ${Type}::toString() const {
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 28e8e5381f2933..4d8bf60522f7db 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -2,16 +2,17 @@
 
 // ${generated_comment}
 
-#include "ATen/Device.h"
+#include "ATen/core/Device.h"
 #include "ATen/core/Layout.h"
 #include "ATen/Scalar.h"
-#include "ATen/ScalarType.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/ScalarType.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Storage.h"
 #include "ATen/TensorAccessor.h"
-#include "ATen/TensorBase.h"
 #include "ATen/TensorImpl.h"
 #include "ATen/core/optional.h"
+#include "ATen/UndefinedTensor.h"
+#include "ATen/core/Error.h"
 
 namespace at {
 struct Generator;
@@ -38,16 +39,48 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct AT_API Tensor : public detail::TensorBase {
-  using TensorBase = detail::TensorBase;
-  Tensor() : TensorBase() {}
-  Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {}
-  Tensor(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr) : TensorBase(ptr) {}
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr) : TensorBase(std::move(ptr)) {}
+struct AT_API Tensor {
+  Tensor(){};
+  Tensor(TensorImpl* tensor_impl, bool retain)
+      : tensor_impl_(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(
+            tensor_impl)) {
+    if (tensor_impl == nullptr) {
+      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
+    }
+    if (retain && tensor_impl != UndefinedTensor::singleton()) {
+      c10::raw::intrusive_ptr::incref(tensor_impl);
+    }
+  }
+  Tensor(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr)
+      : tensor_impl_(std::move(ptr)) {}
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr)
+      : tensor_impl_(ptr) {}
 
   Tensor(const Tensor&) = default;
   Tensor(Tensor&&) = default;
 
+  int64_t dim() const {
+    return tensor_impl_->dim();
+  }
+
+  TensorImpl * unsafeGetTensorImpl() const {
+    return tensor_impl_.get();
+  }
+  TensorImpl * unsafeReleaseTensorImpl() {
+    return tensor_impl_.release();
+  }
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& getIntrusivePtr() const {
+    return tensor_impl_;
+  }
+
+  bool defined() const {
+    return tensor_impl_;
+  }
+
+  void reset() {
+    tensor_impl_.reset();
+  }
+
   // The following overloads are very intruiging.  Consider the following
   // program:
   //
@@ -242,6 +275,9 @@ struct AT_API Tensor : public detail::TensorBase {
   }
 
   friend struct WeakTensor;
+
+protected:
+  c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl_;
 };
 
 struct AT_API WeakTensor {
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 214a5d18316588..e52c597b99eeb7 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -4,7 +4,7 @@
 
 #include "ATen/Tensor.h"
 #include "ATen/Scalar.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Type.h"
 
 namespace at {
diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp
index 40621a9be6e08b..ff154971e7bffb 100644
--- a/aten/src/ATen/templates/Type.cpp
+++ b/aten/src/ATen/templates/Type.cpp
@@ -5,26 +5,14 @@
 #include "ATen/ExpandUtils.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/Scalar.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Storage.h"
 #include "ATen/Tensor.h"
 #include "ATen/TensorOptions.h"
-#include "ATen/UndefinedType.h"
 #include "ATen/DeviceGuard.h"
 
-#include <ATen/detail/VariableHooksInterface.h>
-
-#include <iostream>
-${cpu_type_headers}
-
 namespace at {
 
-void Type::registerCPU(Context * context) {
-  ${cpu_type_registrations}
-  context->type_registry[static_cast<int>(Backend::Undefined)]
-                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType(context));
-}
-
 Tensor & Type::copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
   Tensor b_src;
   std::tie(b_src) = expand_inplace(self, src, "copy");
@@ -50,10 +38,10 @@ Tensor Type::copy(const Tensor & src, bool non_blocking) const {
 }
 
 Type & Type::toBackend(Backend b) const {
-  return context->getType(b,scalarType());
+  return at::globalContext().getType(b,scalarType());
 }
 Type & Type::toScalarType(ScalarType s) const {
-  return context->getType(backend(),s);
+  return at::globalContext().getType(backend(),s);
 }
 static std::vector<int64_t> defaultStrides(IntList sizes) {
   std::vector<int64_t> strides(sizes.size());
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index d4972d87a6dfd9..10c52ac14b6975 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -3,13 +3,13 @@
 // ${generated_comment}
 
 #include "ATen/core/ATenGeneral.h"
-#include "ATen/Allocator.h"
+#include "ATen/core/Allocator.h"
 #include "ATen/core/Deprecated.h"
 #include "ATen/core/Generator.h"
 #include "ATen/core/Layout.h"
 #include "ATen/Scalar.h"
 #include "ATen/core/ScalarType.h"
-#include "ATen/SparseTensorRef.h"
+#include "ATen/core/SparseTensorRef.h"
 #include "ATen/Tensor.h"
 #include "ATen/core/ArrayRef.h"
 #include "ATen/core/Half.h"
@@ -45,8 +45,8 @@ enum class TypeID {
 };
 
 struct AT_API Type {
-  explicit Type(Context* context, TensorTypeId type_id, bool is_variable, bool is_undefined)
-      : context(context), type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
+  explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
   virtual ~Type() {}
   virtual ScalarType scalarType() const = 0;
   virtual Backend backend() const = 0;
@@ -56,7 +56,6 @@ struct AT_API Type {
   virtual bool is_distributed() const = 0;
   bool is_variable() const noexcept { return is_variable_; }
   bool is_undefined() const noexcept { return is_undefined_; }
-  static void registerCPU(Context * context);
   virtual Storage storage(bool resizable = false) const = 0;
   virtual Storage storage(size_t size, bool resizable = false) const = 0;
   virtual Storage storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
@@ -80,8 +79,6 @@ struct AT_API Type {
   Type & cuda() const {
     return this->toBackend(at::backendToCUDA(this->backend()));
   }
-  Context& get_context() const { return *context; }
-
   // contiguous IDs for all types in the system
   // for external dispatch
   virtual TypeID ID() const = 0;
@@ -112,7 +109,6 @@ struct AT_API Type {
   // virtual Tensor * add(Tensor & a, Tensor & b) = 0;
   ${type_method_declarations}
 protected:
-  Context* context;
   TensorTypeId type_id_;
   bool is_variable_;
   bool is_undefined_;
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index fbafed82b57e02..4335a8f2209a20 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -38,8 +38,8 @@ static int getPointerDevice(void* ptr) {
 }
 #endif
 
-${Type}::${Type}(Context* context)
-  : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
+${Type}::${Type}()
+  : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
@@ -99,7 +99,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   return Storage((${THStorage}*) th_pointer);
 }
 std::unique_ptr<Generator> ${Type}::generator() const {
-  return std::unique_ptr<Generator>(new ${Generator}(context));
+  return std::unique_ptr<Generator>(new ${Generator}(&at::globalContext()));
 }
 
 const char * ${Type}::toString() const {
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
index e8613b62a333be..ec08e1a336daf6 100644
--- a/aten/src/ATen/templates/TypeDerived.h
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -16,7 +16,7 @@
 namespace at {
 
 struct ${Type} final : public Type {
-  explicit ${Type}(Context* context);
+  explicit ${Type}();
   virtual ScalarType scalarType() const override;
   virtual Backend backend() const override;
   virtual bool is_cuda() const override;
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index ab9f5343eddad9..9fe22beb0dc54e 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -102,7 +102,6 @@ INSTALL(FILES
   THTensor.hpp
   THStorageFunctions.hpp
   THGenerator.hpp
-  THTypeConversion.hpp
   DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
 
 INSTALL(FILES
diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp
index c8924b54f4bf70..4a2cb18b92e07e 100644
--- a/aten/src/TH/THFile.cpp
+++ b/aten/src/TH/THFile.cpp
@@ -140,12 +140,12 @@ IMPLEMENT_THFILE_SCALAR(Half, THHalf)
 #define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE)                           \
   size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage)    \
   {                                                                     \
-    return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \
+    return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \
   }                                                                     \
                                                                         \
   size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage)   \
   {                                                                     \
-    return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \
+    return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \
   }
 
 IMPLEMENT_THFILE_STORAGE(Byte, uint8_t)
diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h
index 5ff85eb2c8f40b..fb68639ec44752 100644
--- a/aten/src/TH/THHalf.h
+++ b/aten/src/TH/THHalf.h
@@ -2,40 +2,22 @@
 #define TH_HALF_H
 
 #include <TH/THGeneral.h>
-#include <stdint.h>
 
-/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */
-#if defined(__GNUC__)
-#define __thalign__(n) __attribute__((aligned(n)))
-#elif defined(_WIN32)
-#define __thalign__(n) __declspec(align(n))
-#else
-#define __thalign__(n)
+#ifdef __cplusplus
+#include <ATen/TensorImpl.h>
 #endif
 
-typedef struct __thalign__(2){
-  unsigned short x;
-} __THHalf;
-
-typedef struct __thalign__(4) {
-  unsigned int x;
-} __THHalf2;
-
-typedef __THHalf THHalf;
-typedef __THHalf2 THHalf2;
+#ifdef __cplusplus
+#define THHalf at::Half
+#else
+typedef struct at_Half at_Half;
+#define THHalf at_Half
+#endif
 
 TH_API void TH_float2halfbits(float*, unsigned short*);
 TH_API void TH_halfbits2float(unsigned short*, float*);
 
 TH_API THHalf TH_float2half(float);
-TH_API float  TH_half2float(THHalf);
-
-#ifndef TH_HALF_BITS_TO_LITERAL
-# define TH_HALF_BITS_TO_LITERAL(n) { n }
-#endif
-
-#define TH_HALF_ZERO 0x0U
-#define TH_HALF_INF  0x7C00U
+TH_API float TH_half2float(THHalf);
 
-#undef __thalign__
 #endif
diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp
index 011c1d1f54aaee..3f2187b68f74ea 100644
--- a/aten/src/TH/THMemoryFile.cpp
+++ b/aten/src/TH/THMemoryFile.cpp
@@ -56,7 +56,7 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size)
     return;
   else
   {
-    if(size < self->storage->size()) /* note the "<" and not "<=" */
+    if(size < self->storage->numel()) /* note the "<" and not "<=" */
     {
       self->size = size;
       THCharStorage_data(self->storage)[self->size] = '\0';
@@ -64,10 +64,10 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size)
     }
   }
 
-  missingSpace = size-self->storage->size()+1; /* +1 for the '\0' */
-  THCharStorage_resize(self->storage, (self->storage->size()/2 > missingSpace ?
-                                       self->storage->size() + (self->storage->size()/2)
-                                       : self->storage->size() + missingSpace));
+  missingSpace = size-self->storage->numel()+1; /* +1 for the '\0' */
+  THCharStorage_resize(self->storage, (self->storage->numel()/2 > missingSpace ?
+                                       self->storage->numel() + (self->storage->numel()/2)
+                                       : self->storage->numel() + missingSpace));
 }
 
 static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
@@ -188,12 +188,12 @@ static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
         while (1)                                                       \
         {                                                               \
           ASCII_WRITE_ELEM;                                             \
-          if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) ) \
+          if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) ) \
           {                                                             \
             mfself->position += nByteWritten;                           \
             break;                                                      \
           }                                                             \
-          THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2); \
+          THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2); \
         }                                                               \
         if(mfself->file.isAutoSpacing)                                  \
         {                                                               \
@@ -297,7 +297,7 @@ static void THMemoryFile_free(THFile *self)
 
 /* READ_WRITE_METHODS(bool, Bool, */
 /*                    int value = 0; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */
-/*                    int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", value), */
+/*                    int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", value), */
 /*                    1) */
 
 READ_WRITE_METHODS(uint8_t, Byte,
@@ -307,7 +307,7 @@ READ_WRITE_METHODS(uint8_t, Byte,
                    nread = ret; \
                    i = n-1; \
                    memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead),
-                   nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \
+                   nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \
                    i = n-1; \
                    if(nByteWritten > -1)
                      memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten),
@@ -322,7 +322,7 @@ READ_WRITE_METHODS(int8_t, Char,
                    nread = ret; \
                    i = n-1; \
                    memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead),
-                   nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \
+                   nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \
                    i = n-1; \
                    if(nByteWritten > -1)
                      memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten),
@@ -330,29 +330,29 @@ READ_WRITE_METHODS(int8_t, Char,
 
 READ_WRITE_METHODS(int16_t, Short,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%hd", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%hd", data[i]),
                    1)
 
 READ_WRITE_METHODS(int32_t, Int,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", data[i]),
                    1)
 
 READ_WRITE_METHODS(float, Float,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]),
                    1)
 
 READ_WRITE_METHODS(THHalf, Half,
                    int nByteRead_; float buf; \
                    int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \
                    data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", TH_half2float(data[i])),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])),
                    1)
 
 READ_WRITE_METHODS(double, Double,
                    int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.17g", data[i]),
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.17g", data[i]),
                    1)
 
 static ssize_t THMemoryFile_readLong(THFile *self, int64_t *data, ssize_t n)
@@ -491,13 +491,13 @@ static ssize_t THMemoryFile_writeLong(THFile *self, int64_t *data, ssize_t n)
       ssize_t nByteWritten;
       while (1)
       {
-        nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%" PRId64, data[i]);
-        if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) )
+        nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%" PRId64, data[i]);
+        if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) )
         {
           mfself->position += nByteWritten;
           break;
         }
-        THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2);
+        THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2);
       }
       if(mfself->file.isAutoSpacing)
       {
@@ -654,7 +654,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
 
   if(storage)
   {
-    THArgCheck(THCharStorage_data(storage)[storage->size()-1] == '\0', 1, "provided CharStorage must be terminated by 0");
+    THArgCheck(THCharStorage_data(storage)[storage->numel()-1] == '\0', 1, "provided CharStorage must be terminated by 0");
     THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
     THCharStorage_retain(storage);
   }
@@ -668,7 +668,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
   mfself = static_cast<THMemoryFile*>(THAlloc(sizeof(THMemoryFile)));
 
   mfself->storage = storage;
-  mfself->size = (storage ? storage->size()-1 : 0);
+  mfself->size = (storage ? storage->numel()-1 : 0);
   mfself->position = 0;
   mfself->longSize = 0;
 
diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp
index b0e4abe9329db7..a5319e67dabe61 100644
--- a/aten/src/TH/THStorageFunctions.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -34,7 +34,7 @@ void THStorage_free(THStorage* storage) {
 
 ptrdiff_t THStorage_size(const THStorage *self)
 {
-  return self->size();
+  return self->numel();
 }
 
 void THStorage_retain(THStorage *storage)
@@ -49,21 +49,21 @@ void THStorage_resize(THStorage* storage, ptrdiff_t size) {
     /* case when the allocator does not have a realloc defined */
     at::DataPtr new_data;
     if (size != 0) {
-      new_data = storage->allocator()->allocate(storage->elementSize() * size);
+      new_data = storage->allocator()->allocate(storage->itemsize() * size);
     }
     at::DataPtr old_data = storage->set_data_ptr(std::move(new_data));
-    ptrdiff_t old_size = storage->size();
-    storage->set_size(size);
+    ptrdiff_t old_size = storage->numel();
+    storage->set_numel(size);
     if (old_data != nullptr) {
       ptrdiff_t copy_size = old_size;
-      if (storage->size() < copy_size) {
-        copy_size = storage->size();
+      if (storage->numel() < copy_size) {
+        copy_size = storage->numel();
       }
       if (copy_size > 0) {
         memcpy(
             storage->data(),
             old_data.get(),
-            storage->elementSize() * copy_size);
+            storage->itemsize() * copy_size);
       }
     }
   } else {
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 9fe0db5e5497f9..362fa6e2c83de5 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -8,7 +8,6 @@
 
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
-#include "THTypeConversion.hpp"
 #include <atomic>
 
 // Note [Weak references for intrusive refcounting]
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 1b1f493ac4e289..0c731779b95685 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -125,7 +125,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons
     if(!THTensor_getStoragePtr(self)) {
       THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type()));
     }
-    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) {
+    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) {
       THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset());
     }
   }
diff --git a/aten/src/TH/THTypeConversion.hpp b/aten/src/TH/THTypeConversion.hpp
deleted file mode 100644
index d40169e7180e58..00000000000000
--- a/aten/src/TH/THTypeConversion.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include <ATen/core/Half.h>
-#include "THHalf.h"
-
-// Type traits to convert types to TH-specific types. Used primarily to
-// convert at::Half to TH's half type. This makes the conversion explicit.
-// FIXME: we should just use the same type
-
-namespace th {
-
-template <typename T>
-struct FromTypeConversion {
-  using type = T;
-};
-
-template <>
-struct FromTypeConversion<THHalf> {
-  using type = at::Half;
-};
-
-template <typename T>
-using from_type = typename FromTypeConversion<T>::type;
-}
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index 21431ef778d5a0..992cbd5bb7509f 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -21,13 +21,13 @@ size_t THStorage_(elementSize)()
 
 THStorage* THStorage_(new)(void)
 {
-  return THStorage_new(at::CTypeToScalarType<th::from_type<real>>::to());
+  return THStorage_new(at::CTypeToScalarType<real>::to());
 }
 
 THStorage* THStorage_(newWithSize)(ptrdiff_t size)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<real>>::to()),
+      at::scalarTypeToDataType(at::CTypeToScalarType<real>::to()),
       size,
       getTHDefaultAllocator(),
       true).release();
@@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
                                         at::Allocator *allocator)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<real>>::to()),
+      at::scalarTypeToDataType(at::CTypeToScalarType<real>::to()),
       size,
       allocator,
       true).release();
@@ -48,7 +48,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
 
 THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
 {
-  auto scalar_type = at::CTypeToScalarType<th::from_type<real>>::to();
+  auto scalar_type = at::CTypeToScalarType<real>::to();
   size_t actual_size = -1;
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
       at::scalarTypeToDataType(scalar_type),
@@ -59,7 +59,7 @@ THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int
       false).release();
 
   if (size <= 0) {
-    storage->set_size(actual_size / at::elementSize(scalar_type));
+    storage->set_numel(actual_size / at::elementSize(scalar_type));
   }
 
   return storage;
@@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage)
 THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size,
                                                at::Allocator* allocator) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<th::from_type<real>>::to()),
+      at::scalarTypeToDataType(at::CTypeToScalarType<real>::to()),
       size,
       std::move(data),
       allocator,
@@ -132,19 +132,19 @@ void THStorage_(resize)(THStorage *storage, ptrdiff_t size)
 void THStorage_(fill)(THStorage *storage, real value)
 {
   ptrdiff_t i;
-  for(i = 0; i < storage->size(); i++)
+  for(i = 0; i < storage->numel(); i++)
     THStorage_(data)(storage)[i] = value;
 }
 
 void THStorage_(set)(THStorage *self, ptrdiff_t idx, real value)
 {
-  THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds");
+  THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds");
   THStorage_(data)(self)[idx] = value;
 }
 
 real THStorage_(get)(const THStorage *self, ptrdiff_t idx)
 {
-  THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds");
+  THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds");
   return THStorage_(data)(self)[idx];
 }
 
diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp
index 0cde162d4c2843..442f7dbde2925d 100644
--- a/aten/src/TH/generic/THStorageCopy.cpp
+++ b/aten/src/TH/generic/THStorageCopy.cpp
@@ -6,13 +6,13 @@ void THStorage_(rawCopy)(THStorage *storage, real *src)
 {
   ptrdiff_t i;
   real *data = THStorage_(data)(storage);
-  for(i = 0; i < storage->size(); i++)
+  for(i = 0; i < storage->numel(); i++)
     data[i] = src[i];
 }
 
 void THStorage_(copy)(THStorage *storage, THStorage *src)
 {
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch");
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch");
   THStorage_(rawCopy)(storage, THStorage_(data)(src));
 }
 
@@ -25,40 +25,40 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage
   ptrdiff_t i;                                                          \
   auto data = THStorage_(data)(storage);                                \
   auto src_data = TH##TYPENAMESRC##Storage_data(src);                   \
-  for(i = 0; i < storage->size(); i++)                                    \
+  for(i = 0; i < storage->numel(); i++)                                    \
     data[i] = static_cast<real>(src_data[i]);                           \
 }
 
 #define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC)		\
 void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
 { \
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
   ptrdiff_t i;								\
   auto data = THStorage_(data)(storage);      \
   auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->size(); i++)					\
+  for(i = 0; i < storage->numel(); i++)					\
     data[i] = (real)TH_half2float(src_data[i]); \
 }
 
 #define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC)		\
 void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
 { \
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
   ptrdiff_t i;								\
   auto data = THStorage_(data)(storage);      \
   auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->size(); i++)					\
+  for(i = 0; i < storage->numel(); i++)					\
     data[i] = TH_float2half((float)(src_data[i])); \
 }
 
 #define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC)		\
 void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
 { \
-  THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \
+  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
   ptrdiff_t i;								\
   auto data = THStorage_(data)(storage);      \
   auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->size(); i++)					\
+  for(i = 0; i < storage->numel(); i++)					\
     data[i] = static_cast<real>(src_data[i]); \
 }
 
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index a9a1790c58c830..96e3938e20b0f9 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -20,17 +20,17 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
   if (!self->resizable())
     THError("Trying to resize storage that is not resizable");
 
-  size_t elementSize = self->elementSize();
+  size_t itemsize = self->itemsize();
 
   if(size == 0)
   {
     self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::DeviceType::CUDA, device)));
-    self->set_size(0);
+    self->set_numel(0);
   }
   else
   {
     at::DataPtr data =
-      self->allocator()->allocate(size * elementSize);
+      self->allocator()->allocate(size * itemsize);
 
     if (self->data_ptr()) {
       // Enable p2p access when the memcpy is across devices
@@ -38,14 +38,14 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
 
       THCudaCheck(cudaMemcpyAsync(data.get(),
                                   self->data(),
-                                  THMin(self->size(), size) * elementSize,
+                                  THMin(self->numel(), size) * itemsize,
                                   cudaMemcpyDeviceToDevice,
                                   THCState_getCurrentStream(state)));
     }
 
     // Destructively overwrite data_ptr
     self->set_data_ptr(std::move(data));
-    self->set_size(size);
+    self->set_numel(size);
   }
 }
 
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index 3826ea57fc5da3..de787bd380b6e6 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -148,7 +148,7 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, const
     if(!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) {
+    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) {
       THCStorage_resize(state, THTensor_getStoragePtr(self), totalSize+self->storage_offset());
     }
   }
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index aef30d62517061..feb2e94959abf2 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -21,7 +21,7 @@ int THCStorage_(elementSize)(THCState *state)
 
 void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value)
 {
-  THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds");
+  THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(real),
                               cudaMemcpyHostToDevice,
@@ -31,7 +31,7 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real v
 
 real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
 {
-  THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds");
+  THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
   real value;
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(real),
diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu
index a6b3bf557e2f63..95f2bc7163d46f 100644
--- a/aten/src/THC/generic/THCStorage.cu
+++ b/aten/src/THC/generic/THCStorage.cu
@@ -10,7 +10,7 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
 #if CUDA_VERSION >= 7000
     thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
 #endif
-    self_data, self_data+self->size(), value);
+    self_data, self_data+self->numel(), value);
 }
 
 void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
diff --git a/aten/src/THC/generic/THCStorageCopy.cpp b/aten/src/THC/generic/THCStorageCopy.cpp
index 9194ab7d3c80d4..546777baaf98c7 100644
--- a/aten/src/THC/generic/THCStorageCopy.cpp
+++ b/aten/src/THC/generic/THCStorageCopy.cpp
@@ -4,11 +4,11 @@
 
 void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
 {
-  THArgCheck(self->size() == src->size(), 2, "size does not match");
+  THArgCheck(self->numel() == src->numel(), 2, "size does not match");
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self),
                               THStorage_(data)(src),
-                              self->size() * sizeof(real),
+                              self->numel() * sizeof(real),
                               cudaMemcpyHostToDevice,
                               stream));
   THCudaCheck(cudaStreamSynchronize(stream));
@@ -18,9 +18,9 @@ void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *s
 void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src)  \
 {                                                                      \
   THCTensor* selfTensor =                                              \
-      THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1);     \
+      THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1);     \
   struct TH##TYPEC##Tensor* srcTensor =                                \
-      TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->size(), 1);        \
+      TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->numel(), 1);        \
   THCTensor_(copy##TYPEC)(state, selfTensor, srcTensor);               \
   TH##TYPEC##Tensor_free(srcTensor);                                   \
   THCTensor_(free)(state, selfTensor);                                 \
@@ -36,11 +36,11 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Double)
 
 void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src)
 {
-  THArgCheck(self->size() == src->size(), 2, "size does not match");
+  THArgCheck(self->numel() == src->numel(), 2, "size does not match");
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self),
                               THCStorage_(data)(state, src),
-                              self->size() * sizeof(real),
+                              self->numel() * sizeof(real),
                               cudaMemcpyDeviceToHost,
                               stream));
   THCudaCheck(cudaStreamSynchronize(stream));
@@ -50,9 +50,9 @@ void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *s
 void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \
 {                                                                           \
   TH##TYPEC##Tensor* selfTensor =                                           \
-      TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->size(), 1);           \
+      TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->numel(), 1);           \
   struct THCTensor* srcTensor =                                             \
-      THCTensor_(newWithStorage1d)(state, src, 0, src->size(), 1);            \
+      THCTensor_(newWithStorage1d)(state, src, 0, src->numel(), 1);            \
   TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(state, selfTensor, srcTensor); \
   THCTensor_(free)(state, srcTensor);                                       \
   TH##TYPEC##Tensor_free(selfTensor);                                   \
diff --git a/aten/src/THC/generic/THCStorageCopy.cu b/aten/src/THC/generic/THCStorageCopy.cu
index bea4fe699623fb..962167c73b82c8 100644
--- a/aten/src/THC/generic/THCStorageCopy.cu
+++ b/aten/src/THC/generic/THCStorageCopy.cu
@@ -4,17 +4,17 @@
 
 void THCStorage_(rawCopy)(THCState *state, THCStorage *self, real *src)
 {
-  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->size() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state)));
+  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->numel() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state)));
 }
 
 // conversions are delegated to THCTensor implementation
 #define THC_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC,TYPECUDA)                                 \
 void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src)  \
 {                                                                                       \
-  THArgCheck(self->size() == src->size(), 2, "size does not match");                        \
-  THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1);  \
+  THArgCheck(self->numel() == src->numel(), 2, "size does not match");                        \
+  THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1);  \
   struct THCuda##TYPECUDA##Tensor* srcTensor =                                          \
-      THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->size(), 1);           \
+      THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->numel(), 1);           \
   THCTensor_(copyCuda##TYPEC)(state, selfTensor, srcTensor);                            \
   THCuda##TYPECUDA##Tensor_free(state, srcTensor);                                      \
   THCTensor_(free)(state, selfTensor);                                                  \
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index aee04a8e22a4e4..3b63c3ae1c7b2f 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -235,7 +235,13 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
     else if (info < 0)
       THError("MAGMA syev : Argument %d : illegal value", -info);
   }
-  THCTensor_(freeCopyTo)(state, input, rv_);
+  if (jobzs[0] == 'N') {
+    // If eigenvector is not needed, fill the result with zeros.
+    THCTensor_(zero)(state, rv_);
+    THCTensor_(free)(state, input);
+  } else {
+    THCTensor_(freeCopyTo)(state, input, rv_);
+  }
 #else
   THError(NoMagma(syev));
 #endif
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index 7a441dc1c5c2ee..daf3ccac90eecf 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include <string>
 #include <chrono>
+#include <fstream>
+#include <string>
 #include <thread>
 
 #include "binaries/benchmark_helper.h"
@@ -309,3 +310,88 @@ void writeOutput(
     }
   }
 }
+
+int benchmark(
+    int argc,
+    char* argv[],
+    const string& FLAGS_backend,
+    const string& FLAGS_init_net,
+    const string& FLAGS_input,
+    const string& FLAGS_input_dims,
+    const string& FLAGS_input_file,
+    const string& FLAGS_input_type,
+    int FLAGS_iter,
+    const string& FLAGS_net,
+    const string& FLAGS_output,
+    const string& FLAGS_output_folder,
+    bool FLAGS_run_individual,
+    int FLAGS_sleep_before_run,
+    bool FLAGS_text_output,
+    int FLAGS_warmup,
+    bool FLAGS_wipe_cache) {
+  caffe2::GlobalInit(&argc, &argv);
+  // Check arguments to be correct
+  {
+    // Need to check whether file exists, as the file reader does not assert if
+    // file does not exist
+    std::ifstream net_file(FLAGS_net);
+    CAFFE_ENFORCE(net_file.good());
+
+    std::ifstream init_net_file(FLAGS_init_net);
+    CAFFE_ENFORCE(init_net_file.good());
+
+    if (FLAGS_input_file.size() > 0) {
+      vector<string> input_files = caffe2::split(',', FLAGS_input_file);
+      for (auto input_file : input_files) {
+        std::ifstream ifile(input_file);
+        CAFFE_ENFORCE(ifile.good());
+      }
+    }
+  }
+
+  observerConfig();
+  caffe2::ShowLogInfoToStderr();
+
+  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
+  bool run_on_gpu = backendCudaSet(FLAGS_backend);
+  // Run initialization network.
+  caffe2::NetDef init_net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
+  setOperatorEngine(&init_net_def, FLAGS_backend);
+  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
+
+  // Run main network.
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
+  setOperatorEngine(&net_def, FLAGS_backend);
+
+  map<string, caffe2::TensorProtos> tensor_protos_map;
+
+  loadInput(
+      workspace,
+      run_on_gpu,
+      tensor_protos_map,
+      FLAGS_input,
+      FLAGS_input_file,
+      FLAGS_input_dims,
+      FLAGS_input_type);
+
+  runNetwork(
+      workspace,
+      net_def,
+      tensor_protos_map,
+      FLAGS_wipe_cache,
+      FLAGS_run_individual,
+      FLAGS_warmup,
+      FLAGS_iter,
+      FLAGS_sleep_before_run);
+
+  writeOutput(
+      workspace,
+      run_on_gpu,
+      FLAGS_output,
+      FLAGS_output_folder,
+      FLAGS_text_output);
+
+  return 0;
+}
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
index df23ed8651118a..5bf79182dab7e1 100644
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@@ -98,3 +98,21 @@ void runNetwork(
     const int,
     const int,
     const int);
+int benchmark(
+    int argc,
+    char* argv[],
+    const string& FLAGS_backend,
+    const string& FLAGS_init_net,
+    const string& FLAGS_input,
+    const string& FLAGS_input_dims,
+    const string& FLAGS_input_file,
+    const string& FLAGS_input_type,
+    int FLAGS_iter,
+    const string& FLAGS_net,
+    const string& FLAGS_output,
+    const string& FLAGS_output_folder,
+    bool FLAGS_run_individual,
+    int FLAGS_sleep_before_run,
+    bool FLAGS_text_output,
+    int FLAGS_warmup,
+    bool FLAGS_wipe_cache);
diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc
index c5a93ae7cbae33..38badccfa1e4bb 100644
--- a/binaries/caffe2_benchmark.cc
+++ b/binaries/caffe2_benchmark.cc
@@ -77,51 +77,22 @@ CAFFE2_DEFINE_bool(
     "Whether to evict the cache before running network.");
 
 int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-
-  observerConfig();
-  caffe2::ShowLogInfoToStderr();
-
-  auto workspace = make_shared<caffe2::Workspace>(new caffe2::Workspace());
-  bool run_on_gpu = backendCudaSet(caffe2::FLAGS_backend);
-  // Run initialization network.
-  caffe2::NetDef init_net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def));
-  setOperatorEngine(&init_net_def, caffe2::FLAGS_backend);
-  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
-
-  // Run main network.
-  caffe2::NetDef net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def));
-  setOperatorEngine(&net_def, caffe2::FLAGS_backend);
-
-  map<string, caffe2::TensorProtos> tensor_protos_map;
-
-  loadInput(
-      workspace,
-      run_on_gpu,
-      tensor_protos_map,
+  benchmark(
+      argc,
+      argv,
+      caffe2::FLAGS_backend,
+      caffe2::FLAGS_init_net,
       caffe2::FLAGS_input,
-      caffe2::FLAGS_input_file,
       caffe2::FLAGS_input_dims,
-      caffe2::FLAGS_input_type);
-
-  runNetwork(
-      workspace,
-      net_def,
-      tensor_protos_map,
-      caffe2::FLAGS_wipe_cache,
-      caffe2::FLAGS_run_individual,
-      caffe2::FLAGS_warmup,
+      caffe2::FLAGS_input_file,
+      caffe2::FLAGS_input_type,
       caffe2::FLAGS_iter,
-      caffe2::FLAGS_sleep_before_run);
-
-  writeOutput(
-      workspace,
-      run_on_gpu,
+      caffe2::FLAGS_net,
       caffe2::FLAGS_output,
       caffe2::FLAGS_output_folder,
-      caffe2::FLAGS_text_output);
-
-  return 0;
+      caffe2::FLAGS_run_individual,
+      caffe2::FLAGS_sleep_before_run,
+      caffe2::FLAGS_text_output,
+      caffe2::FLAGS_warmup,
+      caffe2::FLAGS_wipe_cache);
 }
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index ec7518630c9525..a394f91c729b87 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -236,6 +236,12 @@ MessageLogger::~MessageLogger() {
   if (severity_ >= FLAGS_caffe2_log_level) {
     // If not building on Android, log all output to std::cerr.
     std::cerr << stream_.str();
+    // Simulating the glog default behavior: if the severity is above INFO,
+    // we flush the stream so that the output appears immediately on std::cerr.
+    // This is expected in some of our tests.
+    if (severity_ > INFO) {
+      std::cerr << std::flush;
+    }
   }
 #endif  // ANDROID
   if (severity_ == FATAL) {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
index 8560ff82374d9a..8c24a2e2cb1076 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
@@ -11,15 +11,15 @@ namespace repr {
 class CAFFE2_API Value {
  public:
   enum class ValueKind { Value, Instruction, Data };
-  Value(ValueKind K) : Kind(K) {}
-  Value() : Kind(ValueKind::Value) {}
+  Value(ValueKind K) : kind_(K) {}
+  Value() : kind_(ValueKind::Value) {}
   ValueKind getKind() const {
-    return Kind;
+    return kind_;
   }
   virtual ~Value() = default;
 
  private:
-  const ValueKind Kind;
+  const ValueKind kind_;
 };
 
 class CAFFE2_API Data : public Value {
@@ -30,15 +30,15 @@ class CAFFE2_API Data : public Value {
   }
   virtual ~Data() = default;
   size_t getVersion() const {
-    return Version;
+    return version_;
   }
 
   void setVersion(size_t version) {
-    Version = version;
+    version_ = version;
   }
 
  private:
-  size_t Version = 0;
+  size_t version_ = 0;
 };
 
 class CAFFE2_API Instruction : public Value {
@@ -52,18 +52,18 @@ class CAFFE2_API Instruction : public Value {
     TerminatorEnd,
     Phi
   };
-  Instruction() : Value(ValueKind::Instruction), Op(Opcode::Generic) {}
-  Instruction(Opcode op) : Value(ValueKind::Instruction), Op(op) {}
+  Instruction() : Value(ValueKind::Instruction), op_(Opcode::Generic) {}
+  Instruction(Opcode op) : Value(ValueKind::Instruction), op_(op) {}
   CAFFE2_API static bool classof(const Value* V) {
     return V->getKind() == ValueKind::Instruction;
   }
   virtual ~Instruction() = default;
   Opcode getOpcode() const {
-    return Op;
+    return op_;
   }
 
  private:
-  Opcode Op;
+  Opcode op_;
 };
 
 class CAFFE2_API Terminator : public Instruction {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
index 835f187febf15d..1934b1f1b7bad4 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
@@ -19,45 +19,45 @@ class CAFFE2_API BasicBlock {
   using NodeRef = typename Subgraph<T, U...>::NodeRef;
   BasicBlock() {}
   ~BasicBlock() {
-    for (auto pair : callbacks) {
+    for (auto pair : callbacks_) {
       pair.first->deleteDestructorCallback(pair.second);
     }
   }
 
   void trackNode(NodeRef node) {
-    callbacks[node] = node->registerDestructorCallback([&](NodeRef n) {
+    callbacks_[node] = node->registerDestructorCallback([&](NodeRef n) {
       assert(
           hasInstruction(n) &&
           "Destructor callback invoked on untracked node in BasicBlock.");
       deleteInstruction(n);
     });
-    Nodes.addNode(node);
+    nodes_.addNode(node);
   }
 
   void untrackNode(NodeRef node) {
-    callbacks.erase(node);
-    Nodes.removeNode(node);
+    callbacks_.erase(node);
+    nodes_.removeNode(node);
   }
 
   void pushInstructionNode(NodeRef node) {
     assert(
         isa<Instruction>(node->data()) &&
         "Cannot push non-instruction node to basic block.");
-    Instructions.emplace_back(node);
+    instructions_.emplace_back(node);
     trackNode(node);
   }
   const std::vector<NodeRef>& getInstructions() {
-    return Instructions;
+    return instructions_;
   }
 
   bool hasInstruction(NodeRef instr) const {
-    return Nodes.hasNode(instr);
+    return nodes_.hasNode(instr);
   }
 
   void insertInstructionBefore(NodeRef newInstr, NodeRef instr) {
     auto it =
-        std::find(std::begin(Instructions), std::end(Instructions), instr);
-    Instructions.insert(it, newInstr);
+        std::find(std::begin(instructions_), std::end(instructions_), instr);
+    instructions_.insert(it, newInstr);
     trackNode(newInstr);
   }
 
@@ -65,28 +65,28 @@ class CAFFE2_API BasicBlock {
     assert(hasInstruction(instr1) && "Instruction not in basic block.");
     assert(hasInstruction(instr2) && "Instruction not in basic block.");
     auto it1 =
-        std::find(std::begin(Instructions), std::end(Instructions), instr1);
+        std::find(std::begin(instructions_), std::end(instructions_), instr1);
     auto it2 =
-        std::find(std::begin(Instructions), std::end(Instructions), instr2);
-    Instructions.erase(it1);
-    Instructions.insert(it2, instr1);
+        std::find(std::begin(instructions_), std::end(instructions_), instr2);
+    instructions_.erase(it1);
+    instructions_.insert(it2, instr1);
   }
 
   void deleteInstruction(NodeRef instr) {
     assert(hasInstruction(instr) && "Instruction not in basic block.");
-    Instructions.erase(
-        std::remove(Instructions.begin(), Instructions.end(), instr),
-        Instructions.end());
+    instructions_.erase(
+        std::remove(instructions_.begin(), instructions_.end(), instr),
+        instructions_.end());
     untrackNode(instr);
   }
 
  private:
-  Subgraph<T, U...> Nodes;
-  std::vector<NodeRef> Instructions;
+  Subgraph<T, U...> nodes_;
+  std::vector<NodeRef> instructions_;
   // Because we reference a dataflow graph, we need to register callbacks
   // for when the dataflow graph is modified.
   std::unordered_map<NodeRef, typename Notifier<Node<T, U...>>::Callback*>
-      callbacks;
+      callbacks_;
 };
 
 using Program = Graph<Value>;
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index 1f7e2c27906c99..b1e9283bc9ccee 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -45,19 +45,19 @@ class CAFFE2_API Annotation {
  public:
   enum class AnnotationKind { Generic, Caffe2 };
 
-  Annotation(AnnotationKind K) : Kind(K) {}
-  Annotation() : Kind(AnnotationKind::Generic) {}
+  Annotation(AnnotationKind kind) : kind_(kind) {}
+  Annotation() : kind_(AnnotationKind::Generic) {}
   virtual ~Annotation() {}
 
   AnnotationKind getKind() const {
-    return Kind;
+    return kind_;
   }
 
   Annotation(const Annotation&) = delete;
   Annotation& operator=(Annotation&) = delete;
 
  private:
-  const AnnotationKind Kind;
+  const AnnotationKind kind_;
 };
 
 class CAFFE2_API NeuralNetOperator : public Instruction {
@@ -75,36 +75,38 @@ class CAFFE2_API NeuralNetOperator : public Instruction {
   enum class NNLayout { Undefined, NCHW, NHWC };
 
   NeuralNetOperator(NNKind K, Opcode I, NNLayout L)
-      : Instruction(I), Kind(K), Layout(L) {}
+      : Instruction(I), kind_(K), layout_(L) {}
   NeuralNetOperator(NNKind K, Opcode I)
-      : Instruction(I), Kind(K), Layout(NNLayout::Undefined) {}
-  NeuralNetOperator(NNKind K, NNLayout L) : Instruction(), Kind(K), Layout(L) {}
+      : Instruction(I), kind_(K), layout_(NNLayout::Undefined) {}
+  NeuralNetOperator(NNKind K, NNLayout L)
+      : Instruction(), kind_(K), layout_(L) {}
   NeuralNetOperator(NNKind K)
-      : Instruction(), Kind(K), Layout(NNLayout::Undefined) {}
+      : Instruction(), kind_(K), layout_(NNLayout::Undefined) {}
   NeuralNetOperator()
-      : Instruction(), Kind(NNKind::Undefined), Layout(NNLayout::Undefined) {}
+      : Instruction(), kind_(NNKind::Undefined), layout_(NNLayout::Undefined) {}
 
   NNKind getKind() const {
-    return Kind;
+    return kind_;
   }
 
   void setLayout(NNLayout L) {
-    Layout = L;
+    layout_ = L;
   }
 
   NNLayout getLayout() const {
-    return Layout;
+    return layout_;
   }
 
   void setAnnotation(std::unique_ptr<Annotation> extraAnnotation) {
-    ExtraAnnotation = std::move(extraAnnotation);
+    extraAnnotation_ = std::move(extraAnnotation);
   }
 
   const Annotation* getAnnotation() const {
-    return ExtraAnnotation.get();
+    return extraAnnotation_.get();
   }
+
   Annotation* getMutableAnnotation() {
-    return ExtraAnnotation.get();
+    return extraAnnotation_.get();
   }
 
   const std::string getName() const;
@@ -128,9 +130,9 @@ class CAFFE2_API NeuralNetOperator : public Instruction {
   NeuralNetOperator& operator=(NeuralNetOperator&) = delete;
 
  private:
-  const NNKind Kind;
-  NNLayout Layout; // Mutable attribute, much like a type cast
-  std::unique_ptr<Annotation> ExtraAnnotation;
+  const NNKind kind_;
+  NNLayout layout_; // Mutable attribute, much like a type cast
+  std::unique_ptr<Annotation> extraAnnotation_;
 };
 
 class CAFFE2_API NeuralNetData : public Data {
@@ -138,12 +140,12 @@ class CAFFE2_API NeuralNetData : public Data {
   /// Discriminator for LLVM-style RTTI (isa<>)
   enum class NNDataKind { Generic, Tensor };
 
-  NeuralNetData(NNDataKind kind) : Kind(kind) {}
+  NeuralNetData(NNDataKind kind) : kind_(kind) {}
 
-  NeuralNetData() : Kind(NNDataKind::Generic) {}
+  NeuralNetData() : kind_(NNDataKind::Generic) {}
 
   NNDataKind getKind() const {
-    return Kind;
+    return kind_;
   }
 
   virtual NeuralNetData* clone() = 0;
@@ -153,8 +155,8 @@ class CAFFE2_API NeuralNetData : public Data {
   virtual ~NeuralNetData() = 0;
 
  private:
-  NNDataKind Kind;
-  size_t Version = 0;
+  NNDataKind kind_;
+  size_t version_ = 0;
 };
 
 class CAFFE2_API Tensor : public NeuralNetData {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
index cef1bdec522a56..91e4c2f6e01e87 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h
@@ -71,13 +71,13 @@ class Notifier {
   Notifier() {}
 
   Callback* registerDestructorCallback(Callback fn) {
-    DtorCallbacks.emplace_back(fn);
-    return &DtorCallbacks.back();
+    dtorCallbacks_.emplace_back(fn);
+    return &dtorCallbacks_.back();
   }
 
   Callback* registerNotificationCallback(Callback fn) {
-    NotifCallbacks.emplace_back(fn);
-    return &NotifCallbacks.back();
+    notifCallbacks_.emplace_back(fn);
+    return &notifCallbacks_.back();
   }
 
   void deleteCallback(std::list<Callback>& callbackList, Callback* toDelete) {
@@ -90,11 +90,11 @@ class Notifier {
   }
 
   void deleteDestructorCallback(Callback* c) {
-    deleteCallback(DtorCallbacks, c);
+    deleteCallback(dtorCallbacks_, c);
   }
 
   void deleteNotificationCallback(Callback* c) {
-    deleteCallback(NotifCallbacks, c);
+    deleteCallback(notifCallbacks_, c);
   }
 
   /// \brief Notifies all listeners (`registerNotificationCallback`
@@ -102,20 +102,20 @@ class Notifier {
   /// is encoded in the state of the derived class, thus only passing
   /// a pointer of type T* to the callback.
   void notify() {
-    for (auto callback : NotifCallbacks) {
+    for (auto callback : notifCallbacks_) {
       callback(reinterpret_cast<T*>(this));
     }
   }
 
   virtual ~Notifier() {
-    for (auto callback : DtorCallbacks) {
+    for (auto callback : dtorCallbacks_) {
       callback(reinterpret_cast<T*>(this));
     }
   }
 
  private:
-  std::list<Callback> DtorCallbacks;
-  std::list<Callback> NotifCallbacks;
+  std::list<Callback> dtorCallbacks_;
+  std::list<Callback> notifCallbacks_;
 };
 
 #endif /* NOM_SUPPORT_COMMON_H */
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 23740cfc5772e5..f1934f5ddbc28d 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -25,7 +25,7 @@ inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
 }
 
 /**
- * Return product of all dimensions starting from K
+ * Return product of all dimensions starting from k
  */
 inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
   TIndex r = 1;
@@ -35,7 +35,7 @@ inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
   return r;
 }
 
-// Product of all dims up to
+// Product of all dims up to k (not including dims[k])
 inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
   CAFFE_ENFORCE((unsigned)k <= dims.size());
   TIndex r = 1;
@@ -61,6 +61,7 @@ inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
   return r;
 }
 
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
 inline int canonical_axis_index_(int axis_index, int ndims) {
   CAFFE_ENFORCE_GE(axis_index, -ndims);
   CAFFE_ENFORCE_LT(axis_index, ndims);
@@ -274,9 +275,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     CAFFE_ENFORCE_GE_WITH_CALLER(
         num, 0, "`num` must be non-negative for Extend");
-    CAFFE_ENFORCE(
-        storage_.use_count() == 1,
-        "Can't call Extend on shared storage, please call Resize instead");
     auto newDims = dims_;
     newDims[0] += num;
     if (!storage_->data()) {
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index eb2d5b6acf1a61..25d4e16d2f9e7a 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -25,13 +25,21 @@ class IDEEPConcatOp final : public IDEEPOperator {
   virtual ~IDEEPConcatOp() {}
 
   bool RunOnDevice() override {
-    const auto& input_zero = Input(INPUT0);
     auto* output = Output(OUTPUT);
     TensorCPU* axis_info = OperatorBase::Output<TensorCPU>(AXIS_INFO, CPU);
 
     vector<itensor> inputs;
     for (int i = 0; i < InputSize(); ++i) {
-      inputs.emplace_back(Input(i));
+      if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
+        inputs.emplace_back(Input(i));
+      } else {
+        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType<Tensor>(CPU),
+                      "Expect cpu tensor if not itensor");
+        auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
+        CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
+                      tensor_cpu.size_from_dim(0) == 0,
+                      "Expect zero dim tensor");
+      }
     }
 
     auto axis_vdata = ideep::concat::compute(inputs, axis_, add_axis_, *output);
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
index 8251b386eeb3c7..75895c5d844345 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -32,6 +32,8 @@
 #include <caffe2/operators/tanh_op.h>
 #include <caffe2/operators/transpose_op.h>
 #include <caffe2/operators/utility_ops.h>
+#include <caffe2/operators/affine_channel_op.h>
+#include <caffe2/operators/stop_gradient.h>
 #include <caffe2/sgd/adam_op.h>
 #include <caffe2/sgd/iter_op.h>
 #include <caffe2/sgd/learning_rate_op.h>
@@ -116,6 +118,12 @@ REGISTER_IDEEP_OPERATOR(
 REGISTER_IDEEP_OPERATOR(
     BBoxTransform,
     IDEEPFallbackOp<BBoxTransformOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    AffineChannel,
+    IDEEPFallbackOp<AffineChannelOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    StopGradient,
+    IDEEPFallbackOp<StopGradientOp<CPUContext>>);
 
 REGISTER_IDEEP_OPERATOR(
     PadImage,
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index ae4f903c23c2fc..31df729a217850 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -53,6 +53,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
     // then forward output blobs to local workspace.
     std::unordered_map<string, string> forwarded_output_blobs;
     for (int i = 0; i < base_def_.output_size(); i++) {
+      // For in-place case, the in/output tensor for local_ws must be
+      // re-created, instead of forwarding from current workspace.
       string parent_name(base_def_.output(i));
       if (!SkipOutputCopy::Contains(i)) {
         parent_name += "_cpu_output_blob_" + base_def_.type();
@@ -60,6 +62,13 @@ class IDEEPFallbackOp final : public IDEEPOperator {
       local_output_blobs_.push_back(ws->CreateBlob(parent_name));
       CHECK_NOTNULL(local_output_blobs_.back());
       forwarded_output_blobs[base_def_.output(i)] = parent_name;
+      output_inplace_.push_back(false);
+      for (const string &input_name : base_def_.input()) {
+        if (input_name == base_def_.output(i)) {
+          output_inplace_[i] = true;
+          break;
+        }
+      }
     }
     local_ws_.reset(new Workspace(ws, forwarded_output_blobs));
     // Set up the symbols for the local workspace.
@@ -67,31 +76,26 @@ class IDEEPFallbackOp final : public IDEEPOperator {
       local_input_blobs_.push_back(local_ws_->CreateBlob(name));
       CHECK_NOTNULL(local_input_blobs_.back());
     }
+    input_share_.resize(local_input_blobs_.size(), false);
     base_op_.reset(new CPUOp(base_def_, local_ws_.get()));
   }
 
   bool RunOnDevice() override {
     for (int i = 0; i < InputSize(); ++i) {
-      if (InputIsType<itensor>(i) && Input(i).get_data_type() == itensor::data_type::f32) {
+      if (InputIsType<itensor>(i) &&
+          Input(i).get_data_type() == itensor::data_type::f32) {
         auto& input = Input(i);
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
-        dtensor->Resize(input.get_dims());
-        if (input.is_public_format()) {
-          dtensor->ShareExternalPointer(static_cast<float*>(input.get_data_handle()));
-        } else {
-          input.reorder_to(dtensor->template mutable_data<float>());
+        if (input_share_[i]) {
+          local_input_blobs_[i]->Reset();
         }
-      } else if (
-          InputIsType<itensor>(i) &&
-          Input(i).get_data_type() == itensor::data_type::s32) {
-        auto& input = Input(i);
+        input_share_[i] = false;
         auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
-              static_cast<long*>(input.get_data_handle()));
+              static_cast<float*>(input.get_data_handle()));
         } else {
-          input.reorder_to(dtensor->template mutable_data<long>());
+          input.reorder_to(dtensor->template mutable_data<float>());
         }
       } else {
         VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy.";
@@ -99,8 +103,9 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         // local_input_blobs will only be used as const blob input for the
         // base op so we are still fine.
         local_input_blobs_[i]->ShareExternal(
-            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            const_cast<void *>(OperatorBase::Inputs()[i]->GetRaw()),
             OperatorBase::Inputs()[i]->meta());
+        input_share_[i] = true;
       }
     }
 
@@ -120,21 +125,16 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
-
       auto src_dims = src.dims();
-      if (src.ndim() == 0) {
-        VLOG(1) << "Copy output: index " << i << " skipped.";
+      if (src.template IsType<float>() &&
+          src.dims().size() != 0 && src.size_from_dim(0) != 0 &&
+          base_op_->type() != "Python") {
         Blob* dst = OperatorBase::OutputBlob(i);
-        dst->Reset(new Tensor(CPU));
-        auto dtensor = dst->GetMutableTensor(CPU);
-        dtensor->Resize(src_dims);
-        dtensor->ShareData(src);
-        continue;
-      }
-
-      if (src.template IsType<float>()) {
-        Blob* dst = OperatorBase::OutputBlob(i);
-        if (!dst->template IsType<itensor>()) {
+        // The output tensor must be ideep tensor with public format.
+        // If reusing ideep tensor with non-public format, the tensor buffer
+        // will be interpreted incorrectly.
+        if (!dst->template IsType<itensor>() ||
+            !dst->template Get<itensor>().is_public_format()) {
           dst->Reset(new itensor());
         }
 
@@ -143,7 +143,12 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         if (dtensor->get_dims() != dst_dims) {
           dtensor->resize(dst_dims, itensor::data_type::f32);
         }
-        dtensor->set_data_handle(const_cast<void*>(src.raw_data()));
+        if (output_inplace_[i]) {
+          dtensor->reorder_from(dst_dims, itensor::data_type::f32,
+                                const_cast<void*>(src.raw_data()));
+        } else {
+          dtensor->set_data_handle(const_cast<void *>(src.raw_data()));
+        }
       } else {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         Blob* dst = OperatorBase::OutputBlob(i);
@@ -159,6 +164,8 @@ class IDEEPFallbackOp final : public IDEEPOperator {
  protected:
   vector<Blob*> local_input_blobs_;
   vector<Blob*> local_output_blobs_;
+  vector<bool> output_inplace_;
+  vector<bool> input_share_;
   std::unique_ptr<CPUOp> base_op_;
   std::unique_ptr<Workspace> local_ws_;
   OperatorDef base_def_;
diff --git a/caffe2/mpi/mpi_common.h b/caffe2/mpi/mpi_common.h
index 3e1e7a5625bd2e..b283a0aea382c3 100644
--- a/caffe2/mpi/mpi_common.h
+++ b/caffe2/mpi/mpi_common.h
@@ -4,6 +4,7 @@
 #include <mpi.h>
 #include <mutex>
 
+#include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 
 namespace caffe2 {
@@ -29,7 +30,7 @@ MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE)
 #undef MPI_DATATYPE_WRAPPER
 
 // For all Caffe MPI calls, we will wrap it inside an MPI mutex lock guard.
-std::mutex& MPIMutex();
+CAFFE2_API std::mutex& MPIMutex();
 
 #define MPI_CHECK(condition)                                 \
   do {                                                       \
@@ -49,23 +50,23 @@ std::mutex& MPIMutex();
  * @brief Gets the global MPI communicator used by Caffe2. In default, this
  * is MPI_COMM_WORLD unless you call SetGlobalMPIComm().
  */
-MPI_Comm GlobalMPIComm();
+CAFFE2_API MPI_Comm GlobalMPIComm();
 
 /**
  * @brief Sets the global MPI communicator. Caffe2 takes over the ownership
  * of the passed in communicator.
  */
-void SetGlobalMPIComm(MPI_Comm new_comm);
+CAFFE2_API void SetGlobalMPIComm(MPI_Comm new_comm);
 
 /**
  * @brief A helper function to return the size of the given communicator.
  */
-int MPICommSize(MPI_Comm comm);
+CAFFE2_API int MPICommSize(MPI_Comm comm);
 
 /**
  * @brief A helper function to return the rank of the given communicator.
  */
-int MPICommRank(MPI_Comm comm);
+CAFFE2_API int MPICommRank(MPI_Comm comm);
 
 /**
  * @brief A simple wrapper over an MPI common world.
diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc
index a8f4c91e7e5404..31256026028dfa 100644
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@@ -311,8 +311,8 @@ op = core.CreateOperator(
     axis=3
 )
 
-workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
-workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
+workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
+workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
 print("X1:", workspace.FetchBlob("X1"))
 print("X2:", workspace.FetchBlob("X2"))
 workspace.RunOperatorOnce(op)
diff --git a/caffe2/operators/conv_op.cc b/caffe2/operators/conv_op.cc
index 082c94fb6c18fb..30fb79d3846942 100644
--- a/caffe2/operators/conv_op.cc
+++ b/caffe2/operators/conv_op.cc
@@ -42,24 +42,24 @@ op = core.CreateOperator(
     stride=2
 )
 
-# Create X: (N,C,H,W)
+// Create X: (N,C,H,W)
 data = np.random.randn(1,1,8,8).astype(np.float32)
 print("Data shape: ",data.shape)
 
-# Create W: (M,C,Kh,Kw)
+// Create W: (M,C,Kh,Kw)
 filters = np.random.randn(3,1,5,5).astype(np.float32)
 print("Filter shape: ",filters.shape)
 
-# Create b: M
+// Create b: M
 bias = np.array([1.,1.,1.]).astype(np.float32)
 print("Bias shape: ",bias.shape)
 
-# Put the inputs into the workspace
+// Put the inputs into the workspace
 workspace.FeedBlob("X", data)
 workspace.FeedBlob("filter", filters)
 workspace.FeedBlob("bias", bias)
 
-# Run the operator
+// Run the operator
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 
diff --git a/caffe2/operators/conv_transpose_op.cc b/caffe2/operators/conv_transpose_op.cc
index 57ec02b63ea0dd..7de16afaed9158 100644
--- a/caffe2/operators/conv_transpose_op.cc
+++ b/caffe2/operators/conv_transpose_op.cc
@@ -44,24 +44,24 @@ op = core.CreateOperator(
     strides=[2,2]
 )
 
-# Create X: (N,C,H,W)
+// Create X: (N,C,H,W)
 data = np.random.randn(2,3,5,5).astype(np.float32)
 print("Data shape: ",data.shape)
 
-# Create filter: (M,C,Kh,Kw)
+// Create filter: (M,C,Kh,Kw)
 filters = np.random.randn(3,1,2,2).astype(np.float32)
 print("Filter shape: ",filters.shape)
 
-# Create b: M
+// Create b: M
 bias = np.array([1.]).astype(np.float32)
 print("Bias shape: ",bias.shape)
 
-# Put the inputs into the workspace
+// Put the inputs into the workspace
 workspace.FeedBlob("X", data)
 workspace.FeedBlob("filter", filters)
 workspace.FeedBlob("bias", bias)
 
-# Run the operator
+// Run the operator
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 
diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc
index 15cdab5849cc1f..50e4b9448af310 100644
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@@ -58,22 +58,22 @@ resetcounter_op = core.CreateOperator(
 )
 
 
-# Create counter
+// Create counter
 workspace.RunOperatorOnce(createcounter_op)
 print("'counter' pointer:", workspace.FetchBlob("counter"))
 
 
-# Retrieve initial counter value
+// Retrieve initial counter value
 workspace.RunOperatorOnce(retrievecount_op)
 print("Initial 'count':", workspace.FetchBlob("count"))
 
 
-# Check if counter is done
+// Check if counter is done
 workspace.RunOperatorOnce(checkcounterdone_op)
 print("Initial 'done' value:", workspace.FetchBlob("done"))
 
 
-# Test CountUp operator
+// Test CountUp operator
 print("\nTesting CountUp operator...")
 for i in range(5):
     workspace.RunOperatorOnce(countup_op)
@@ -83,7 +83,7 @@ workspace.RunOperatorOnce(retrievecount_op)
 print("'count' value after CountUp test:", workspace.FetchBlob("count"))
 
 
-# Test CountDown operator
+// Test CountDown operator
 print("\nTesting CountDown operator...")
 for i in range(11):
     workspace.RunOperatorOnce(countdown_op)
diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc
index 584b7abd5a183f..0473e7d4e435b3 100644
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@@ -401,22 +401,22 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: Sample softmax output for 5-class model
+// Create X: Sample softmax output for 5-class model
 X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]])
 print("X:\n",X)
 
-# Create label: Sample 1-hot ground truth label vectors
+// Create label: Sample 1-hot ground truth label vectors
 label = np.array([4,2])
 print("label:\n",label)
 
-# Feed X & label into workspace
+// Feed X & label into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("label", label.astype(np.int32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
@@ -635,22 +635,22 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: Sample softmax output for 5-class model
+// Create X: Sample softmax output for 5-class model
 X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]])
 print("X:\n",X)
 
-# Create label: Sample 1-hot ground truth label vectors
+// Create label: Sample 1-hot ground truth label vectors
 label = np.array([[0.,0.,0.,0.,1.],[0.,0.,1.,0.,0.]])
 print("label:\n",label)
 
-# Feed X & label into workspace
+// Feed X & label into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("label", label.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index d9abfa0e254336..9a38a4a77a0043 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -437,22 +437,22 @@ op = core.CreateOperator(
     ["Z"]
 )
 
-# Create X
+// Create X
 X = 5*np.ones((1, 4))
 print("X:\n",X)
 
-# Create Y
+// Create Y
 Y = np.ones((1, 4))
 print("Y:\n",Y)
 
-# Feed X & Y into workspace
+// Feed X & Y into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("Y", Y.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Z:\n", workspace.FetchBlob("Z"))
 
 ```
@@ -645,22 +645,22 @@ op = core.CreateOperator(
     ["Z"]
 )
 
-# Create X
+// Create X
 X = np.random.randn(3, 3)
 print("X:\n",X)
 
-# Create Y
+// Create Y
 Y = np.random.randn(3, 3)
 print("Y:\n",Y)
 
-# Feed X & Y into workspace
+// Feed X & Y into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("Y", Y.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Z:\n", workspace.FetchBlob("Z"))
 
 ```
diff --git a/caffe2/operators/elementwise_linear_op.cc b/caffe2/operators/elementwise_linear_op.cc
index d68bfbc5a0eb93..371aae78a25201 100644
--- a/caffe2/operators/elementwise_linear_op.cc
+++ b/caffe2/operators/elementwise_linear_op.cc
@@ -112,28 +112,28 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X
+// Create X
 X = np.array([[1,2,3,4,5],[6,8,9,16,10]])
 print("X:\n",X)
 
-# Create w
+// Create w
 w = np.array([1,1/2.,1/3.,1/4.,1/5.])
 print("w:\n",w)
 
-# Create b
+// Create b
 b = np.array([1.,1.,1.,1.,1.])
 print("b:\n",b)
 
 
-# Feed X & w & b into workspace
+// Feed X & w & b into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 workspace.FeedBlob("w", w.astype(np.float32))
 workspace.FeedBlob("b", b.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
diff --git a/caffe2/operators/elementwise_logical_ops.cc b/caffe2/operators/elementwise_logical_ops.cc
index 5ddd4570356e9d..0e2da569dcb11f 100644
--- a/caffe2/operators/elementwise_logical_ops.cc
+++ b/caffe2/operators/elementwise_logical_ops.cc
@@ -63,7 +63,7 @@ op = core.CreateOperator(
     value=[0,2,4,6,8],
 )
 
-# Use a not-empty tensor
+// Use a not-empty tensor
 workspace.FeedBlob("X", np.array([0,1,2,3,4,5,6,7,8]).astype(np.int32))
 print("X:\n", workspace.FetchBlob("X"))
 
@@ -75,7 +75,7 @@ print("Y: \n", workspace.FetchBlob("Y"))
 **Result**
 
 ```
-# value=[0,2,4,6,8]
+// value=[0,2,4,6,8]
 
 X:
  [0 1 2 3 4 5 6 7 8]
diff --git a/caffe2/operators/elementwise_sum_op.cc b/caffe2/operators/elementwise_sum_op.cc
index 861f4f115c0a41..dee3671f5bdc4a 100644
--- a/caffe2/operators/elementwise_sum_op.cc
+++ b/caffe2/operators/elementwise_sum_op.cc
@@ -86,7 +86,7 @@ workspace.ResetWorkspace()
 op = core.CreateOperator(
     "Sum",
     ["A",  "B"],
-    ["A"],  # inplace
+    ["A"],  // inplace
 )
 
 workspace.FeedBlob("A", np.array([[1,2,5],[8,3,4]]).astype(np.float32))
diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc
index ff3eac217390a4..c5a121e3a222d6 100644
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@@ -298,11 +298,11 @@ op_2 = core.CreateOperator(
     input_as_shape=1
 )
 
-# Test arg-based op
+// Test arg-based op
 workspace.RunOperatorOnce(op_1)
 print("output (op_1):\n", workspace.FetchBlob("output"))
 
-# Test input-based op
+// Test input-based op
 workspace.ResetWorkspace()
 workspace.FeedBlob("shape", np.array([5,5]))
 workspace.FeedBlob("min", np.array(13.8, dtype=np.float32))
@@ -389,11 +389,11 @@ op_2 = core.CreateOperator(
     input_as_shape=1
 )
 
-# Test arg-based op
+// Test arg-based op
 workspace.RunOperatorOnce(op_1)
 print("output (op_1):\n", workspace.FetchBlob("output"))
 
-# Test input-based op
+// Test input-based op
 workspace.ResetWorkspace()
 workspace.FeedBlob("shape", np.array([5,5]))
 workspace.FeedBlob("min", np.array(13, dtype=np.int32))
diff --git a/caffe2/operators/fully_connected_op.cc b/caffe2/operators/fully_connected_op.cc
index 6fe95eefbac476..e14fec6f8464b8 100644
--- a/caffe2/operators/fully_connected_op.cc
+++ b/caffe2/operators/fully_connected_op.cc
@@ -182,9 +182,9 @@ Github Links:
 
 ```
 
-# In this example, our batch size is 1 (M=1), the input observation will have
-#   6 features (K=6), and the layer will have one hidden node (N=1). The
-#   expected output is Y=7.
+// In this example, our batch size is 1 (M=1), the input observation will have
+//   6 features (K=6), and the layer will have one hidden node (N=1). The
+//   expected output is Y=7.
 workspace.ResetWorkspace()
 
 op = core.CreateOperator(
@@ -193,23 +193,23 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: MxK
+// Create X: MxK
 data = np.array([1,2,3,4,5,6]).astype(np.float32)
 data = data[np.newaxis,:]
 
-# Create W: NxK
+// Create W: NxK
 weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32)
 weights = weights[np.newaxis,:]
 
-# Create b: N
+// Create b: N
 bias = np.array([1.]).astype(np.float32)
 
-# Put the inputs into the workspace
+// Put the inputs into the workspace
 workspace.FeedBlob("X", data)
 workspace.FeedBlob("W", weights)
 workspace.FeedBlob("b", bias)
 
-# Run the operator
+// Run the operator
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 
diff --git a/caffe2/operators/gather_op.cc b/caffe2/operators/gather_op.cc
index cee268ddafdcbd..34c42bfc983f84 100644
--- a/caffe2/operators/gather_op.cc
+++ b/caffe2/operators/gather_op.cc
@@ -37,7 +37,7 @@ print("DATA:\n",data)
 inds = np.array([[0, 1],[1, 2]])
 print("INDICES:\n",inds)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("DATA", data.astype(np.float32))
 workspace.FeedBlob("INDICES", inds.astype(np.int32))
 
diff --git a/caffe2/operators/hard_sigmoid_op.cu b/caffe2/operators/hard_sigmoid_op.cu
new file mode 100644
index 00000000000000..ed3a4ec8286888
--- /dev/null
+++ b/caffe2/operators/hard_sigmoid_op.cu
@@ -0,0 +1,91 @@
+#include "caffe2/operators/hard_sigmoid_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void HardSigmoidCUDAKernel(
+    const int N,
+    const T alpha,
+    const T beta,
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = max(T(0), min(T(1), alpha * __ldg(X + i) + beta));
+#else
+    Y[i] = max(T(0), min(T(1), alpha * X[i] + beta));
+#endif
+  }
+}
+
+template <typename T>
+__global__ void HardSigmoidGradientCUDAKernel(
+    const int N,
+    const T alpha,
+    const T* dY,
+    const T* Y,
+    T* dX) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = (__ldg(Y + i) > T(0) && __ldg(Y + i) < T(1)) ? __ldg(dY + i) * alpha
+                                                         : T(0);
+#else
+    dX[i] = (Y[i] > T(0) && Y[i] < T(1)) ? dY[i] * alpha : T(0);
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+template <typename T>
+bool HardSigmoidFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  HardSigmoidCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, alpha, beta, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool HardSigmoidGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& /* dY_dims */,
+    const T* Y,
+    const T* dY,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
+  HardSigmoidGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, alpha, dY, Y, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    HardSigmoid,
+    UnaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        HardSigmoidFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    HardSigmoidGradient,
+    BinaryElementwiseWithArgsOp<
+        TensorTypes<float>,
+        CUDAContext,
+        HardSigmoidGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc
index 1cba60e86d9787..81499b4a5d6abf 100644
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@@ -342,7 +342,7 @@ op = core.CreateOperator("LRN",
      order="NHWC"
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc
index f877786648350b..f39aaaa6397a3e 100644
--- a/caffe2/operators/lp_pool_op.cc
+++ b/caffe2/operators/lp_pool_op.cc
@@ -258,7 +258,7 @@ op = core.CreateOperator(
     p=2.0
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc
index 6af404d1153588..79c35cd83a2148 100644
--- a/caffe2/operators/lpnorm_op.cc
+++ b/caffe2/operators/lpnorm_op.cc
@@ -100,7 +100,7 @@ op = core.CreateOperator(
 X = np.array([5., 2.])
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 
 workspace.RunOperatorOnce(op)
diff --git a/caffe2/operators/pool_op.cc b/caffe2/operators/pool_op.cc
index eca7978e024aac..87d67b17e2b6ce 100644
--- a/caffe2/operators/pool_op.cc
+++ b/caffe2/operators/pool_op.cc
@@ -764,7 +764,7 @@ op = core.CreateOperator(
     stride=2,
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
@@ -832,7 +832,7 @@ op = core.CreateOperator(
     stride=2,
 )
 
-workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
diff --git a/caffe2/operators/reduction_ops.cc b/caffe2/operators/reduction_ops.cc
index 0d01d50ca000e3..95f15b56a720e9 100644
--- a/caffe2/operators/reduction_ops.cc
+++ b/caffe2/operators/reduction_ops.cc
@@ -139,17 +139,17 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X, simulating a batch of 2, 4x4 matricies
+// Create X, simulating a batch of 2, 4x4 matricies
 X = np.random.randint(0,high=20,size=(2,4,4))
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
@@ -226,17 +226,17 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X, simulating a batch of 2, 4x4 matricies
+// Create X, simulating a batch of 2, 4x4 matricies
 X = np.random.randint(0,high=20,size=(2,4,4))
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.float32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
diff --git a/caffe2/operators/relu_op.cc b/caffe2/operators/relu_op.cc
index 03205241efc3e1..0f1abd82396156 100644
--- a/caffe2/operators/relu_op.cc
+++ b/caffe2/operators/relu_op.cc
@@ -105,7 +105,7 @@ op = core.CreateOperator(
   ["Y"]
   )
 
-workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) # NCHW
+workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 
 workspace.RunOperatorOnce(op)
diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc
index bea0b43d751ccf..d968112c9ecc2d 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@@ -48,8 +48,8 @@ vector and `values` tensor into a compacted tensor where the first dimension
 corresponds to each id provided in mask argument. Missing values are filled with
 the value of `default_value`. After running this op:
 
-  output[j, :] = values[i] # where mask[j] == indices[i]
-  output[j, ...] = default_value # when mask[j] doesn't appear in indices
+  output[j, :] = values[i] // where mask[j] == indices[i]
+  output[j, ...] = default_value // when mask[j] doesn't appear in indices
 
 If `lengths` is provided and not empty, and extra "batch" dimension is prepended
 to the output.
diff --git a/caffe2/operators/sparse_to_dense_op.cc b/caffe2/operators/sparse_to_dense_op.cc
index 4f6a49796df826..0c9519e6576122 100644
--- a/caffe2/operators/sparse_to_dense_op.cc
+++ b/caffe2/operators/sparse_to_dense_op.cc
@@ -23,7 +23,7 @@ representation.
 
 After running this op:
 
-  output[indices[i], :] += values[i]  # sum over all indices[i] equal to the index
+  output[indices[i], :] += values[i]  // sum over all indices[i] equal to the index
   output[j, ...] = 0 if j not in indices
 )DOC")
     .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data")
diff --git a/caffe2/operators/stats_ops.cc b/caffe2/operators/stats_ops.cc
index 508dd1ae82060a..d07f9cace13636 100644
--- a/caffe2/operators/stats_ops.cc
+++ b/caffe2/operators/stats_ops.cc
@@ -290,7 +290,7 @@ timergetandend_op = core.CreateOperator(
     ["nanos"]
 )
 
-# Test TimerBegin/TimerGet/TimerEnd
+// Test TimerBegin/TimerGet/TimerEnd
 workspace.RunOperatorOnce(timerbegin_op)
 print("timer:", workspace.FetchBlob("timer"))
 workspace.RunOperatorOnce(timerget_op)
@@ -298,7 +298,7 @@ print("nanos:", workspace.FetchBlob("nanos"))
 workspace.RunOperatorOnce(timerend_op)
 
 
-# Test TimerBegin/TimerGetAndEnd
+// Test TimerBegin/TimerGetAndEnd
 workspace.RunOperatorOnce(timerbegin_op)
 print("timer:", workspace.FetchBlob("timer"))
 workspace.RunOperatorOnce(timergetandend_op)
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index cc7c037a6d332d..eb771974fbf397 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -103,17 +103,17 @@ op = core.CreateOperator(
     ["Y"]
 )
 
-# Create X: Sample softmax output for 5-class model
+// Create X: Sample softmax output for 5-class model
 X = np.array([2,2,2,2,2,2,2,2,2,2])
 print("X:\n",X)
 
-# Feed X into workspace
+// Feed X into workspace
 workspace.FeedBlob("X", X.astype(np.int32))
 
-# Run op
+// Run op
 workspace.RunOperatorOnce(op)
 
-# Collect Output
+// Collect Output
 print("Y:\n", workspace.FetchBlob("Y"))
 
 ```
@@ -508,14 +508,14 @@ op = core.CreateOperator(
     ["has_elements"],
 )
 
-# Use a not-empty tensor
+// Use a not-empty tensor
 workspace.FeedBlob("tensor", np.random.randn(2, 2).astype(np.float32))
 print("tensor:\n", workspace.FetchBlob("tensor"))
 
 workspace.RunOperatorOnce(op)
 print("has_elements: ", workspace.FetchBlob("has_elements"),"\n")
 
-# Use an empty tensor
+// Use an empty tensor
 workspace.FeedBlob("tensor", np.empty(0))
 print("tensor:\n", workspace.FetchBlob("tensor"))
 
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 6a8d22253444a5..80e2308eabf3cd 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -322,14 +322,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
     currentBasicBlock->pushInstructionNode(opNode);
   }
 
-  CAFFE_ENFORCE(
-      externalInputNames.size() == 0,
-      "Attempting to convert an ill-formed network: \
-      external_input contains unused blobs");
+  if (externalInputNames.size()) {
+    std::ostringstream os;
+    for (const auto& inputName : externalInputNames) {
+      os << "\"" << inputName << "\" ";
+    }
+
+    CAFFE_ENFORCE(
+        externalInputNames.size() == 0,
+        "Attempting to convert an ill-formed network: external_input contains ",
+        externalInputNames.size(),
+        " unused blobs: ",
+        os.str());
+  }
 
   for (const auto& outputName : net.external_output()) {
     CAFFE_ENFORCE(
-        blobMap.count(outputName), "NetDef has ill-formed external_output");
+        blobMap.count(outputName),
+        "NetDef has ill-formed external_output: \"",
+        outputName,
+        "\"");
     module.outputs.insert(blobMap[outputName]);
   }
 
diff --git a/caffe2/predictor/predictor_config.cc b/caffe2/predictor/predictor_config.cc
index aabff0daffcd73..0ca120d0121da5 100644
--- a/caffe2/predictor/predictor_config.cc
+++ b/caffe2/predictor/predictor_config.cc
@@ -10,7 +10,7 @@ namespace {
 // We don't use the getNet() from predictor_utils.cc here because that file
 // has additional dependencies that we want to avoid bringing in, to keep the
 // binary size as small as possible.
-const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
+static const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
   for (const auto& n : def.nets()) {
     if (n.key() == name) {
       return n.value();
@@ -19,7 +19,7 @@ const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
   CAFFE_THROW("Net not found: ", name);
 }
 
-const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
+static const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
     const MetaNetDef& def,
     const std::string& name) {
   for (const auto& b : def.blobs()) {
@@ -30,26 +30,60 @@ const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
   CAFFE_THROW("Blob not found: ", name);
 }
 
+static std::string combine(const std::string& str, const std::string& name) {
+  if (name.empty()) {
+    return std::string(str);
+  }
+  return str + "_" + name;
+}
+
+static std::string getNamedPredictNet(const string& name) {
+  return combine(PredictorConsts::default_instance().predict_net_type(), name);
+}
+
+static std::string getNamedInitNet(const string& name) {
+  return combine(
+      PredictorConsts::default_instance().predict_init_net_type(), name);
+}
+
+static std::string getNamedInputs(const string& name) {
+  return combine(PredictorConsts::default_instance().inputs_blob_type(), name);
+}
+
+static std::string getNamedOutputs(const string& name) {
+  return combine(PredictorConsts::default_instance().outputs_blob_type(), name);
+}
+
+static std::string getNamedParams(const string& name) {
+  return combine(
+      PredictorConsts::default_instance().parameters_blob_type(), name);
+}
+
 } // namespace
 
-PredictorConfig
-makePredictorConfig(const MetaNetDef& def, Workspace* parent, bool run_init) {
-  const auto& init_net =
-      getNet(def, PredictorConsts::default_instance().global_init_net_type());
-  const auto& run_net =
-      getNet(def, PredictorConsts::default_instance().predict_net_type());
+PredictorConfig makePredictorConfig(
+    const MetaNetDef& def,
+    Workspace* parent,
+    bool run_init,
+    const std::string& net_name) {
+  const auto& init_net = getNet(def, getNamedInitNet(net_name));
+  const auto& run_net = getNet(def, getNamedPredictNet(net_name));
   auto config = makePredictorConfig(init_net, run_net, parent, run_init);
-  const auto& inputs =
-      getBlobs(def, PredictorConsts::default_instance().inputs_blob_type());
+  const auto& inputs = getBlobs(def, getNamedInputs(net_name));
   for (const auto& input : inputs) {
     config.input_names.emplace_back(input);
   }
 
-  const auto& outputs =
-      getBlobs(def, PredictorConsts::default_instance().outputs_blob_type());
+  const auto& outputs = getBlobs(def, getNamedOutputs(net_name));
   for (const auto& output : outputs) {
     config.output_names.emplace_back(output);
   }
+
+  const auto& params = getBlobs(def, getNamedParams(net_name));
+  for (const auto& param : params) {
+    config.parameter_names.emplace_back(param);
+  }
+
   return config;
 }
 
diff --git a/caffe2/predictor/predictor_config.h b/caffe2/predictor/predictor_config.h
index eda1c9d03ca2ba..b1555addfa6f08 100644
--- a/caffe2/predictor/predictor_config.h
+++ b/caffe2/predictor/predictor_config.h
@@ -45,7 +45,8 @@ CAFFE2_API Workspace makeWorkspace(std::shared_ptr<PredictorParameters> paramete
 CAFFE2_API PredictorConfig makePredictorConfig(
     const MetaNetDef& net,
     Workspace* parent = nullptr,
-    bool run_init = true);
+    bool run_init = true,
+    const std::string& net_name = "");
 
 CAFFE2_API PredictorConfig makePredictorConfig(
     const NetDef& init_net,
diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
index 40e4f720c61900..326265fc66d039 100644
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@@ -209,33 +209,4 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
 }
 
-class PredictorMetaNetDefTest : public testing::Test {
- public:
-  void SetUp() override {
-    DeviceOption op;
-    op.set_random_seed(1701);
-    ctx_ = caffe2::make_unique<CPUContext>(op);
-    p_ = caffe2::make_unique<Predictor>(
-        makePredictorConfig(parseMetaNetDef(metaSpec)));
-  }
-
-  std::unique_ptr<CPUContext> ctx_;
-  std::unique_ptr<Predictor> p_;
-};
-
-TEST_F(PredictorMetaNetDefTest, SimpleMetaNetDefInitializer) {
-  auto inputData = randomTensor({1, 4}, ctx_.get());
-  Predictor::TensorMap input;
-  auto iter = input.emplace("data", Tensor(CPU));
-  auto tensor = inputData->GetMutableTensor(CPU);
-  iter.first->second.ResizeLike(*tensor);
-  iter.first->second.ShareData(*tensor);
-  Predictor::TensorList output;
-  (*p_)(input, &output);
-  EXPECT_EQ(output.size(), 1);
-  EXPECT_EQ(output.front().dims().size(), 2);
-  EXPECT_EQ(output.front().dim(0), 1);
-  EXPECT_EQ(output.front().dim(1), 10);
-  EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
-}
 } // namespace caffe2
diff --git a/caffe2/predictor/predictor_utils.cc b/caffe2/predictor/predictor_utils.cc
index 4af83d0bea8c25..f5acd4f936010b 100644
--- a/caffe2/predictor/predictor_utils.cc
+++ b/caffe2/predictor/predictor_utils.cc
@@ -1,4 +1,5 @@
 #include "caffe2/predictor/predictor_utils.h"
+#include "caffe2/predictor/predictor_config.h"
 
 #include "caffe2/core/blob.h"
 #include "caffe2/core/logging.h"
@@ -6,6 +7,13 @@
 #include "caffe2/proto/predictor_consts.pb.h"
 #include "caffe2/utils/proto_utils.h"
 
+CAFFE2_DEFINE_bool(
+    caffe2_predictor_claim_tensor_memory,
+    true,
+    "If false, then predictor will not claim tensor memory"
+    "otherwise when tensor is shrinked to a size smaller than current size "
+    "by FLAGS_caffe2_max_keep_on_shrink_memory, the memory will be claimed.");
+
 namespace caffe2 {
 namespace predictor_utils {
 
@@ -79,4 +87,47 @@ std::unique_ptr<MetaNetDef> runGlobalInitialization(
 }
 
 } // namespace predictor_utils
+
+void removeExternalBlobs(
+    const std::vector<std::string>& input_blobs,
+    const std::vector<std::string>& output_blobs,
+    Workspace* ws) {
+  for (const auto& blob : input_blobs) {
+    ws->RemoveBlob(blob);
+  }
+  for (const auto& blob : output_blobs) {
+    ws->RemoveBlob(blob);
+  }
+}
+
+PredictorConfig makePredictorConfig(
+    const string& db_type,
+    const string& db_path) {
+  // TODO: Remove this flags once Predictor accept PredictorConfig as
+  // constructors. These comes are copied temporarly from the Predictor.
+  if (FLAGS_caffe2_predictor_claim_tensor_memory) {
+    if (FLAGS_caffe2_max_keep_on_shrink_memory == LLONG_MAX) {
+      FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 1024 * 1024;
+    }
+  }
+  auto dbReader =
+      make_unique<db::DBReader>(db::CreateDB(db_type, db_path, db::READ));
+  auto ws = std::make_shared<Workspace>();
+  auto net_def =
+      predictor_utils::runGlobalInitialization(std::move(dbReader), ws.get());
+  auto config = makePredictorConfig(*net_def, ws.get());
+  config.ws = ws;
+  const auto& init_net = predictor_utils::getNet(
+      *net_def, PredictorConsts::default_instance().predict_init_net_type());
+  CAFFE_ENFORCE(config.ws->RunNetOnce(init_net));
+  config.ws->RemoveBlob(
+      PredictorConsts::default_instance().predictor_dbreader());
+  // Input and output blobs should never be allocated in the master workspace
+  // since we'll end up with race-conditions due to these being shared among
+  // predictor threads / TL workspaces. Safely handle against globalInitNet
+  // creating them in the master.
+  removeExternalBlobs(config.input_names, config.output_names, config.ws.get());
+  return config;
+}
+
 } // namespace caffe2
diff --git a/caffe2/predictor/predictor_utils.h b/caffe2/predictor/predictor_utils.h
index 8c9cb4a5792d48..af7799b039c8b7 100644
--- a/caffe2/predictor/predictor_utils.h
+++ b/caffe2/predictor/predictor_utils.h
@@ -24,4 +24,14 @@ CAFFE2_API std::unique_ptr<MetaNetDef> runGlobalInitialization(
     Workspace* master);
 
 } // namespace predictor_utils
+
+PredictorConfig makePredictorConfig(
+    const string& db_type,
+    const string& db_path);
+
+void removeExternalBlobs(
+    const std::vector<std::string>& input_blobs,
+    const std::vector<std::string>& output_blobs,
+    Workspace* ws);
+
 } // namespace caffe2
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 60e5c39bed1318..ae169eef2e6480 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -44,6 +44,7 @@ def Parallelize(
     param_update_builder_fun=None,
     optimizer_builder_fun=None,
     post_sync_builder_fun=None,
+    pre_grad_net_transformer_fun=None,
     net_transformer_fun=None,
     devices=None,
     rendezvous=None,
@@ -91,6 +92,11 @@ def Parallelize(
                         Signature:
                         net_transformer_fun(
                             model, num_devices, device_prefix, device_type)
+      pre_grad_net_transformer_fun:
+                        Optional function to transform the network similar to
+                        net_transformer_fun, but happens before gradient ops
+                        been add.
+                        Signature: pre_grad_net_transformer_fun(model)
       post_sync_builder_fun:
                         Function applied after initial parameter sync has been
                         completed, such as keeping multi-precision parameters
@@ -234,6 +240,9 @@ def Parallelize(
     model_helper_obj._computed_param_names =\
         list(viewkeys(computed_params_grouped))
 
+    if pre_grad_net_transformer_fun:
+        pre_grad_net_transformer_fun(model_helper_obj)
+
     if has_parameter_updates:
         log.info("Adding gradient operators")
         _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)
diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py
new file mode 100644
index 00000000000000..19bdbaac8a217e
--- /dev/null
+++ b/caffe2/python/ideep/operator_fallback_op_test.py
@@ -0,0 +1,99 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+from caffe2.proto import caffe2_pb2
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class TestFallbackOps(hu.HypothesisTestCase):
+    @given(stride=st.integers(1, 3),
+           pad=st.integers(0, 3),
+           kernel=st.integers(3, 5),
+           size=st.integers(8, 10),
+           input_channels=st.integers(1, 3),
+           output_channels=st.integers(1, 5),
+           batch_size=st.integers(1, 3),
+           use_bias=st.booleans(),
+           **mu.gcs)
+    def test_in_place(self, stride, pad, kernel, size,
+                             input_channels, output_channels,
+                             batch_size, use_bias, gc, dc):
+        # To expose fallback in-place potential issue, the fallback op
+        # following ideep op must be run at least two iterations.
+        conv = core.CreateOperator(
+            "Conv",
+            ["X", "w", "b"] if use_bias else ["X", "w"],
+            ["Y"],
+            stride=stride,
+            pad=pad,
+            kernel=kernel,
+            device_option=dc[0]
+        )
+        X = np.random.rand(
+            batch_size, input_channels, size, size).astype(np.float32) - 0.5
+        w = np.random.rand(output_channels, input_channels, kernel, kernel) \
+            .astype(np.float32) - 0.5
+        b = np.random.rand(output_channels).astype(np.float32) - 0.5
+
+        old_ws_name = workspace.CurrentWorkspace()
+        workspace.SwitchWorkspace("_device_check_", True)
+        workspace.FeedBlob('X', X, dc[0])
+        workspace.FeedBlob('w', w, dc[0])
+        workspace.FeedBlob('b', b, dc[0])
+        workspace.RunOperatorOnce(conv)
+        Y = workspace.FetchBlob('Y')
+
+        scale = np.random.randn(Y.shape[1]).astype(np.float32)
+        bias = np.random.randn(Y.shape[1]).astype(np.float32)
+        ac = core.CreateOperator(
+            "AffineChannel",
+            ["Y", "scale", "bias"],
+            ["Y"],
+            is_learnable=False,
+            device_option=dc[0]
+        )
+        workspace.FeedBlob('scale', scale, dc[0])
+        workspace.FeedBlob('bias', bias, dc[0])
+        workspace.RunOperatorOnce(ac)
+        workspace.RunOperatorOnce(conv)
+        workspace.RunOperatorOnce(ac)
+        Y0 = workspace.FetchBlob('Y')
+
+        workspace.ResetWorkspace()
+        dev_net = caffe2_pb2.NetDef()
+        conv_dev = caffe2_pb2.OperatorDef()
+        conv_dev.CopyFrom(conv)
+        conv_dev.device_option.CopyFrom(dc[1])
+        ac_dev = caffe2_pb2.OperatorDef()
+        ac_dev.CopyFrom(ac)
+        ac_dev.device_option.CopyFrom(dc[1])
+        dev_net.op.extend([conv_dev, ac_dev])
+        workspace.FeedBlob('X', X, dc[1])
+        workspace.FeedBlob('w', w, dc[1])
+        workspace.FeedBlob('b', b, dc[1])
+        workspace.FeedBlob('scale', scale, dc[1])
+        workspace.FeedBlob('bias', bias, dc[1])
+        workspace.RunNetOnce(dev_net)
+        workspace.RunNetOnce(dev_net)
+        Y1 = workspace.FetchBlob('Y')
+
+        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
+            print(Y1.flatten())
+            print(Y0.flatten())
+            print(np.max(np.abs(Y1 - Y0)))
+            self.assertTrue(False)
+
+        workspace.SwitchWorkspace(old_ws_name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index 0e590307a88858..c20aad4218f17e 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -338,7 +338,7 @@ def sigmoid_ref(X):
            alpha=st.floats(min_value=-100.0, max_value=100.0),
            beta=st.floats(min_value=-100.0, max_value=100.0),
            engine=st.sampled_from([""]),
-           **hu.gcs_cpu_only)
+           **hu.gcs)
     def test_hard_sigmoid(self, X, inplace, alpha, beta, engine, gc, dc):
         # Prevent alpha and beta from mutually being 0 to avoid a division
         # error when adjusting our inputs
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index 668c812cd8e1a8..056558c9a73335 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -9,6 +9,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "caffe2/ideep/operators/operator_fallback_ideep.h"
 #include <caffe2/ideep/ideep_utils.h>
 
 namespace caffe2 {
@@ -19,42 +20,42 @@ USE_IDEEP_DEF_ALIASES();
 class IDeepFetcher;
 class IDeepFeeder;
 
-REGISTER_BLOB_FETCHER((TypeMeta::Id<itensor>()),IDeepFetcher);
+REGISTER_IDEEP_OPERATOR(Python, IDEEPFallbackOp<PythonOp<CPUContext, false>>);
+
+REGISTER_BLOB_FETCHER((TypeMeta::Id<itensor>()), IDeepFetcher);
 REGISTER_BLOB_FEEDER(IDEEP, IDeepFeeder);
 
 class IDeepFetcher : public BlobFetcherBase {
   TypeMeta type_transform(const itensor &atensor) {
-    switch(atensor.get_data_type()) {
-      case itensor::data_type::f32:
-        return TypeMeta::Make<float>();
-      case itensor::data_type::s16:
-        return TypeMeta::Make<float16>();
-      case itensor::data_type::s32:
-        return TypeMeta::Make<int>();
-      case itensor::data_type::s8:
-        return TypeMeta::Make<int8_t>();
-      case itensor::data_type::u8:
-        return TypeMeta::Make<uint8_t>();
-      default:
-        // Should we throw exception?
-        return TypeMeta();
+    switch (atensor.get_data_type()) {
+    case itensor::data_type::f32:
+      return TypeMeta::Make<float>();
+    case itensor::data_type::s32:
+      return TypeMeta::Make<int>();
+    case itensor::data_type::s8:
+      return TypeMeta::Make<int8_t>();
+    case itensor::data_type::u8:
+      return TypeMeta::Make<uint8_t>();
+    default:
+      // Should we throw exception?
+      return TypeMeta();
     }
   }
 
- public:
-  pybind11::object Fetch(const Blob& blob) override {
+public:
+  pybind11::object Fetch(const Blob &blob) override {
     try {
       return FetchTensor(blob.Get<itensor>(), true).obj;
-    } catch (ideep::error& e) {
-      VLOG(1) << "IDEEP error: " << e.message;
+    } catch (ideep::error &e) {
+      LOG(ERROR) << "IDEEP error: " << e.message;
       throw;
     }
   }
 
-  FetchedBlob FetchTensor(const itensor& atensor, bool force_copy) {
+  FetchedBlob FetchTensor(const itensor &atensor, bool force_copy) {
     FetchedBlob result;
     CAFFE_ENFORCE(atensor.materialized(),
-        "Trying to fetch uninitialized tensor");
+                  "Trying to fetch uninitialized tensor");
     const int numpy_type = CaffeToNumpyType(type_transform(atensor));
     CAFFE_ENFORCE(
         numpy_type != -1,
@@ -64,17 +65,16 @@ class IDeepFetcher : public BlobFetcherBase {
     std::vector<npy_intp> npy_dims(dims.begin(), dims.end());
 
     result.copied = force_copy || atensor.need_reorder();
-    void* outPtr;
+    void *outPtr;
     if (result.copied) {
       result.obj = py::reinterpret_steal<py::object>(
           PyArray_SimpleNew(atensor.ndims(), npy_dims.data(), numpy_type));
       outPtr = static_cast<void *>(
-          PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
+          PyArray_DATA(reinterpret_cast<PyArrayObject *>(result.obj.ptr())));
     } else {
       outPtr = atensor.get_data_handle();
-      result.obj = py::reinterpret_steal<py::object>(
-          PyArray_SimpleNewFromData(
-            atensor.ndims(), npy_dims.data(), numpy_type, outPtr));
+      result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
+          atensor.ndims(), npy_dims.data(), numpy_type, outPtr));
     }
 
     if (numpy_type == NPY_OBJECT) {
@@ -95,8 +95,6 @@ class IDeepFeeder : public BlobFeederBase {
       return itensor::data_type::f32;
     else if (meta == TypeMeta::Make<int>())
       return itensor::data_type::s32;
-    else if (meta == TypeMeta::Make<float16>())
-      return itensor::data_type::s16;
     else if (meta == TypeMeta::Make<int8_t>())
       return itensor::data_type::s8;
     else if (meta == TypeMeta::Make<uint8_t>())
@@ -105,53 +103,74 @@ class IDeepFeeder : public BlobFeederBase {
       return itensor::data_type::data_undef;
   }
 
- public:
-   void FeedTensor(
-       const DeviceOption& option,
-       PyArrayObject *original_array,
-       itensor *tensor) {
-     PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
-     auto g = MakeGuard([&]() {Py_XDECREF(array); });
-
-     const auto npy_type = PyArray_TYPE(array);
-     const TypeMeta& meta = NumpyTypeToCaffe(npy_type);
-     CAFFE_ENFORCE(
-        meta.id() != TypeIdentifier::uninitialized(),
+public:
+  void FeedTensor(
+      const DeviceOption &option,
+      PyArrayObject *original_array,
+      itensor *tensor) {
+    PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
+    auto g = MakeGuard([&]() { Py_XDECREF(array); });
+    const auto npy_type = PyArray_TYPE(array);
+    const TypeMeta &meta = NumpyTypeToCaffe(npy_type);
+    CAFFE_ENFORCE_NE(
+        meta.id(),
+        TypeIdentifier::uninitialized(),
         "This numpy data type is not supported: ",
-        PyArray_TYPE(array),
-        ".");
+        PyArray_TYPE(array), ".");
 
-     int ndim = PyArray_NDIM(array);
-     npy_intp* npy_dims = PyArray_DIMS(array);
+    int ndim = PyArray_NDIM(array);
+    npy_intp *npy_dims = PyArray_DIMS(array);
 
-     itensor::dims adims;
-     for (int i = 0; i < ndim; i++) {
-       adims.push_back(static_cast<itensor::dims::value_type>(
-             npy_dims[i]));
-     }
+    itensor::dims adims;
+    for (int i = 0; i < ndim; i++) {
+      adims.push_back(static_cast<itensor::dims::value_type>(npy_dims[i]));
+    }
 
-     switch (npy_type) {
+    switch (npy_type) {
       case NPY_OBJECT:
       case NPY_UNICODE:
         CAFFE_THROW("IDeep doesn't support string");
         break;
       default:
         auto type = type_transform(meta);
-        tensor->resize(adims, type);
+        if (tensor->get_dims() != adims || type != tensor->get_data_type()) {
+          tensor->resize(adims, type);
+        }
         tensor->reorder_from(adims, type,
-            static_cast<void *>(PyArray_DATA(array)));
-     }
-   }
+                             static_cast<void *>(PyArray_DATA(array)));
+    }
+  }
 
-   void Feed(const DeviceOption& option, PyArrayObject* original_array,
-       Blob* blob) {
-      try {
+  bool ZeroDim(PyArrayObject *array) {
+    int ndim = PyArray_NDIM(array);
+    npy_intp *npy_dims = PyArray_DIMS(array);
+    return ndim == 0 ||
+      std::find(npy_dims, npy_dims + ndim, 0) != npy_dims + ndim;
+  }
+
+  void Feed(const DeviceOption &option, PyArrayObject *original_array,
+            Blob *blob) {
+    try {
+      PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
+      auto g = MakeGuard([&]() { Py_XDECREF(array); });
+
+      const auto npy_type = PyArray_TYPE(array);
+      const TypeMeta &meta = NumpyTypeToCaffe(npy_type);
+      // TODO: if necessary, use dispatcher.
+      if (meta.Match<float>() && !ZeroDim(original_array)) {
         FeedTensor(option, original_array, blob->GetMutable<itensor>());
-      } catch (ideep::error& e) {
-        VLOG(1) << "IDEEP error: " << e.message;
-        throw;
+      } else {
+        DeviceOption cpu_option(option);
+        cpu_option.set_device_type(DeviceType::CPU);
+        TensorFeeder<CPUContext> cpu_tensor_feeder;
+        cpu_tensor_feeder.FeedTensor(cpu_option, original_array,
+                                     blob->GetMutableTensor(CPU));
       }
-   }
+    } catch (ideep::error &e) {
+      LOG(ERROR) << "IDEEP error: " << e.message;
+      throw;
+    }
+  }
 };
 
 } // namespace python
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
index 5aafdf63c3b28a..9cfe7089332a18 100644
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@@ -228,7 +228,7 @@ def num_registered_tasks(self):
     def used_nodes(self):
         # use list to keep order
         used = []
-        for task in self.tasks():
+        for task in self._tasks + self._tasks_to_add:
             if task.node not in used:
                 used.append(task.node)
         return used
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 2437933ae624eb..1a579b519fe09c 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -391,3 +391,17 @@ def test_transformer_FuseConv3DBN(
             rtol=1e-02,
             atol=1e-04
         )
+
+    def test_converterEnforceUnusedInputs(self):
+        net = core.Net("net")
+        net.Relu(["X"], ["Y"])
+        net.Proto().external_input.extend(["fake"])
+        with self.assertRaises(Exception):
+            transformer.AddNNPACK(net)  # just testing the converter
+
+    def test_converterEnforceUnusedOutputs(self):
+        net = core.Net("net")
+        net.Relu(["X"], ["Y"])
+        net.Proto().external_output.extend(["fake"])
+        with self.assertRaises(Exception):
+            transformer.AddNNPACK(net)  # just testing the converter
diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
index 82a59ad60aa950..e207f7c7b05284 100644
--- a/caffe2/utils/smart_tensor_printer_test.cc
+++ b/caffe2/utils/smart_tensor_printer_test.cc
@@ -39,6 +39,9 @@ void printTensorAndCheck(const std::vector<T>& values) {
   expect_stderr_contains(values);
 }
 
+// We need real glog for this test to pass
+#ifdef CAFFE2_USE_GOOGLE_GLOG
+
 #if !(__APPLE__) // TODO(janusz): thread_local does not work under mac.
 
 TEST(SmartTensorPrinterTest, SimpleTest) {
@@ -48,4 +51,6 @@ TEST(SmartTensorPrinterTest, SimpleTest) {
 
 #endif // !(__APPLE__)
 
+#endif // CAFFE2_USE_GOOGLE_GLOG
+
 } // namespace caffe2
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index fa2f92092758a4..d385ff07d323d5 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -169,6 +169,7 @@ Pointwise Ops
 .. autofunction:: cos
 .. autofunction:: cosh
 .. autofunction:: div
+.. autofunction:: digamma 
 .. autofunction:: erf
 .. autofunction:: erfc
 .. autofunction:: erfinv
diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
index f18077b829427b..1791ca27a98590 100644
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@@ -11,4 +11,8 @@ if (USE_CUDA)
 
   target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
   install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
+elseif(NOT IOS_PLATFORM)
+  add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
+  target_link_libraries(caffe2_detectron_ops caffe2)
+  install(TARGETS caffe2_detectron_ops DESTINATION lib)
 endif()
diff --git a/modules/detectron/batch_permutation_op.cc b/modules/detectron/batch_permutation_op.cc
index f92d7dd236d758..032288f811de08 100644
--- a/modules/detectron/batch_permutation_op.cc
+++ b/modules/detectron/batch_permutation_op.cc
@@ -15,9 +15,19 @@
  */
 
 #include "batch_permutation_op.h"
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
 
 namespace caffe2 {
 
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(
+    BatchPermutation,
+    IDEEPFallbackOp<BatchPermutationOp<float, CPUContext>>);
+#endif
+
 REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
     BatchPermutationGradient,
diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc
index b668701b4ce4f4..4fc4d6dcd93a31 100644
--- a/modules/detectron/upsample_nearest_op.cc
+++ b/modules/detectron/upsample_nearest_op.cc
@@ -15,8 +15,17 @@
  */
 
 #include "upsample_nearest_op.h"
+#ifdef CAFFE2_USE_IDEEP
+#include "caffe2/ideep/operators/operator_fallback_ideep.h"
+#include "caffe2/ideep/utils/ideep_operator.h"
+#endif
 
 namespace caffe2 {
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(
+    UpsampleNearest,
+    IDEEPFallbackOp<UpsampleNearestOp<float, CPUContext>>);
+#endif
 
 REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h
index e24d705bc14afd..17f77855509e67 100644
--- a/modules/detectron/upsample_nearest_op.h
+++ b/modules/detectron/upsample_nearest_op.h
@@ -35,8 +35,50 @@ class UpsampleNearestOp final : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
   bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
+    auto translate_idx = [](int ii, int d1, int d2, int d3, int scale_factor) {
+      int x, y, z, w;
+      w = ii % d3;
+      ii = ii/d3;
+      z = ii % d2;
+      ii = ii/d2;
+      y = ii % d1;
+      ii = ii/d1;
+      x = ii;
+      w = w/scale_factor;
+      z = z/scale_factor;
+      d2 /= scale_factor;
+      d3 /= scale_factor;
+      return (((x*d1+y)*d2)+z)*d3+w;
+    };
+
+    auto& X = Input(0);
+    auto* Y = Output(0);
+    auto out_shape = X.dims();
+    out_shape[X.ndim() - 1] *= scale_;
+    out_shape[X.ndim() - 2] *= scale_;
+    Y->Resize(out_shape);
+
+    int d1;
+    int d2;
+    int d3;
+    if (X.ndim() == 3) {
+      d1 = Y->dim32(0);
+      d2 = Y->dim32(1);
+      d3 = Y->dim32(2);
+    } else {
+      d1 = Y->dim32(1);
+      d2 = Y->dim32(2);
+      d3 = Y->dim32(3);
+    }
+
+    const T *input_data = X.template data<T>();
+    T *output_data = Y->template mutable_data<T>();
+
+    for (int ii = 0; ii < Y->size(); ii++) {
+      int ipidx = translate_idx(ii, d1, d2, d3, scale_);
+      output_data[ii] = input_data[ipidx];
+    }
+    return true;
   }
 
  protected:
diff --git a/setup.py b/setup.py
index ac82f2a7960a1b..e99283d54ffdd9 100644
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,8 @@
 #   TORCH_CUDA_ARCH_LIST
 #     specify which CUDA architectures to build for.
 #     ie `TORCH_CUDA_ARCH_LIST="6.0;7.0"`
+#     These are not CUDA versions, instead, they specify what
+#     classes of NVIDIA hardware we should generate PTX for.
 #
 #   ONNX_NAMESPACE
 #     specify a namespace for ONNX built here rather than the hard-coded
diff --git a/test/common.py b/test/common.py
index 1c86bcd7fe24b8..e7d6940ea56cc1 100644
--- a/test/common.py
+++ b/test/common.py
@@ -17,6 +17,7 @@
 import warnings
 import random
 import contextlib
+import socket
 from functools import wraps
 from itertools import product
 from copy import deepcopy
@@ -111,12 +112,10 @@ def wrapper(*args, **kwargs):
 def skipIfNoLapack(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
-        try:
+        if not torch._C.has_lapack:
+            raise unittest.SkipTest('PyTorch compiled without Lapack')
+        else:
             fn(*args, **kwargs)
-        except Exception as e:
-            if 'Lapack library not found' in repr(e):
-                raise unittest.SkipTest('Compiled without Lapack')
-            raise
     return wrapper
 
 
@@ -550,3 +549,12 @@ def download_file(url, binary=True):
         msg = "could not download test file '{}'".format(url)
         warnings.warn(msg, RuntimeWarning)
         raise unittest.SkipTest(msg)
+
+
+def find_free_port():
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind(('localhost', 0))
+    sockname = sock.getsockname()
+    sock.close()
+    return sockname[1]
diff --git a/test/expect/TestJit.test_constant_prop_loop_constant.expect b/test/expect/TestJit.test_constant_prop_loop_constant.expect
new file mode 100644
index 00000000000000..5bdca2f2c47890
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_loop_constant.expect
@@ -0,0 +1,20 @@
+graph() {
+  %b.1 : int = prim::Constant[value=0]()
+  %1 : int = prim::Constant[value=2147483647]()
+  %2 : int = prim::Constant[value=1]()
+  %b.3 : int = prim::Loop(%1, %2, %b.1)
+    block0(%4 : int, %5 : int) {
+      %b.2 : int = prim::Constant[value=1]()
+      %7 : int = prim::Constant[value=1]()
+      -> (%7, %b.2)
+    }
+  %8 : int = prim::Constant[value=2147483647]()
+  %9 : int = prim::Constant[value=0]()
+  %b : int = prim::Loop(%8, %9, %b.3)
+    block0(%11 : int, %12 : int) {
+      %b.4 : int = prim::Constant[value=2]()
+      %14 : int = prim::Constant[value=0]()
+      -> (%14, %b.4)
+    }
+  return (%b);
+}
diff --git a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
index 6a9a3a571967a2..078091d52268e2 100644
--- a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
@@ -1,8 +1,14 @@
 graph(%x : Dynamic) {
-  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
-  %2 : Double(3, 3) = aten::mm(%x, %1)
-  %3 : int = prim::Constant[value=1]()
-  %4 : int = prim::Constant[value=1]()
-  %5 : Dynamic = aten::add(%2, %3, %4)
-  return (%5);
+  %1 : int = prim::Constant[value=4]()
+  %2 : int = prim::Constant[value=3]()
+  %3 : int[] = prim::ListConstruct(%1, %2)
+  %4 : int = prim::Constant[value=7]()
+  %5 : int = prim::Constant[value=0]()
+  %6 : int[] = prim::Constant[value=[0, -1]]()
+  %7 : Double(4, 3) = aten::zeros(%3, %4, %5, %6)
+  %8 : Double(3, 3) = aten::mm(%x, %7)
+  %9 : int = prim::Constant[value=1]()
+  %10 : int = prim::Constant[value=1]()
+  %11 : Dynamic = aten::add(%8, %9, %10)
+  return (%11);
 }
diff --git a/test/expect/TestScript.test_onnx_export_speculate-f1.expect b/test/expect/TestScript.test_onnx_export_speculate-f1.expect
index 47f55eb41ccdaa..4e8e51552ea4ac 100644
--- a/test/expect/TestScript.test_onnx_export_speculate-f1.expect
+++ b/test/expect/TestScript.test_onnx_export_speculate-f1.expect
@@ -6,27 +6,28 @@ ModelProto {
     GraphProto {
       name: "torch-jit-export"
       inputs: [{name: "x.1", type:Tensor dims: 1 10}]
-      outputs: [{name: "6", type:Tensor dims: 10 1}]
+      outputs: [{name: "8", type:Tensor dims: 10 1}]
       initializers: []
       nodes: [
         Node {type: "Add", inputs: [x.1,x.1], outputs: [1], attributes: []},
-        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "Transpose", inputs: [1], outputs: [3], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
-        Node {type: "Transpose", inputs: [1], outputs: [4], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "ReduceSum", inputs: [1], outputs: [2], attributes: [{ name: 'keepdims', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Greater", inputs: [2,3], outputs: [4], attributes: []},
         Node {type: "Transpose", inputs: [1], outputs: [5], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
-        Node {type: "If", inputs: [2], outputs: [6], attributes: [{ name: 'then_branch', type: graph, value:
+        Node {type: "Transpose", inputs: [1], outputs: [6], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "Transpose", inputs: [1], outputs: [7], attributes: [{ name: 'perm', type: ints, values: [1 0]}]},
+        Node {type: "If", inputs: [4], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value:
             GraphProto {
               name: "torch-jit-export1"
               inputs: []
-              outputs: [{name: "8", type:Tensor dims: }]
+              outputs: [{name: "9", type:Tensor dims: }]
               initializers: []
               nodes: [
-                Node {type: "Constant", inputs: [], outputs: [7], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-                Node {type: "If", inputs: [7], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value:
+                Node {type: "If", inputs: [4], outputs: [9], attributes: [{ name: 'then_branch', type: graph, value:
                     GraphProto {
                       name: "torch-jit-export2"
                       inputs: []
-                      outputs: [{name: "3", type:Tensor dims: }]
+                      outputs: [{name: "5", type:Tensor dims: }]
                       initializers: []
                       nodes: [
                         
@@ -37,7 +38,7 @@ ModelProto {
                     GraphProto {
                       name: "torch-jit-export3"
                       inputs: []
-                      outputs: [{name: "4", type:Tensor dims: }]
+                      outputs: [{name: "6", type:Tensor dims: }]
                       initializers: []
                       nodes: [
                         
@@ -52,7 +53,7 @@ ModelProto {
             GraphProto {
               name: "torch-jit-export4"
               inputs: []
-              outputs: [{name: "5", type:Tensor dims: }]
+              outputs: [{name: "7", type:Tensor dims: }]
               initializers: []
               nodes: [
                 
diff --git a/test/expect/TestScript.test_onnx_export_speculate-f2.expect b/test/expect/TestScript.test_onnx_export_speculate-f2.expect
index e7d04f54309b05..2820ce5f639ecb 100644
--- a/test/expect/TestScript.test_onnx_export_speculate-f2.expect
+++ b/test/expect/TestScript.test_onnx_export_speculate-f2.expect
@@ -6,27 +6,28 @@ ModelProto {
     GraphProto {
       name: "torch-jit-export"
       inputs: [{name: "x.1", type:Tensor dims: 1 10},{name: "1", type:Tensor dims: 20 10},{name: "2", type:Tensor dims: 20}]
-      outputs: [{name: "5", type:Tensor dims: 1 20}]
+      outputs: [{name: "7", type:Tensor dims: 1 20}]
       initializers: [TensorProto shape: [20 10],TensorProto shape: [20]]
       nodes: [
         Node {type: "Add", inputs: [x.1,x.1], outputs: [3], attributes: []},
-        Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "If", inputs: [4], outputs: [5], attributes: [{ name: 'then_branch', type: graph, value:
+        Node {type: "ReduceSum", inputs: [3], outputs: [4], attributes: [{ name: 'keepdims', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [5], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Greater", inputs: [4,5], outputs: [6], attributes: []},
+        Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value:
             GraphProto {
               name: "torch-jit-export1"
               inputs: []
-              outputs: [{name: "7", type:Tensor dims: 1 20}]
+              outputs: [{name: "8", type:Tensor dims: 1 20}]
               initializers: []
               nodes: [
-                Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-                Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value:
+                Node {type: "If", inputs: [6], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value:
                     GraphProto {
                       name: "torch-jit-export2"
                       inputs: []
-                      outputs: [{name: "8", type:Tensor dims: 1 20}]
+                      outputs: [{name: "9", type:Tensor dims: 1 20}]
                       initializers: []
                       nodes: [
-                        Node {type: "Gemm", inputs: [3,1,2], outputs: [8], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
                       ]
                     }
 
@@ -34,10 +35,10 @@ ModelProto {
                     GraphProto {
                       name: "torch-jit-export3"
                       inputs: []
-                      outputs: [{name: "9", type:Tensor dims: 1 20}]
+                      outputs: [{name: "10", type:Tensor dims: 1 20}]
                       initializers: []
                       nodes: [
-                        Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
+                        Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
                       ]
                     }
 
@@ -49,10 +50,10 @@ ModelProto {
             GraphProto {
               name: "torch-jit-export4"
               inputs: []
-              outputs: [{name: "10", type:Tensor dims: 1 20}]
+              outputs: [{name: "11", type:Tensor dims: 1 20}]
               initializers: []
               nodes: [
-                Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
+                Node {type: "Gemm", inputs: [3,1,2], outputs: [11], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]}
               ]
             }
 
diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect
new file mode 100644
index 00000000000000..3a8e01092f8d0b
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect
@@ -0,0 +1,54 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index f476cde7afd935..d8e0b6be0d94a9 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -428,6 +428,10 @@ def test_upsample(self):
         x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
         self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2., mode='bilinear'), x)
 
+    def test_unsqueeze(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.unsqueeze(len(x.shape)), x)
+
     def test_symbolic_override(self):
         """Lifted from fast-neural-style: custom implementation of instance norm
         to be mapped to ONNX operator"""
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 9b31d02d6e385d..349e7fc1eec375 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -798,6 +798,18 @@ def test_convtranspose(self):
         model = nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, padding=1, output_padding=2)
         self.run_model_test(model, train=False, batch_size=BATCH_SIZE, atol=1e-7)
 
+    def test_unsqueeze(self):
+        shape = (3, 4, 5)
+        for dim in range(len(shape) + 1):
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super(MyModel, self).__init__()
+
+                def forward(self, x):
+                    return x.unsqueeze(dim)
+            x = Variable(torch.randn(*shape))
+            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7)
+
     # NB: InstanceNorm model includes unused weights, so skip this in TestCaffe2BackendEmbed
     # TODO: We should have another pass to eliminate the unused initializers in ONNX models.
     @skipIfEmbed
diff --git a/test/run_test.py b/test/run_test.py
index 3979ba0f2d15e6..71b96e78bc91b5 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -14,6 +14,7 @@
 import torch
 from torch.utils import cpp_extension
 from common import TEST_WITH_ROCM
+import torch.distributed.c10d as c10d
 
 TESTS = [
     'autograd',
@@ -31,12 +32,14 @@
     'nn',
     'optim',
     'sparse',
+    'thd_distributed',
     'torch',
     'utils',
 ]
 
 WINDOWS_BLACKLIST = [
     'distributed',
+    'thd_distributed',
 ]
 
 ROCM_BLACKLIST = [
@@ -46,10 +49,29 @@
     'distributions',
     'multiprocessing',
     'nccl',
+    'thd_distributed',
     'utils',
 ]
 
 DISTRIBUTED_TESTS_CONFIG = {
+    'gloo': {
+        'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
+    },
+}
+
+
+if c10d.is_available():
+    if c10d.is_mpi_available():
+        DISTRIBUTED_TESTS_CONFIG['mpi'] = {
+            'WORLD_SIZE': '3'
+        }
+    if c10d.is_nccl_available():
+        DISTRIBUTED_TESTS_CONFIG['nccl'] = {
+            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
+        }
+
+
+THD_DISTRIBUTED_TESTS_CONFIG = {
     'tcp': {
         'WORLD_SIZE': '3'
     },
@@ -122,7 +144,10 @@ def test_distributed(python, test_module, test_directory, options):
     if options.verbose and not mpi_available:
         print_to_stderr(
             'MPI not available -- MPI backend tests will be skipped')
-    for backend, env_vars in DISTRIBUTED_TESTS_CONFIG.items():
+    config = DISTRIBUTED_TESTS_CONFIG
+    if test_module == "test_thd_distributed":
+        config = THD_DISTRIBUTED_TESTS_CONFIG
+    for backend, env_vars in config.items():
         if backend == 'mpi' and not mpi_available:
             continue
         for with_init_file in {True, False}:
@@ -137,7 +162,10 @@ def test_distributed(python, test_module, test_directory, options):
             os.environ['INIT_METHOD'] = 'env://'
             os.environ.update(env_vars)
             if with_init_file:
-                init_method = 'file://{}/shared_init_file'.format(tmp_dir)
+                if test_module == "test_distributed":
+                    init_method = 'file://{}/'.format(tmp_dir)
+                else:
+                    init_method = 'file://{}/shared_init_file'.format(tmp_dir)
                 os.environ['INIT_METHOD'] = init_method
             try:
                 os.mkdir(os.path.join(tmp_dir, 'barrier'))
@@ -166,6 +194,7 @@ def test_distributed(python, test_module, test_directory, options):
 CUSTOM_HANDLERS = {
     'cpp_extensions': test_cpp_extensions,
     'distributed': test_distributed,
+    'thd_distributed': test_distributed,
 }
 
 
diff --git a/test/test_c10d.py b/test/test_c10d.py
index c448eba1349972..13f7b779d04736 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -1,7 +1,6 @@
 import copy
 import math
 import multiprocessing
-import socket
 import sys
 import tempfile
 import unittest
@@ -10,6 +9,7 @@
 from collections import namedtuple
 
 import torch
+import common
 from torch import nn
 import torch.nn.functional as F
 from torch.distributed import c10d
@@ -60,15 +60,6 @@ def get_timeout(test_id):
     return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT)
 
 
-def find_free_port():
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    sock.bind(('localhost', 0))
-    sockname = sock.getsockname()
-    sock.close()
-    return sockname[1]
-
-
 def gpus_for_rank(world_size):
     """Multigpu tests are designed to simulate the multi nodes with multi
     GPUs on each node. Nccl backend requires equal #GPUs in each process.
@@ -126,14 +117,14 @@ def _create_store(self):
 class TCPStoreTest(TestCase, StoreTestBase):
     def _create_store(self):
         addr = 'localhost'
-        port = find_free_port()
+        port = common.find_free_port()
         return c10d.TCPStore(addr, port, True)
 
 
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
         addr = 'localhost'
-        port = find_free_port()
+        port = common.find_free_port()
         self.tcpstore = c10d.TCPStore(addr, port, True)
         self.prefix = "test_prefix"
 
@@ -150,10 +141,10 @@ def test_unknown_handler(self):
 class RendezvousFileTest(TestCase):
     def test_common_errors(self):
         with self.assertRaisesRegex(ValueError, 'path missing'):
-            gen = c10d.rendezvous('file://?rank=0&size=1')
+            gen = c10d.rendezvous('file://?rank=0&world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'rank parameter missing'):
-            gen = c10d.rendezvous('file:///tmp/foo?size=1')
+            gen = c10d.rendezvous('file:///tmp/foo?world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'size parameter missing'):
             gen = c10d.rendezvous('file:///tmp/foo?rank=0')
@@ -161,7 +152,7 @@ def test_common_errors(self):
 
     def test_nominal(self):
         with tempfile.NamedTemporaryFile() as file:
-            url = 'file://%s?size=%d' % (file.name, 2)
+            url = 'file://%s?world_size=%d' % (file.name, 2)
             gen0 = c10d.rendezvous(url + "&rank=0")
             store0, rank0, size0 = next(gen0)
             self.assertEqual(0, rank0)
@@ -183,10 +174,10 @@ def test_nominal(self):
 class RendezvousTCPTest(TestCase):
     def test_common_errors(self):
         with self.assertRaisesRegex(ValueError, 'port number missing'):
-            gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&size=1')
+            gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'rank parameter missing'):
-            gen = c10d.rendezvous('tcp://127.0.0.1:23456?size=1')
+            gen = c10d.rendezvous('tcp://127.0.0.1:23456?world_size=1')
             next(gen)
         with self.assertRaisesRegex(ValueError, 'size parameter missing'):
             gen = c10d.rendezvous('tcp://127.0.0.1:23456?rank=0')
@@ -194,8 +185,8 @@ def test_common_errors(self):
 
     def test_nominal(self):
         addr = 'localhost'
-        port = find_free_port()
-        url = 'tcp://%s:%d?size=%d' % (addr, port, 2)
+        port = common.find_free_port()
+        url = 'tcp://%s:%d?world_size=%d' % (addr, port, 2)
         gen0 = c10d.rendezvous(url + "&rank=0")
         store0, rank0, size0 = next(gen0)
         self.assertEqual(0, rank0)
@@ -245,7 +236,7 @@ def setUpClass(cls):
     def setUp(self):
         self.rank = self.MAIN_PROCESS_RANK
         self.file = tempfile.NamedTemporaryFile()
-        self.port = find_free_port()
+        self.port = common.find_free_port()
         self.processes = [self._spawn_process(rank) for rank in range(int(self.world_size))]
 
     def tearDown(self):
@@ -529,8 +520,9 @@ def _test_ddp_with_process_group(self, process_group):
         model = Net()
         ddp_model = distributed_c10d._DistributedDataParallelC10d(
             copy.deepcopy(model).cuda(gpus[0]),
-            process_group,
-            device_ids=gpus)
+            device_ids=gpus,
+            process_group=process_group)
+
         model.cuda(gpus[0])
 
         local_batch_size = len(gpus)
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 7c70aa2591f3a3..03a2ff5af641fe 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1823,17 +1823,7 @@ def test(use_double=False):
 
     @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
     def test_symeig(self):
-        # Small case
-        tensor = torch.randn(3, 3).cuda()
-        tensor = torch.mm(tensor, tensor.t())
-        eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
-        self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))
-
-        # Large case
-        tensor = torch.randn(257, 257).cuda()
-        tensor = torch.mm(tensor, tensor.t())
-        eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
-        self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))
+        TestTorch._test_symeig(self, lambda t: t.cuda())
 
     def test_arange(self):
         for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']:
diff --git a/test/test_distributed.py b/test/test_distributed.py
index 47dbe9d056f154..38a32d69ef7c64 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -5,29 +5,32 @@
 import os
 import sys
 import time
+import tempfile
 import unittest
 from contextlib import contextmanager
 from functools import reduce, wraps
 
 import torch
 import torch.cuda
-import torch.distributed as dist
+import torch.distributed.c10d as dist
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from common import TestCase
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
 from torch.autograd import Variable
-
+import common
 
 BACKEND = os.environ["BACKEND"]
 TEMP_DIR = os.environ["TEMP_DIR"]
 INIT_METHOD = os.getenv("INIT_METHOD", "env://")
-MASTER_PORT = "29500"
 
 DEFAULT_TIMEOUT = 300
 CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500}
 
+if INIT_METHOD.startswith("file://"):
+    FOLDER = INIT_METHOD[7:]
+
 
 def get_timeout(test_id):
     test_name = test_id.split(".")[-1]
@@ -361,8 +364,9 @@ def test_broadcast_cuda(self):
         rank_to_GPU = self._init_multigpu_helper()
         self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU)
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_broadcast_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_broadcast_helper(group, group_id, rank)
@@ -454,7 +458,8 @@ def test_reduce_max(self):
         self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10)
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_sum(self):
         group, group_id, rank = self._init_group_test()
@@ -469,7 +474,8 @@ def test_reduce_group_sum(self):
         )
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_product(self):
         group, group_id, rank = self._init_group_test()
@@ -484,14 +490,16 @@ def test_reduce_group_product(self):
         )
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_min(self):
         group, group_id, rank = self._init_group_test()
         self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1)
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_reduce_group_max(self):
         group, group_id, rank = self._init_group_test()
@@ -540,8 +548,8 @@ def test_all_reduce_sum(self):
         )
 
     @unittest.skipIf(
-        BACKEND != "gloo" and BACKEND != "nccl",
-        "Only Gloo & Nccl backend support CUDA allReduce",
+        BACKEND != "gloo",
+        "Only Gloo backend will have CUDA allReduce tested",
     )
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
@@ -587,8 +595,9 @@ def test_all_reduce_max(self):
             group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_sum(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
@@ -601,8 +610,9 @@ def test_all_reduce_group_sum(self):
             2 + (10 * (len(group) - 1)),
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_product(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
@@ -615,16 +625,18 @@ def test_all_reduce_group_product(self):
             reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_min(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
             group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
         )
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_reduce_group_max(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_reduce_helper(
@@ -652,6 +664,7 @@ def test_scatter(self):
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter")
     @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_scatter_group(self):
         group, group_id, rank = self._init_group_test()
@@ -679,7 +692,8 @@ def test_gather(self):
         self._test_gather_helper(group, group_id, rank)
 
     @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     @skip_if_small_worldsize
     def test_gather_group(self):
         group, group_id, rank = self._init_group_test()
@@ -703,12 +717,13 @@ def _test_all_gather_helper(
 
         self._barrier()
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports CPU all gather")
     def test_all_gather(self):
         group, group_id, rank = self._init_global_test()
         self._test_all_gather_helper(group, group_id, rank)
 
     @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather")
+    @unittest.skipIf(BACKEND == "nccl", "CUDA all gather skipped for NCCL")
     @skip_if_no_cuda_distributed
     @skip_if_no_gpu
     def test_all_gather_cuda(self):
@@ -716,8 +731,10 @@ def test_all_gather_cuda(self):
         rank_to_GPU = self._init_multigpu_helper()
         self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU)
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_all_gather_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_all_gather_helper(group, group_id, rank)
@@ -740,13 +757,14 @@ def _test_barrier_helper(self, group, group_id, rank):
 
         self._barrier()
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier")
     def test_barrier(self):
         group, group_id, rank = self._init_global_test()
         self._test_barrier_helper(group, group_id, rank)
 
-    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
     @skip_if_small_worldsize
+    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier")
+    @unittest.skipIf(BACKEND == "mpi", "MPI does not support group")
     def test_barrier_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_barrier_helper(group, group_id, rank)
@@ -765,7 +783,8 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
                 self.assertEqual(tensor, expected_tensor)
         self._barrier()
 
-    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu")
+    @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
+    @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped")
     @skip_if_no_gpu
     def test_broadcast_multigpu(self):
         group, group_id, rank = self._init_global_test()
@@ -802,7 +821,8 @@ def _test_all_reduce_multigpu_helper(
 
         self._barrier()
 
-    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu")
+    @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
+    @unittest.skipIf(BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL")
     @skip_if_no_gpu
     def test_all_reduce_multigpu(self):
         group, group_id, rank = self._init_global_test()
@@ -985,7 +1005,7 @@ def test_DistributedDataParallel(self):
         # DDP training setup
         model_DDP = copy.deepcopy(model)
         model_DDP.cuda(gpu_subset[0])
-        model_DDP = nn.parallel.DistributedDataParallel(
+        model_DDP = nn.parallel._DistributedDataParallelC10d(
             model_DDP, device_ids=gpu_subset
         )
 
@@ -1006,33 +1026,8 @@ def test_DistributedDataParallel(self):
         )
         self._barrier()
 
-    @unittest.skipIf(
-        BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU"
-    )
-    def test_DistributedDataParallelCPU(self):
-        # Run a simple end to end DDP-CPU model, use result of single node
-        # model as baseline
-        group, group_id, rank = self._init_global_test()
-
-        # cpu training setup
-        model_base = self._create_Net()
-
-        # DDP-CPU training setup
-        model_DDP = copy.deepcopy(model_base)
-        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
-
-        # dummy data initialization
-        local_bs = 2
-        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
 
-        # check two model parameters over 2 iterations
-        self._test_DDP_2iter(
-            model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs
-        )
-        self._barrier()
-
-
-if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl":
+if BACKEND == "gloo" or BACKEND == "nccl":
     WORLD_SIZE = os.environ["WORLD_SIZE"]
 
     class TestDistBackend(TestCase, _DistTestBase):
@@ -1052,7 +1047,6 @@ def wrapper(self):
         @classmethod
         def setUpClass(cls):
             os.environ["MASTER_ADDR"] = MASTER_ADDR
-            os.environ["MASTER_PORT"] = MASTER_PORT
             os.environ["WORLD_SIZE"] = WORLD_SIZE
             for attr in dir(cls):
                 if attr.startswith("test"):
@@ -1060,6 +1054,17 @@ def setUpClass(cls):
                     setattr(cls, attr, cls.manager_join(fn))
 
         def setUp(self):
+            # Adding this hack until we fix the FileStore to delete its
+            # content at the end
+            global INIT_METHOD
+            if INIT_METHOD.startswith("file://"):
+                _, filename = tempfile.mkstemp(prefix=FOLDER)
+                INIT_METHOD = "file://{}".format(filename)
+
+            if INIT_METHOD.startswith("env://"):
+                port = common.find_free_port()
+                os.environ["MASTER_PORT"] = str(port)
+
             self.processes = []
             self.rank = self.MANAGER_PROCESS_RANK
             Barrier.init()
@@ -1081,7 +1086,10 @@ def _run(self, rank):
             self.rank = rank
             try:
                 dist.init_process_group(
-                    init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE)
+                    init_method=INIT_METHOD,
+                    backend=BACKEND,
+                    world_size=int(WORLD_SIZE),
+                    rank=self.rank
                 )
             except RuntimeError as e:
                 if "recompile" in e.args[0]:
diff --git a/test/test_jit.py b/test/test_jit.py
index 5a9e9656e567ab..4fe4adc5b1d00d 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -396,6 +396,28 @@ def fn(x, y):
 
         self.assertEqual(fn(x, y), fn_traced(x, y))
 
+    def test_disabled(self):
+        torch.jit._enabled = False
+        try:
+            def f(x, y):
+                return x + y
+
+            self.assertIs(torch.jit.trace(torch.randn(2, 2), torch.randn(2, 2))(f), f)
+            self.assertIs(torch.jit.script(f), f)
+
+            class MyModule(torch.jit.ScriptModule):
+                @torch.jit.script_method
+                def method(self, x):
+                    return x
+
+            # XXX: Unfortunately ScriptModule won't simply become Module now,
+            # because that requires disabling the JIT at startup time, which
+            # we can't do in here.
+            # We need to or those two conditions to make it work with all versions of Python
+            self.assertTrue(inspect.ismethod(MyModule.method) or inspect.isfunction(MyModule.method))
+        finally:
+            torch.jit._enabled = True
+
     # Backwards tracing was broken for indexing by a constant,
     # because it's internally implemented using as_strided,
     # and we attempted to trace its derivative (which is not
@@ -956,6 +978,24 @@ def fn(x, y):
         self.assertExpectedGraph(traced_fn.graph)
         self.assertExportImport(traced_fn.graph, (x, y))
 
+    def test_trace_tensor_factory(self):
+        def run(**kwargs):
+            inputs_require_grads = kwargs.pop('inputs_require_grads', True)
+
+            def fn(x):
+                return x + torch.ones(2, 3, **kwargs)
+            input = torch.ones(2, 3, **kwargs)
+            self.checkTrace(fn, (input,), inputs_require_grads=inputs_require_grads)
+            # check we recorded 'ones' and did not just record a constant
+            tfn = torch.jit.trace(input)(fn)
+            self.assertTrue("ones" in str(tfn.graph))
+        run()
+        run(dtype=torch.int, inputs_require_grads=False)
+        if RUN_CUDA:
+            run(device="cuda:0")
+        if RUN_CUDA_MULTI_GPU:
+            run(device="cuda:1")
+
     # TODO: implement
     @unittest.expectedFailure
     def test_output_unflatten(self):
@@ -1384,8 +1424,6 @@ def constant_prop(a, b):
         self.run_pass('constant_propagation', constant_prop.graph)
         self.assertExpected(canonical(constant_prop.graph))
 
-    # TODO: implement
-    @unittest.expectedFailure
     def test_constant_prop_loop_constant(self):
         @torch.jit.script
         def constant_prop():
@@ -4701,8 +4739,12 @@ def __init__(self, m):
             @torch.jit.script_method
             def forward(self, x):
                 x += x
-                if True:
-                    if True:
+                # because we are testing if we emit `if` statement correctly
+                # we cannot use `True` as the condition. Constant prop
+                # would remove the `if` statements.
+                c = sum(x) > 4
+                if c:
+                    if c:
                         y = self.m(x)
                     else:
                         y = self.m(x)
diff --git a/test/test_thd_distributed.py b/test/test_thd_distributed.py
new file mode 100644
index 00000000000000..47dbe9d056f154
--- /dev/null
+++ b/test/test_thd_distributed.py
@@ -0,0 +1,1148 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import copy
+import fcntl
+import multiprocessing
+import os
+import sys
+import time
+import unittest
+from contextlib import contextmanager
+from functools import reduce, wraps
+
+import torch
+import torch.cuda
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from common import TestCase
+from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
+from torch.autograd import Variable
+
+
+BACKEND = os.environ["BACKEND"]
+TEMP_DIR = os.environ["TEMP_DIR"]
+INIT_METHOD = os.getenv("INIT_METHOD", "env://")
+MASTER_PORT = "29500"
+
+DEFAULT_TIMEOUT = 300
+CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500}
+
+
+def get_timeout(test_id):
+    test_name = test_id.split(".")[-1]
+    if test_name in CUSTOMIZED_TIMEOUT:
+        return CUSTOMIZED_TIMEOUT[test_name]
+    else:
+        return DEFAULT_TIMEOUT
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests")
+    sys.exit(0)
+
+SKIP_IF_NO_CUDA_EXIT_CODE = 75
+SKIP_IF_NO_GPU_EXIT_CODE = 76
+SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77
+SKIP_IF_BACKEND_UNAVAILABLE = 78
+
+
+def skip_if_no_cuda_distributed(func):
+    func.skip_if_no_cuda_distributed = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.cuda.is_available():
+            sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def skip_if_no_gpu(func):
+    """ Nccl multigpu tests requires at least 2 GPUS. Skip if this is not met"""
+    func.skip_if_no_gpu = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.cuda.is_available():
+            sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE)
+        if torch.cuda.device_count() < int(os.environ["WORLD_SIZE"]):
+            sys.exit(SKIP_IF_NO_GPU_EXIT_CODE)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def skip_if_small_worldsize(func):
+    func.skip_if_small_worldsize = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if (os.environ["BACKEND"] != "mpi") and int(os.environ["WORLD_SIZE"]) <= 2:
+            sys.exit(SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def apply_hack_for_nccl():
+    # This is a hack for a known NCCL issue using multiprocess
+    # in conjunction with multiple threads to manage different GPUs which
+    # may cause ncclCommInitRank to fail.
+    # http://docs.nvidia.com/deeplearning/sdk/nccl-release-notes/rel_2.1.4.html#rel_2.1.4
+    # It slows down the performance of collective operations.
+    # Without this setting NCCL might throw unhandled error.
+    os.environ["NCCL_MAX_NRINGS"] = "1"
+
+
+@contextmanager
+def _lock():
+    lockfile = os.path.join(TEMP_DIR, "lockfile")
+    with open(lockfile, "w") as lf:
+        try:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+            yield
+        finally:
+            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            lf.close()
+
+
+def _build_tensor(size, value=None):
+    if value is None:
+        value = size
+    return torch.FloatTensor(size, size, size).fill_(value)
+
+
+class Barrier(object):
+    barrier_id = 0
+
+    @classmethod
+    def init(cls):
+        cls.barrier_id = 0
+        barrier_dir = os.path.join(TEMP_DIR, "barrier")
+        for f_name in os.listdir(barrier_dir):
+            os.unlink(os.path.join(barrier_dir, f_name))
+
+    @classmethod
+    def sync(cls, timeout=5):
+        cls.barrier_id += 1
+        barrier_dir = os.path.join(TEMP_DIR, "barrier")
+        pid = str(os.getpid())
+        barrier_file = os.path.join(barrier_dir, pid)
+        with _lock():
+            with open(barrier_file, "w") as f:
+                f.write(str(cls.barrier_id))
+
+        start_time = time.time()
+        while True:
+            arrived = 0
+            with _lock():
+                for f_name in os.listdir(barrier_dir):
+                    with open(os.path.join(barrier_dir, f_name), "r") as f:
+                        data = f.read()
+                        if int(data) >= cls.barrier_id:
+                            arrived += 1
+            if arrived == dist.get_world_size():
+                break
+
+            if time.time() - start_time > timeout:
+                raise RuntimeError("barrier timeout")
+            time.sleep(0.1)
+
+
+class _DistTestBase(object):
+    def _barrier(self, *args, **kwargs):
+        Barrier.sync(*args, **kwargs)
+
+    def _init_group_test(self):
+        group = [1, 2]
+        group_id = dist.new_group(group)
+        rank = dist.get_rank()
+        if rank not in group:
+            return ([], None, rank)
+
+        return (group, group_id, rank)
+
+    def _init_global_test(self):
+        group = [i for i in range(0, dist.get_world_size())]
+        group_id = dist.group.WORLD
+        rank = dist.get_rank()
+        return (group, group_id, rank)
+
+    # HELPER FOR MULTIGPU TESTS
+    def _init_multigpu_helper(self):
+        """Multigpu tests are designed to simulate the multi nodes with multi
+        GPUs on each node. Nccl backend requires equal #GPUs in each process.
+        On a single node, all visible GPUs are evenly
+        divided to subsets, each process only uses a subset.
+        """
+        nGPUs = torch.cuda.device_count()
+        world_size = dist.get_world_size()
+        visible_devices = range(nGPUs)
+
+        if BACKEND == "nccl":
+            apply_hack_for_nccl()
+
+        nGPUs_per_process = nGPUs // world_size
+        rank_to_GPU = {
+            i: list(
+                visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process]
+            )
+            for i in range(world_size)
+        }
+        return rank_to_GPU
+
+    # GET RANK
+    def test_get_rank(self):
+        test_dir = os.path.join(TEMP_DIR, "test_dir")
+        pid = str(os.getpid())
+        num_processes = dist.get_world_size()
+        with open(os.path.join(test_dir, pid), "w") as f:
+            f.write(str(dist.get_rank()))
+
+        self._barrier()
+
+        all_ranks = set()
+        for f_name in os.listdir(test_dir):
+            with open(os.path.join(test_dir, f_name), "r") as f:
+                all_ranks.add(int(f.read()))
+        self.assertEqual(len(all_ranks), num_processes)
+
+        self._barrier()
+
+        if dist.get_rank() == 0:
+            for f_name in os.listdir(test_dir):
+                os.unlink(os.path.join(test_dir, f_name))
+
+        self._barrier()
+
+    # SEND RECV
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support send/recv")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
+    def test_send_recv(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(rank + 1)
+        for dest in range(0, dist.get_world_size()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        for src in range(0, dist.get_world_size()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(src + 1, value=-1)
+            expected_tensor = _build_tensor(src + 1)
+            dist.recv(tensor, src)
+            self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    # SEND RECV ANY SOURCE
+    @unittest.skipIf(
+        BACKEND == "gloo", "Gloo does not support send/recv from any source"
+    )
+    @unittest.skipIf(
+        BACKEND == "nccl", "Nccl does not support send/recv from any source"
+    )
+    def test_send_recv_any_source(self):
+        rank = dist.get_rank()
+        tensor = _build_tensor(10, rank)
+        for dest in range(0, dist.get_world_size()):
+            if dest == rank:
+                continue
+            dist.send(tensor, dest)
+
+        recv_ranks = set()
+        for src in range(0, dist.get_world_size()):
+            if src == rank:
+                continue
+            tensor = _build_tensor(10, value=-1)
+            sender = dist.recv(tensor)
+            self.assertTrue(tensor.eq(sender).all())
+            recv_ranks.add(sender)
+
+        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
+        self._barrier()
+
+    # ISEND
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support isend")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend")
+    def test_isend(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            requests = [
+                dist.isend(_build_tensor(dest, 10), dest)
+                for dest in range(1, world_size)
+            ]
+            for request in requests:
+                request.wait()
+                self.assertTrue(request.is_completed())
+        else:
+            tensor = _build_tensor(rank, -1)
+            dist.recv(tensor, 0)
+            self.assertEqual(tensor, _build_tensor(rank, 10))
+
+        self._barrier()
+
+    # IRECV
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support irecv")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support irecv")
+    def test_irecv(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
+            requests = [
+                dist.irecv(expected_tensors[src - 1], src)
+                for src in range(1, world_size)
+            ]
+
+            for src in range(1, world_size):
+                requests[src - 1].wait()
+                self.assertTrue(requests[src - 1].is_completed())
+                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
+        else:
+            tensor = _build_tensor(rank, 10)
+            dist.send(tensor, 0)
+
+        self._barrier()
+
+    # BROADCAST
+    def _test_broadcast_helper(
+        self, group, group_id, rank, cuda=False, rank_to_GPU=None
+    ):
+        for ttype, value, requires_cuda in [
+            ("torch.FloatTensor", -1e-10, False),
+            ("torch.DoubleTensor", -1e-100, False),
+            ("torch.HalfTensor", -0.1, True),
+            ("torch.CharTensor", -2, False),
+            ("torch.ByteTensor", 129, False),
+            ("torch.IntTensor", -1e5, False),
+            ("torch.LongTensor", -1e15, False),
+        ]:
+            if requires_cuda and not cuda:
+                continue
+            for src in group:
+                expected_tensor = _build_tensor(src + 1, value).type(ttype)
+                if cuda:
+                    expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
+                if rank == src:
+                    dist.broadcast(expected_tensor, src, group_id)
+                else:
+                    tensor = _build_tensor(src + 1, -1).type(ttype)
+                    if cuda:
+                        tensor = tensor.cuda(rank_to_GPU[rank][0])
+                    dist.broadcast(tensor, src, group_id)
+                    self.assertEqual(tensor.size(), expected_tensor.size())
+                    self.assertEqual(tensor.ne(expected_tensor).max(), 0)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_broadcast(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    @unittest.skipIf(
+        BACKEND != "gloo" and BACKEND != "nccl",
+        "Only Gloo and Nccl backend supports CUDA allReduce",
+    )
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_broadcast_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU)
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_broadcast_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_broadcast_helper(group, group_id, rank)
+
+    # REDUCE
+    def _test_reduce_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+        cuda=False,
+        rank_to_GPU=None,
+    ):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.reduce(tensor, src, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.reduce(tensor, src, op, group_id)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA reduce")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_reduce_sum_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + 10 * (len(group) - 1),
+            True,
+            rank_to_GPU,
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10)
+
+    # ALL REDUCE
+    def _test_all_reduce_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+        cuda=False,
+        rank_to_GPU=None,
+    ):
+        for src in group:
+            if rank == src:
+                tensor = _build_tensor(src + 1).fill_(master_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+            else:
+                tensor = _build_tensor(src + 1).fill_(worker_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                dist.all_reduce(tensor, op, group_id)
+                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_sum(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(
+        BACKEND != "gloo" and BACKEND != "nccl",
+        "Only Gloo & Nccl backend support CUDA allReduce",
+    )
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_all_reduce_sum_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+            True,
+            rank_to_GPU,
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_product(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_min(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_reduce_max(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_sum(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            2 + (10 * (len(group) - 1)),
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_product(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group,
+            group_id,
+            rank,
+            dist.reduce_op.PRODUCT,
+            2,
+            10,
+            reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2),
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_min(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
+        )
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_reduce_group_max(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_reduce_helper(
+            group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
+        )
+
+    # SCATTER
+    def _test_scatter_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, -1)
+            expected_tensor = _build_tensor(dest + 1, rank)
+            tensors = (
+                [_build_tensor(dest + 1, i) for i in group] if rank == dest else []
+            )
+            dist.scatter(tensor, src=dest, scatter_list=tensors, group=group_id)
+            self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter")
+    def test_scatter(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter")
+    @skip_if_small_worldsize
+    def test_scatter_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_scatter_helper(group, group_id, rank)
+
+    # GATHER
+    def _test_gather_helper(self, group, group_id, rank):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            tensors = (
+                [_build_tensor(dest + 1, -1) for i in group] if rank == dest else []
+            )
+            dist.gather(tensor, dst=dest, gather_list=tensors, group=group_id)
+            if rank == dest:
+                expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+                for t1, t2 in zip(tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather")
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_gather_helper(group, group_id, rank)
+
+    # ALL GATHER
+    def _test_all_gather_helper(
+        self, group, group_id, rank, cuda=False, rank_to_GPU=None
+    ):
+        for dest in group:
+            tensor = _build_tensor(dest + 1, rank)
+            tensors = [_build_tensor(dest + 1, -1) for i in group]
+            if cuda:
+                tensor = tensor.cuda(rank_to_GPU[rank][0])
+                tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+            dist.all_gather(tensors, tensor, group_id)
+
+            expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+            for t1, t2 in zip(tensors, expected_tensors):
+                self.assertEqual(t1, t2)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_all_gather(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_all_gather_cuda(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU)
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_all_gather_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_all_gather_helper(group, group_id, rank)
+
+    # BARRIER
+    def _test_barrier_helper(self, group, group_id, rank):
+        WAIT_TIME = 0.3  # seconds
+
+        for dest in group:
+            expected_time = torch.DoubleTensor(1).fill_(0.0)
+            if dest == rank:
+                expected_time.fill_(time.time() + WAIT_TIME)
+                dist.broadcast(expected_time, dest, group_id)
+                time.sleep(WAIT_TIME + 0.1)  # sleep a little bit longer
+                dist.barrier(group_id)
+            else:
+                dist.broadcast(expected_time, dest, group_id)
+                dist.barrier(group_id)
+                self.assertGreaterEqual(time.time(), expected_time[0])
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors")
+    def test_barrier(self):
+        group, group_id, rank = self._init_global_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup")
+    @skip_if_small_worldsize
+    def test_barrier_group(self):
+        group, group_id, rank = self._init_group_test()
+        self._test_barrier_helper(group, group_id, rank)
+
+    def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
+        for src in group:
+            expected_tensor = _build_tensor(src + 1)
+            tensors = [
+                _build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank]
+            ]
+            if rank == src:
+                tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0])
+
+            dist.broadcast_multigpu(tensors, src, group_id)
+            for tensor in tensors:
+                self.assertEqual(tensor, expected_tensor)
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu")
+    @skip_if_no_gpu
+    def test_broadcast_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_broadcast_multigpu_helper(group, group_id, rank, rank_to_GPU)
+
+    def _test_all_reduce_multigpu_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        rank_to_GPU,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+    ):
+        for src in group:
+            if rank == src:
+                tensors = [
+                    _build_tensor(src + 1, master_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+            else:
+                tensors = [
+                    _build_tensor(src + 1, worker_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+
+            dist.all_reduce_multigpu(tensors, op, group_id)
+            expected_tensor = _build_tensor(src + 1, expected_value)
+            for tensor in tensors:
+                self.assertEqual(tensor, expected_tensor)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu")
+    @skip_if_no_gpu
+    def test_all_reduce_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_reduce_multigpu_helper(
+            group,
+            group_id,
+            rank,
+            rank_to_GPU,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
+        )
+
+    def _test_reduce_multigpu_helper(
+        self,
+        group,
+        group_id,
+        rank,
+        rank_to_GPU,
+        op,
+        master_value,
+        worker_value,
+        expected_value,
+    ):
+        for src in group:
+            if rank == src:
+                tensors = [
+                    _build_tensor(src + 1, master_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+                dist.reduce_multigpu(tensors, src, op, group_id)
+                expected_tensor = _build_tensor(src + 1, expected_value)
+                self.assertEqual(tensors[0], expected_tensor)
+            else:
+                tensors = [
+                    _build_tensor(src + 1, worker_value).cuda(device=i)
+                    for i in rank_to_GPU[rank]
+                ]
+                dist.reduce_multigpu(tensors, src, op, group_id)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports reduce multigpu")
+    @skip_if_no_gpu
+    def test_reduce_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_reduce_multigpu_helper(
+            group,
+            group_id,
+            rank,
+            rank_to_GPU,
+            dist.reduce_op.SUM,
+            2,
+            10,
+            (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
+        )
+
+    def _test_all_gather_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
+        for dest in group:
+            tensors = [
+                _build_tensor(dest + 1).cuda(device=i) for i in rank_to_GPU[rank]
+            ]
+
+            # construct expected output along with
+            # a place holder to receive all gather results
+            output_tensors = []
+            expected_output = []
+            output_per_gpu = (
+                [_build_tensor(dest + 1, -1)] * len(rank_to_GPU[0]) * len(group)
+            )
+            expected_per_gpu = (
+                [_build_tensor(dest + 1)] * len(rank_to_GPU[0]) * len(group)
+            )
+            for gpu in rank_to_GPU[rank]:
+                output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
+                expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])
+
+            dist.all_gather_multigpu(output_tensors, tensors, group_id)
+            self.assertEqual(output_tensors, expected_output)
+
+        self._barrier()
+
+    @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allgather multigpu")
+    @skip_if_no_gpu
+    def test_all_gather_multigpu(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        self._test_all_gather_multigpu_helper(group, group_id, rank, rank_to_GPU)
+
+    def _create_Net(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 50, bias=False)
+                self.fc3 = nn.Linear(50, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                x = self.fc3(x)
+                return F.softmax(x, dim=1)
+
+        return Net()
+
+    def _model_step(self, model):
+        for param in model.parameters():
+            param.data += param.grad
+            param.grad = None
+
+    def _prepare_dummy_data(self, local_bs):
+        # global_bs for DDP should be divisible by WORLD_SIZE
+        global_bs = int(WORLD_SIZE) * local_bs
+        input_cpu = torch.randn(global_bs, 2)
+        target = torch.randn(global_bs, 4)
+        loss = nn.MSELoss()
+        return global_bs, input_cpu, target, loss
+
+    # END TO END TEST FOR DISTRIBUTEDDATAPARALLEL
+    def _test_DDP_helper(self, model, input_var, target, loss):
+        model.train()
+        output = model(input_var)
+        l = loss(output, target)
+        l.backward()
+
+    def _assert_equal_param(self, param_gpu, param_DDP):
+        self.assertEqual(len(param_gpu), len(param_DDP))
+        for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+            self.assertEqual(p_gpu, p_DDP)
+
+    def _test_DDP_2iter(
+        self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size
+    ):
+        for _ in range(2):
+            # single cpu/gpu training
+            self._test_DDP_helper(model_base, input, target, loss)
+
+            # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs
+            self._test_DDP_helper(
+                model_DDP,
+                input[rank * local_bs: (rank + 1) * local_bs],
+                target[rank * local_bs: (rank + 1) * local_bs],
+                loss,
+            )
+
+            # Update weights and run a second iteration to shake out errors
+            self._model_step(model_base)
+            self._model_step(model_DDP)
+            self._assert_equal_param(
+                list(model_base.parameters()), list(model_DDP.module.parameters())
+            )
+
+            # Shuffle the input so that DDP input is different
+            input = input[torch.randperm(batch_size)]
+
+    @unittest.skipIf(
+        BACKEND != "nccl" and BACKEND != "gloo",
+        "Only Nccl & Gloo backend support DistributedDataParallel",
+    )
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_DistributedDataParallel(self):
+        # Run a simple end to end DDP model, use result of single node model
+        # as baseline
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+
+        # cpu training setup
+        model = self._create_Net()
+
+        # single gpu training setup
+        model_gpu = copy.deepcopy(model)
+        gpu_subset = list(rank_to_GPU[rank])
+        model_gpu.cuda(gpu_subset[0])
+
+        # DDP training setup
+        model_DDP = copy.deepcopy(model)
+        model_DDP.cuda(gpu_subset[0])
+        model_DDP = nn.parallel.DistributedDataParallel(
+            model_DDP, device_ids=gpu_subset
+        )
+
+        # dummy data initialization
+        local_bs = len(gpu_subset)
+        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+        # check two model parameters over 2 iterations
+        self._test_DDP_2iter(
+            model_gpu,
+            model_DDP,
+            input_cpu.cuda(gpu_subset[0]),
+            target.cuda(gpu_subset[0]),
+            loss,
+            local_bs,
+            rank,
+            global_bs,
+        )
+        self._barrier()
+
+    @unittest.skipIf(
+        BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU"
+    )
+    def test_DistributedDataParallelCPU(self):
+        # Run a simple end to end DDP-CPU model, use result of single node
+        # model as baseline
+        group, group_id, rank = self._init_global_test()
+
+        # cpu training setup
+        model_base = self._create_Net()
+
+        # DDP-CPU training setup
+        model_DDP = copy.deepcopy(model_base)
+        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
+
+        # dummy data initialization
+        local_bs = 2
+        global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+        # check two model parameters over 2 iterations
+        self._test_DDP_2iter(
+            model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs
+        )
+        self._barrier()
+
+
+if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl":
+    WORLD_SIZE = os.environ["WORLD_SIZE"]
+
+    class TestDistBackend(TestCase, _DistTestBase):
+        MANAGER_PROCESS_RANK = -1
+
+        @staticmethod
+        def manager_join(fn):
+            @wraps(fn)
+            def wrapper(self):
+                if self.rank == self.MANAGER_PROCESS_RANK:
+                    self._join_and_reduce(fn)
+                else:
+                    fn(self)
+
+            return wrapper
+
+        @classmethod
+        def setUpClass(cls):
+            os.environ["MASTER_ADDR"] = MASTER_ADDR
+            os.environ["MASTER_PORT"] = MASTER_PORT
+            os.environ["WORLD_SIZE"] = WORLD_SIZE
+            for attr in dir(cls):
+                if attr.startswith("test"):
+                    fn = getattr(cls, attr)
+                    setattr(cls, attr, cls.manager_join(fn))
+
+        def setUp(self):
+            self.processes = []
+            self.rank = self.MANAGER_PROCESS_RANK
+            Barrier.init()
+            for rank in range(int(WORLD_SIZE)):
+                self.processes.append(self._spawn_process(rank))
+
+        def tearDown(self):
+            for p in self.processes:
+                p.terminate()
+
+        def _spawn_process(self, rank):
+            os.environ["RANK"] = str(rank)
+            name = "process " + str(rank)
+            process = multiprocessing.Process(target=self._run, name=name, args=(rank,))
+            process.start()
+            return process
+
+        def _run(self, rank):
+            self.rank = rank
+            try:
+                dist.init_process_group(
+                    init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE)
+                )
+            except RuntimeError as e:
+                if "recompile" in e.args[0]:
+                    sys.exit(SKIP_IF_BACKEND_UNAVAILABLE)
+                    # sys.exit(0)
+                raise
+            # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+            # We're retreiving a corresponding test and executing it.
+            getattr(self, self.id().split(".")[2])()
+            sys.exit(0)
+
+        def _join_and_reduce(self, fn):
+            skip_ok = (
+                getattr(fn, "skip_if_no_cuda_distributed", False) or
+                getattr(fn, "skip_if_no_gpu", False) or
+                getattr(fn, "skip_if_small_worldsize", False)
+            )
+            self.JOIN_TIMEOUT = get_timeout(self.id())
+            for p in self.processes:
+                p.join(self.JOIN_TIMEOUT)
+
+            first_process = self.processes[0]
+            for p in self.processes:
+                self.assertEqual(p.exitcode, first_process.exitcode)
+
+            if first_process.exitcode == SKIP_IF_BACKEND_UNAVAILABLE:
+                raise unittest.SkipTest("Compiled without the " + BACKEND + " backend")
+
+            if skip_ok:
+                # do this first so we don't give an error message about
+                # mismatched exit codes if the first isn't valid
+                assert (
+                    first_process.exitcode == 0 or
+                    first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE or
+                    first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE or
+                    first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE
+                )
+
+                if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE:
+                    raise unittest.SkipTest("cuda is not available")
+                if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE:
+                    raise unittest.SkipTest(
+                        "One unique gpu per process is not available"
+                    )
+                if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE:
+                    raise unittest.SkipTest("worldsize is too small to run group tests")
+
+            self.assertEqual(first_process.exitcode, 0)
+
+
+elif BACKEND == "mpi":
+    WORLD_SIZE = os.environ["WORLD_SIZE"]
+    dist.init_process_group(init_method=INIT_METHOD, backend="mpi")
+
+    class TestMPI(TestCase, _DistTestBase):
+        pass
+
+
+if __name__ == "__main__":
+    assert (
+        not torch.cuda._initialized
+    ), "test_distributed must not have initialized CUDA context on main process"
+
+    unittest.main()
diff --git a/test/test_torch.py b/test/test_torch.py
index ff84dbff1cb054..91f2f702552c77 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -179,8 +179,6 @@ def test_namespace(ns, *skips):
                        'as_strided_',
                        re.compile('^clamp_(min|max)_?$'),
                        'coalesce',
-                       'digamma',
-                       'digamma_',
                        'index_put',
                        'is_coalesced',
                        'is_distributed',
@@ -4278,13 +4276,12 @@ def test_eig(self):
         Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t())
         self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
 
-    @skipIfNoLapack
-    @skipIfRocm
-    def test_symeig(self):
-        xval = torch.rand(100, 3)
+    @staticmethod
+    def _test_symeig(self, conv_fn):
+        xval = conv_fn(torch.rand(100, 3))
         cov = torch.mm(xval.t(), xval)
-        rese = torch.zeros(3)
-        resv = torch.zeros(3, 3)
+        rese = conv_fn(torch.zeros(3))
+        resv = conv_fn(torch.zeros(3, 3))
 
         # First call to symeig
         self.assertTrue(resv.is_contiguous(), 'resv is not contiguous')
@@ -4298,17 +4295,30 @@ def test_symeig(self):
         ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t())
         self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong')
 
+        # test eigenvectors=False
+        rese2 = conv_fn(torch.zeros(3))
+        resv2 = conv_fn(torch.randn(3, 3))
+        expected_resv2 = conv_fn(torch.zeros(3, 3))
+        torch.symeig(cov.clone(), False, out=(rese2, resv2))
+        self.assertEqual(rese, rese2)
+        self.assertEqual(resv2, expected_resv2)
+
         # test non-contiguous
-        X = torch.rand(5, 5)
+        X = conv_fn(torch.rand(5, 5))
         X = X.t() * X
-        e = torch.zeros(4, 2).select(1, 1)
-        v = torch.zeros(4, 2, 4)[:, 1]
+        e = conv_fn(torch.zeros(4, 2)).select(1, 1)
+        v = conv_fn(torch.zeros(4, 2, 4))[:, 1]
         self.assertFalse(v.is_contiguous(), 'V is contiguous')
         self.assertFalse(e.is_contiguous(), 'E is contiguous')
         torch.symeig(X, True, out=(e, v))
         Xhat = torch.mm(torch.mm(v, torch.diag(e)), v.t())
         self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
 
+    @skipIfNoLapack
+    @skipIfRocm
+    def test_symeig(self):
+        self._test_symeig(self, lambda x: x)
+
     @skipIfNoLapack
     def test_svd(self):
         a = torch.Tensor(((8.79, 6.11, -9.15, 9.57, -3.49, 9.84),
@@ -6094,7 +6104,7 @@ def _test_abs(tensors_dict):
         _test_abs(self._make_tensors((3, 5, 7), val_range=(0, max_val)))
         _test_abs(self._make_tensors((2, 2, 5, 8, 2, 3), val_range=(0, max_val)))
         _test_abs(self._make_tensors((1000, ), val_range=(0, max_val)))
-        _test_abs(self._make_tensors((30, 30, 30), val_range=(0, max_val)))
+        _test_abs(self._make_tensors((10, 10, 10), val_range=(0, max_val)))
 
         # Checking that the right abs function is called for LongTensor
         bignumber = 2 ^ 31 + 1
@@ -8474,6 +8484,67 @@ def test_unique(self):
         self.assertEqual(torch.ByteTensor([7, 42, 128, 133]), byte_unique)
         self.assertEqual(torch.LongTensor([3, 0, 0, 0, 1, 2]), byte_inverse)
 
+    def test_unique_dim(self):
+        def run_test(dtype=torch.float):
+            x = torch.tensor([[[1., 1.],
+                               [0., 1.],
+                               [2., 1.],
+                               [0., 1.]],
+                              [[1., 1.],
+                               [0., 1.],
+                               [2., 1.],
+                               [0., 1.]]], dtype=dtype)
+            expected_unique_dim0 = torch.tensor([[[1., 1.],
+                                                  [0., 1.],
+                                                  [2., 1.],
+                                                  [0., 1.]]], dtype=dtype)
+            expected_inverse_dim0 = torch.tensor([0, 0])
+            expected_unique_dim1 = torch.tensor([[[0., 1.],
+                                                  [1., 1.],
+                                                  [2., 1.]],
+                                                 [[0., 1.],
+                                                  [1., 1.],
+                                                  [2., 1.]]], dtype=dtype)
+            expected_inverse_dim1 = torch.tensor([1, 0, 2, 0])
+            expected_unique_dim2 = torch.tensor([[[1., 1.],
+                                                  [0., 1.],
+                                                  [2., 1.],
+                                                  [0., 1.]],
+                                                 [[1., 1.],
+                                                  [0., 1.],
+                                                  [2., 1.],
+                                                  [0., 1.]]], dtype=dtype)
+            expected_inverse_dim2 = torch.tensor([0, 1])
+
+            # dim0
+            x_unique = torch.unique(x, dim=0)
+            self.assertEqual(expected_unique_dim0, x_unique)
+
+            x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=0)
+            self.assertEqual(expected_unique_dim0, x_unique)
+            self.assertEqual(expected_inverse_dim0, x_inverse)
+
+            # dim1
+            x_unique = torch.unique(x, dim=1)
+            self.assertEqual(expected_unique_dim1, x_unique)
+
+            x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=1)
+            self.assertEqual(expected_unique_dim1, x_unique)
+            self.assertEqual(expected_inverse_dim1, x_inverse)
+
+            # dim2
+            x_unique = torch.unique(x, dim=2)
+            self.assertEqual(expected_unique_dim2, x_unique)
+
+            x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=2)
+            self.assertEqual(expected_unique_dim2, x_unique)
+            self.assertEqual(expected_inverse_dim2, x_inverse)
+
+        run_test(torch.float)
+        run_test(torch.double)
+        run_test(torch.long)
+        run_test(torch.uint8)
+
     @staticmethod
     def _test_bincount(self, device):
         # negative input throws
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
index c963650933cf25..ac3e8782eb355d 100644
--- a/tools/autograd/gen_variable_factories.py
+++ b/tools/autograd/gen_variable_factories.py
@@ -5,11 +5,16 @@
 import re
 
 from .utils import CodeTemplate, write
+from .gen_variable_type import format_trace
+
 
 FUNCTION_TEMPLATE = CodeTemplate("""\
 inline at::Tensor ${name}(${formals}) {
+  ${pre_record_trace}
   at::Tensor tensor = at::${name}(${actuals});
-  return autograd::make_variable(tensor, /*requires_grad=*/${requires_grad});
+  auto result = autograd::make_variable(tensor, /*requires_grad=*/${requires_grad});
+  ${post_record_trace}
+  return result;
 }
 """)
 
@@ -53,6 +58,10 @@ def process_function(decl, has_tensor_options):
     requires_grad = "options.requires_grad()" if has_tensor_options else "false"
     if decl['name'].endswith('_like') and not has_tensor_options:
         actuals.append('at::TensorOptions({}, /*discard_runtime_type=*/true)'.format(actuals[0]))
+
+    pre_record_trace, post_record_trace = format_trace(decl)
+
     return FUNCTION_TEMPLATE.substitute(
-        name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad
+        name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad,
+        pre_record_trace=pre_record_trace, post_record_trace=post_record_trace
     )
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 0fe32115da314e..caa6744bb38542 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -141,7 +141,7 @@
 
 POST_RECORD_TRACE = CodeTemplate("""\
 if (jit::tracer::isTracing()) {
-  jit::tracer::postRecordTrace(node, ArrayRef<Variable>(${trace_outputs}) );
+  jit::tracer::postRecordTrace(node, at::ArrayRef<autograd::Variable>(${trace_outputs}) );
 }
 """)
 
@@ -183,6 +183,41 @@ def should_trace(declaration):
     return True
 
 
+def get_trace_outputs(declaration):
+    if declaration['return_type'] == 'std::vector<Tensor>':
+        return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name'])
+    elif declaration['name'].endswith('_out'):
+        output_args = [arg['name'] for arg in declaration['arguments']
+                       if arg.get('output', False)]
+        return '{' + ', '.join(output_args) + '}'
+    trace_outs = [r['name'] for r in declaration['returns']]
+    if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']):
+        return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs)
+    else:
+        return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs)
+
+
+def format_trace(declaration):
+    local = {}
+
+    add_trace_inputs = []
+    for argument in declaration['arguments']:
+        add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name']))
+    local['add_trace_inputs'] = '\n'.join(add_trace_inputs)
+
+    # Record inplace operations as out-of-place operations (e.g.,
+    # not add_ but add)
+    # TODO: Add a proper concept of side effects to the IR, and
+    # properly record inplace operations.
+    local['trace_name'] = uninplace_api_name(declaration['api_name'])
+    if local['trace_name'] in RENAME_TRACE:
+        local['trace_name'] = RENAME_TRACE[local['trace_name']]
+
+    local['trace_outputs'] = get_trace_outputs(declaration)
+
+    return (PRE_RECORD_TRACE.substitute(local), POST_RECORD_TRACE.substitute(local))
+
+
 def gen_variable_type(out, aten_declarations, template_path):
     """VariableType.h and VariableType.cpp body
 
@@ -361,42 +396,10 @@ def reference_args(args):
                 res.append(arg['name'])
         return res
 
-    def get_trace_outputs(declaration):
-        if declaration['return_type'] == 'std::vector<Tensor>':
-            return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name'])
-        elif name.endswith('_out'):
-            output_args = [arg['name'] for arg in arguments
-                           if arg.get('output', False)]
-            return '{' + ', '.join(output_args) + '}'
-        trace_outs = [r['name'] for r in declaration['returns']]
-        if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']):
-            return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs)
-        else:
-            return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs)
-
     def emit_record_trace(env):
         if not should_trace(declaration):
             return ('', '')
-
-        local = {}
-
-        add_trace_inputs = []
-        for argument in declaration['arguments']:
-            add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name']))
-        local['add_trace_inputs'] = '\n'.join(add_trace_inputs)
-
-        # Record inplace operations as out-of-place operations (e.g.,
-        # not add_ but add)
-        # TODO: Add a proper concept of side effects to the IR, and
-        # properly record inplace operations.
-        local['trace_name'] = uninplace_api_name(declaration['api_name'])
-        if local['trace_name'] in RENAME_TRACE:
-            local['trace_name'] = RENAME_TRACE[local['trace_name']]
-
-        local['trace_outputs'] = get_trace_outputs(declaration)
-
-        combined = nested_dict(local, nested_dict(env, declaration))
-        return (PRE_RECORD_TRACE.substitute(combined), POST_RECORD_TRACE.substitute(combined))
+        return format_trace(declaration)
 
     def declare_returned_variables():
         if modifies_arguments:
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 89101a24714b72..244606ca7938d7 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -43,7 +43,7 @@ using namespace torch::autograd::generated;
 namespace torch { namespace autograd {
 
 VariableType::VariableType(Context* context, Type* baseType)
-  : Type(context, baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
+  : Type(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
   , baseType(baseType)
   , id_(context->freshTypeID()) {
   str = std::string("Variable[") + baseType->toString() + "]";
diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h
index bc2fa21385777f..bf74abc9138c65 100644
--- a/tools/autograd/templates/variable_factories.h
+++ b/tools/autograd/templates/variable_factories.h
@@ -3,7 +3,7 @@
 // ${generated_comment}
 
 #include <torch/csrc/autograd/variable.h>
-
+#include <torch/csrc/jit/tracer.h>
 #include <ATen/ATen.h>
 #include <ATen/core/ArrayRef.h>
 
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 994a96ad822b41..d1cdb855c9099f 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -281,6 +281,12 @@ function build_caffe2() {
       # STOP!!! Are you trying to add a C or CXX flag?  Add it
       # to CMakeLists.txt and aten/CMakeLists.txt, not here.
       # We need the vanilla cmake build to work.
+
+  # This is needed by the aten tests built with caffe2
+  if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then
+    cp "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
+  fi
+
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
 
   # Install Python proto files
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index d337143dd8b09e..ff7fce56e91552 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -262,7 +262,8 @@ def declkey(decl):
                 arguments.extend([
                     # XXX - until we actually have first-class interpreter types for these
                     # concepts, the default values to be encoded in Tensors
-
+                    # If you change this, you also need to update [TensorOptions in script]
+                    # in the tracer code.
                     # dtype is specified as an int64_t of at::ScalarType
                     {'name': 'dtype', 'simple_type': 'ScalarType', 'default': 'float', 'kwarg_only': True},
                     # layout is specified as an int64_t of at::Layout
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 39d14668958c94..0a76a89a20d55a 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -650,6 +650,20 @@ def add_docstr_all(method, docstr):
 See :func:`torch.diagonal`
 """)
 
+add_docstr_all('digamma',
+               r"""
+digamma() -> Tensor
+
+See :func:`torch.digamma`
+""")
+
+add_docstr_all('digamma_',
+               r"""
+digamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.digamma`
+""")
+
 add_docstr_all('dim',
                r"""
 dim() -> int
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 6561c7a7c23889..a9db54d3117842 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1168,6 +1168,26 @@ def parse_kwargs(desc):
              [ 1.0500,  0.7336, -0.3836, -1.1015]]])
 """)
 
+add_docstr(torch.digamma,
+           r"""
+digamma(input) -> Tensor
+
+Computes the logarithmic derivative of the gamma function on `input`.
+
+.. math::
+    \psi(x) = \frac{d}{dx} \ln\left(\Gamma\left(x\right)\right) = \frac{\Gamma'(x)}{\Gamma(x)}
+
+Args:
+    input (Tensor): the tensor to compute the digamma function on
+
+Example::
+
+    >>> a = torch.tensor([1, 0.5])
+    >>> torch.digamma(a)
+    tensor([-0.5772, -1.9635])
+""")
+
+
 add_docstr(torch.dist,
            r"""
 dist(input, other, p=2) -> Tensor
@@ -4117,7 +4137,7 @@ def parse_kwargs(desc):
 Constructs a sparse tensors in COO(rdinate) format with non-zero elements at the given :attr:`indices`
 with the given :attr:`values`. A sparse tensor can be `uncoalesced`, in that case, there are duplicate
 coordinates in the indices, and the value at that index is the sum of all duplicate value entries:
-`torch.spaerse`_.
+`torch.sparse`_.
 
 Args:
     indices (array_like): Initial data for the tensor. Can be a list, tuple,
@@ -4439,6 +4459,15 @@ def parse_kwargs(desc):
     upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region
     out (tuple, optional): the output tuple of (Tensor, Tensor)
 
+Returns:
+    (Tensor, Tensor): A tuple containing
+
+        - **e** (*Tensor*): Shape :math:`(m)`. Each element is an eigenvalue of ``input``,
+            The eigenvalues are in ascending order.
+        - **V** (*Tensor*): Shape :math:`(m \times m)`.
+            If ``eigenvectors=False``, it's a tensor filled with zeros.
+            Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``.
+
 Examples::
 
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 75e309ac0faf06..c1be47ad494397 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -554,11 +554,11 @@ def build_table(events, sort_by=None, header=None):
     header_sep = '-' * max_name_length + ('  ' + '-' * col_width) * 5
 
     # Have to use a list because nonlocal is Py3 only...
-    result = ['']
+    result = []
 
     def append(s):
-        result[0] += s
-        result[0] += '\n'
+        result.append(s)
+        result.append('\n')  # Yes, newline after the end as well
 
     # Actual printing
     if header is not None:
@@ -572,4 +572,4 @@ def append(s):
         append(row_format.format(evt.key, evt.cpu_time_str, evt.cuda_time_str,
                                  evt.count, evt.cpu_time_total_str, evt.cuda_time_total_str))
 
-    return result[0]
+    return ''.join(result)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index af367c3e544905..e17997e6e9baba 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -584,13 +584,20 @@ static PyObject* initModule() {
   ASSERT_TRUE(THCPStream_init(module));
 #endif
 
+  auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) {
+    // PyModule_AddObject steals reference
+    if (incref) {
+      Py_INCREF(v);
+    }
+    return PyModule_AddObject(module, name, v) == 0;
+  };
+
 #ifdef USE_CUDNN
   PyObject *has_cudnn = Py_True;
 #else
   PyObject *has_cudnn = Py_False;
 #endif
-  Py_INCREF(has_cudnn);
-  ASSERT_TRUE(PyModule_AddObject(module, "has_cudnn", has_cudnn) == 0);
+ ASSERT_TRUE(set_module_attr("has_cudnn", has_cudnn));
 
 #ifdef USE_DISTRIBUTED_MW
   // See comment on CUDA objects
@@ -611,19 +618,20 @@ static PyObject* initModule() {
   // Set ATen warnings to issue Python warnings
   at::Warning::set_warning_handler(&warning_handler);
 
-  ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0);
+  ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False));
+  ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
 
 #ifdef _GLIBCXX_USE_CXX11_ABI
-  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI",
-        _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0);
+  ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False));
 #else
-  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0);
+  ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_False));
 #endif
 
   auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU);
   THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator(
     defaultGenerator);
-  ASSERT_TRUE(PyModule_AddObject(module, "default_generator", (PyObject*)THPDefaultGenerator) == 0);
+  // This reference is meant to be given away, so no need to incref here.
+  ASSERT_TRUE(set_module_attr("default_generator", (PyObject*)THPDefaultGenerator, /* incref= */ false));
 
 #ifdef USE_NUMPY
   if (_import_array() < 0) return NULL;
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index a4fcc6c45e874d..8fd95eda86f121 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -333,16 +333,15 @@ static PyObject * THCPModule_initExtension(PyObject *self)
   THCPCharStorage_postInit(m);
   THCPByteStorage_postInit(m);
 
-#ifdef USE_MAGMA
-  THCMagma_init(state);
-  bool has_magma = true;
-#else
-  bool has_magma = false;
-#endif
+  bool has_magma = at::hasMAGMA();
+  if (has_magma) {
+    THCMagma_init(state);
+  }
 
   bool has_half = true;
 
   auto set_module_attr = [&](const char* name, PyObject* v) {
+    // PyObject_SetAttrString doesn't steal reference. So no need to incref.
     if (PyObject_SetAttrString(m, name, v) < 0) {
       throw python_error();
     }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index fdf88bc0704a47..a67d009e024360 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -346,8 +346,8 @@ PyObject* c10d_init(PyObject* _unused) {
 #endif
 
   shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work")
-      .def("isCompleted", &::c10d::ProcessGroup::Work::isCompleted)
-      .def("isSuccess", &::c10d::ProcessGroup::Work::isSuccess)
+      .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted)
+      .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
       .def("exception", &::c10d::ProcessGroup::Work::exception)
       .def("synchronize", &::c10d::ProcessGroup::Work::synchronize)
       .def(
diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp
index 42f3f583b848e9..d8f33c533b2039 100644
--- a/torch/csrc/generic/Storage.cpp
+++ b/torch/csrc/generic/Storage.cpp
@@ -151,9 +151,9 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
     int64_t nindex = THPUtils_unpackLong(index);
     if (nindex < 0)
       nindex += THWStorage_(size)(LIBRARY_STATE self->cdata);
-    if (nindex < 0 || nindex >= self->cdata->size()) {
+    if (nindex < 0 || nindex >= self->cdata->numel()) {
       PyErr_Format(PyExc_IndexError, "index %" PRId64 " out of range for storage of "
-              "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->size());
+              "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->numel());
       return NULL;
     }
     real value = THWStorage_(get)(LIBRARY_STATE self->cdata, nindex);
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
index 4a7c01b2ca2e82..6b462160c6d0b2 100644
--- a/torch/csrc/generic/StorageSharing.cpp
+++ b/torch/csrc/generic/StorageSharing.cpp
@@ -79,7 +79,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self)
   } else {
     // TODO: retry on collision
     // TODO: free GIL - but remember to reacquire it when an exception is thrown
-    THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size()));
+    THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->numel()));
     THWStorage_(copy)(new_storage, storage);
     THWStorage_(swap)(storage, new_storage);
     ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr());
@@ -90,7 +90,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self)
   if (!manager_handle) return NULL;
   THPObjectPtr storage_handle(PyBytes_FromString(ctx->filename()));
   if (!storage_handle) return NULL;
-  THPObjectPtr size(PyLong_FromLong(storage->size()));
+  THPObjectPtr size(PyLong_FromLong(storage->numel()));
   if (!size) return NULL;
 
   THPObjectPtr tuple(PyTuple_New(3));
@@ -158,7 +158,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self)
   if ((ctx = THMapAllocator::fromDataPtr(storage->data_ptr()))) {
     // done
   } else {
-    THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size()));
+    THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->numel()));
     THWStorage_(copy)(new_storage, storage);
     THWStorage_(swap)(storage, new_storage);
     ctx = THMapAllocator::fromDataPtr(storage->data_ptr());
@@ -167,7 +167,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self)
 
   THPObjectPtr storage_handle(PyLong_FromLong(ctx->fd()));
   if (!storage_handle) return NULL;
-  THPObjectPtr size(PyLong_FromLong(storage->size()));
+  THPObjectPtr size(PyLong_FromLong(storage->numel()));
   if (!size) return NULL;
 
   THPObjectPtr tuple(PyTuple_New(2));
@@ -220,7 +220,7 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self)
   THPObjectPtr device(PyLong_FromLong(storage->device().index()));
   THPObjectPtr _handle(Py_None);
   Py_INCREF(Py_None);
-  THPObjectPtr size(PyLong_FromLong(storage->size()));
+  THPObjectPtr size(PyLong_FromLong(storage->numel()));
   THPObjectPtr _offset(PyLong_FromLong(0));
   if (THWStorage_(data)(LIBRARY_STATE storage)) {
     size_t base_size;
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index f51a735acea1b5..d7876411c687a6 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -13,7 +13,9 @@ Value* insertConstant(
   Node * n = g.create(prim::Constant);
   if(val.isTensor()) {
     at::Tensor ref = std::move(val).toTensor();
-    JIT_ASSERT(ref.defined());
+    if(!ref.defined()) {
+      throw constant_not_supported_error("undefined tensors cannot become constants");
+    }
     n->output()->inferTypeFrom(ref); // note: before t_ because of std::move(ref)
     n->t_(attr::value, std::move(ref));
   } else if(val.isInt()) {
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index 6855002d4fd9cb..bfd8ec9b9f1764 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -31,6 +31,10 @@ std::unordered_set<Symbol> skip_list = {
   aten::randn_like,
   aten::randperm,
   aten::randperm_out,
+  prim::Constant,
+  prim::Undefined,
+  // TODO (zach): we should consider skipping tensor factories in the cases
+  // where the constant tensor would be large but cheap to create.
  };
 
 std::vector<IValue> runNode(Node* n) {
@@ -40,9 +44,14 @@ std::vector<IValue> runNode(Node* n) {
     stack.push_back(*(toIValue(input)));
   }
   op(stack);
-  auto var_outputs = fmap(stack, [&](IValue v) {
+  auto var_outputs = fmap(stack, [&](IValue v) -> IValue {
     if (v.isTensor()) {
-      return IValue(autograd::as_variable_ref(v.toTensor()).data());
+      auto t = std::move(v).toTensor();
+      if(t.defined()) {
+        return IValue(autograd::as_variable_ref(t).data());
+      } else {
+        return t;
+      }
     } else {
       return v;
     }
@@ -119,11 +128,11 @@ bool removeExtraNodeOutputs(Node *n) {
 } // anonymous namespace
 
 void ConstantPropagation(Node* n, bool recurse) {
-  bool constant_inputs = (n->inputs().size() > 0) &&
-    std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
-      return v->node()->kind() == prim::Constant;
-    });
-  bool supported_node = skip_list.count(n->kind()) == 0;
+  bool constant_inputs =
+      std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
+        return v->node()->kind() == prim::Constant;
+      });
+  bool supported_node = !n->kind().is_onnx() && skip_list.count(n->kind()) == 0;
   auto run_blocks = [&]() {
     if (recurse) {
       for (Block * block : n->blocks()) {
@@ -150,7 +159,6 @@ void ConstantPropagation(Node* n, bool recurse) {
 }
 
 void ConstantPropagation(Block* block, bool recurse) {
-  ConstantPropagation(block->param_node(), recurse);
   for(auto it = block->nodes().begin(); it != block->nodes().end();) {
     Node *n = *it;
     it++; //advance iterator bc the current node may be destroyed
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index d16d4b00f07e91..d685584a4045be 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -443,14 +443,29 @@ void initPythonIRBindings(PyObject * module_) {
       switch(t->kind()) {
         case TypeKind::DynamicType:
           return "DynamicType";
+        case TypeKind::TensorType:
+          return "TensorType";
+        case TypeKind::NumberType:
+          return "NumberType";
+        case TypeKind::NoneType:
+          return "NoneType";
         case TypeKind::CompleteTensorType:
           return "CompleteTensorType";
         case TypeKind::TupleType:
           return "TupleType";
-        default:
-          AT_ERROR("unknown type kind");
-          return "";
+        case TypeKind::ListType:
+          return "ListType";
+        case TypeKind::IntType:
+          return "IntType";
+        case TypeKind::FloatType:
+          return "FloatType";
+        case TypeKind::StringType:
+          return "StringType";
+        case TypeKind::GeneratorType:
+          return "GeneratorType";
         }
+        // not reachable, but some compilers complain
+        AT_ERROR("Unknown Type Kind");
     })
     .def("sizes",[](Type& t) {
       return t.expect<CompleteTensorType>()->sizes();
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 5bc7bd574cf766..fee8924277d11e 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -48,6 +48,16 @@ void addInputs(Node *n, const char * name, at::TensorList value) {
   n->addInput(list_node->output());
 }
 
+void addInputs(Node* n, const char * name, const at::TensorOptions& options) {
+  // [TensorOptions in script] - update this when you change how we schematize TensorOptions
+  detail::genericAddInput(n, static_cast<int64_t>(options.dtype()));
+  detail::genericAddInput(n, static_cast<int64_t>(options.layout()));
+  std::vector<int64_t> device = {
+      static_cast<int64_t>(options.device().type()),
+      static_cast<int64_t>(options.device().index())};
+  detail::genericAddInput(n, std::move(device));
+}
+
 void addInputs(Node *n, const char * name, at::IntList value) {
   using ArgumentStash = jit::tracer::ArgumentStash;
   std::vector<Value*> info = ArgumentStash::hasIntList(name) ?
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 789b3fd2d4591c..b811534ce27401 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -229,16 +229,17 @@ inline void abandon() {
 
 // NB: those serve both as an intermediate steps in addInputs below,
 // as well as the overloads that terminate template recursion
-void addInputs(Node *n, const char * name, int64_t value);
-void addInputs(Node *n, const char * name, bool value);
-void addInputs(Node *n, const char * name, double value);
-void addInputs(Node *n, const char * name, const at::Scalar& value);
-void addInputs(Node *n, const char * name, const at::Tensor& value);
-void addInputs(Node *n, const char * name, at::IntList value);
-void addInputs(Node *n, const char * name, at::TensorList value);
-void addInputs(Node *n, const char * name, const ArrayRef<double>& value);
-void addInputs(Node *n, const char * name, const std::string& value);
-void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
+TORCH_API void addInputs(Node *n, const char * name, int64_t value);
+TORCH_API void addInputs(Node *n, const char * name, bool value);
+TORCH_API void addInputs(Node *n, const char * name, double value);
+TORCH_API void addInputs(Node *n, const char * name, const at::Scalar& value);
+TORCH_API void addInputs(Node *n, const char * name, const at::Tensor& value);
+TORCH_API void addInputs(Node *n, const char * name, at::IntList value);
+TORCH_API void addInputs(Node *n, const char * name, at::TensorList value);
+TORCH_API void addInputs(Node *n, const char * name, const ArrayRef<double>& value);
+TORCH_API void addInputs(Node *n, const char * name, const std::string& value);
+TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
+TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value);
 
 template<size_t N>
 void addInputs(Node *n, const char * name, std::array<bool, N> value) {
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index c7e33fae7e20ac..e5a3e64ac067d8 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -51,6 +51,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << "None";
   } else if(t.kind() == TypeKind::StringType) {
     out << "string";
+  } else if(t.kind() == TypeKind::GeneratorType) {
+    out << "Generator";
   } else {
     AT_ERROR("unknown type kind");
   }
diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py
index 3b98424e891479..5356097743aa3c 100644
--- a/torch/distributed/c10d/__init__.py
+++ b/torch/distributed/c10d/__init__.py
@@ -6,20 +6,8 @@ def is_available():
 
 
 if is_available() and not torch._C._c10d_init():
-    raise RuntimeError("c10d initialization failed")
+    raise RuntimeError("Failed to initialize PyTorch distributed support")
 
 
 if is_available():
-    from .rendezvous import rendezvous, register_rendezvous_handler
-    from . import BroadcastOptions, AllreduceOptions
-
-    DEFAULT_REDUCE_OPTIONS = AllreduceOptions()
-
-    def broadcast(tensor, src, process_group):
-        opts = BroadcastOptions()
-        opts.rootRank = src
-        opts.rootTensor = 0
-        return process_group.broadcast([tensor], opts)
-
-    def all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS):
-        return process_group.allreduce([tensor], opts)
+    from .distributed_c10d import *
diff --git a/torch/distributed/c10d/distributed_c10d.py b/torch/distributed/c10d/distributed_c10d.py
new file mode 100644
index 00000000000000..dc341f99427552
--- /dev/null
+++ b/torch/distributed/c10d/distributed_c10d.py
@@ -0,0 +1,1054 @@
+import torch
+
+from .rendezvous import rendezvous, register_rendezvous_handler
+from . import BroadcastOptions, AllreduceOptions, ReduceOptions, \
+    ScatterOptions, GatherOptions
+from . import ReduceOp as reduce_op
+from . import PrefixStore
+from . import ProcessGroupGloo
+
+
+_MPI_AVAILBLE = True
+_NCCL_AVAILBLE = True
+
+
+try:
+    from. import ProcessGroupMPI
+except ImportError:
+    _MPI_AVAILBLE = False
+
+try:
+    from. import ProcessGroupNCCL
+except ImportError:
+    _NCCL_AVAILBLE = False
+
+
+class DistBackend:
+    UNDEFINED = -1
+    GLOO = 0
+    NCCL = 2
+    MPI = 3
+
+
+class group(object):
+    WORLD = object()
+
+
+class GroupMember(object):
+    # Alias to group.WORLD for backward compatibility
+    WORLD = group.WORLD
+    NON_GROUP_MEMBER = object()
+
+
+# Cached process groups, map from ProcessGroup to (DistBackend, Store)
+_pg_map = {}
+# Process group's names, map from ProcessGroup to str
+_pg_names = {}
+# Process group's global rank to local rank mapping
+_pg_group_ranks = {}
+
+# Default process group state
+_default_pg = None
+_default_pg_init_method = None
+
+# Process group count for default naming
+_group_count = 0
+
+
+def _rank_not_in_group(group):
+    """
+    Helper that checks if the current process's rank is not in a given group
+
+    """
+    return group == GroupMember.NON_GROUP_MEMBER
+
+
+def _get_group_rank(group, rank):
+    """
+    Helper that gets a given group's local rank in the group from a given global
+    rank
+
+    """
+    if group is GroupMember.WORLD:
+        raise RuntimeError("group.WORLD does not have local rank to global "
+                           "rank mapping")
+    group_rank = _pg_group_ranks[group][rank]
+    if group_rank is None:
+        raise RuntimeError("The global rank is not part of the group")
+    return group_rank
+
+
+def _get_global_rank(group, group_rank):
+    """
+    Helper that gets a given group's global rank from a given local rank in the
+    group
+
+    """
+    if group is GroupMember.WORLD:
+        raise RuntimeError("group.WORLD does not have local rank to global "
+                           "rank mapping")
+    group_rank_map = _pg_group_ranks[group]
+    for rank, grp_rank in group_rank_map.items():
+        if grp_rank == group_rank:
+            return rank
+    raise RuntimeError("The group rank is not part of the group")
+
+
+def _check_default_pg():
+    """
+    Helper that checks if the default ProcessGroup has been initializd, with
+    assertion
+
+    """
+    assert _default_pg is not None, \
+        "Default process group is not initialized"
+
+
+def is_mpi_available():
+    """
+    Checks if MPI is available
+
+    """
+    return _MPI_AVAILBLE
+
+
+def is_nccl_available():
+    """
+    Checks if NCCL is available
+
+    """
+    return _NCCL_AVAILBLE
+
+
+def is_initialized():
+    """
+    Checking if the default process group has been initialized
+
+    """
+    return _default_pg is not None
+
+
+def get_default_group():
+    """
+    Getting the default process group created by init_process_group
+
+    """
+    if not is_initialized():
+        raise RuntimeError("Default process group has not been initialized, "
+                           "please make sure to call init_process_group.")
+    return _default_pg
+
+
+def init_process_group(backend,
+                       init_method="env://",
+                       **kwargs):
+    """
+    Initializes the default distributed process group, and this will also
+    initialize the distributed package
+
+    Arguments:
+        backend (str): Name of the backend to use. Depending on build-time
+                       configuration valid values include:
+                        ``mpi`` and ``gloo``.
+        init_method (str, optional): URL specifying how to initialize the
+                                     process group.
+        world_size (int, optional): Number of processes participating in
+                                    the job.
+        rank (int, optional): Rank of the current process.
+        group_name (str, optional, deprecated): Group name.
+
+    To enable ``backend == mpi``, PyTorch needs to built from source on
+    a system that supports MPI. The same applies to NCCL as well.
+
+    """
+    global _pg_map
+    global _pg_names
+    global _default_pg
+    global _default_pg_init_method
+
+    if _default_pg is not None:
+        raise RuntimeError("trying to initialize the default process group "
+                           "twice!")
+
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, \
+        "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if backend == "mpi":
+        if not is_mpi_available():
+            raise RuntimeError("Distributed package doesn't have MPI built in")
+
+        _default_pg = ProcessGroupMPI()
+        _pg_map[_default_pg] = (DistBackend.MPI, None)
+    else:
+        # backward compatible API
+        if init_method != "env://" and world_size != -1 and rank != -1:
+            url = "{}?rank={}&world_size={}".format(init_method,
+                                                    rank,
+                                                    world_size)
+            store, _, _ = next(rendezvous(url))
+        else:
+            store, rank, world_size = next(rendezvous(init_method))
+
+        if backend == "gloo":
+            _default_pg = ProcessGroupGloo(store, rank, world_size)
+            _pg_map[_default_pg] = (DistBackend.GLOO, store)
+            _pg_names[_default_pg] = group_name
+        elif backend == "nccl":
+            if not is_nccl_available():
+                raise RuntimeError("Distributed package doesn't have NCCL "
+                                   "built in")
+            _default_pg = ProcessGroupNCCL(store, rank, world_size)
+            _pg_map[_default_pg] = (DistBackend.NCCL, store)
+            _pg_names[_default_pg] = group_name
+        else:
+            raise RuntimeError("Invalid distributed backend name: " + backend)
+
+    _default_pg_init_method = init_method
+
+
+def _new_process_group_helper(world_size, rank, group_name=""):
+    """
+    Create a new distributed process group. And the new process group can be
+    used to perform collective operations.
+
+    """
+    global _pg_map
+    global _group_count
+    global _pg_names
+
+    if not group_name:
+        group_name = str(_group_count)
+        _group_count += 1
+
+    if group_name in _pg_names.values():
+        raise RuntimeError("The specified group name has already been "
+                           "created, please use a different group name")
+
+    default_backend, default_store = _pg_map[_default_pg]
+
+    # Create the prefix store
+    store = PrefixStore(group_name, default_store)
+
+    if default_backend == DistBackend.GLOO:
+        pg = ProcessGroupGloo(store, rank, world_size)
+        _pg_map[pg] = (DistBackend.GLOO, store, group_name)
+        _pg_names[_default_pg] = group_name
+    elif default_backend == DistBackend.NCCL:
+        if not is_nccl_available():
+            raise RuntimeError("Distributed package doesn't have NCCL "
+                               "built in")
+        pg = ProcessGroupNCCL(store, rank, world_size)
+        _pg_map[pg] = (DistBackend.NCCL, store, group_name)
+        _pg_names[_default_pg] = group_name
+    else:
+        raise RuntimeError("Unsupported distributed backend by group")
+    return pg
+
+
+def destroy_process_group(group=group.WORLD):
+    """
+    Destroy a given process group, and deinitialize the distributed package
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to be destroyed, if
+                                        group.WORLD is given, all process
+                                        groups including the default one will
+                                        be destroyed.
+    """
+    if _rank_not_in_group(group):
+        return
+
+    global _pg_map
+    global _pg_names
+    global _pg_group_ranks
+    global _default_pg
+    global _default_pg_init_method
+
+    if group == GroupMember.WORLD:
+        pg = _default_pg
+
+    if _pg_map.get(pg, None) is None:
+        raise RuntimeError("Invalid process group specified")
+
+    if group == GroupMember.WORLD:
+        _default_pg = None
+        _default_pg_init_method = None
+        _pg_map.clear()
+        _pg_names.clear()
+        _pg_group_ranks.clear()
+    else:
+        del _pg_map[pg]
+        del _pg_names[pg]
+        del _pg_group_ranks[pg]
+
+
+def get_rank(group=group.WORLD):
+    """
+    Returns the rank of currrent process group
+
+    Rank is a unique identifier assigned to each process within a distributed
+    process group. They are always consecutive integers ranging from 0 to
+    ``world_size``.
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        The rank of the process group
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.rank()
+
+    return group.rank()
+
+
+def get_world_size(group=group.WORLD):
+    """
+    Returns the number of processes in the current process group
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        The world size of the process group
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.size()
+
+    return group.size()
+
+
+def isend(tensor,
+          dst,
+          group=group.WORLD):
+    """
+    Sends a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        A distributed request object.
+        None, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.send([tensor], dst)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        return group.send([tensor], group_dst_rank)
+
+
+def irecv(tensor,
+          src,
+          group=group.WORLD):
+    """
+    Receives a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int): Source rank.
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        A distributed request object.
+        None, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        return _default_pg.recv([tensor], src)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        return group.recv([tensor], group_src_rank)
+
+
+def send(tensor,
+         dst,
+         group=group.WORLD):
+    """
+    Sends a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+        group (ProcessGroup, optional): The process group to work on
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        _default_pg.send([tensor], dst).wait()
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        group.send([tensor], group_dst_rank).wait()
+
+
+def recv(tensor,
+         src=None,
+         group=group.WORLD):
+    """
+    Receives a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int, optional): Source rank. Will receive from any
+            process if unspecified.
+        group (ProcessGroup, optional): The process group to work on
+
+    Returns:
+        Sender rank
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        pg = _default_pg
+    else:
+        pg = group
+
+    if src is None:
+        rank_tensor = torch.IntTensor([-1])
+        pg.recv_anysource([tensor], rank_tensor).wait()
+        src_rank = rank_tensor[0].item()
+        if group == GroupMember.WORLD:
+            return src_rank
+        else:
+            return _get_global_rank(pg, src_rank)
+    else:
+        if group == GroupMember.WORLD:
+            pg.recv([tensor], src).wait()
+        else:
+            group_src_rank = _get_group_rank(pg, src)
+            pg.recv([tensor], group_src_rank).wait()
+        return src
+
+
+def broadcast_multigpu(tensor_list,
+                       src,
+                       group=group.WORLD,
+                       async_op=False,
+                       src_tensor=0):
+    """
+    Broadcasts the tensor to the whole group with multiple GPU tensors
+    per node.
+
+    ``tensor`` must have the same number of elements in all the GPUs from
+    all processes participating in the collective. each tensor in the list must
+    be on a different GPU
+
+    Only nccl and gloo backend are currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor_list (List[Tensor]): Tensors that participate in the collective
+            operation. if ``src`` is the rank, then ``src_tensor``th element of
+            ``tensor_list`` (``tensor_list[src_tensor]``) will be broadcasted
+            to all other tensors (on different GPUs) in the src process and
+            all tensors in ``tensor_list`` of other non-src processes.
+            You also need to make sure that ``len(tensor_list)`` is the same
+            for all the distributed processes calling this function.
+
+        src (int): Source rank.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+        src_tensor (int, optional): Source tensor rank within ``tensor_list``
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = BroadcastOptions()
+    opts.rootRank = src
+    opts.rootTensor = src_tensor
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.broadcast(tensor_list, opts)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.broadcast(tensor_list, opts)
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def broadcast(tensor,
+              src,
+              group=group.WORLD,
+              async_op=False):
+    """
+    Broadcasts the tensor to the whole group.
+
+    ``tensor`` must have the same number of elements in all processes
+    participating in the collective.
+
+    Arguments:
+        tensor (Tensor): Data to be sent if ``src`` is the rank of current
+            process, and tensor to be used to save received data otherwise.
+        src (int): Source rank.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = BroadcastOptions()
+    opts.rootRank = src
+    opts.rootTensor = 0
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.broadcast([tensor], opts)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.broadcast([tensor], opts)
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_reduce_multigpu(tensor_list,
+                        op=reduce_op.SUM,
+                        group=group.WORLD,
+                        async_op=False):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result. This function reduces a number of tensors on every node,
+    while each tensor resides on different GPUs.
+    Therefore, the input tensor in the tensor list needs to be GPU tensors.
+    Also, each tensor in the tensor list needs to reside on a different GPU.
+
+    After the call, all ``tensor`` in ``tensor_list`` is going to be bitwise
+    identical in all processes.
+
+    Only nccl and gloo backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor list (List[Tensor]): List of input and output tensors of
+            the collective. The function operates in-place and requires that
+            each tensor to be a GPU tensor on different GPUs.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = AllreduceOptions()
+    opts.reduceOp = op
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allreduce(tensor_list, opts)
+    else:
+        work = group.allreduce(tensor_list, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_reduce(tensor,
+               op=reduce_op.SUM,
+               group=group.WORLD,
+               async_op=False):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    After the call ``tensor`` is going to be bitwise identical in all processes.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = AllreduceOptions()
+    opts.reduceOp = op
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allreduce([tensor], opts)
+    else:
+        work = group.allreduce([tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def reduce_multigpu(tensor_list,
+                    dst,
+                    op=reduce_op.SUM,
+                    group=group.WORLD,
+                    async_op=False,
+                    dst_tensor=0):
+    """
+    Reduces the tensor data on multiple GPUs across all machines. Each tensor
+    in ``tensor_list`` should reside on a separate GPU
+
+    Only the GPU of ``tensor_list[dst_tensor]`` on the process with rank ``dst``
+    is going to receive the final result.
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        tensor_list (List[Tensor]): Input and output GPU tensors of the
+            collective. The function operates in-place.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+        dst (int): Destination rank
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+        dst_tensor (int, optional): Destination tensor rank within
+                                    ``tensor_list``
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, otherwise
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = ReduceOptions()
+    opts.reduceOp = op
+    opts.rootRank = dst
+    opts.rootTensor = dst_tensor
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.reduce(tensor_list, opts)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.reduce(tensor_list, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def reduce(tensor,
+           dst,
+           op=reduce_op.SUM,
+           group=group.WORLD,
+           async_op=False):
+    """
+    Reduces the tensor data across all machines.
+
+    Only the process with rank ``dst`` is going to receive the final result.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        dst (int): Destination rank
+        op (optional): One of the values from
+            ``torch.distributed.c10d.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    opts = ReduceOptions()
+    opts.reduceOp = op
+    opts.rootRank = dst
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.reduce([tensor], opts)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.reduce([tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_gather_multigpu(output_tensor_lists,
+                        input_tensor_list,
+                        group=group.WORLD,
+                        async_op=False):
+    """
+    Gathers tensors from the whole group in a list.
+    Each tensor in ``tensor_list`` should reside on a separate GPU
+
+    Only nccl backend is currently supported
+    tensors should only be GPU tensors
+
+    Arguments:
+        output_tensor_lists (List[List[Tensor]]): Output lists. It should
+            contain correctly-sized tensors on each GPU to be used for output of
+            the collective.
+            e.g. ``output_tensor_lists[i]`` contains the all_gather
+            result that resides on the GPU of ``input_tensor_list[i]``.
+            Note that each element of ``output_tensor_lists[i]`` has the size of
+            ``world_size * len(input_tensor_list)``, since the function all
+            gathers the result from every single GPU in the group. To interpret
+            each element of ``output_tensor_list[i]``, note that
+            ``input_tensor_list[j]`` of rank k will be appear in
+            ``output_tensor_list[i][rank * world_size + j]``
+            Also note that ``len(output_tensor_lists)``, and the size of each
+            element in ``output_tensor_lists`` (each element is a list,
+            therefore ``len(output_tensor_lists[i])``) need to be the same
+            for all the distributed processes calling this function.
+
+        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
+            be broadcast from current process.
+            Note that ``len(input_tensor_list)`` needs to be the same for
+            all the distributed processes calling this function.
+
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allgather(output_tensor_lists, input_tensor_list)
+    else:
+        work = group.allgather(output_tensor_lists, input_tensor_list)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def all_gather(tensor_list,
+               tensor,
+               group=group.WORLD,
+               async_op=False):
+    """
+    Gathers tensors from the whole group in a list.
+
+    Arguments:
+        tensor_list (list[Tensor]): Output list. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        tensor (Tensor): Tensor to be broadcast from current process.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.allgather([tensor_list], [tensor])
+    else:
+        work = group.allgather([tensor_list], [tensor])
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def gather(tensor,
+           gather_list,
+           dst,
+           group=group.WORLD,
+           async_op=False):
+    """
+    Gathers a list of tensors in a single process.
+
+    Arguments:
+        tensor (Tensor): Input tensor.
+        gather_list (list[Tensor]): List of appropriately-sized tensors to
+            use for received data. Required only in the receiving process.
+        dst (int): Destination rank. Required in all processes except the one
+            that is receiveing the data.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    if dst == my_rank:
+        if gather_list is None:
+            raise RuntimeError("gather_list is a required argument in gather "
+                               "destination")
+    else:
+        if gather_list:
+            raise RuntimeError("non-empty gather_list can be given only "
+                               "to gather destination")
+
+    opts = GatherOptions()
+    opts.rootRank = dst
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.gather([gather_list], [tensor], opts)
+    else:
+        group_dst_rank = _get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.gather([gather_list], [tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def scatter(tensor,
+            scatter_list,
+            src,
+            group=group.WORLD,
+            async_op=False):
+    """
+    Scatters a list of tensors to all processes in a group.
+
+    Each process will receive exactly one tensor and store its data in the
+    ``tensor`` argument.
+
+    Arguments:
+        tensor (Tensor): Output tensor.
+        scatter_list (list[Tensor]): List of tensors to scatter. Required only
+            in the process that is sending the data.
+        src (int): Source rank. Required in all processes except the one that
+            is sending the data.
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    if src == my_rank:
+        if scatter_list is None:
+            raise RuntimeError("scatter_list is a required argument in "
+                               "scatter source")
+    else:
+        if scatter_list:
+            raise RuntimeError("non-empty can be given only to scatter "
+                               "source")
+
+    opts = ScatterOptions()
+    opts.rootRank = src
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.scatter([tensor], [scatter_list], opts)
+    else:
+        group_src_rank = _get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.scatter([tensor], [scatter_list], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def barrier(group=group.WORLD,
+            async_op=False):
+    """
+    Synchronizes all processes.
+
+    This collective blocks processes until the whole group enters this function,
+    if async_op is False, or if async work handle is called on wait().
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+    """
+    if _rank_not_in_group(group):
+        return
+
+    if group == GroupMember.WORLD:
+        _check_default_pg()
+        work = _default_pg.barrier()
+    else:
+        work = group.barrier()
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def new_group(ranks=None):
+    """
+    Creates a new distributed group.
+
+    This function requires that all processes in the main group (i.e. all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group. Additionally, groups
+    should be created in the same order in all processes.
+
+    Arguments:
+        ranks (list[int]): List of ranks of group members.
+
+    Returns:
+        A handle of distributed group that can be given to collective calls.
+    """
+
+    _check_default_pg()
+
+    global _pg_group_ranks
+
+    default_backend, _ = _pg_map[_default_pg]
+    if default_backend == DistBackend.MPI:
+        raise RuntimeError("Only NCCL and Gloo backend currently support "
+                           "new_group function")
+
+    global_rank = _default_pg.rank()
+    global_world_size = _default_pg.size()
+
+    # checks the input ranks
+    if ranks is not None:
+        group_world_size = len(ranks)
+        if group_world_size > global_world_size:
+            raise RuntimeError("the new group's world size should be less or "
+                               "equal to the world size set by "
+                               "init_process_group")
+        # check ranks' sanity
+        for rank in ranks:
+            if rank < 0 or rank >= global_world_size:
+                raise RuntimeError("The new group's rank should be within the "
+                                   "the world_size set by init_process_group")
+
+        if global_rank in ranks:
+            group_rank = ranks.index(global_rank)
+        else:
+            group_rank = None
+    else:
+        group_world_size = global_world_size
+        group_rank = global_rank
+
+    # Release ranks not in the group
+    if global_rank not in ranks:
+        return GroupMember.NON_GROUP_MEMBER
+
+    pg = _new_process_group_helper(group_world_size, group_rank)
+
+    # Create the global rank to group rank mapping
+    _pg_group_ranks[pg] = {}
+    for rank in range(global_world_size):
+        if rank in ranks:
+            _pg_group_ranks[pg][rank] = ranks.index(rank)
+        else:
+            _pg_group_ranks[pg][rank] = None
+
+    return pg
+
+
+# TODO: delete these functions and replace DDP with public functions
+DEFAULT_REDUCE_OPTIONS = AllreduceOptions()
+
+
+def _broadcast(tensor, src, process_group):
+    opts = BroadcastOptions()
+    opts.rootRank = src
+    opts.rootTensor = 0
+    return process_group.broadcast([tensor], opts)
+
+
+def _all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS):
+    return process_group.allreduce([tensor], opts)
diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/c10d/rendezvous.py
index 062443f87abfec..30c9f2dfe7dd3b 100644
--- a/torch/distributed/c10d/rendezvous.py
+++ b/torch/distributed/c10d/rendezvous.py
@@ -3,6 +3,7 @@
 except ImportError:
     from urlparse import urlparse
 
+import os
 from . import FileStore, TCPStore
 
 
@@ -59,13 +60,13 @@ def _error(msg):
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
     if "rank" not in query:
         raise _error("rank parameter missing")
-    if "size" not in query:
-        raise _error("size parameter missing")
+    if "world_size" not in query:
+        raise _error("world size parameter missing")
 
     rank = int(query["rank"])
-    size = int(query["size"])
+    world_size = int(query["world_size"])
     store = FileStore(path)
-    yield (store, rank, size)
+    yield (store, rank, world_size)
 
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using file:// method")
@@ -81,18 +82,52 @@ def _error(msg):
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
     if "rank" not in query:
         raise _error("rank parameter missing")
-    if "size" not in query:
-        raise _error("size parameter missing")
+    if "world_size" not in query:
+        raise _error("world size parameter missing")
 
     rank = int(query["rank"])
-    size = int(query["size"])
+    world_size = int(query["world_size"])
     start_daemon = rank == 0
     store = TCPStore(result.hostname, result.port, start_daemon)
-    yield (store, rank, size)
+    yield (store, rank, world_size)
 
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using tcp:// method")
 
 
+def _env_rendezvous_handler(url):
+    def _error(msg):
+        return ValueError("env:// rendezvous: " + msg)
+
+    if url != "env://":
+        raise _error("Only `env://` is expected for the env init method")
+    world_size = os.environ["WORLD_SIZE"]
+    if world_size is None:
+        raise _error("world size is missing")
+    rank = os.environ["RANK"]
+    if rank is None:
+        raise _error("rank is missing")
+    master_addr = os.environ["MASTER_ADDR"]
+    if master_addr is None:
+        raise _error("master addr is missing")
+    master_port = os.environ["MASTER_PORT"]
+    if master_port is None:
+        raise _error("master port is missing")
+
+    # Converting before creating the store
+    rank = int(rank)
+    world_size = int(world_size)
+    master_port = int(master_port)
+
+    # Now start the TCP store daemon on the rank 0
+    start_daemon = rank == 0
+    store = TCPStore(master_addr, master_port, start_daemon)
+    yield (store, rank, world_size)
+
+    # If this configuration is invalidated, there is nothing we can do about it
+    raise RuntimeError("Unable to perform rerendezvous using env:// method")
+
+
 register_rendezvous_handler("file", _file_rendezvous_handler)
 register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+register_rendezvous_handler("env", _env_rendezvous_handler)
diff --git a/torch/functional.py b/torch/functional.py
index 055141b7469a20..8c78b6efe9f80f 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -389,7 +389,7 @@ def isnan(tensor):
     return tensor != tensor
 
 
-def unique(input, sorted=False, return_inverse=False):
+def unique(input, sorted=False, return_inverse=False, dim=None):
     r"""Returns the unique scalar elements of the input tensor as a 1-D tensor.
 
     Arguments:
@@ -431,11 +431,19 @@ def unique(input, sorted=False, return_inverse=False):
                 [ 1,  2]])
 
     """
-    output, inverse_indices = torch._unique(
-        input,
-        sorted=sorted,
-        return_inverse=return_inverse,
-    )
+    if dim is not None:
+        output, inverse_indices = torch._unique_dim(
+            input,
+            dim,
+            sorted=sorted,
+            return_inverse=return_inverse
+        )
+    else:
+        output, inverse_indices = torch._unique(
+            input,
+            sorted=sorted,
+            return_inverse=return_inverse,
+        )
     if return_inverse:
         return output, inverse_indices
     else:
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 30904ac7adff7d..551a17565e1763 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -21,6 +21,25 @@
 import collections
 import re
 
+
+def _parse_env(name, default, true_message, false_message):
+    value = os.environ.get(name)
+    if value is None:
+        return default
+    if value.lower() in {'1', 'true', 'yes'}:
+        return True
+    elif value.lower() in {'0', 'false', 'no'}:
+        return False
+    if value == '1v':
+        print(true_message)
+        return True
+    elif value == '0v':
+        print(false_message)
+        return False
+    raise ValueError('Unknown setting of {}. Try using 0 or 1.'.format(name))
+
+
+_enabled = _parse_env('PYTORCH_JIT', True, "> Using PyTorch JIT", "> PyTorch JIT DISABLED")
 _flatten = torch._C._jit_flatten
 _unflatten = torch._C._jit_unflatten
 _jit_script_compile = torch._C._jit_script_compile
@@ -431,6 +450,8 @@ def trace(*args, **kwargs):
         ...     return x * 2
     """
     def wrapper(func):
+        if not _enabled:
+            return func
         executor_options = {'optimize': True}
         for name in executor_options:
             executor_options[name] = kwargs.pop(name, executor_options[name])
@@ -509,6 +530,8 @@ def __getattr__(self, attr):
 
 
 def script(fn, optimize=True, _frames_up=0):
+    if not _enabled:
+        return fn
     rcb = createResolutionCallback(_frames_up + 1)
     ast = get_jit_ast(fn, is_method=False)
     graph = _jit_script_compile(ast, rcb)
@@ -528,6 +551,8 @@ def script(fn, optimize=True, _frames_up=0):
 
 
 def script_method(fn):
+    if not _enabled:
+        return fn
     # NOTE: we need to traverse two frames here because the meta-class frame
     # for ScriptModule will be present, as opposed to invoking @script on a
     # a function or invoking define() on a CompilationUnit.
@@ -547,6 +572,8 @@ def script_method(fn):
 
 def batch(batch_size=1, optimize=True, _frames_up=0):
     def decorator(fn):
+        if not _enabled:
+            return fn
         import torch.jit.batchop
         mod = script(fn, optimize, _frames_up)
         res_graph = torch.to_batch_graph(mod.graph)
@@ -757,57 +784,60 @@ def init_then_register(self, *args, **kwargs):
         return super(ScriptMeta, cls).__init__(name, bases, attrs)
 
 
-class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)):
-    def __init__(self, optimize=True):
-        # must be before Module.init since the field is used in __getattr__
-        Module.__init__(self)
-        self._set_optimized(optimize)
-        self._parameters = OrderedParameterDict(self)
-        self._buffers = OrderedBufferDict(self)
-        self._modules = OrderedModuleDict(self)
-
-    def __getattr__(self, attr):
-        if self._has_method(attr):
-            if attr in self.__class__._original_methods:
-                original_method = self.__class__._original_methods[attr]
-                script_method = self._get_method(attr)
-                return functools.wraps(original_method)(script_method)
+if _enabled:
+    class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)):
+        def __init__(self, optimize=True):
+            # must be before Module.init since the field is used in __getattr__
+            Module.__init__(self)
+            self._set_optimized(optimize)
+            self._parameters = OrderedParameterDict(self)
+            self._buffers = OrderedBufferDict(self)
+            self._modules = OrderedModuleDict(self)
+
+        def __getattr__(self, attr):
+            if self._has_method(attr):
+                if attr in self.__class__._original_methods:
+                    original_method = self.__class__._original_methods[attr]
+                    script_method = self._get_method(attr)
+                    return functools.wraps(original_method)(script_method)
+                else:
+                    return self._get_method(attr)
+            if attr == 'graph' and self._has_method('forward'):
+                return self.__getattr__('forward').graph
+            return Module.__getattr__(self, attr)
+
+        def __setattr__(self, attr, value):
+            if attr not in self._constants_set:
+                return super(ScriptModule, self).__setattr__(attr, value)
+            if hasattr(self, attr):
+                raise RuntimeError("attempting to re-assign constant '{}'".format(attr))
+            if isinstance(value, ModuleList):
+                # special case for list of modules. Modules need to be registered with their
+                # parent module. To do this, we create a ConstModuleList, which is itself a module, that
+                # contains each of these modules as submodules. The ConstModuleList then
+                # is set as an attribute of the parent module.
+                super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value))
+            elif isinstance(value, Sequential):
+                super(ScriptModule, self).__setattr__(attr, _ConstSequential(value))
             else:
-                return self._get_method(attr)
-        if attr == 'graph' and self._has_method('forward'):
-            return self.__getattr__('forward').graph
-        return Module.__getattr__(self, attr)
-
-    def __setattr__(self, attr, value):
-        if attr not in self._constants_set:
-            return super(ScriptModule, self).__setattr__(attr, value)
-        if hasattr(self, attr):
-            raise RuntimeError("attempting to re-assign constant '{}'".format(attr))
-        if isinstance(value, ModuleList):
-            # special case for list of modules. Modules need to be registered with their
-            # parent module. To do this, we create a ConstModuleList, which is itself a module, that
-            # contains each of these modules as submodules. The ConstModuleList then
-            # is set as an attribute of the parent module.
-            super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value))
-        elif isinstance(value, Sequential):
-            super(ScriptModule, self).__setattr__(attr, _ConstSequential(value))
-        else:
-            super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value))
-
-    def __dir__(self):
-        return sorted(Module.__dir__(self) + self._method_names())
-
-    def define(self, lang):
-        # We use frames_up=1 to get to the proper surrounding scope. The stack
-        # will look like:
-        # 0. createResolutionCallback
-        # 1. define()
-        # 2. surrounding scope.
-        #
-        # createResolutionCallback internally adds 1 to get us to our frame, then
-        # we add 1 to get to the proper surrounding scope.
-        rcb = createResolutionCallback(frames_up=1)
-        self._define(lang, rcb, True)
+                super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value))
+
+        def __dir__(self):
+            return sorted(Module.__dir__(self) + self._method_names())
+
+        def define(self, lang):
+            # We use frames_up=1 to get to the proper surrounding scope. The stack
+            # will look like:
+            # 0. createResolutionCallback
+            # 1. define()
+            # 2. surrounding scope.
+            #
+            # createResolutionCallback internally adds 1 to get us to our frame, then
+            # we add 1 to get to the proper surrounding scope.
+            rcb = createResolutionCallback(frames_up=1)
+            self._define(lang, rcb, True)
+else:
+    ScriptModule = torch.nn.Module
 
 
 def _get_methods(cls):
@@ -966,12 +996,12 @@ def register_all(mod):
     return _builtin_table
 
 
-def _register_builtin(callable, op):
-    _get_builtin_table()[id(callable)] = op
+def _register_builtin(fn, op):
+    _get_builtin_table()[id(fn)] = op
 
 
-def _find_builtin(callable):
-    return _get_builtin_table().get(id(callable))
+def _find_builtin(fn):
+    return _get_builtin_table().get(id(fn))
 
 
 if not torch._C._jit_init():
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 3d2bad9191a1fb..3afa33c7536bac 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -386,16 +386,17 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::gather(
     const GatherOptions& opts) {
   checkSingleTensor(inputTensors);
 
+  if (outputTensors.size() != 1) {
+    throw std::runtime_error("Gather: multi-GPU collective is not supported");
+  }
+
   if (rank_ != opts.rootRank) {
-    if (outputTensors.size() > 0) {
+    if (outputTensors[0].size() > 0) {
       throw std::runtime_error(
           "Gather: number of output tensors should be 0 "
           "for non-root");
     }
   } else {
-    if (outputTensors.size() != 1) {
-      throw std::runtime_error("Gather: multi-GPU collective is not supported");
-    }
     if (static_cast<size_t>(size_) != outputTensors[0].size()) {
       throw std::runtime_error(
           "Gather: number of output tensors should equal "
@@ -449,17 +450,17 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
   checkSingleTensor(outputTensors);
+  if (inputTensors.size() != 1) {
+    throw std::runtime_error("Scatter: multi-GPU collective is not supported");
+  }
 
   if (rank_ != opts.rootRank) {
-    if (inputTensors.size() > 0) {
+    if (inputTensors[0].size() > 0) {
       throw std::runtime_error(
           "Scatter: number of input tensors should be 0 "
           "for non-root");
     }
   } else {
-    if (inputTensors.size() != 1) {
-      throw std::runtime_error("Gather: multi-GPU collective is not supported");
-    }
     if (static_cast<size_t>(size_) != inputTensors[0].size()) {
       throw std::runtime_error(
           "Scatter: number of input tensors should equal "
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 01ce9bf2ac7cab..f2683756ce59f8 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -43,10 +43,10 @@
     'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm',
     'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
     'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
-    'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell',
-    'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance',
-    'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d',
-    'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d',
+    'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell',
+    'LSTMCell', 'GRUCell', 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d',
+    'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d',
     'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
     'AdaptiveLogSoftmaxWithLoss',
 ]
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
index 1310d2d748c89c..daa03f9f585114 100644
--- a/torch/nn/parallel/distributed_c10d.py
+++ b/torch/nn/parallel/distributed_c10d.py
@@ -91,13 +91,14 @@ class _DistributedDataParallelC10d(Module):
 
     Args:
         module: module to be parallelized
-        process_group: the c10d process group to be used for distributed data
-                       all-reduction
         device_ids: CUDA devices (default: all devices)
         output_device: device location of output (default: device_ids[0])
         broadcast_buffers: flag that enables syncing (broadcasting) buffers of
                            the module at beginning of the forward function.
                            (default: True)
+        process_group: the c10d process group to be used for distributed data
+                       all-reduction. If None, the default process group will
+                       be used
         bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into
                        multiple buckets so that gradient reduction of each
                        bucket can potentially overlap with backward computation.
@@ -112,9 +113,9 @@ class _DistributedDataParallelC10d(Module):
         >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size)
         >>> net = torch.nn._DistributedDataParallelC10d(model, pg)
     """
-    def __init__(self, module, process_group, device_ids=None,
+    def __init__(self, module, device_ids=None,
                  output_device=None, dim=0, broadcast_buffers=True,
-                 bucket_cap_mb=25):
+                 process_group=None, bucket_cap_mb=25):
 
         super(_DistributedDataParallelC10d, self).__init__()
 
@@ -125,13 +126,19 @@ def __init__(self, module, process_group, device_ids=None,
         if output_device is None:
             output_device = device_ids[0]
 
+        if process_group is None:
+            self.process_group = c10d.get_default_group()
+        else:
+            self.process_group = process_group
+
         self.dim = dim
         self.module = module
-        self.process_group = process_group
         self.device_ids = device_ids
         self.output_device = output_device
         self.broadcast_buffers = broadcast_buffers
 
+        self.allreduce_opts = c10d.AllreduceOptions()
+
         MB = 1024 * 1024
 
         # used for intra-node param sync and inter-node sync as well
@@ -341,7 +348,8 @@ def _queue_reduction(self, bucket_idx):
             nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
 
         # now work on the first gpu
-        reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group)
+        reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]],
+                                                      self.allreduce_opts)
         self.reduction_works[bucket_idx] = reduction_work
         self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 34c30aea654ed7..b65ea160b5c213 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -19,6 +19,7 @@
 from torch.autograd import Function, function
 from torch.jit import _unique_state_dict
 from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes
+from torch._C import ListType
 
 
 @contextlib.contextmanager
@@ -103,24 +104,32 @@ def export(model, args, f, export_params=True, verbose=False, training=False,
             operator_export_type=operator_export_type)
 
 
-def _list_constant_prop(g, block):
+# ONNX can't handle constants that are lists of tensors, which can
+# get generated in constant prop. So we split them back into prim::ListConstructs
+def _split_tensor_list_constants(g, block):
     for node in block.nodes():
         for subblock in node.blocks():
-            _list_constant_prop(g, subblock)
-        if node.kind() == "prim::ListConstruct":
-            input_nodes = [i.node() for i in node.inputs()]
-            if all(inode.kind() == "prim::Constant" and inode.kindOf("value") == "i" for inode in input_nodes):
-                input_values = [inode['value'] for inode in input_nodes]
-                const_node = g.create("prim::Constant")
-                const_node.insertBefore(node)
-                const_node.is_("value", input_values)
-                const_node.output().setType(torch._C.ListType.ofInts())
-                node.output().replaceAllUsesWith(const_node.output())
+            _split_tensor_list_constants(g, subblock)
+        if node.kind() == "prim::Constant":
+            output_type = node.output().type()
+            if output_type.isSubtypeOf(ListType.ofTensors()):
+                inputs = [g.create("prim::Constant").t_('value', t)
+                           .insertBefore(node).output()
+                          for t in node['value']]
+                lc = (g.create("prim::ListConstruct", inputs)
+                      .insertBefore(node)
+                      .output()
+                      .setType(ListType.ofTensors()))
+                node.output().replaceAllUsesWith(lc)
 
 
 def _optimize_graph(graph, operator_export_type):
-    _list_constant_prop(graph, graph)
-
+    # we record now record some ops like ones/zeros
+    # into a trace where we previously recorded constants
+    # use constant prop to maintain our current level of onnx support
+    # without implementing symbolics for all of them
+    torch._C._jit_pass_constant_propagation(graph)
+    _split_tensor_list_constants(graph, graph)
     # run dce to eliminate dead parts of the graph that might have been
     # left behind by things like symbolic_override
     torch._C._jit_pass_dce(graph)
diff --git a/torch/tensor.py b/torch/tensor.py
index ed2f7f0c10a565..904d3a5eeb3760 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -319,13 +319,22 @@ def masked_fill(self, mask, value):
         """
         return self.clone().masked_fill_(mask, value)
 
-    def unique(self, sorted=False, return_inverse=False):
+    def unique(self, sorted=False, return_inverse=False, dim=None):
         r"""Returns the unique scalar elements of the tensor as a 1-D tensor.
 
         See :func:`torch.unique`
         """
-        output, inverse_indices = self._unique(
-            sorted=sorted, return_inverse=return_inverse)
+        if dim is not None:
+            output, inverse_indices = self._unique_dim(
+                sorted=sorted,
+                return_inverse=return_inverse,
+                dim=dim
+            )
+        else:
+            output, inverse_indices = self._unique(
+                sorted=sorted,
+                return_inverse=return_inverse
+            )
         if return_inverse:
             return output, inverse_indices
         else: