diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 00000000000000..712143336a1af7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,49 @@
+---
+name: "\U0001F41B Bug Report"
+about: Submit a bug report to help us improve PyTorch
+
+---
+
+## 🐛 Bug
+
+<!-- A clear and concise description of what the bug is. -->
+
+## To Reproduce
+
+Steps to reproduce the behavior:
+
+1.
+1.
+1.
+
+<!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
+
+## Expected behavior
+
+<!-- A clear and concise description of what you expected to happen. -->
+
+## Environment
+
+Please copy and paste the output from our
+[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
+(or fill out the checklist below manually).
+
+You can get the script and run it with:
+```
+wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+# For security purposes, please check the contents of collect_env.py before running it.
+python collect_env.py
+```
+
+ - PyTorch Version (e.g., 1.0):
+ - OS (e.g., Linux):
+ - How you installed PyTorch (`conda`, `pip`, source):
+ - Build command you used (if compiling from source):
+ - Python version:
+ - CUDA/cuDNN version:
+ - GPU models and configuration:
+ - Any other relevant information:
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 00000000000000..a699c2e4548f8a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,9 @@
+---
+name: "\U0001F4DA Documentation"
+about: Report an issue related to https://pytorch.org/docs
+
+---
+
+## 📚 Documentation
+
+<!-- A clear and concise description of what content in https://pytorch.org/docs is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new -->
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
new file mode 100644
index 00000000000000..e1d2bc306eae8c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -0,0 +1,24 @@
+---
+name: "\U0001F680Feature Request"
+about: Submit a proposal/request for a new PyTorch feature
+
+---
+
+## 🚀 Feature
+<!-- A clear and concise description of the feature proposal -->
+
+## Motivation
+
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
+
+## Pitch
+
+<!-- A clear and concise description of what you want to happen. -->
+
+## Alternatives
+
+<!-- A clear and concise description of any alternative solutions or features you've considered, if any. -->
+
+## Additional context
+
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/.github/ISSUE_TEMPLATE/questions-help-support.md b/.github/ISSUE_TEMPLATE/questions-help-support.md
new file mode 100644
index 00000000000000..77bfb55b9a468a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/questions-help-support.md
@@ -0,0 +1,13 @@
+---
+name: "❓Questions/Help/Support"
+about: Do you need support? We have resources.
+
+---
+
+## ❓ Questions and Help
+
+### Please note that this issue tracker is not a help form and this issue will be closed.
+
+We have a set of [listed resources available on the website](https://pytorch.org/resources). Our primary means of support is our discussion forum:
+
+- [Discussion Forum](https://discuss.pytorch.org/)
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 2dc64157c5d00d..e076b329b28f5b 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -102,17 +102,6 @@ fi
 # Add the test binaries so that they won't be git clean'ed away
 git add -f build/bin
 
-# Test C FFI plugins
-# cffi install doesn't work for Python 3.7
-if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then
-  # TODO: Don't run this here
-  pip install cffi
-  git clone https://github.com/pytorch/extension-ffi.git
-  pushd extension-ffi/script
-  python build.py
-  popd
-fi
-
 # Test documentation build
 if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   pushd docs
diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt
index da9f62db38ded2..cffb72aa7acc4f 100644
--- a/.jenkins/pytorch/enabled-configs.txt
+++ b/.jenkins/pytorch/enabled-configs.txt
@@ -40,8 +40,8 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
 pytorch-docker-build-test
 short-perf-test-cpu
 short-perf-test-gpu
-py2-clang3.8-rocm1.7.1-ubuntu16.04-build
-py2-clang3.8-rocm1.7.1-ubuntu16.04-test
+py2-clang7-rocmdeb-ubuntu16.04-build
+py2-clang7-rocmdeb-ubuntu16.04-test
 pytorch-ppc64le-cuda9.2-cudnn7-py3-build
 pytorch-ppc64le-cuda9.2-cudnn7-py3-test
 pytorch-ppc64le-cuda9.1-cudnn7-py3-build
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 471fd8fac1fc6e..c43e821d98daf5 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -102,6 +102,7 @@ test_aten() {
       SUDO=sudo
     fi
 
+    ${SUDO} ln -s "$TORCH_LIB_PATH"/libc10* build/bin
     ${SUDO} ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin
     ${SUDO} ln -s "$TORCH_LIB_PATH"/libnccl* build/bin
 
diff --git a/aten/src/ATen/Registry.h b/aten/src/ATen/Registry.h
deleted file mode 100644
index 9d8d8ff2ee8404..00000000000000
--- a/aten/src/ATen/Registry.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#pragma once
-#include <ATen/core/Registry.h>
diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
index 75ff2a2fe6937f..f964649e19f172 100644
--- a/aten/src/ATen/core/Half-inl.h
+++ b/aten/src/ATen/core/Half-inl.h
@@ -190,6 +190,33 @@ inline AT_HOST_DEVICE Half operator/(int a, Half b) {
   return static_cast<Half>(a) / b;
 }
 
+//// Arithmetic with longs
+inline AT_HOST_DEVICE Half operator+(Half a, long b) {
+  return a + static_cast<Half>(b);
+}
+inline AT_HOST_DEVICE Half operator-(Half a, long b) {
+  return a - static_cast<Half>(b);
+}
+inline AT_HOST_DEVICE Half operator*(Half a, long b) {
+  return a * static_cast<Half>(b);
+}
+inline AT_HOST_DEVICE Half operator/(Half a, long b) {
+  return a / static_cast<Half>(b);
+}
+
+inline AT_HOST_DEVICE Half operator+(long a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline AT_HOST_DEVICE Half operator-(long a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline AT_HOST_DEVICE Half operator*(long a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline AT_HOST_DEVICE Half operator/(long a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
 /// conversion from at::Half to float.
 
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.cpp b/aten/src/ATen/core/LegacyTypeDispatch.cpp
index 6835399bfe2ca8..56c19cda3f4271 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.cpp
+++ b/aten/src/ATen/core/LegacyTypeDispatch.cpp
@@ -9,7 +9,10 @@ LegacyTypeDispatch & globalLegacyTypeDispatch() {
   return singleton;
 }
 
-AT_DEFINE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs)
+C10_DEFINE_REGISTRY(
+    LegacyTypeInitRegistry,
+    LegacyTypeInitInterface,
+    LegacyTypeInitArgs)
 
 const LegacyTypeInitInterface& getLegacyTypeInit() {
   static std::unique_ptr<LegacyTypeInitInterface> legacy_type_init;
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
index 53cedf04e4601a..5383acbb97ebf7 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -43,8 +43,12 @@ struct CAFFE2_API LegacyTypeInitInterface {
   }
 };
 struct CAFFE2_API LegacyTypeInitArgs {};
-AT_DECLARE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs);
-#define REGISTER_LEGACY_TYPE_INIT(clsname) AT_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname)
+C10_DECLARE_REGISTRY(
+    LegacyTypeInitRegistry,
+    LegacyTypeInitInterface,
+    LegacyTypeInitArgs);
+#define REGISTER_LEGACY_TYPE_INIT(clsname) \
+  C10_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname)
 
 CAFFE2_API const LegacyTypeInitInterface& getLegacyTypeInit();
 
diff --git a/aten/src/ATen/core/Registry.h b/aten/src/ATen/core/Registry.h
deleted file mode 100644
index 98a3e4a18c7258..00000000000000
--- a/aten/src/ATen/core/Registry.h
+++ /dev/null
@@ -1,217 +0,0 @@
-#pragma once
-
-/**
- * Simple registry implementation that uses static variables to
- * register object creators during program initialization time.
- */
-
-// NB: This Registry works poorly when you have other namespaces.
-// Make all macro invocations from inside the at namespace.
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <string>
-#include <vector>
-
-#include <ATen/core/ATenGeneral.h>
-#include <ATen/core/Backtrace.h>
-
-namespace at {
-
-template <typename KeyType>
-inline void PrintOffendingKey(const KeyType& /*key*/) {
-  printf("[key type printing not supported]\n");
-}
-
-template <>
-inline void PrintOffendingKey(const std::string& key) {
-  printf("Offending key: %s.\n", key.c_str());
-}
-
-/**
- * @brief A template class that allows one to register classes by keys.
- *
- * The keys are usually a std::string specifying the name, but can be anything that
- * can be used in a std::map.
- *
- * You should most likely not use the Registry class explicitly, but use the
- * helper macros below to declare specific registries as well as registering
- * objects.
- */
-template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registry {
- public:
-  typedef std::function<ObjectPtrType(Args...)> Creator;
-
-  Registry() : registry_() {}
-
-  void Register(const SrcType& key, Creator creator) {
-    // The if statement below is essentially the same as the following line:
-    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
-    //                                   << " registered twice.";
-    // However, CHECK_EQ depends on google logging, and since registration is
-    // carried out at static initialization time, we do not want to have an
-    // explicit dependency on glog's initialization function.
-    std::lock_guard<std::mutex> lock(register_mutex_);
-    if (registry_.count(key) != 0) {
-      printf("Key already registered.\n");
-      PrintOffendingKey(key);
-      std::exit(1);
-    }
-    registry_[key] = creator;
-  }
-
-  void Register(const SrcType& key, Creator creator, const std::string& help_msg) {
-    Register(key, creator);
-    help_message_[key] = help_msg;
-  }
-
-  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
-
-  ObjectPtrType Create(const SrcType& key, Args... args) {
-    if (registry_.count(key) == 0) {
-      // Returns nullptr if the key is not registered.
-      return nullptr;
-    }
-    return registry_[key](args...);
-  }
-
-  /**
-   * Returns the keys currently registered as a std::vector.
-   */
-  std::vector<SrcType> Keys() {
-    std::vector<SrcType> keys;
-    for (const auto& it : registry_) {
-      keys.push_back(it.first);
-    }
-    return keys;
-  }
-
-  const std::unordered_map<SrcType, std::string>& HelpMessage() const {
-    return help_message_;
-  }
-
-  const char* HelpMessage(const SrcType& key) const {
-    auto it = help_message_.find(key);
-    if (it == help_message_.end()) {
-      return nullptr;
-    }
-    return it->second.c_str();
-  }
-
- private:
-  std::unordered_map<SrcType, Creator> registry_;
-  std::unordered_map<SrcType, std::string> help_message_;
-  std::mutex register_mutex_;
-
-  Registry(const Registry&) = delete;
-  Registry& operator=(const Registry&) = delete;
-};
-
-template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registerer {
- public:
-  Registerer(
-      const SrcType& key,
-      Registry<SrcType, ObjectPtrType, Args...>* registry,
-      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
-      const std::string& help_msg = "") {
-    registry->Register(key, creator, help_msg);
-  }
-
-  template <class DerivedType>
-  static ObjectPtrType DefaultCreator(Args... args) {
-    // TODO(jiayq): old versions of NVCC does not handle make_unique well
-    // so we are forced to use a unique_ptr constructor here. Check if it is
-    // fine to use make_unique in the future.
-    // return make_unique<DerivedType>(args...);
-    return ObjectPtrType(new DerivedType(args...));
-  }
-};
-
-/**
- * AT_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
- * str and ending with a number that varies with the line.
- * Pretty much a copy from 'folly/Preprocessor.h'
- */
-#define AT_CONCATENATE_IMPL(s1, s2) s1##s2
-#define AT_CONCATENATE(s1, s2) AT_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __COUNTER__)
-#else
-#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __LINE__)
-#endif
-
-/**
- * AT_DECLARE_TYPED_REGISTRY is a macro that expands to a function
- * declaration, as well as creating a convenient typename for its corresponding
- * registerer.
- */
-#define AT_DECLARE_TYPED_REGISTRY(                                \
-    RegistryName, SrcType, ObjectType, PtrType, ...)              \
-  CAFFE2_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* \
-  RegistryName();                                                 \
-  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>   \
-      Registerer##RegistryName;                                   \
-  extern template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
-
-#define AT_DEFINE_TYPED_REGISTRY(                                         \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName() {    \
-    static Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* registry = \
-        new Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>();         \
-    return registry;                                                         \
-  } \
-  template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
-
-// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
-// creator with comma in its templated arguments.
-#define AT_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
-  namespace {                                                                 \
-  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key, RegistryName(), __VA_ARGS__);                                      \
-  }
-
-#define AT_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
-  namespace {                                                                 \
-  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key,                                                                    \
-      RegistryName(),                                                         \
-      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      ::at::demangle_type<__VA_ARGS__>());                                           \
-  }
-
-// AT_DECLARE_REGISTRY and AT_DEFINE_REGISTRY are hard-wired to use std::string
-// as the key
-// type, because that is the most commonly used cases.
-#define AT_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DECLARE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
-
-#define AT_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DEFINE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
-
-#define AT_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DECLARE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
-
-#define AT_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DEFINE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
-
-// AT_REGISTER_CREATOR and AT_REGISTER_CLASS are hard-wired to use std::string
-// as the key
-// type, because that is the most commonly used cases.
-#define AT_REGISTER_CREATOR(RegistryName, key, ...) \
-  AT_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
-
-#define AT_REGISTER_CLASS(RegistryName, key, ...) \
-  AT_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
-
-}  // namespace at
diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h
index bba2df4e0d1bec..a92b14d147c5ae 100644
--- a/aten/src/ATen/core/StorageImpl.h
+++ b/aten/src/ATen/core/StorageImpl.h
@@ -74,6 +74,8 @@ struct CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
 
   template <typename T>
   inline T* data() const {
+    // TODO: This is bad: it means storage.data<T>() calls only work on
+    // T that are valid ScalarType.  FIXME!
     auto data_type_T = at::scalarTypeToDataType(at::CTypeToScalarType<T>::to());
     if (dtype().id() != data_type_T) {
       AT_ERROR(
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index fa31741313db39..31de431bf367b0 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -533,6 +533,7 @@ class CAFFE2_API Tensor {
   Tensor mv(const Tensor & vec) const;
   Tensor mvlgamma(int64_t p) const;
   Tensor & mvlgamma_(int64_t p);
+  Tensor narrow_copy(int64_t dim, int64_t start, int64_t length) const;
   Tensor narrow(int64_t dim, int64_t start, int64_t length) const;
   Tensor permute(IntList dims) const;
   Tensor pin_memory() const;
diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
index 5b568482d8dfe2..d8f38e98ef4434 100644
--- a/aten/src/ATen/core/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -45,6 +45,9 @@ IntList TensorImpl::sizes() const {
 }
 
 IntList TensorImpl::strides() const {
+  AT_ASSERTM(strides_.size() == sizes_.size(),
+             "Caffe2 tensors don't (yet) have meaningful strides and cannot "
+             "be used in PyTorch.");
   return strides_;
 }
 
@@ -52,6 +55,10 @@ bool TensorImpl::compute_contiguous() const {
   bool is_contiguous = true;
   if (is_empty())
     return is_contiguous;
+  if (strides_.empty()) {
+    // Special case for Caffe2 tensors which don't have strides set.
+    return true;
+  }
   int64_t z = 1;
   for (int64_t d = dim() - 1; d >= 0; d--) {
     if (size(d) != 1) {
@@ -82,6 +89,9 @@ int64_t TensorImpl::size(int64_t d) const {
 }
 
 int64_t TensorImpl::stride(int64_t d) const {
+  AT_ASSERTM(strides_.size() == sizes_.size(),
+             "Caffe2 tensors don't (yet) have meaningful strides and cannot "
+             "be used in PyTorch.");
   d = at::maybe_wrap_dim(d, dim(), false);
   return strides_[d];
 }
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index 27232e2a3a8e97..7d7ce6a980249c 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -9,24 +9,142 @@
 #include "ATen/core/TensorTypeIdRegistration.h"
 #include "ATen/core/LegacyTypeDispatch.h"
 #include "ATen/core/Backend.h"
+#include "ATen/core/context_base.h"
+#include "ATen/core/WrapDimMinimal.h"
 
-struct THTensor;
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+
+// A global boolean variable to control whether we free memory when a Tensor
+// is shrinked to a smaller size. As a result, a Tensor is always going to
+// keep the memory allocated for its maximum capacity reshaped to so far.
+//
+// This parameter is respected "upper-case" methods which call Resize()
+// (e.g., CopyFrom, ResizeLike); it is NOT respected by Tensor::resize_
+// or ShrinkTo, both of which guarantee to never to free memory.
+CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
+
+// Since we can have high variance in blob memory allocated across different
+// inputs in the same run, we will shrink the blob only if the memory gain
+// is larger than this flag in bytes.  This only applies to functions which
+// respect caffe2_keep_on_shrink.
+CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+
+
+namespace caffe2 {
+
+// Defined by protobuf
+class DeviceOption;
+
+}
 
 namespace at {
 class Scalar;
 struct Type;
 struct Storage;
 class Tensor;
-} // namespace at
 
-namespace at {
+/**
+ * A utility function to convert vector<int> to vector<int64_t>.
+ */
+inline std::vector<int64_t> ToVectorint64_t(ArrayRef<int> src) {
+  return std::vector<int64_t>(src.begin(), src.end());
+}
+
+/**
+ * Return product of all dimensions starting from k
+ */
+inline int64_t size_from_dim_(int k, IntList dims) {
+  int64_t r = 1;
+  for (size_t i = k; i < dims.size(); ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims up to k (not including dims[k])
+inline int64_t size_to_dim_(int k, IntList dims) {
+  CAFFE_ENFORCE((unsigned)k <= dims.size());
+  int64_t r = 1;
+  for (int i = 0; i < k; ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims between k and l (not including dims[k] and dims[l])
+inline int64_t size_between_dim_(int k, int l, IntList dims) {
+  CAFFE_ENFORCE((unsigned)l < dims.size());
+  int64_t r = 1;
+  if (k < l) {
+    for (int i = k + 1; i < l; ++i) {
+      r *= dims[i];
+    }
+  } else {
+    for (int i = l + 1; i < k; ++i) {
+      r *= dims[i];
+    }
+  }
+  return r;
+}
+
+/**
+ * The low-level representation of a tensor, which contains a storage
+ * (which contains the actual data) and metadata (e.g., sizes and strides)
+ * describing this data as a tensor.
+ *
+ * Some basic characteristics about our in-memory representation of
+ * tensors:
+ *
+ *  - It contains a pointer to a storage struct (Storage/StorageImpl)
+ *    which contains the pointer to the actual data and records the
+ *    data type and device of the view.  This allows multiple tensors
+ *    to alias the same underlying data, which allows to efficiently
+ *    implement differing *views* on a tensor.
+ *
+ *  - The tensor struct itself records view-specific metadata about
+ *    the tensor, e.g., sizes, strides and offset into storage.
+ *    Each view of a storage can have a different size or offset.
+ *
+ *  - This class is intrusively refcounted.  It is refcounted so that
+ *    we can support prompt deallocation of large tensors; it is
+ *    intrusively refcounted so that we can still perform reference
+ *    counted operations on raw pointers, which is often more convenient
+ *    when passing tensors across language boundaries.
+ */
 struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
   TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable);
   TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
 
+  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {
+    data_type_ = storage_ ? storage_.dtype() : caffe2::TypeMeta{};
+  }
+
+  TensorImpl(const TensorImpl&) = default;
+  TensorImpl& operator=(const TensorImpl&) = default;
+  TensorImpl(TensorImpl&&) = default;
+  TensorImpl& operator=(TensorImpl&&) = default;
+
   virtual void release_resources() override;
 
+  // TODO: Ideally, type_id() would be the *only* key we need to consult
+  // to do a dispatch, instead of having to grovel through three different
+  // variables.  Here's what's standing in the way:
+  //
+  //  - To eliminate ScalarType, we have to allocate a TensorTypeId for
+  //    each ScalarType+Backend combination, and then set it appropriately
+  //    when we initially allocate a TensorImpl.
+  //
+  //  - To eliminate is_variable, we have to allocate two classes of
+  //    TensorTypeId: ones that are variables, and ones that are not.
+  //    We may not want to eliminate this in the short term, because
+  //    hard-coding variable status into type_id() makes it more difficult
+  //    to do the "thread-local no_grad" trick (where we process Variables
+  //    "as if" they were non-Variables by setting a thread local variable.)
+  //
   Type & type() const {
     // NB: It's valid to use getTypeRaw here, because the TensorImpl
     // could not have been created without initializing the Type first.
@@ -42,9 +160,17 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   virtual const Storage& storage() const;
   friend struct Type;
 
+  /**
+   * The number of elements in a tensor.
+   *
+   * WARNING: If you are using the Caffe2 API, this method can sometimes
+   * return -1, specifically when a tensor has not yet had its storage
+   * allocated by calling mutable_data().  You can use this case to
+   * test if a tensor is initialized or not.
+   */
   virtual int64_t numel() const {
 #ifdef DEBUG
-    AT_ASSERT(compute_numel() == numel_);
+    AT_ASSERT(numel_ == -1 || compute_numel() == numel_);
 #endif
     return numel_;
   }
@@ -100,11 +226,25 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   template <typename T>
   inline T * data() const {
     AT_ASSERT(!is_variable());
-    return storage_.data<T>() + storage_offset_;
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_.data() || numel_ == 0,
+        "The tensor has a non-zero number of elements, but its data is not allocated yet. "
+        "Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_.IsType<T>(),
+        "Tensor type mismatch, caller expects elements to be ",
+        caffe2::TypeMeta::TypeName<T>(),
+        ", while tensor contains ",
+        data_type_.name(),
+        ". ");
+    // We managed the type check ourselves
+    return storage_.unsafe_data<T>() + storage_offset_;
   }
 
   inline void* data() const {
     AT_ASSERT(!is_variable());
+    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
     return static_cast<void*>(
         static_cast<char*>(storage_.data()) +
         data_type_.itemsize() * storage_offset_);
@@ -119,6 +259,9 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   const caffe2::TypeMeta& dtype() const {
     return data_type_;
   }
+  size_t itemsize() const {
+    return data_type_.itemsize();
+  }
 
   virtual int64_t storage_offset() const {
     return storage_offset_;
@@ -139,13 +282,13 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   virtual void set_size(int64_t dim, int64_t new_size) {
-    sizes_[dim] = new_size;
+    sizes_.at(dim) = new_size;
     refresh_numel();
     refresh_contiguous();
   }
 
   virtual void set_stride(int64_t dim, int64_t new_stride) {
-    strides_[dim] = new_stride;
+    strides_.at(dim) = new_stride;
     refresh_numel();
     refresh_contiguous();
   }
@@ -214,5 +357,516 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
  private:
   TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable);
+
+ public:
+
+  at::DeviceType device_type() const {
+    AT_ASSERT(!is_variable());
+    return storage_.device_type();
+  }
+
+  at::Device GetDevice() const {
+    return storage_.device();
+  }
+
+  /**
+   * The static context of a tensor intuitively represents the device
+   * type of a tensor; e.g., a CPU tensor is associated with the
+   * GetCPUStaticContext().  This method replaces the former Context template
+   * parameter which was previously used to identify the device type
+   * of a tensor.
+   */
+  at::BaseStaticContext* GetStaticContext() const {
+    return ::caffe2::get_static_context(device_type());
+  }
+
+  /**
+   * @brief Copies the data from a source tensor, with a contex provided to
+   * carry out the underlying memcpy operation.  This method respects
+   * caffe2_keep_on_shrink.
+   */
+  void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
+    if ((void*)&src == (void*)this) {
+      return;
+    }
+    if (data_type_ != src.dtype()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          src.is_contiguous(),
+          "Right now only copy of contiguous source Tensor is supported.");
+      storage_ = at::Storage(device_type(), src.dtype());
+      data_type_ = src.dtype();
+    }
+    if (src.numel() == -1) {
+      sizes_.clear();
+      numel_ = -1;
+      strides_.clear();
+      is_contiguous_ = true;
+      storage_.reset();
+      data_type_ = caffe2::TypeMeta();
+      return;
+    }
+    Resize(src.dims());
+    if (numel() > 0) {
+      if (data_type_.copy()) {
+        CAFFE_ENFORCE(
+            device_type() == ::at::DeviceType::CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        CAFFE_ENFORCE(
+            src.device_type() == ::at::DeviceType::CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        data_type_.copy()(src.data(), raw_mutable_data(data_type_), numel());
+      } else {
+        // We'll need to use a non-CPU context to perform the copy if
+        // one of the context is not CPU since only non-CPU context
+        // knows how to copy between CPU and that context
+        if (src.device_type() != ::at::DeviceType::CPU || device_type() == ::at::DeviceType::CPU) {
+          if (!context) {
+            CreateContext(src.GetDevice())
+                ->CopyBytesToDevice(
+                    numel() * itemsize(),
+                    src.data(),
+                    raw_mutable_data(data_type_),
+                    device_type());
+          } else {
+            CAFFE_ENFORCE(
+                context->device_type() == src.device_type(),
+                "Type for provided context does not match the type of source");
+            context->CopyBytesToDevice(
+                numel() * itemsize(), src.data(), raw_mutable_data(data_type_), device_type());
+          }
+        } else {
+          // In case source context is CPU, and target context is non-CPU
+          // We'll have to create a Context from target and perform the
+          // copy using that context
+          CreateContext(GetDevice())
+              ->CopyBytesFromCPU(
+                  numel() * itemsize(),
+                  src.data(),
+                  raw_mutable_data(data_type_));
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   */
+  void Extend(int64_t num, float growthPct, at::BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(sizes_.size(), 1u);
+    CAFFE_ENFORCE_GE_WITH_CALLER(
+        num, 0, "`num` must be non-negative for Extend");
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Extend is only supported for contiguous Tensor.");
+    auto newDims = sizes_;
+    newDims[0] += num;
+    if (!storage_.data()) {
+      Resize(newDims);
+      return;
+    }
+    auto newNumel = std::accumulate(
+        newDims.begin(),
+        newDims.end(),
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      sizes_ = newDims;
+      numel_ = newNumel;
+      return;
+    }
+    auto newCapacity = sizes_;
+    newCapacity[0] = std::max<size_t>(
+        newDims[0], std::ceil(sizes_[0] * (growthPct + 100) / 100));
+    auto oldData = std::move(storage_.data_ptr());
+    auto oldSize = numel_;
+    auto oldDims = sizes_;
+    Resize(newCapacity);
+    auto* newData = raw_mutable_data(data_type_);
+    CAFFE_ENFORCE(
+        context != nullptr, "Context must be provided to Extend the tensor");
+    context->CopyItemsSameDevice(
+        data_type_, oldSize, oldData.get(), newData);
+    reserved_ = true;
+    sizes_ = newDims;
+    numel_ = newNumel;
+  }
+
+  /**
+   * @brief Reserve space for the underlying tensor.
+   *
+   * This must be called after Resize(), since we only specify the first
+   * dimension This does not copy over the old data to the newly allocated space
+   */
+  template <class T>
+  void ReserveSpace(const T& outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ReserveSpace is only supported for contiguous Tensor.");
+    CAFFE_ENFORCE(
+        numel_ != -1, "size should be initialized before calling ReserveSpace");
+    CAFFE_ENFORCE(
+        storage_.unique(), "Can't call ReserveSpace on shared storage.");
+    auto newCapacity = sizes_;
+    newCapacity[0] = outer_dim;
+    auto newNumel = std::accumulate(
+        newCapacity.begin(),
+        newCapacity.end(),
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      return;
+    }
+    // Old data is discarded
+    storage_.data_ptr().clear();
+    auto oldSize = numel_;
+    auto oldDims = sizes_;
+    Resize(newCapacity);
+    // Allocate new memory but don't copy over the data
+    raw_mutable_data(data_type_);
+    sizes_ = oldDims;
+    numel_ = oldSize;
+    reserved_ = true;
+  }
+
+  /**
+   * @brief Resizes a tensor.
+   *
+   * Resize takes in a vector of ints specifying the dimensions of the tensor.
+   * You can pass in an empty vector to specify that it is a scalar (i.e.
+   * containing one single item).
+   *
+   * The underlying storage may be deleted after calling Resize: if the new
+   * shape leads to a different number of items in the tensor, the old memory
+   * is deleted and new memory will be allocated next time you call
+   * mutable_data(). However, if the shape is different but the total number of
+   * items is the same, the underlying storage is kept.
+   *
+   * This method respects caffe2_keep_on_shrink.  Consult the internal logic
+   * of this method to see exactly under what circumstances this flag matters.
+   */
+  template <typename... Ts>
+  void Resize(Ts... dim_source) {
+    bool is_init = numel_ == -1;
+    bool size_changed = SetDims(dim_source...);
+    if (size_changed) {
+      // If needed, we will free the data. the next mutable_data() call
+      // will create the data storage.
+      bool reset_tensor = false;
+      if (reserved_) {
+        // If tensor is reserved then don't claim its memeory unless capacity()
+        // is smaller than new size
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize();
+      } else {
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() ||
+            !caffe2::FLAGS_caffe2_keep_on_shrink ||
+            storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() >
+                static_cast<size_t>(caffe2::FLAGS_caffe2_max_keep_on_shrink_memory);
+      }
+
+      if (reset_tensor && !is_init) {
+        FreeMemory();
+      }
+    }
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  inline void Reshape(const std::vector<int64_t>& dims) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Reshape is only supported for contiguous Tensor.");
+    int64_t new_size = 1;
+    for (auto d : dims) {
+      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
+      new_size *= d;
+    }
+    CAFFE_ENFORCE_WITH_CALLER(
+        new_size == numel_,
+        "New size and old size are not equal. You cannot use Reshape, "
+        "but should use Resize."
+        // TODO(jiayq): remove the following warning after pending diffs
+        // stabilize.
+        " The old caffe2 mixes Reshape and Resize but this behavior has "
+        "been changed. If you find this error, most likely you will need "
+        "to change corresponding code from Reshape to Resize.");
+    sizes_ = dims;
+    update_to_contiguous_strides();
+  }
+
+  /**
+   * Release whatever memory the tensor was holding but keep size and type
+   * information. Subsequent call to mutable_data will trigger new memory
+   * allocation.
+   */
+  inline void FreeMemory() {
+    // We'll detach from the old Storage and create a new one
+    storage_ = at::Storage(storage_.device_type(), data_type_);
+    storage_offset_ = 0;
+  }
+
+  /**
+   * @brief Shares the data with another tensor.
+   *
+   * To share data between two tensors, the sizes of the two tensors must be
+   * equal already. The reason we do not implicitly do a Resize to make the two
+   * tensors have the same shape is that we want to allow tensors of different
+   * shapes but the same number of items to still be able to share data. This
+   * allows one to e.g. have a n-dimensional Tensor and a flattened version
+   * sharing the same underlying storage.
+   *
+   * The source tensor should already have its data allocated.
+   */
+  void ShareData(const TensorImpl& src) {
+    // Right now, we are assuming the device_type are the same, since it is
+    // inherently the same in the non-templatized code. We should probably add
+    // an ENFORCE here which might affect perf a little bit.
+    CAFFE_ENFORCE_EQ_WITH_CALLER(
+        src.numel_,
+        numel_,
+        "Size mismatch - did you call reshape before sharing the data?");
+    // It is possible that the source tensor hasn't called mutable_data() yet,
+    // in which case ShareData() doesn't make much sense since we don't really
+    // know what to share yet.
+    CAFFE_ENFORCE_WITH_CALLER(
+        src.storage_.data() || src.numel_ == 0,
+        "Source tensor has no content and has size > 0");
+    // Finally, do sharing.
+    /* Since we create new Storage whenever we need to change data_type/capacity
+     * this still keeps the original semantics
+     */
+    storage_ = src.storage();
+    data_type_ = src.dtype();
+    storage_offset_ = src.storage_offset();
+  }
+
+  void ShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    if (!capacity) {
+      capacity = numel_ * data_type.itemsize();
+    }
+    if (storage_.unique()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "To share data with a raw pointer, you need to set shape first.");
+      storage_.UniqueStorageShareExternalPointer(
+          std::move(data_ptr), data_type, capacity);
+      data_type_ = data_type;
+      storage_offset_ = 0;
+    } else {
+      int64_t numel = capacity / data_type.itemsize();
+      // Create a new Storage
+      storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
+      data_type_ = data_type;
+      storage_offset_ = 0;
+    }
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. Since we will need
+   * to know the type of the data for allocation, a TypeMeta object is passed in
+   * to specify the necessary information. This is conceptually equivalent of
+   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
+   * the type T. This function differs from mutable_data<T>() in the sense that
+   * the type T can be specified during runtime via the TypeMeta object.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data(const caffe2::TypeMeta& meta) {
+    // For 0-size tensors it's fine to return any pointer (including nullptr)
+    if (data_type_ == meta && (storage_.data() || numel_ == 0)) {
+      return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
+    } else {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "Tensor is not initialized. You probably need to call Resize() "
+          "before calling mutable_data()");
+      bool had_special_dtor = data_type_.dtor() != nullptr;
+      storage_offset_ = 0;
+      if (storage_.unique()) {
+        storage_.set_dtype(meta);
+      } else {
+        if (data_type_ != meta) {
+          storage_ = at::Storage(storage_.device_type(), meta);
+        }
+      }
+      data_type_ = meta;
+
+      // We can reuse the existing buffer if the current data does not have
+      // a special destructor and the new data doesn't have a special
+      // constructor.
+      if (numel_ == 0 ||
+          (meta.ctor() == nullptr && !had_special_dtor &&
+           storage_.numel() >= numel_)) {
+        AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+        return storage_.data();
+      }
+      const at::Allocator* allocator = storage_.allocator();
+      // TODO: Get rid of StaticContext
+      CAFFE_ENFORCE(
+          allocator == nullptr,
+          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
+      if (meta.ctor()) {
+        // For types that need placement new, we will call it, as well as
+        // making sure that when the data is freed, it calls the right
+        // destruction procedure.
+        auto size = numel_;
+        auto dtor = data_type_.dtor();
+        void* ptr;
+        at::DeleterFnPtr deleter;
+        auto ptr_and_deleter = GetStaticContext()->New(
+            numel_ * storage_.itemsize()); // Removing this can get rid of
+                                           // InefficientStdFunctionContext
+        ptr = ptr_and_deleter.first;
+        deleter = ptr_and_deleter.second;
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr,
+            [size, dtor, deleter](void* local_ptr) -> void {
+              dtor(local_ptr, size);
+              deleter(local_ptr);
+            },
+            at::Device(storage_.device_type())));
+        data_type_.ctor()(storage_.data(), numel_);
+      } else {
+        // For fundamental type, new and delete is easier.
+        auto ptr_and_deleter =
+            GetStaticContext()->New(numel_ * storage_.itemsize());
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr_and_deleter.first,
+            ptr_and_deleter.second,
+            at::Device(storage_.device_type())));
+      }
+      storage_.set_numel(numel_);
+      AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+      return storage_.data();
+    }
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage.
+   *
+   * For fundamental types, we reuse possible existing storage if there
+   * is sufficient capacity.
+   */
+  template <typename T>
+  inline T* mutable_data() {
+    if ((numel_ == 0 || storage_.data()) && storage_.IsType<T>()) {
+      return static_cast<T*>(storage_.data()) + storage_offset_;
+    }
+    // Check it here statically - otherwise TypeMeta would throw the runtime
+    // error in attempt to invoke TypeMeta::ctor()
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "Tensor can't hold non-default-constructible types");
+    return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
+  }
+
+  /**
+   * Returns the dimensions of the tensor as a vector.
+   */
+  inline const std::vector<int64_t>& dims() const {
+    // TODO: This method will no longer work if we change the
+    // internal representation of dims().  That's BAD.  Let's get
+    // people to stop using this.
+    return sizes_;
+  }
+
+ protected:
+  // we decide to keep reserved_ and it will
+  // live in Tensor after the split
+  // The logic is that if Extend() or ReserveSpace() were ever called,
+  // then subsequent Resize()s will not free up Storage.
+  bool reserved_ = false;
+
+ private:
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_integral<T>::value>::type>
+  bool SetDims(const std::vector<T>& src) {
+    auto old_numel = numel_;
+    sizes_.resize(src.size());
+    int64_t new_numel = 1;
+    for (size_t i = 0; i < src.size(); ++i) {
+      new_numel *= src[i];
+      sizes_[i] = src[i];
+    }
+    update_to_contiguous_strides();
+    numel_ = new_numel;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims() {
+    auto old_numel = numel_;
+    sizes_.resize(0);
+    update_to_contiguous_strides();
+    numel_ = 1;
+    return numel_ != old_numel;
+  }
+
+  // TODO(jiayq): maybe rewrite the following functions with initializer list.
+  // NVCC does not play well with initializer lists last time, but worth
+  // another shot.
+  bool SetDims(const int64_t d0) {
+    auto old_numel = numel_;
+    sizes_.resize(1);
+    sizes_[0] = d0;
+    update_to_contiguous_strides();
+    numel_ = d0;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1) {
+    auto old_numel = numel_;
+    sizes_.resize(2);
+    sizes_[0] = d0;
+    sizes_[1] = d1;
+    update_to_contiguous_strides();
+    numel_ = d0 * d1;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
+    auto old_numel = numel_;
+    sizes_.resize(3);
+    sizes_[0] = d0;
+    sizes_[1] = d1;
+    sizes_[2] = d2;
+    update_to_contiguous_strides();
+    numel_ = d0 * d1 * d2;
+    return numel_ != old_numel;
+  }
+
+  bool
+  SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) {
+    auto old_numel = numel_;
+    sizes_.resize(4);
+    sizes_[0] = d0;
+    sizes_[1] = d1;
+    sizes_[2] = d2;
+    sizes_[3] = d3;
+    update_to_contiguous_strides();
+    numel_ = d0 * d1 * d2 * d3;
+    return numel_ != old_numel;
+  }
+
+  inline void update_to_contiguous_strides() {
+    strides_.resize(0);
+    is_contiguous_ = true;
+  }
+
 };
 } // namespace at
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index c6197b4fc2d08b..857131298376b1 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -902,6 +902,9 @@ inline Tensor Tensor::mvlgamma(int64_t p) const {
 inline Tensor & Tensor::mvlgamma_(int64_t p) {
     return type().mvlgamma_(*this, p);
 }
+inline Tensor Tensor::narrow_copy(int64_t dim, int64_t start, int64_t length) const {
+    return type().narrow_copy(*this, dim, start, length);
+}
 inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const {
     return type().narrow(*this, dim, start, length);
 }
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index 3a2ccbe1e45edb..009ee309d7808a 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -492,6 +492,7 @@ struct CAFFE2_API Type {
   virtual Tensor mv(const Tensor & self, const Tensor & vec) const = 0;
   virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0;
   virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0;
+  virtual Tensor narrow_copy(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
   virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
   virtual Tensor permute(const Tensor & self, IntList dims) const = 0;
   virtual Tensor pin_memory(const Tensor & self) const = 0;
diff --git a/aten/src/ATen/core/VariableHooksInterface.cpp b/aten/src/ATen/core/VariableHooksInterface.cpp
index 3728114492e53b..b9d90f56b8683b 100644
--- a/aten/src/ATen/core/VariableHooksInterface.cpp
+++ b/aten/src/ATen/core/VariableHooksInterface.cpp
@@ -24,6 +24,9 @@ namespace detail {
 
 }
 
-AT_DEFINE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
+C10_DEFINE_REGISTRY(
+    VariableHooksRegistry,
+    VariableHooksInterface,
+    VariableHooksArgs)
 
 } // namespace at::detail
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index e8fd4da9e27536..0b8eb1532c1bc6 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <ATen/core/Registry.h>
-#include <ATen/core/ScalarType.h>
 #include <ATen/core/Backend.h>
+#include <ATen/core/ScalarType.h>
+#include "c10/util/Registry.h"
 
 namespace at {
   class LegacyTypeDispatch;
@@ -39,8 +39,12 @@ struct CAFFE2_API VariableHooksInterface {
 // for the "..." in a variadic macro"
 struct CAFFE2_API VariableHooksArgs {};
 
-AT_DECLARE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
-#define REGISTER_VARIABLE_HOOKS(clsname) AT_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname)
+C10_DECLARE_REGISTRY(
+    VariableHooksRegistry,
+    VariableHooksInterface,
+    VariableHooksArgs);
+#define REGISTER_VARIABLE_HOOKS(clsname) \
+  C10_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname)
 
 namespace detail {
 CAFFE2_API const VariableHooksInterface& getVariableHooks();
diff --git a/aten/src/ATen/core/WrapDimMinimal.h b/aten/src/ATen/core/WrapDimMinimal.h
index 6971bac0b3f67c..859c1da0590a9d 100644
--- a/aten/src/ATen/core/WrapDimMinimal.h
+++ b/aten/src/ATen/core/WrapDimMinimal.h
@@ -20,4 +20,10 @@ static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wr
   return dim;
 }
 
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+// This is the "Caffe2" name
+static inline int canonical_axis_index_(int axis_index, int ndims) {
+  return maybe_wrap_dim(axis_index, ndims, false);
+}
+
 }
diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp
index e34c6880c0210a..f81bd81361305f 100644
--- a/aten/src/ATen/core/context_base.cpp
+++ b/aten/src/ATen/core/context_base.cpp
@@ -1,5 +1,16 @@
 #include <ATen/core/context_base.h>
 
+namespace at {
+
+C10_DEFINE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+} // namespace at
+
 namespace caffe2 {
 
 // TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 326cae5eb9691e..13bc885da344ee 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -6,11 +6,12 @@
 #include <memory>
 #include <unordered_map>
 
-#include <ATen/core/DeviceType.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
 #include <ATen/core/UniqueVoidPtr.h>
 #include <ATen/core/typeid.h>
-#include <ATen/core/ATenGeneral.h>
+#include <c10/util/Registry.h>
 
 namespace caffe2 {
 class Event;
@@ -31,11 +32,6 @@ class CAFFE2_API BaseStaticContext {
 
   virtual std::pair<void*, DeleterFnPtr> New(size_t nbytes) const = 0;
 
-  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext(
-      const caffe2::DeviceOption&) = 0;
-
   virtual DeviceType GetDeviceType() = 0;
 
   /*
@@ -184,6 +180,22 @@ class CAFFE2_API BaseContext {
   }
 };
 
+// Context constructor registry
+C10_DECLARE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+#define REGISTER_CONTEXT(type, ...) \
+  C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
+
+inline std::unique_ptr<at::BaseContext> CreateContext(
+    const at::Device& device) {
+  return at::ContextRegistry()->Create(device.type(), device);
+}
+
 } // namespace at
 
 namespace caffe2 {
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 8dfb1e8ebb75b6..5df0a5b49ca93b 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -1,10 +1,19 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/core/Formatting.h>
 
-#define TORCH_FORALL_TAGS(_)                                             \
-  _(None)                                                                \
-  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
-      _(TensorList) _(Blob)
+#define TORCH_FORALL_TAGS(_) \
+  _(None) \
+  _(Tensor) \
+  _(Double) \
+  _(Int) \
+  _(Tuple) \
+  _(IntList) \
+  _(DoubleList) \
+  _(String) \
+  _(TensorList) \
+  _(Blob) \
+  _(GenericList) \
+  _(World) \
 
 namespace torch { namespace jit {
 
@@ -16,7 +25,7 @@ CAFFE2_API c10::intrusive_ptr<ConstantString> ConstantString::create(
 namespace {
 
 template<typename Elem>
-std::ostream& printList(std::ostream & out, const ConstantList<Elem> &v,
+std::ostream& printList(std::ostream & out, const List<Elem> &v,
   const std::string start, const std::string delim, const std::string finish) {
   out << start;
   for(size_t i = 0; i < v.elements().size(); ++i) {
@@ -40,13 +49,13 @@ std::ostream& operator<<(std::ostream & out, const ConstantString & v) {
 }
 
 template<typename Elem>
-std::ostream& operator<<(std::ostream & out, const ConstantList<Elem> & v) {
+std::ostream& operator<<(std::ostream & out, const List<Elem> & v) {
   return printList<Elem>(out, v, "[", ", ", "]");
 }
 
 // tuple case
 template<>
-std::ostream& operator<<(std::ostream & out, const ConstantList<IValue> & v) {
+std::ostream& operator<<(std::ostream & out, const List<IValue> & v) {
   return printList<IValue>(out, v, "(", ", ", ")");
 }
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 513845d4c12af0..5e210d638d9226 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -33,16 +33,17 @@ struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
       const ConstantString& v);
 };
 
-// non-mutable list
 template <typename Elem>
-struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target {
+struct C10_EXPORT List : c10::intrusive_ptr_target {
  private:
-  const std::vector<Elem> elements_;
+  std::vector<Elem> elements_;
+
  public:
-  ConstantList(std::vector<Elem> elements_)
-  : elements_(std::move(elements_)) {}
-  static c10::intrusive_ptr<ConstantList<Elem>> create(std::vector<Elem> elements_) {
-    return c10::make_intrusive<ConstantList<Elem>>(std::move(elements_));
+  typedef Elem ElemType;
+
+  List(std::vector<Elem> elements_) : elements_(std::move(elements_)) {}
+  static c10::intrusive_ptr<List<Elem>> create(std::vector<Elem> elements_) {
+    return c10::make_intrusive<List<Elem>>(std::move(elements_));
   }
   const std::vector<Elem>& elements() const {
     return elements_;
@@ -50,13 +51,30 @@ struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target {
   operator const std::vector<Elem>&() const {
     return elements();
   }
+
+  std::vector<Elem>& elements() {
+    return elements_;
+  }
+  operator std::vector<Elem>&() {
+    return elements();
+  }
+};
+
+struct World {
+  int64_t world_id;
 };
 
 struct IValue;
-using Tuple = ConstantList<IValue>;
-using IntList = ConstantList<int64_t>;
-using TensorList = ConstantList<at::Tensor>;
-using DoubleList = ConstantList<double>;
+struct C10_EXPORT Tuple : public List<IValue> {
+  using List<IValue>::List;
+  static c10::intrusive_ptr<Tuple> create(std::vector<IValue> elements_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_));
+  }
+};
+using IntList = List<int64_t>;
+using TensorList = List<at::Tensor>;
+using DoubleList = List<double>;
+using GenericList = List<IValue>;
 
 // IValue is the generic tagged union used by the interpreter to hold
 // all value types.
@@ -65,10 +83,19 @@ using DoubleList = ConstantList<double>;
 // to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
 // retain/release calls.
 
-#define TORCH_FORALL_TAGS(_)                                             \
-  _(None)                                                                \
-  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
-      _(TensorList) _(Blob)
+#define TORCH_FORALL_TAGS(_) \
+  _(None) \
+  _(Tensor) \
+  _(Double) \
+  _(Int) \
+  _(Tuple) \
+  _(IntList) \
+  _(DoubleList) \
+  _(String) \
+  _(TensorList) \
+  _(Blob) \
+  _(GenericList) \
+  _(World) \
 
 struct CAFFE2_API IValue final {
   IValue()
@@ -128,6 +155,13 @@ struct CAFFE2_API IValue final {
     return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
   }
 
+  const IValue& toIValue() const {
+    return *this;
+  }
+  IValue& toIValue() {
+    return *this;
+  }
+
   IValue(caffe2::Blob blob) : tag(Tag::Blob), is_intrusive_ptr(true) {
     // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
     // and
@@ -170,6 +204,17 @@ struct CAFFE2_API IValue final {
     return payload.as_double;
   }
 
+  // World
+  IValue(World w)
+  : tag(Tag::World), is_intrusive_ptr(false) {
+    payload.as_world = w;
+  }
+  bool isWorld() const { return Tag::World == tag; }
+  World toWorld() const {
+    AT_ASSERT(isWorld());
+    return payload.as_world;
+  }
+
   // Int
   IValue(int64_t i)
   : tag(Tag::Int), is_intrusive_ptr(false) {
@@ -207,6 +252,7 @@ struct CAFFE2_API IValue final {
   const std::vector<int64_t>& toIntListRef() const;
   const std::vector<double>& toDoubleListRef() const;
   const std::vector<at::Tensor>& toTensorListRef() const;
+  const std::vector<IValue>& toGenericListRef() const;
 
   // ConstantString
   IValue(c10::intrusive_ptr<ConstantString> v);
@@ -247,6 +293,19 @@ struct CAFFE2_API IValue final {
     return toIntrusivePtr<TensorList>();
   }
 
+  //GenericList
+  IValue(c10::intrusive_ptr<GenericList> v);
+  IValue(std::vector<IValue> v);
+  bool isGenericList() const { return Tag::GenericList == tag; }
+  c10::intrusive_ptr<GenericList> toGenericList() && {
+    AT_ASSERT(isGenericList());
+    return moveToIntrusivePtr<GenericList>();
+  }
+  c10::intrusive_ptr<GenericList> toGenericList() const & {
+    AT_ASSERT(isGenericList());
+    return toIntrusivePtr<GenericList>();
+  }
+
   // None
   bool isNone() {
     return Tag::None == tag;
@@ -338,6 +397,7 @@ struct CAFFE2_API IValue final {
     int64_t as_int;
     double as_double;
     c10::intrusive_ptr_target* as_intrusive_ptr;
+    World as_world;
   } payload;
   Tag tag;
   bool is_intrusive_ptr;
@@ -362,12 +422,16 @@ DEFINE_TO(int64_t, toInt)
 DEFINE_TO(c10::intrusive_ptr<DoubleList>, toDoubleList)
 DEFINE_TO(c10::intrusive_ptr<IntList>, toIntList)
 DEFINE_TO(c10::intrusive_ptr<TensorList>, toTensorList)
+DEFINE_TO(c10::intrusive_ptr<GenericList>, toGenericList)
 DEFINE_TO(c10::intrusive_ptr<ConstantString>, toString)
 DEFINE_TO(at::Scalar, toScalar)
 DEFINE_TO(bool, toInt)
 DEFINE_TO(std::vector<int64_t>, toIntListRef)
 DEFINE_TO(std::vector<double>, toDoubleListRef)
 DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
+DEFINE_TO(std::vector<IValue>, toGenericListRef)
+DEFINE_TO(World, toWorld)
+DEFINE_TO(IValue, toIValue)
 
 #undef DEFINE_TO
 
@@ -433,6 +497,14 @@ inline IValue::IValue(c10::intrusive_ptr<TensorList> v)
 inline IValue::IValue(std::vector<at::Tensor> v)
 : IValue(TensorList::create(std::move(v))) {}
 
+inline IValue::IValue(c10::intrusive_ptr<GenericList> v)
+: tag(Tag::GenericList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<IValue> v)
+: IValue(GenericList::create(std::move(v))) {}
+
+
 inline const std::vector<int64_t>& IValue::toIntListRef() const {
   return toIntList()->elements();
 }
@@ -445,5 +517,9 @@ inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
   return toTensorList()->elements();
 }
 
+inline const std::vector<IValue>& IValue::toGenericListRef() const {
+  return toGenericList()->elements();
+}
+
 
 }}
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 58248acfe17951..0a4649d9c41ad4 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -54,15 +54,13 @@ Allocator* getCUDADeviceAllocator() {
 }
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
-  cusparseHandle_t getCurrentCUDASparseHandle() {
-    return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
-  }
+cusparseHandle_t getCurrentCUDASparseHandle() {
+  return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
+}
 
-  cublasHandle_t getCurrentCUDABlasHandle() {
-    return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
-  }
-#endif
+cublasHandle_t getCurrentCUDABlasHandle() {
+  return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
+}
 
 } // namespace cuda
 
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index 83a890da4d535e..3a480d2ca4e4e3 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -59,10 +59,8 @@ CAFFE2_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
 CAFFE2_API Allocator* getCUDADeviceAllocator();
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
 CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
 CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
-#endif
 
 
 } // namespace cuda
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp
index ec2ac11f305dcf..f3299b34cb7f9b 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.cpp
+++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp
@@ -54,6 +54,6 @@ const CUDAHooksInterface& getCUDAHooks() {
 }
 } // namespace detail
 
-AT_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
+C10_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
 
 } // namespace at
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 69149932ac7b98..b8cff1a7aa125f 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -2,9 +2,10 @@
 
 #include <ATen/Allocator.h>
 #include <ATen/core/Generator.h>
-#include <ATen/core/Registry.h>
 #include <ATen/core/Error.h>
 
+#include "c10/util/Registry.h"
+
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -131,9 +132,9 @@ struct CAFFE2_API CUDAHooksInterface {
 // for the "..." in a variadic macro"
 struct CAFFE2_API CUDAHooksArgs {};
 
-AT_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
+C10_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs);
 #define REGISTER_CUDA_HOOKS(clsname) \
-  AT_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
 
 namespace detail {
 CAFFE2_API const CUDAHooksInterface& getCUDAHooks();
diff --git a/aten/src/ATen/detail/ComplexHooksInterface.cpp b/aten/src/ATen/detail/ComplexHooksInterface.cpp
index 9755e288ff5fe7..a7ffcf1d625f2b 100644
--- a/aten/src/ATen/detail/ComplexHooksInterface.cpp
+++ b/aten/src/ATen/detail/ComplexHooksInterface.cpp
@@ -20,6 +20,8 @@ const ComplexHooksInterface& getComplexHooks() {
 }
 } // namespace detail
 
-AT_DEFINE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs)
-
+C10_DEFINE_REGISTRY(
+    ComplexHooksRegistry,
+    ComplexHooksInterface,
+    ComplexHooksArgs)
 }
diff --git a/aten/src/ATen/detail/ComplexHooksInterface.h b/aten/src/ATen/detail/ComplexHooksInterface.h
index e5d5c3ec2a83fa..52f835a30cc17b 100644
--- a/aten/src/ATen/detail/ComplexHooksInterface.h
+++ b/aten/src/ATen/detail/ComplexHooksInterface.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/Registry.h>
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
+#include "c10/util/Registry.h"
 
 namespace at {
 
@@ -16,9 +16,12 @@ struct CAFFE2_API ComplexHooksInterface {
 };
 
 struct CAFFE2_API ComplexHooksArgs {};
-AT_DECLARE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs)
+C10_DECLARE_REGISTRY(
+    ComplexHooksRegistry,
+    ComplexHooksInterface,
+    ComplexHooksArgs);
 #define REGISTER_COMPLEX_HOOKS(clsname) \
-  AT_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname)
 
 namespace detail {
 CAFFE2_API const ComplexHooksInterface& getComplexHooks();
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 189cadf0b6d1c6..1955d07b630d74 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -107,16 +107,10 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 # NB: As far as ezyang can tell, we don't *have* to codegen this,
 # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in
 # the superclass.  But it doesn't seem to be harmful.
-#
-# TODO: self_ty is a hack to make things work for native methods which need to
-# take a dtype, but also need to dispatch differently for different types.
-# Eliminate it at some point.
 TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     ${device_guard_declaration}
-    const auto& self_ty = *this;
-    (void)self_ty;
-    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals});
+    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${type_derived_call_actuals});
 }
 """)
 TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\
@@ -1574,8 +1568,15 @@ def process_native(option):
                         TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env))
                 else:
                     option['native_type_method_dispatch'] = native_dispatch
+                    type_derived_call_actuals = []
+                    for actual, arg in zip(option['actuals'], option['arguments']):
+                        if arg.get('is_type_dispatched', False):
+                            type_derived_call_actuals.append('*this')
+                        else:
+                            type_derived_call_actuals.append(actual)
                     type_object_definitions.append(
-                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(env))
+                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(
+                            env, type_derived_call_actuals=type_derived_call_actuals))
 
     for declaration in declarations:
         for option in declaration['options']:
diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp
index 1f93ecbc8235ab..d16458e5ad80a6 100644
--- a/aten/src/ATen/native/PixelShuffle.cpp
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@@ -1,7 +1,7 @@
 #include "ATen/native/TensorTransformations.h"
 
 #include <ATen/NativeFunctions.h>
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <algorithm>
 #include <vector>
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index c470f554c14234..31b8f59a779a65 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -148,6 +148,45 @@ Tensor &as_strided_(Tensor& self, IntList size, IntList stride) {
   return at::as_strided_(self, size, stride, self.storage_offset());
 }
 
+Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length){
+  int64_t allDim = self.dim();
+  int64_t end = start+length;
+  AT_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  AT_CHECK(dim >= 0 && dim < allDim, 
+    "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, ".");
+  AT_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
+    "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").")
+  LongTensor indices = self._indices();
+  int64_t sparseDims = self._sparseDims();
+  
+  std::vector<int64_t> newSizes = self.sizes().vec();
+  newSizes[dim]=length;
+  
+  Tensor newValues;
+  LongTensor newIndices;
+  if(dim < sparseDims){
+    Tensor mask = (indices[dim] >= start).__and__((indices[dim] < end));
+    newIndices = indices.masked_select(mask).view({sparseDims, -1});
+    newIndices[dim].add_(-start);
+    Tensor nzIndices = mask.nonzero().view(-1);
+    newValues = self._values().index_select(0, nzIndices);
+  }else{
+    /* This means we are narrowing on a dense dim, which is in effect just a
+        regular narrow on _values() */
+    newIndices = indices;
+    int64_t ddim = dim - sparseDims + 1;
+    newValues = self._values().narrow_copy(ddim, start, length);
+  }
+
+  SparseTensor newTensor = at::sparse_coo_tensor(newIndices, newValues, newSizes, self.type().options());
+  _get_sparse_impl(newTensor)->set_coalesced(self.is_coalesced());
+  return newTensor;
+}
+
+Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){
+    return self.narrow(dim, start, length).clone();
+}
+
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   AT_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
   auto cur_size = self.size(dim);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2cc0995dabadad..b4ebdfb634e422 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1174,6 +1174,14 @@
 - func: mvlgamma_(Tensor self, int64_t p) -> Tensor
   variants: method
 
+- func: narrow_copy(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
+  variants: method
+  dispatch:
+    CPU: narrow_copy_dense
+    CUDA: narrow_copy_dense
+    SparseCPU: narrow_copy_sparse
+    SparseCUDA: narrow_copy_sparse
+
 - func: narrow(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
   variants: function, method
 
@@ -2060,8 +2068,8 @@
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
 
-# This "raw copy" doesn't handle conversions NOR does it handle non-blocking.
-- func: raw_copy_sparse_(Tensor self, Tensor src) -> Tensor
+- func: copy_sparse_to_sparse_(Tensor self, Tensor src, bool non_blocking=false) -> Tensor
+  variants: function
   dispatch:
     SparseCPU: copy_sparse_
     SparseCUDA: copy_sparse_
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 83aee52cf81021..7e2340be24a10f 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -204,7 +204,7 @@ SparseTensor new_with_tensor_and_size_sparse(const LongTensor& indices, const Te
 
 SparseTensor clone_sparse(const SparseTensor& self) {
   SparseTensor other = new_with_dims_and_size_sparse(self.type(), self._sparseDims(), self._denseDims(), self.sizes());
-  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values());
+  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values(), true);
   _get_sparse_impl(other)->set_coalesced(self.is_coalesced());
   return other;
 }
@@ -243,11 +243,11 @@ Tensor sparse_to_dense(const SparseTensor& self) {
   return dst.add_(self);
 }
 
-SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src) {
+SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src, bool non_blocking) {
   if (isSameTensor(self, src)) return self;
   _get_sparse_impl(self)->resize_(src._sparseDims(), src._denseDims(), src.sizes());
   // NB: This seems to copy the underlying full indices/values buffer
-  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values());
+  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values(), non_blocking);
   _get_sparse_impl(self)->set_coalesced(src.is_coalesced());
   return self;
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 8a8668fc48b8a1..c71e38450974a6 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -98,7 +98,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
       r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
   }
   else {
-    r = raw_copy_sparse_(r, t.coalesce());
+    copy_sparse_to_sparse_(r, t.coalesce());
   }
   r._values().log1p_();
   return r;
@@ -192,7 +192,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
   AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
-    return raw_copy_sparse_(r, t);
+    return copy_sparse_to_sparse_(r, t);
   }
   if (t._nnz() == 0) {
     return mul_out_sparse_scalar(r, src, value);
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
index 2626eedebaf5e2..a0fbf4ea904cc4 100644
--- a/aten/src/ATen/native/sparse/SparseUtils.h
+++ b/aten/src/ATen/native/sparse/SparseUtils.h
@@ -50,8 +50,8 @@ inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indic
 
 // Take indices and values and makes a (data) copy of them to put into the sparse
 // indices/values.  This used to be called THSTensor_(_set)
-inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
-  _alias_into_sparse(self, indices.clone(), values.clone());
+inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {
+  _alias_into_sparse(self, self._indices().type().copy(indices, non_blocking), self._values().type().copy(values, non_blocking));
 }
 
 // Does NOT make copies of indices/values
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 036666bec82ac2..2abc10e62c3d46 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -363,7 +363,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const
   AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
-    return raw_copy_sparse_(r_, t);
+    return copy_sparse_to_sparse_(r_, t);
   }
   if (t._nnz() == 0) {
     return mul_out_sparse_scalar(r_, src, value);
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 0891f6d9f4f492..03309f8fe9eee3 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -18,7 +18,8 @@ namespace at {
 
 Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
   Tensor b_src;
-  std::tie(b_src) = expand_inplace(self, src, "copy");
+  if (is_sparse()) b_src = src;
+  else std::tie(b_src) = expand_inplace(self, src, "copy");
   return s_copy_(self, b_src, non_blocking);
 }
 
@@ -28,19 +29,11 @@ Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional<Device>
     device_guard.set_index(to_device.value().index());
   }
   AT_CHECK(src.defined(), "attempt to copy an undefined tensor");
-  if (is_sparse()) {
-    auto indices = src._indices();
-    auto values = src._values();
-    auto & this_dense = toBackend(is_cuda() ? Backend::CUDA : Backend::CPU);
-    auto & this_dense_idx = this_dense.toScalarType(ScalarType::Long);
-    auto indices_copy = this_dense_idx.copy(indices, non_blocking);
-    auto values_copy = this_dense.copy(values, non_blocking);
-    return _sparse_coo_tensor_unsafe(indices_copy, values_copy, src.sizes());
-  } else {
-    Tensor r = this->tensor(src.sizes());
-    r.copy_(src, non_blocking);
-    return r;
-  }
+  Tensor r;
+  if (is_sparse()) r = this->native_tensor();
+  else r = this->tensor(src.sizes());
+  r.copy_(src, non_blocking);
+  return r;
 }
 
 void TypeDefault::backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const {
diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp
index fc39eccee3926b..93a2d705bd8a08 100644
--- a/aten/src/ATen/test/apply_test.cpp
+++ b/aten/src/ATen/test/apply_test.cpp
@@ -1,121 +1,135 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "cuda.h"
 #include "cuda_runtime.h"
 
 #include "ATen/cuda/detail/TensorInfo.cuh"
-
+#define ASSERT_EQ_CUDA(X, Y) \
+  {                          \
+    bool _isEQ = X == Y;     \
+    ASSERT_TRUE(_isEQ);      \
+  }
 /*
-Tests related to tensor indexing and applying operations. 
+   Tests related to tensor indexing and applying operations.
 */
 #ifndef _WIN32
 
-CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") {
-    int sizes[] = {4, 4};
-    int strides[] = {4, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == (4 * 4));
+// CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D
+// contiguous") {
+TEST(ApplyTest, Contiguous2D) {
+  int sizes[] = {4, 4};
+  int strides[] = {4, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], (4 * 4));
 }
 
-CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") {
-    int sizes[] = {6, 3, 7};
-    int strides[] = {3 * 7, 7, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == (6 * 3 * 7));
+// CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D
+// contiguous") {
+TEST(ApplyTest, Contiguous3D) {
+  int sizes[] = {6, 3, 7};
+  int strides[] = {3 * 7, 7, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], (6 * 3 * 7));
 }
-
-CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") {
-    int sizes[] = {4, 3, 2};
-    int strides[] = {3 * 3, 3, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 2);
-    CATCH_REQUIRE(ti.sizes[0] == (4 * 3));
-    CATCH_REQUIRE(ti.sizes[1] == 2);
+// CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor
+// to a 2D tensor") {
+TEST(ApplyTest, PartialCollapse3D) {
+  int sizes[] = {4, 3, 2};
+  int strides[] = {3 * 3, 3, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 2);
+  ASSERT_EQ_CUDA(ti.sizes[0], (4 * 3));
+  ASSERT_EQ_CUDA(ti.sizes[1], 2);
 }
 
-CATCH_TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") {
-    int sizes[] = {3, 2};
-    int strides[] = {2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == (3 * 2));
-    CATCH_REQUIRE(ti.strides[0] == 2);
+// Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor
+TEST(ApplyTest, StridedCollapse2D) {
+  int sizes[] = {3, 2};
+  int strides[] = {2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], (3 * 2));
+  ASSERT_EQ_CUDA(ti.strides[0], 2);
 }
 
-CATCH_TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){
-    int sizes[] = {3, 6, 5, 2};
-    int strides[] = {6 * 22, 22, 2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 2);
-    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
-    CATCH_REQUIRE(ti.strides[0] == 22);
-    CATCH_REQUIRE(ti.sizes[1] == (5 * 2));
-    CATCH_REQUIRE(ti.strides[1] == 2);
+// Collapses a 4D tensor to a 2D tensor
+TEST(ApplyTest, PartialStridedCollapse4D) {
+  int sizes[] = {3, 6, 5, 2};
+  int strides[] = {6 * 22, 22, 2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 2);
+  ASSERT_EQ_CUDA(ti.sizes[0], (3 * 6));
+  ASSERT_EQ_CUDA(ti.strides[0], 22);
+  ASSERT_EQ_CUDA(ti.sizes[1], (5 * 2));
+  ASSERT_EQ_CUDA(ti.strides[1], 2);
 }
 
-CATCH_TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") {
-    int sizes[] = {1, 10, 1, 5, 4};
-    int strides[] = {4, 0, 16, 0, 1};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 5, sizes, strides};
-    ti.collapseDims();
-    CATCH_REQUIRE(ti.dims == 2);
-    CATCH_REQUIRE(ti.sizes[0] == (10 * 5));
-    CATCH_REQUIRE(ti.strides[0] == 0);
-    CATCH_REQUIRE(ti.sizes[1] == 4);
-    CATCH_REQUIRE(ti.strides[1] == 1);
+// Collapses a 5D tensor to a 1D tensor
+TEST(ApplyTest, CollapsesZerosAndOnes) {
+  int sizes[] = {1, 10, 1, 5, 4};
+  int strides[] = {4, 0, 16, 0, 1};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 5, sizes, strides};
+  ti.collapseDims();
+  ASSERT_EQ_CUDA(ti.dims, 2);
+  ASSERT_EQ_CUDA(ti.sizes[0], (10 * 5));
+  ASSERT_EQ_CUDA(ti.strides[0], 0);
+  ASSERT_EQ_CUDA(ti.sizes[1], 4);
+  ASSERT_EQ_CUDA(ti.strides[1], 1);
 }
 
-CATCH_TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") {
-    int sizes[] = {1, 1, 1};
-    int strides[] = {17, 12, 3};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    CATCH_REQUIRE(ti.collapseDims() == 0);
-    CATCH_REQUIRE(ti.dims == 1);
-    CATCH_REQUIRE(ti.sizes[0] == 1);
-    CATCH_REQUIRE(ti.strides[0] == 1);
+// Collapses a 3D tensor to a point tensor
+TEST(ApplyTest, CollapseToPointTensor) {
+  int sizes[] = {1, 1, 1};
+  int strides[] = {17, 12, 3};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ASSERT_EQ_CUDA(ti.collapseDims(), 0);
+  ASSERT_EQ_CUDA(ti.dims, 1);
+  ASSERT_EQ_CUDA(ti.sizes[0], 1);
+  ASSERT_EQ_CUDA(ti.strides[0], 1);
 }
 
-CATCH_TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") {
-    int sizes[] = {3, 6, 5, 2};
-    int strides[] = {6 * 22, 22, 2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    CATCH_REQUIRE(ti.collapseDims(1) == 1);
-    CATCH_REQUIRE(ti.dims == 3);
-    CATCH_REQUIRE(ti.sizes[0] == 3);
-    CATCH_REQUIRE(ti.strides[0] == (6 * 22));
-    CATCH_REQUIRE(ti.sizes[1] == 6);
-    CATCH_REQUIRE(ti.strides[1] == 22);
-    CATCH_REQUIRE(ti.sizes[2] == (5 * 2));
-    CATCH_REQUIRE(ti.strides[2] == 2);
+// Collapses a 4D tensor to a 3D tensor
+TEST(ApplyTest, ExcludingInContiguous4D) {
+  int sizes[] = {3, 6, 5, 2};
+  int strides[] = {6 * 22, 22, 2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+  ASSERT_EQ_CUDA(ti.collapseDims(1), 1);
+  ASSERT_EQ_CUDA(ti.dims, 3);
+  ASSERT_EQ_CUDA(ti.sizes[0], 3);
+  ASSERT_EQ_CUDA(ti.strides[0], (6 * 22));
+  ASSERT_EQ_CUDA(ti.sizes[1], 6);
+  ASSERT_EQ_CUDA(ti.strides[1], 22);
+  ASSERT_EQ_CUDA(ti.sizes[2], (5 * 2));
+  ASSERT_EQ_CUDA(ti.strides[2], 2);
 }
 
-CATCH_TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") {
-    int sizes[] = {3, 6, 5, 2};
-    int strides[] = {6 * 22, 22, 2 * 2, 2};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    CATCH_REQUIRE(ti.collapseDims(2) == 1);
-    CATCH_REQUIRE(ti.dims == 3);
-    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
-    CATCH_REQUIRE(ti.strides[0] == 22);
-    CATCH_REQUIRE(ti.sizes[1] == 5);
-    CATCH_REQUIRE(ti.strides[1] == 4);
-    CATCH_REQUIRE(ti.sizes[2] == 2);
-    CATCH_REQUIRE(ti.strides[2] == 2);
+// Collapses a 4D tensor to a 3D tensor
+TEST(ApplyTest, RovingExclusion) {
+  int sizes[] = {3, 6, 5, 2};
+  int strides[] = {6 * 22, 22, 2 * 2, 2};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
+  ASSERT_EQ_CUDA(ti.collapseDims(2), 1);
+  ASSERT_EQ_CUDA(ti.dims, 3);
+  ASSERT_EQ_CUDA(ti.sizes[0], (3 * 6));
+  ASSERT_EQ_CUDA(ti.strides[0], 22);
+  ASSERT_EQ_CUDA(ti.sizes[1], 5);
+  ASSERT_EQ_CUDA(ti.strides[1], 4);
+  ASSERT_EQ_CUDA(ti.sizes[2], 2);
+  ASSERT_EQ_CUDA(ti.strides[2], 2);
 }
 
-CATCH_TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") {
-    int sizes[] = {1, 1, 1};
-    int strides[] = {17, 12, 3};
-    ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    _CATCH_REQUIRE_THROWS(ti.collapseDims(5));
-} 
-
+// Attempts to exclude a nonexisting dimension
+TEST(ApplyTest, InvalidExclusion) {
+  int sizes[] = {1, 1, 1};
+  int strides[] = {17, 12, 3};
+  ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
+  ASSERT_ANY_THROW(ti.collapseDims(5));
+}
 #endif
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
index ab7e3522bbedae..71715a2d4b0d6e 100644
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/CPUApplyUtils.h"
@@ -108,32 +107,38 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
   });
 }
 
-CATCH_TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
+// apply utils test 2-dim small contiguous
+TEST(ApplyUtilsTest, Contiguous2D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1}, -1, -1);
 }
 
-CATCH_TEST_CASE("apply utils test 2-dim small", "[cpu]") {
+// apply utils test 2-dim small
+TEST(ApplyUtilsTest, Small2D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1});
 }
 
-CATCH_TEST_CASE("apply utils test 2-dim", "[cpu]") {
+// apply utils test 2-dim
+TEST(ApplyUtilsTest, _2D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {20, 10});
 }
 
-CATCH_TEST_CASE("apply utils test 3-dim", "[cpu]") {
+// apply utils test 3-dim
+TEST(ApplyUtilsTest, _3D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2});
 }
 
-CATCH_TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
+// apply utils test 3-dim medium
+TEST(ApplyUtilsTest, Medium3D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 40, 2});
 }
 
-CATCH_TEST_CASE("apply utils test 10-dim", "[cpu]") {
+// apply utils test 10-dim
+TEST(ApplyUtilsTest, _10D) {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3});
 }
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
index edb3f79fd2d55d..96c5ed11897481 100644
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@@ -8,17 +8,17 @@ using namespace std;
 using namespace at;
 
 void trace() {
-  Tensor foo = rand({12,12});
+  Tensor foo = rand({12, 12});
 
   // ASSERT foo is 2-dimensional and holds floats.
-  auto foo_a = foo.accessor<float,2>();
+  auto foo_a = foo.accessor<float, 2>();
   float trace = 0;
 
-  for(int i = 0; i < foo_a.size(0); i++) {
+  for (int i = 0; i < foo_a.size(0); i++) {
     trace += foo_a[i][i];
   }
 
-  EXPECT_FLOAT_EQ(foo.trace().item<float>(), trace);
+  ASSERT_FLOAT_EQ(foo.trace().item<float>(), trace);
 }
 
 // TEST_CASE( "atest", "[]" ) {
@@ -26,82 +26,78 @@ TEST(atest, atest) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
-  auto foo = rand({12,6});
+  auto foo = rand({12, 6});
 
-  EXPECT_EQ(foo.size(0), 12);
-  EXPECT_EQ(foo.size(1), 6);
+  ASSERT_EQ(foo.size(0), 12);
+  ASSERT_EQ(foo.size(1), 6);
 
-  foo = foo+foo*3;
+  foo = foo + foo * 3;
   foo -= 4;
 
   Scalar a = 4;
   float b = a.to<float>();
-  EXPECT_EQ(b, 4);
+  ASSERT_EQ(b, 4);
 
-  foo = (foo*foo) == (foo.pow(3));
-  foo =  2 + (foo+1);
-  //foo = foo[3];
-  auto foo_v = foo.accessor<uint8_t,2>();
+  foo = (foo * foo) == (foo.pow(3));
+  foo = 2 + (foo + 1);
+  // foo = foo[3];
+  auto foo_v = foo.accessor<uint8_t, 2>();
 
-  for(int i = 0; i < foo_v.size(0); i++) {
-    for(int j = 0; j < foo_v.size(1); j++) {
+  for (int i = 0; i < foo_v.size(0); i++) {
+    for (int j = 0; j < foo_v.size(1); j++) {
       foo_v[i][j]++;
     }
   }
 
-  EXPECT_TRUE(foo.equal(4 * ones({12, 6}, kByte)));
+  ASSERT_TRUE(foo.equal(4 * ones({12, 6}, kByte)));
 
   trace();
 
-  float data[] = { 1, 2, 3,
-                   4, 5, 6};
+  float data[] = {1, 2, 3, 4, 5, 6};
 
-  auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3});
-  auto f_a = f.accessor<float,3>();
+  auto f = CPU(kFloat).tensorFromBlob(data, {1, 2, 3});
+  auto f_a = f.accessor<float, 3>();
 
-  EXPECT_EQ(f_a[0][0][0], 1.0);
-  EXPECT_EQ(f_a[0][1][1], 5.0);
+  ASSERT_EQ(f_a[0][0][0], 1.0);
+  ASSERT_EQ(f_a[0][1][1], 5.0);
 
-  EXPECT_EQ(f.strides()[0], 6);
-  EXPECT_EQ(f.strides()[1], 3);
-  EXPECT_EQ(f.strides()[2], 1);
-  EXPECT_EQ(f.sizes()[0], 1);
-  EXPECT_EQ(f.sizes()[1], 2);
-  EXPECT_EQ(f.sizes()[2], 3);
+  ASSERT_EQ(f.strides()[0], 6);
+  ASSERT_EQ(f.strides()[1], 3);
+  ASSERT_EQ(f.strides()[2], 1);
+  ASSERT_EQ(f.sizes()[0], 1);
+  ASSERT_EQ(f.sizes()[1], 2);
+  ASSERT_EQ(f.sizes()[2], 3);
 
   // TODO(ezyang): maybe do a more precise exception type.
-  ASSERT_THROW(f.resize_({3,4,5}), std::exception);
+  ASSERT_THROW(f.resize_({3, 4, 5}), std::exception);
   {
     int isgone = 0;
     {
-      auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) {
-        isgone++;
-      });
+      auto f2 =
+          CPU(kFloat).tensorFromBlob(data, {1, 2, 3}, [&](void*) { isgone++; });
     }
-    EXPECT_EQ(isgone, 1);
+    ASSERT_EQ(isgone, 1);
   }
   {
     int isgone = 0;
     Tensor a_view;
     {
-      auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) {
-        isgone++;
-      });
-      a_view = f2.view({3,2,1});
+      auto f2 =
+          CPU(kFloat).tensorFromBlob(data, {1, 2, 3}, [&](void*) { isgone++; });
+      a_view = f2.view({3, 2, 1});
     }
-    EXPECT_EQ(isgone, 0);
+    ASSERT_EQ(isgone, 0);
     a_view.reset();
-    EXPECT_EQ(isgone, 1);
+    ASSERT_EQ(isgone, 1);
   }
 
-  if(at::hasCUDA()) {
+  if (at::hasCUDA()) {
     int isgone = 0;
     {
-      auto base = CUDA(kFloat).tensor({1,2,3});
-      auto f2 = CUDA(kFloat).tensorFromBlob(base.data_ptr(), {1,2,3}, [&](void*) {
-        isgone++;
-      });
+      auto base = CUDA(kFloat).tensor({1, 2, 3});
+      auto f2 = CUDA(kFloat).tensorFromBlob(
+          base.data_ptr(), {1, 2, 3}, [&](void*) { isgone++; });
     }
-    EXPECT_EQ(isgone, 1);
+    ASSERT_EQ(isgone, 1);
   }
 }
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 361d24b5a6b76f..791d80b1f42f95 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -19,36 +19,35 @@ using namespace at;
 
 using Catch::Matchers::StartsWith;
 
-static void test(Type & type) {
-  CATCH_SECTION( "resize" ) {
+static void test(Type& type) {
+  CATCH_SECTION("resize") {
     auto a = at::empty({0}, type.options());
-    a.resize_({3,4});
+    a.resize_({3, 4});
     CATCH_REQUIRE(a.numel() == 12);
     a.resize_({5, 7});
     CATCH_REQUIRE(a.numel() == 35);
-
   }
 
-  CATCH_SECTION( "ones and dot" ) {
+  CATCH_SECTION("ones and dot") {
     Tensor b0 = ones({1, 1}, type);
-    CATCH_REQUIRE(2 == (b0+b0).sum().item<double>());
+    CATCH_REQUIRE(2 == (b0 + b0).sum().item<double>());
 
     Tensor b1 = ones({1, 2}, type);
-    CATCH_REQUIRE(4 == (b1+b1).sum().item<double>());
+    CATCH_REQUIRE(4 == (b1 + b1).sum().item<double>());
 
     Tensor b = ones({3, 4}, type);
-    CATCH_REQUIRE(24 == (b+b).sum().item<double>());
+    CATCH_REQUIRE(24 == (b + b).sum().item<double>());
     CATCH_REQUIRE(12 == b.numel());
     CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).item<double>() == 12);
   }
 
-  CATCH_SECTION( "rand" ) {
-    for(auto i = 0; i < 10; i++) {
-      Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble));
+  CATCH_SECTION("rand") {
+    for (auto i = 0; i < 10; i++) {
+      Tensor a = rand({3, 4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble));
     }
   }
 
-  CATCH_SECTION( "sort" ) {
+  CATCH_SECTION("sort") {
     Tensor b = rand({3, 4}, type);
 
     auto z = b.sort(1);
@@ -57,93 +56,101 @@ static void test(Type & type) {
     CATCH_REQUIRE(z_sorted[0][0].item<float>() < z_sorted[0][1].item<float>());
   }
 
-  if(type.backend() != Backend::CUDA)
-  CATCH_SECTION( "randperm" ) {
-    Tensor b = randperm(15, type);
-    Tensor rv, ri;
-    std::tie(rv, ri) = sort(b, 0);
-    CATCH_REQUIRE(rv[0].item<float>() <= rv[1].item<float>());
-  }
+  if (type.backend() != Backend::CUDA)
+    CATCH_SECTION("randperm") {
+      Tensor b = randperm(15, type);
+      Tensor rv, ri;
+      std::tie(rv, ri) = sort(b, 0);
+      CATCH_REQUIRE(rv[0].item<float>() <= rv[1].item<float>());
+    }
 
-  CATCH_SECTION( "context" ) {
+  CATCH_SECTION("context") {
     std::stringstream ss;
     ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl;
   }
 
-  CATCH_SECTION( "add" ) {
+  CATCH_SECTION("add") {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({3, 4}, type);
     Tensor c = add(a, add(a, b));
-    //TODO:0-dim Tensor d(3.f);
+    // TODO:0-dim Tensor d(3.f);
     Scalar d = 3.f;
-    CATCH_REQUIRE( add(c, d).allclose(a + a + b + d) );
+    CATCH_REQUIRE(add(c, d).allclose(a + a + b + d));
   }
 
-  CATCH_SECTION( "loads of adds" ) {
+  CATCH_SECTION("loads of adds") {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
-    for(auto i = 0; i < 100000; i++) {
+    for (auto i = 0; i < 100000; i++) {
       add_out(r, r, d);
     }
     auto end = std::chrono::high_resolution_clock::now();
-    //TODO TEST PERF?
-    std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
+    // TODO TEST PERF?
+    std::cout << std::dec << "   "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(
+                     end - begin)
+                     .count()
+              << " ms" << std::endl;
+    CATCH_REQUIRE(norm(100000 * d).item<double>() == norm(r).item<double>());
   }
 
-  CATCH_SECTION( "loads of adds (with copy)" ) {
+  CATCH_SECTION("loads of adds (with copy)") {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
-    for(auto i = 0; i < 100000; i++) {
+    for (auto i = 0; i < 100000; i++) {
       r = add(r, d);
     }
     auto end = std::chrono::high_resolution_clock::now();
-    //TODO TEST PERF?
-    std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
+    // TODO TEST PERF?
+    std::cout << std::dec << "   "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(
+                     end - begin)
+                     .count()
+              << " ms" << std::endl;
+    CATCH_REQUIRE(norm(100000 * d).item<double>() == norm(r).item<double>());
   }
 
-  CATCH_SECTION( "isContiguous" ) {
+  CATCH_SECTION("isContiguous") {
     Tensor a = rand({3, 4}, type);
     CATCH_REQUIRE(a.is_contiguous());
     a = a.transpose(0, 1);
     CATCH_REQUIRE(!a.is_contiguous());
   }
 
-  CATCH_SECTION( "permute" ) {
+  CATCH_SECTION("permute") {
     Tensor a = rand({3, 4, 5}, type);
     Tensor b = a.permute({1, 2, 0});
     CATCH_REQUIRE(b.sizes().equals({4, 5, 3}));
     CATCH_REQUIRE(b.strides().equals({5, 1, 20}));
   }
 
-  CATCH_SECTION( "mm" ) {
+  CATCH_SECTION("mm") {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({4}, type);
     Tensor c = mv(a, b);
     CATCH_REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1)));
   }
 
-  CATCH_SECTION( "squeeze" ) {
+  CATCH_SECTION("squeeze") {
     Tensor a = rand({2, 1}, type);
     Tensor b = squeeze(a);
     CATCH_REQUIRE(b.dim() == 1);
     a = rand({1}, type);
     b = squeeze(a);
-    //TODO 0-dim squeeze
+    // TODO 0-dim squeeze
     CATCH_REQUIRE(a[0].equal(b));
   }
 
-  CATCH_SECTION( "copy" ) {
+  CATCH_SECTION("copy") {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({4, 3}, type);
     a.copy_(e);
     CATCH_REQUIRE(a.equal(e));
   }
 
-  CATCH_SECTION( "copy (broadcasting)" ) {
+  CATCH_SECTION("copy (broadcasting)") {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({3}, type);
     a.copy_(e);
@@ -152,12 +159,12 @@ static void test(Type & type) {
     }
   }
 
-  CATCH_SECTION( "abs(value)" ) {
+  CATCH_SECTION("abs(value)") {
     Tensor r = at::abs(type.scalarTensor(-3));
     CATCH_REQUIRE(r.item<int32_t>() == 3);
   }
 
-//TODO(zach): operator overloads
+// TODO(zach): operator overloads
 #if 0
   {
     std::cout << "eq (value):" << std::endl;
@@ -168,60 +175,60 @@ static void test(Type & type) {
   }
 #endif
 
-  CATCH_SECTION( "adding a value with a scalar" ) {
+  CATCH_SECTION("adding a value with a scalar") {
     Tensor a = rand({4, 3}, type);
-    CATCH_REQUIRE((ones({4,3}, type) + a).equal(add(a,1)));
+    CATCH_REQUIRE((ones({4, 3}, type) + a).equal(add(a, 1)));
   }
 
-  CATCH_SECTION( "select" ) {
+  CATCH_SECTION("select") {
     Tensor a = rand({3, 7}, type);
     auto a_13 = select(a, 1, 3);
     auto a_13_02 = select(select(a, 1, 3), 0, 2);
-    CATCH_REQUIRE( a[0][3].equal(a_13[0]) );
-    CATCH_REQUIRE( a[2][3].equal(a_13_02) );
+    CATCH_REQUIRE(a[0][3].equal(a_13[0]));
+    CATCH_REQUIRE(a[2][3].equal(a_13_02));
   }
 
-  CATCH_SECTION( "zero-dim" ) {
-    Tensor a =  type.scalarTensor(4); //rand(type, {1});
+  CATCH_SECTION("zero-dim") {
+    Tensor a = type.scalarTensor(4); // rand(type, {1});
 
-    Tensor b = rand({3,4}, type);
+    Tensor b = rand({3, 4}, type);
     CATCH_REQUIRE((a + a).dim() == 0);
     CATCH_REQUIRE((1 + a).dim() == 0);
     CATCH_REQUIRE((b + a).dim() == 2);
     CATCH_REQUIRE((a + b).dim() == 2);
-    auto c = rand({3,4}, type);
+    auto c = rand({3, 4}, type);
     CATCH_REQUIRE(c[1][2].dim() == 0);
 
-    auto f = rand({3,4}, type);
+    auto f = rand({3, 4}, type);
     f[2] = zeros({4}, type);
     f[1][0] = -1;
     CATCH_REQUIRE(f[2][0].item<double>() == 0);
   }
 
-  CATCH_SECTION( "tensor from TH" ) {
+  CATCH_SECTION("tensor from TH") {
     int a = 4;
-    THFloatTensor *t = THFloatTensor_newWithSize2d(a, a);
+    THFloatTensor* t = THFloatTensor_newWithSize2d(a, a);
     THFloatTensor_fill(t, a);
-    Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false);
+    Tensor tt = CPU(kFloat).unsafeTensorFromTH(t, false);
     CATCH_REQUIRE_NOTHROW(tt);
   }
 
-  CATCH_SECTION( "item<float>" ) {
-    Tensor a = zeros({3,4});
-    Tensor b = ones({3,7});
-    Tensor c = cat({a,b},1);
+  CATCH_SECTION("item<float>") {
+    Tensor a = zeros({3, 4});
+    Tensor b = ones({3, 7});
+    Tensor c = cat({a, b}, 1);
     CATCH_REQUIRE(c.size(1) == 11);
 
     Tensor e = rand({});
     CATCH_REQUIRE(*e.data<float>() == e.sum().item<float>());
   }
 
-  CATCH_SECTION( "to string" ) {
-    Tensor b = ones({3,7})*.0000001f;
+  CATCH_SECTION("to string") {
+    Tensor b = ones({3, 7}) * .0000001f;
     std::stringstream s;
     s << b << "\n";
     std::string expect = "1e-07 *";
-    CATCH_REQUIRE(s.str().substr(0,expect.size()) == expect);
+    CATCH_REQUIRE(s.str().substr(0, expect.size()) == expect);
   }
   CATCH_SECTION("indexing by Scalar") {
     Tensor tensor = arange(0, 10, kInt);
@@ -243,8 +250,7 @@ static void test(Type & type) {
     }
     CATCH_REQUIRE_THROWS_WITH(
         tensor[Scalar(3.14)].equal(one),
-        StartsWith(
-            "Can only index tensors with integral scalars"));
+        StartsWith("Can only index tensors with integral scalars"));
   }
   CATCH_SECTION("indexing by zero-dim tensor") {
     Tensor tensor = arange(0, 10, kInt);
@@ -254,8 +260,7 @@ static void test(Type & type) {
     }
     CATCH_REQUIRE_THROWS_WITH(
         tensor[ones({}) * 3.14].equal(one),
-        StartsWith(
-            "Can only index tensors with integral scalars"));
+        StartsWith("Can only index tensors with integral scalars"));
     CATCH_REQUIRE_THROWS_WITH(
         tensor[Tensor()].equal(one),
         StartsWith("Can only index with tensors that are defined"));
@@ -275,16 +280,16 @@ static void test(Type & type) {
   }
 }
 
-CATCH_TEST_CASE( "basic tests CPU", "[cpu]" ) {
+CATCH_TEST_CASE("basic tests CPU", "[cpu]") {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
-CATCH_TEST_CASE( "basic tests GPU", "[cuda]" ) {
+CATCH_TEST_CASE("basic tests GPU", "[cuda]") {
   manual_seed(123, at::kCUDA);
 
-  if(at::hasCUDA()) {
+  if (at::hasCUDA()) {
     test(CUDA(kFloat));
   }
 }
diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp
index 822a1d79df1bda..8bebb7d8fdd907 100644
--- a/aten/src/ATen/test/broadcast_test.cpp
+++ b/aten/src/ATen/test/broadcast_test.cpp
@@ -1,154 +1,192 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-CATCH_TEST_CASE( "broadcast", "[]" ) {
+// can't expand empty tensor
+void TestEmptyTensor(Type& T) {
+  auto empty = randn({0}, T);
+  ASSERT_ANY_THROW(empty.expand({3}));
+}
+
+// out-place function with 2 args
+void TestOut2Basic(Type& T) {
+  auto a = randn({3, 1}, T);
+  auto b = randn({5}, T);
+  std::vector<int64_t> expanded_sizes = {3, 5};
+  ASSERT_TRUE(
+      (a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
+}
+
+// with scalar
+void TestOut2WithScalar(Type& T) {
+  auto aScalar = ones({1}, T);
+  aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  auto b = randn({3, 5}, T);
+  ASSERT_TRUE(
+      (aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
+}
 
+// old fallback behavior yields error
+void TestOut2OldFallback(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto b = randn({5, 3}, T);
+  ASSERT_ANY_THROW(a + b);
+}
+
+// with mismatched sizes
+void TestOut2MismatchedSizes(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto b = randn({7, 5}, T);
+  ASSERT_ANY_THROW(a + b);
+}
+
+// out-place function with 3 args
+void TestOut3Basic(Type& T) {
+  auto a = randn({3, 1, 1}, T);
+  auto b = randn({1, 2, 1}, T);
+  auto c = randn({1, 1, 5}, T);
+  std::vector<int64_t> expanded_sizes = {3, 2, 5};
+  ASSERT_TRUE((a + b + c).equal(
+      a.expand(expanded_sizes) + b.expand(expanded_sizes) +
+      c.expand(expanded_sizes)));
+}
+
+// with scalar
+void TestOut3WithScalar(Type& T) {
+  auto aTensorScalar = ones({1}, T);
+  aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  auto b = randn({3, 2, 1}, T);
+  auto c = randn({1, 2, 5}, T);
+  std::vector<int64_t> expanded_sizes = {3, 2, 5};
+  ASSERT_TRUE(aTensorScalar.addcmul(b, c).equal(
+      aTensorScalar.expand(expanded_sizes)
+          .addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes))));
+}
+
+// old fallback behavior yields error
+void TestOut3OldFallback(Type& T) {
+  auto a = randn({3, 2, 5}, T);
+  auto b = randn({2, 3, 5}, T);
+  auto c = randn({5, 3, 2}, T);
+  ASSERT_ANY_THROW(a.addcmul(b, c));
+}
+
+// with mismatched sizes
+void TestOut3MismatchedSizes(Type& T) {
+  auto a = randn({3, 2, 5}, T);
+  auto b = randn({2, 3, 5}, T);
+  auto c = randn({5, 5, 5}, T);
+  ASSERT_ANY_THROW(a.addcmul(b, c));
+}
+
+// in-place function with 2 args
+void TestIn2Basic(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto b = randn({3, 1}, T);
+  ASSERT_TRUE((a + b).equal(a + b.expand({3, 5})));
+}
+
+// with scalar
+void TestIn2WithScalar(Type& T) {
+  auto a = randn({3, 5}, T);
+  auto bScalar = ones({1}, T);
+  bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
+}
+
+// error: would have to expand inplace arg
+void TestIn2ExpandError(Type& T) {
+  auto a = randn({1, 5}, T);
+  auto b = randn({3, 1}, T);
+  ASSERT_ANY_THROW(a.add_(b));
+}
+
+// in-place function with 3 args
+void TestIn3Basic(Type& T) {
+  auto a = randn({3, 5, 2}, T);
+  auto b = randn({3, 1, 2}, T);
+  auto c = randn({1, 5, 1}, T);
+  auto aClone = a.clone();
+  ASSERT_TRUE(a.addcmul_(b, c).equal(
+      aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
+}
+
+// with scalar
+void TestIn3WithScalar(Type& T) {
+  auto a = randn({3, 5, 2}, T);
+  auto b = randn({3, 1, 2}, T);
+  auto c = randn({1, 5, 1}, T);
+  auto aClone = a.clone();
+  auto bScalar = ones({1}, T);
+  bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE(a.addcmul_(bScalar, c)
+                  .equal(aClone.addcmul_(
+                      bScalar.expand(a.sizes()), c.expand(a.sizes()))));
+}
+
+// error: would have to expand inplace arg
+void TestIn3ExpandError(Type& T) {
+  auto a = randn({1, 3, 5}, T);
+  auto b = randn({4, 1, 1}, T);
+  auto c = randn({1, 3, 1}, T);
+  ASSERT_ANY_THROW(a.addcmul_(b, c));
+}
+
+// explicit dim specification
+void TestExplicitDimBasic(Type& T) {
+  auto a = randn({1}, T);
+  auto b = randn({5, 3}, T);
+  auto c = randn({3, 7}, T);
+  ASSERT_TRUE(a.addmm(b, c).equal(a.expand({5, 7}).addmm(b, c)));
+}
+
+// with scalar
+void TestExplicitDimWithScalar(Type& T) {
+  auto a = randn({1}, T);
+  auto b = randn({5, 3}, T);
+  auto c = randn({3, 7}, T);
+  Tensor aScalar = ones({1}, T);
+  aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
+}
+
+// with mismatched sizes
+void TestExplicitDimWithMismatchedSizes(Type& T) {
+  auto b = randn({5, 3}, T);
+  auto c = randn({3, 7}, T);
+  auto a = randn({3, 3}, T);
+  ASSERT_ANY_THROW(a.addmm(b, c));
+}
+
+TEST(BroadcastTest, Broadcast) {
   manual_seed(123, at::kCPU);
+  Type& T = CPU(kFloat);
+
+  TestEmptyTensor(T);
+
+  TestOut2Basic(T);
+  TestOut2WithScalar(T);
+  TestOut2OldFallback(T);
+  TestOut2MismatchedSizes(T);
+
+  TestOut3Basic(T);
+  TestOut3WithScalar(T);
+  TestOut3OldFallback(T);
+  TestOut3MismatchedSizes(T);
+
+  TestIn2Basic(T);
+  TestIn2WithScalar(T);
+  TestIn2ExpandError(T);
+
+  TestIn3Basic(T);
+  TestIn3WithScalar(T);
+  TestIn3ExpandError(T);
 
-  Type & T = CPU(kFloat);
-
-  // 0) pre-req tests:
-  CATCH_SECTION( "can't expand empty tensor" ) {
-    auto empty = randn({0}, T);
-    _CATCH_REQUIRE_THROWS(empty.expand({3}));
-  }
-
-  // 1) out-place function with 2 args
-  CATCH_SECTION( "out-place function with 2 args" ) {
-
-    CATCH_SECTION( "basic" ) {
-      auto a = randn({3, 1}, T);
-      auto b = randn({5}, T);
-      std::vector<int64_t> expanded_sizes = {3, 5};
-      CATCH_REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto aScalar = ones({1}, T);
-      aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      auto b = randn({3, 5}, T);
-      CATCH_REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
-    }
-
-    CATCH_SECTION( "old fallback behavior yields error" ) {
-      auto a = randn({3, 5}, T);
-      auto b = randn({5, 3}, T);
-      _CATCH_REQUIRE_THROWS(a + b);
-    }
-
-    CATCH_SECTION( "with mismatched sizes" ) {
-      auto a = randn({3, 5}, T);
-      auto b = randn({7, 5}, T);
-      _CATCH_REQUIRE_THROWS(a + b);
-    }
-  }
-
-  CATCH_SECTION( "out-place function with 3 args" ) {
-
-    CATCH_SECTION( "basic" ) {
-      auto a = randn({3, 1, 1}, T);
-      auto b = randn({1, 2, 1}, T);
-      auto c = randn({1, 1, 5}, T);
-      std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      CATCH_REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes)));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto aTensorScalar = ones({1}, T);
-      aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      auto b = randn({3, 2, 1}, T);
-      auto c = randn({1, 2, 5}, T);
-      std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      CATCH_REQUIRE(aTensorScalar.addcmul(b, c).equal(
-                aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes))));
-    }
-
-    CATCH_SECTION( "old fallback behavior yields error" ) {
-      auto a = randn({3, 2, 5}, T);
-      auto b = randn({2, 3, 5}, T);
-      auto c = randn({5, 3, 2}, T);
-      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
-    }
-
-    CATCH_SECTION( "with mismatched sizes" ){
-      auto a = randn({3, 2, 5}, T);
-      auto b = randn({2, 3, 5}, T);
-      auto c = randn({5, 5, 5}, T);
-      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
-    }
-  }
-
-  CATCH_SECTION( "in-place function with 2 args" ) {
-    CATCH_SECTION( "basic" ) {
-      auto a = randn({3, 5}, T);
-      auto b = randn({3, 1}, T);
-      CATCH_REQUIRE((a + b).equal(a + b.expand({3, 5})));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto a = randn({3, 5}, T);
-      auto bScalar = ones({1}, T);
-      bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      CATCH_REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
-    }
-
-    CATCH_SECTION( "error: would have to expand inplace arg" ) {
-      auto a = randn({1, 5}, T);
-      auto b = randn({3, 1}, T);
-      _CATCH_REQUIRE_THROWS(a.add_(b));
-    }
-  }
-
-  CATCH_SECTION( "in-place function with 3 args" ) {
-
-    auto a = randn({3, 5, 2}, T);
-    auto b = randn({3, 1, 2}, T);
-    auto c = randn({1, 5, 1}, T);
-
-    CATCH_SECTION( "basic" ) {
-      auto aClone = a.clone();
-      CATCH_REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      auto aClone = a.clone();
-      auto bScalar = ones({1}, T);
-      bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      CATCH_REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes()))));
-    }
-
-    CATCH_SECTION( "error: would have to expand inplace arg" ) {
-      auto a = randn({1, 3, 5}, T);
-      auto b = randn({4, 1, 1}, T);
-      auto c = randn({1, 3, 1}, T);
-      _CATCH_REQUIRE_THROWS(a.addcmul_(b, c));
-    }
-  }
-
-  CATCH_SECTION( "explicit dim specification" ) {
-
-    auto a = randn({1}, T);
-    auto b = randn({5, 3}, T);
-    auto c = randn({3, 7}, T);
-
-    CATCH_SECTION( "basic" ) {
-      CATCH_REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c)));
-    }
-
-    CATCH_SECTION( "with scalar" ) {
-      Tensor aScalar = ones({1}, T);
-      aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      CATCH_REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
-    }
-
-    CATCH_SECTION( "with mismatched sizes" ) {
-      auto a = randn({3, 3}, T);
-      _CATCH_REQUIRE_THROWS(a.addmm(b, c));
-    }
-  }
+  TestExplicitDimBasic(T);
+  TestExplicitDimWithScalar(T);
+  TestExplicitDimWithMismatchedSizes(T);
 }
diff --git a/aten/src/ATen/test/catch_utils.hpp b/aten/src/ATen/test/catch_utils.hpp
index b9b0a87990a9ce..9e7696b1372263 100644
--- a/aten/src/ATen/test/catch_utils.hpp
+++ b/aten/src/ATen/test/catch_utils.hpp
@@ -3,6 +3,8 @@
 #define CATCH_CONFIG_PREFIX_ALL
 #include <catch.hpp>
 
-// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
-// define our own version that doesn't warn.
-#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes
+// warning; define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS(...) \
+  INTERNAL_CATCH_THROWS(           \
+      "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__)
diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu
index cce267100589e1..56ca901931384d 100644
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/cuda/NumericLimits.cuh"
@@ -12,7 +11,6 @@
 using namespace at;
 
 __device__ void test(){
-  
   // test half construction and implicit conversions in device
   assert(Half(3) == Half(3.0f));
   assert(static_cast<Half>(3.0f) == Half(3.0f));
@@ -24,7 +22,7 @@ __device__ void test(){
   __half c = a - Half(b);
   assert(static_cast<Half>(c) == Half(1.0));
 
-  // asserting if the  functions used on 
+  // asserting if the  functions used on
   // half types give almost equivalent results when using
   //  functions on double.
   // The purpose of these asserts are to test the device side
@@ -61,17 +59,18 @@ __device__ void test(){
   assert(::abs(::abs(Half(-3.0)) - ::abs(-3.0f)) <= threshold);
   assert(::abs(::round(Half(2.3)) - ::round(2.3f)) <= threshold);
   assert(::abs(::pow(Half(2.0), Half(10.0)) - ::pow(2.0f, 10.0f)) <= threshold);
-  assert(::abs(::atan2(Half(7.0), Half(0.0)) - ::atan2(7.0f, 0.0f)) <= threshold);
+  assert(
+      ::abs(::atan2(Half(7.0), Half(0.0)) - ::atan2(7.0f, 0.0f)) <= threshold);
   // note: can't use  namespace on isnan and isinf in device code
-  #ifdef _MSC_VER
-    // Windows requires this explicit conversion. The reason is unclear
-    // related issue with clang: https://reviews.llvm.org/D37906
-    assert(::abs(::isnan((float)Half(0.0)) - ::isnan(0.0f)) <= threshold);
-    assert(::abs(::isinf((float)Half(0.0)) - ::isinf(0.0f)) <= threshold);
-  #else
-    assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold);
-    assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold);
-  #endif
+#ifdef _MSC_VER
+  // Windows requires this explicit conversion. The reason is unclear
+  // related issue with clang: https://reviews.llvm.org/D37906
+  assert(::abs(::isnan((float)Half(0.0)) - ::isnan(0.0f)) <= threshold);
+  assert(::abs(::isinf((float)Half(0.0)) - ::isinf(0.0f)) <= threshold);
+#else
+  assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold);
+  assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold);
+#endif
 }
 
 __global__ void kernel(){
@@ -79,12 +78,13 @@ __global__ void kernel(){
 }
 
 void launch_function(){
-  kernel<<<1,1>>>();
+  kernel<<<1, 1>>>();
 }
 
-CATCH_TEST_CASE( "half common math functions tests in device", "[cuda]" ) {
+// half common math functions tests in device
+TEST(HalfCuda, HalfCuda) {
   launch_function();
   cudaError_t err = cudaDeviceSynchronize();
-  CATCH_REQUIRE(err == cudaSuccess);
+  bool isEQ = err == cudaSuccess;
+  ASSERT_TRUE(isEQ);
 }
-
diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu
index b64c530b355914..128e1cf5f5147e 100644
--- a/aten/src/ATen/test/cuda_optional_test.cu
+++ b/aten/src/ATen/test/cuda_optional_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/optional.h"
@@ -8,15 +7,15 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "optional in cuda files", "[cuda]" ) {
+// optional in cuda files
+TEST(OptionalTest, OptionalTestCUDA) {
   at::optional<int64_t> trivially_destructible;
   at::optional<std::vector<int64_t>> non_trivially_destructible;
-  CATCH_REQUIRE(!trivially_destructible.has_value());
-  CATCH_REQUIRE(!non_trivially_destructible.has_value());
+  ASSERT_FALSE(trivially_destructible.has_value());
+  ASSERT_FALSE(non_trivially_destructible.has_value());
 
   trivially_destructible = {5};
   non_trivially_destructible = std::vector<int64_t>{5, 10};
-  CATCH_REQUIRE(trivially_destructible.has_value());
-  CATCH_REQUIRE(non_trivially_destructible.has_value());
+  ASSERT_TRUE(trivially_destructible.has_value());
+  ASSERT_TRUE(non_trivially_destructible.has_value());
 }
-
diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
index a529f38d748a1b..32f5f410bb2eb5 100644
--- a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
+++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -10,9 +9,10 @@
 
 using namespace at;
 
-__global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor<float,1,RestrictPtrTraits> resa,
-						   PackedTensorAccessor<float,2,RestrictPtrTraits> t1a,
-						   PackedTensorAccessor<float,1,RestrictPtrTraits> t2a){
+__global__ void test_tensor_packed_accessor_kernel(
+    PackedTensorAccessor<float, 1, RestrictPtrTraits> resa,
+    PackedTensorAccessor<float, 2, RestrictPtrTraits> t1a,
+    PackedTensorAccessor<float, 1, RestrictPtrTraits> t2a) {
   for (int64_t i = 0; i < resa.size(0); i++) {
     float val = 0.0f;
     for (int64_t j = 0; j < t1a.size(1); j++) {
@@ -22,7 +22,8 @@ __global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor<float,1,
   }
 }
 
-CATCH_TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) {
+// test PackedTensorAccessor and Tensor.packed_accessor
+TEST(PackedtensoraccessorTest, PackedtensoraccessorTestCUDA) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
@@ -35,12 +36,13 @@ CATCH_TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]
   auto resa = res.packed_accessor<float, 1, RestrictPtrTraits>();
 
   auto stream = at::cuda::getCurrentCUDAStream();
-  
+
   test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a);
   cudaError_t err = cudaDeviceSynchronize();
-  CATCH_REQUIRE(err == cudaSuccess);
+  bool isEQ = err == cudaSuccess;
+  ASSERT_TRUE(isEQ);
 
   auto expected = mv(t1, t2);
 
-  CATCH_REQUIRE(res.allclose(expected));
+  ASSERT_TRUE(res.allclose(expected));
 }
diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp
index 7b14174d3baeb3..f5645a7978c11f 100644
--- a/aten/src/ATen/test/cuda_rng_test.cpp
+++ b/aten/src/ATen/test/cuda_rng_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "cuda.h"
@@ -21,7 +20,6 @@ void testCudaRNGMultithread() {
   }
 };
 
-CATCH_TEST_CASE( "CUDA RNG test", "[cuda]" ) {
-  CATCH_SECTION( "multithread" )
-    testCudaRNGMultithread();
+TEST(Cuda_RNGTest, MultithreadRNGTest) {
+  testCudaRNGMultithread();
 }
diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp
index 4391867d166772..54da9420ff60a1 100644
--- a/aten/src/ATen/test/cudnn_test.cpp
+++ b/aten/src/ATen/test/cudnn_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/cudnn/Descriptors.h"
@@ -9,7 +8,7 @@
 using namespace at;
 using namespace at::native;
 
-CATCH_TEST_CASE( "cudnn", "[cuda]" ) {
+TEST(CUDNNTest, CUDNNTestCUDA) {
   manual_seed(123, at::kCUDA);
 
 #if CUDNN_VERSION < 7000
@@ -17,9 +16,12 @@ CATCH_TEST_CASE( "cudnn", "[cuda]" ) {
   DropoutDescriptor desc1, desc2;
   desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42);
   desc2.set(handle, 0.5, desc1.state);
-
-  CATCH_REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout);
-  CATCH_REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates);
-  CATCH_REQUIRE(desc1.desc()->states == desc2.desc()->states);
+  bool isEQ;
+  isEQ = (desc1.desc()->dropout == desc2.desc()->dropout);
+  ASSERT_TRUE(isEQ);
+  isEQ = (desc1.desc()->nstates == desc2.desc()->nstates);
+  ASSERT_TRUE(isEQ);
+  isEQ = (desc1.desc()->states == desc2.desc()->states);
+  ASSERT_TRUE(isEQ);
 #endif
 }
diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp
index bf0cf93f7c4064..71a8d535d01e4e 100644
--- a/aten/src/ATen/test/dlconvertor_test.cpp
+++ b/aten/src/ATen/test/dlconvertor_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -10,18 +9,13 @@
 #include "test_seed.h"
 
 using namespace at;
-
-CATCH_TEST_CASE( "dlconvertor", "[cpu]" ) {
-
+TEST(TestDlconvertor, TestDlconvertor) {
   manual_seed(123, at::kCPU);
 
-  CATCH_INFO( "convert ATen to DLTensor" );
-
-  Tensor a = rand({3,4});
+  Tensor a = rand({3, 4});
   DLManagedTensor* dlMTensor = toDLPack(a);
 
-  CATCH_INFO( "convert DLTensor to ATen" );
   Tensor b = fromDLPack(dlMTensor);
 
-  CATCH_REQUIRE(a.equal(b));
+  ASSERT_TRUE(a.equal(b));
 }
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 32177705a2f883..5aa062f125b21a 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include <ATen/ATen.h>
 #include <iostream>
@@ -12,53 +11,53 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "half arithmetic", "[]" ) {
+TEST(TestHalf, Arithmetic) {
   Half zero = 0;
   Half one = 1;
-  CATCH_REQUIRE(zero + one == one);
-  CATCH_REQUIRE(zero + zero == zero);
-  CATCH_REQUIRE(zero * one == zero);
-  CATCH_REQUIRE(one * one == one);
-  CATCH_REQUIRE(one / one == one);
-  CATCH_REQUIRE(one - one == zero);
-  CATCH_REQUIRE(one - zero == one);
-  CATCH_REQUIRE(zero - one == -one);
-  CATCH_REQUIRE(one + one == Half(2));
-  CATCH_REQUIRE(one + one == 2);
+  ASSERT_EQ(zero + one, one);
+  ASSERT_EQ(zero + zero, zero);
+  ASSERT_EQ(zero * one, zero);
+  ASSERT_EQ(one * one, one);
+  ASSERT_EQ(one / one, one);
+  ASSERT_EQ(one - one, zero);
+  ASSERT_EQ(one - zero, one);
+  ASSERT_EQ(zero - one, -one);
+  ASSERT_EQ(one + one, Half(2));
+  ASSERT_EQ(one + one, 2);
 }
 
-CATCH_TEST_CASE( "half comparisons", "[]" ) {
+TEST(TestHalf, Comparisions) {
   Half zero = 0;
   Half one = 1;
-  CATCH_REQUIRE(zero < one);
-  CATCH_REQUIRE(zero < 1);
-  CATCH_REQUIRE(1 > zero);
-  CATCH_REQUIRE(0 >= zero);
-  CATCH_REQUIRE(0 != one);
-  CATCH_REQUIRE(zero == 0);
-  CATCH_REQUIRE(zero == zero);
-  CATCH_REQUIRE(zero == -zero);
+  ASSERT_LT(zero, one);
+  ASSERT_LT(zero, 1);
+  ASSERT_GT(1, zero);
+  ASSERT_GE(0, zero);
+  ASSERT_NE(0, one);
+  ASSERT_EQ(zero, 0);
+  ASSERT_EQ(zero, zero);
+  ASSERT_EQ(zero, -zero);
 }
 
-CATCH_TEST_CASE( "half cast", "[]" ) {
+TEST(TestHalf, Cast) {
   Half value = 1.5f;
-  CATCH_REQUIRE((int)value == 1);
-  CATCH_REQUIRE((short)value == 1);
-  CATCH_REQUIRE((long long)value == 1LL);
-  CATCH_REQUIRE((float)value == 1.5f);
-  CATCH_REQUIRE((double)value == 1.5);
-  CATCH_REQUIRE((bool)value == true);
-  CATCH_REQUIRE((bool)Half(0.0f) == false);
+  ASSERT_EQ((int)value, 1);
+  ASSERT_EQ((short)value, 1);
+  ASSERT_EQ((long long)value, 1LL);
+  ASSERT_EQ((float)value, 1.5f);
+  ASSERT_EQ((double)value, 1.5);
+  ASSERT_EQ((bool)value, true);
+  ASSERT_EQ((bool)Half(0.0f), false);
 }
 
-CATCH_TEST_CASE( "half construction", "[]" ) {
-  CATCH_REQUIRE(Half((short)3) == Half(3.0f));
-  CATCH_REQUIRE(Half((unsigned short)3) == Half(3.0f));
-  CATCH_REQUIRE(Half(3) == Half(3.0f));
-  CATCH_REQUIRE(Half(3U) == Half(3.0f));
-  CATCH_REQUIRE(Half(3LL) == Half(3.0f));
-  CATCH_REQUIRE(Half(3ULL) == Half(3.0f));
-  CATCH_REQUIRE(Half(3.5) == Half(3.5f));
+TEST(TestHalf, Construction) {
+  ASSERT_EQ(Half((short)3), Half(3.0f));
+  ASSERT_EQ(Half((unsigned short)3), Half(3.0f));
+  ASSERT_EQ(Half(3), Half(3.0f));
+  ASSERT_EQ(Half(3U), Half(3.0f));
+  ASSERT_EQ(Half(3LL), Half(3.0f));
+  ASSERT_EQ(Half(3ULL), Half(3.0f));
+  ASSERT_EQ(Half(3.5), Half(3.5f));
 }
 
 static std::string to_string(const Half& h) {
@@ -67,31 +66,31 @@ static std::string to_string(const Half& h) {
   return ss.str();
 }
 
-CATCH_TEST_CASE( "half to string", "[]" ) {
-  CATCH_REQUIRE(to_string(Half(3.5f)) == "3.5");
-  CATCH_REQUIRE(to_string(Half(-100.0f)) == "-100");
+TEST(TestHalf, Half2String) {
+  ASSERT_EQ(to_string(Half(3.5f)), "3.5");
+  ASSERT_EQ(to_string(Half(-100.0f)), "-100");
 }
 
-CATCH_TEST_CASE( "half numeric limits", "[]" ) {
+TEST(TestHalf, HalfNumericLimits) {
   using limits = std::numeric_limits<Half>;
-  CATCH_REQUIRE(limits::lowest() == -65504.0f);
-  CATCH_REQUIRE(limits::max() == 65504.0f);
-  CATCH_REQUIRE(limits::min() > 0);
-  CATCH_REQUIRE(limits::min() < 1);
-  CATCH_REQUIRE(limits::denorm_min() > 0);
-  CATCH_REQUIRE(limits::denorm_min() / 2  == 0);
-  CATCH_REQUIRE(limits::infinity() == std::numeric_limits<float>::infinity());
-  CATCH_REQUIRE(limits::quiet_NaN() != limits::quiet_NaN());
-  CATCH_REQUIRE(limits::signaling_NaN() != limits::signaling_NaN());
+  ASSERT_EQ(limits::lowest(), -65504.0f);
+  ASSERT_EQ(limits::max(), 65504.0f);
+  ASSERT_GT(limits::min(), 0);
+  ASSERT_LT(limits::min(), 1);
+  ASSERT_GT(limits::denorm_min(), 0);
+  ASSERT_EQ(limits::denorm_min() / 2, 0);
+  ASSERT_EQ(limits::infinity(), std::numeric_limits<float>::infinity());
+  ASSERT_NE(limits::quiet_NaN(), limits::quiet_NaN());
+  ASSERT_NE(limits::signaling_NaN(), limits::signaling_NaN());
 }
 
 // Check the declared type of members of numeric_limits<Half> matches
 // the declared type of that member on numeric_limits<float>
 
-#define ASSERT_SAME_TYPE(name) \
-  static_assert( \
-      std::is_same< \
-          decltype(std::numeric_limits<Half>::name), \
+#define ASSERT_SAME_TYPE(name)                                \
+  static_assert(                                              \
+      std::is_same<                                           \
+          decltype(std::numeric_limits<Half>::name),          \
           decltype(std::numeric_limits<float>::name)>::value, \
       "decltype(" #name ") differs")
 
@@ -119,7 +118,7 @@ ASSERT_SAME_TYPE(max_exponent10);
 ASSERT_SAME_TYPE(traps);
 ASSERT_SAME_TYPE(tinyness_before);
 
-CATCH_TEST_CASE( "half common math functions test", "[]" ) {
+TEST(TestHalf, CommonMath) {
   float threshold = 0.00001;
   assert(std::abs(std::lgamma(Half(10.0)) - std::lgamma(10.0f)) <= threshold);
   assert(std::abs(std::exp(Half(1.0)) - std::exp(1.0f)) <= threshold);
@@ -147,14 +146,22 @@ CATCH_TEST_CASE( "half common math functions test", "[]" ) {
   assert(std::abs(std::erfc(Half(10.0)) - std::erfc(10.0f)) <= threshold);
   assert(std::abs(std::abs(Half(-3.0)) - std::abs(-3.0f)) <= threshold);
   assert(std::abs(std::round(Half(2.3)) - std::round(2.3f)) <= threshold);
-  assert(std::abs(std::pow(Half(2.0), Half(10.0)) - std::pow(2.0f, 10.0f)) <= threshold);
-  assert(std::abs(std::atan2(Half(7.0), Half(0.0)) - std::atan2(7.0f, 0.0f)) <= threshold);
-  #ifdef __APPLE__
-    // @TODO: can macos do implicit conversion of Half?
-    assert(std::abs(std::isnan(static_cast<float>(Half(0.0))) - std::isnan(0.0f)) <= threshold);
-    assert(std::abs(std::isinf(static_cast<float>(Half(0.0))) - std::isinf(0.0f)) <= threshold);
-  #else
-    assert(std::abs(std::isnan(Half(0.0)) - std::isnan(0.0f)) <= threshold);
-    assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold);
-  #endif
-}
\ No newline at end of file
+  assert(
+      std::abs(std::pow(Half(2.0), Half(10.0)) - std::pow(2.0f, 10.0f)) <=
+      threshold);
+  assert(
+      std::abs(std::atan2(Half(7.0), Half(0.0)) - std::atan2(7.0f, 0.0f)) <=
+      threshold);
+#ifdef __APPLE__
+  // @TODO: can macos do implicit conversion of Half?
+  assert(
+      std::abs(std::isnan(static_cast<float>(Half(0.0))) - std::isnan(0.0f)) <=
+      threshold);
+  assert(
+      std::abs(std::isinf(static_cast<float>(Half(0.0))) - std::isinf(0.0f)) <=
+      threshold);
+#else
+  assert(std::abs(std::isnan(Half(0.0)) - std::isnan(0.0f)) <= threshold);
+  assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold);
+#endif
+}
diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu
index d09a423d7ca72d..21169e9ee30625 100644
--- a/aten/src/ATen/test/integer_divider_test.cu
+++ b/aten/src/ATen/test/integer_divider_test.cu
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or
 // (b-1), so it takes a few minutes to run.
@@ -20,24 +19,25 @@ struct TestCase {
   int steps;
 
   TestCase(Value dividend, int divisor_idx, int steps)
-    : dividend(dividend), divisor_idx(divisor_idx), steps(steps) { }
+      : dividend(dividend), divisor_idx(divisor_idx), steps(steps) {}
 };
 
-template<typename Value>
-__global__ void testIntDivider(const IntDivider<Value> *dividers,
-                               const TestCase<Value> *testCases,
-                               int numCases)
-{
+template <typename Value>
+__global__ void testIntDivider(
+    const IntDivider<Value>* dividers,
+    const TestCase<Value>* testCases,
+    int numCases) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   for (int i = index; i < numCases; i += stride) {
-    const TestCase<Value> &tc = testCases[i];
+    const TestCase<Value>& tc = testCases[i];
     Value dividend = tc.dividend;
-    const IntDivider<Value> &divider = dividers[tc.divisor_idx];
+    const IntDivider<Value>& divider = dividers[tc.divisor_idx];
     Value divisor = divider.divisor;
 
     for (int j = 0; j < tc.steps; j++) {
-      if (sizeof(Value) == 4 && dividend > INT32_MAX) return;
+      if (sizeof(Value) == 4 && dividend > INT32_MAX)
+        return;
 
       DivMod<Value> qr = divider.divmod(dividend);
       assert(qr.div == dividend / divisor && qr.mod == dividend % divisor);
@@ -62,18 +62,22 @@ class IntDividerTester {
     cudaError_t err;
 
     err = cudaMalloc(&dividersBuf_, NUM_CASES * sizeof(IntDivider<Value>));
-    CATCH_REQUIRE(err == cudaSuccess);
+    bool isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
     err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase<Value>));
-    CATCH_REQUIRE(err == cudaSuccess);
+    isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
   }
 
   ~IntDividerTester() {
     cudaError_t err;
 
     err = cudaFree(dividersBuf_);
-    CATCH_REQUIRE(err == cudaSuccess);
+    bool isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
     err = cudaFree(testCasesBuf_);
-    CATCH_REQUIRE(err == cudaSuccess);
+    isEQ = err == cudaSuccess;
+    EXPECT_TRUE(isEQ);
   }
 
   void addTestCase(Value dividend, Value divisor, int steps) {
@@ -85,29 +89,39 @@ class IntDividerTester {
     testCases_.emplace_back(dividend, dividers_.size() - 1, steps);
 
     // Launch the test kernel if the buffer is full.
-    if (testCases_.size() == NUM_CASES) flush();
+    if (testCases_.size() == NUM_CASES)
+      flush();
   }
 
   void flush() {
     cudaError_t err;
-
-    if (testCases_.empty()) return;
-    CATCH_REQUIRE(!dividers_.empty());
-
-    CATCH_REQUIRE(dividers_.size() <= NUM_CASES);
-    CATCH_REQUIRE(testCases_.size() <= NUM_CASES);
-    err = cudaMemcpy(dividersBuf_, dividers_.data(),
-                     dividers_.size() * sizeof(IntDivider<Value>),
-                     cudaMemcpyHostToDevice);
-    CATCH_REQUIRE(err == cudaSuccess);
-    err = cudaMemcpy(testCasesBuf_, testCases_.data(),
-                     testCases_.size() * sizeof(TestCase<Value>),
-                     cudaMemcpyHostToDevice);
-    CATCH_REQUIRE(err == cudaSuccess);
+    bool isTrue;
+    if (testCases_.empty())
+      return;
+
+    ASSERT_FALSE(dividers_.empty());
+
+    isTrue = dividers_.size() <= NUM_CASES;
+    ASSERT_TRUE(isTrue);
+    isTrue = testCases_.size() <= NUM_CASES;
+    ASSERT_TRUE(isTrue);
+    err = cudaMemcpy(
+        dividersBuf_,
+        dividers_.data(),
+        dividers_.size() * sizeof(IntDivider<Value>),
+        cudaMemcpyHostToDevice);
+    isTrue = err == cudaSuccess;
+    ASSERT_TRUE(isTrue);
+    err = cudaMemcpy(
+        testCasesBuf_,
+        testCases_.data(),
+        testCases_.size() * sizeof(TestCase<Value>),
+        cudaMemcpyHostToDevice);
+    isTrue = err == cudaSuccess;
+    ASSERT_TRUE(isTrue);
 
     int numCases = testCases_.size();
-    testIntDivider<Value><<<512, 512>>>(
-      dividersBuf_, testCasesBuf_, numCases);
+    testIntDivider<Value><<<512, 512>>>(dividersBuf_, testCasesBuf_, numCases);
 
     dividers_.clear();
     testCases_.clear();
@@ -117,8 +131,8 @@ class IntDividerTester {
   vector<IntDivider<Value>> dividers_;
   vector<TestCase<Value>> testCases_;
 
-  IntDivider<Value> *dividersBuf_;
-  TestCase<Value> *testCasesBuf_;
+  IntDivider<Value>* dividersBuf_;
+  TestCase<Value>* testCasesBuf_;
 };
 
 static void testUint32Divider()
@@ -128,15 +142,18 @@ static void testUint32Divider()
   IntDividerTester<uint32_t> tester;
 
   for (uint64_t divisor = 1; divisor <= INT32_MAX; divisor++) {
-    if (divisor < 1000000 && divisor % 10000 == 0) fprintf(stderr, ".");
-    if (divisor % 10000000 == 0) fprintf(stderr, "-");
+    if (divisor < 1000000 && divisor % 10000 == 0)
+      fprintf(stderr, ".");
+    if (divisor % 10000000 == 0)
+      fprintf(stderr, "-");
 
     // In order to save time, we only test when the remainder is zero or
     // (divisor - 1).
     uint64_t dividend = 0;
     while (dividend <= INT32_MAX) {
       uint64_t steps = (INT32_MAX - dividend) / divisor + 1;
-      if (steps > MAX_STEPS) steps = MAX_STEPS;
+      if (steps > MAX_STEPS)
+        steps = MAX_STEPS;
 
       tester.addTestCase(dividend, divisor, steps);
       tester.addTestCase(dividend + divisor - 1, divisor, steps);
@@ -180,11 +197,11 @@ static void testUint64Divider()
   tester.flush();
 }
 
-CATCH_TEST_CASE( "CUDA integer divider", "[cuda]" ) {
-
+TEST(TestCUDAIntegerDivider, IntegerDivider) {
   testUint64Divider();
   testUint32Divider();
 
   cudaError_t err = cudaDeviceSynchronize();
-  CATCH_REQUIRE(err == cudaSuccess);
+  bool isTrue = err == cudaSuccess;
+  ASSERT_TRUE(isTrue);
 }
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index 4c57b7d8ee1d96..6721c69b0e0f36 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -1,192 +1,222 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-using Catch::Matchers::StartsWith;
+#define ASSERT_EQUAL(t1, t2) ASSERT_TRUE(t1.equal(t2));
 
-#define REQUIRE_EQUAL(t1, t2) \
-  CATCH_REQUIRE(t1.equal(t2));
+#define ASSERT_ALLCLOSE(t1, t2)     \
+  ASSERT_TRUE(t1.is_same_size(t2)); \
+  ASSERT_TRUE(t1.allclose(t2));
 
-#define REQUIRE_ALLCLOSE(t1, t2)   \
-  CATCH_REQUIRE(t1.is_same_size(t2));    \
-  CATCH_REQUIRE(t1.allclose(t2));
-
-#define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol)   \
-  CATCH_REQUIRE(t1.is_same_size(t2));    \
-  CATCH_REQUIRE(t1.allclose(t2, atol, rtol));
+#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \
+  ASSERT_TRUE(t1.is_same_size(t2));                    \
+  ASSERT_TRUE(t1.allclose(t2, atol, rtol));
 
 void requireEqualTensorList(TensorList t1, TensorList t2) {
-  CATCH_REQUIRE(t1.size() == t2.size());
+  ASSERT_EQ(t1.size(), t2.size());
   for (size_t i = 0; i < t1.size(); ++i) {
-    REQUIRE_EQUAL(t1[ i ], t2[ i ]);
+    ASSERT_EQUAL(t1[i], t2[i]);
   }
 }
 
-void test(Type & T, Type & AccT) {
-  auto t = randn({3, 3}, T);
-
-  CATCH_SECTION( "split: test method, type, namespace give same result" ) {
-    auto splitMethod = t.split(1, 0);
-    auto splitType = T.split(t, 1, 0);
-    auto splitNs = at::split(t, 1, 0);
-    requireEqualTensorList(splitMethod, splitType);
-    requireEqualTensorList(splitMethod, splitNs);
+// split: test method, type, namespace give same result
+void TestSplit(Type& T, Tensor& t) {
+  auto splitMethod = t.split(1, 0);
+  auto splitType = T.split(t, 1, 0);
+  auto splitNs = at::split(t, 1, 0);
+  requireEqualTensorList(splitMethod, splitType);
+  requireEqualTensorList(splitMethod, splitNs);
 
-    // test rebuilding with cat
-    REQUIRE_EQUAL(at::cat(splitMethod, 0), t);
-  }
+  // test rebuilding with cat
+  ASSERT_EQUAL(at::cat(splitMethod, 0), t);
+}
 
-  CATCH_SECTION( "chunk: test method, type, namespace give same result" ) {
-    // test method, type, namespace give same result
-    auto chunkMethod = t.chunk(3, 0);
-    auto chunkType = T.chunk(t, 3, 0);
-    auto chunkNs = at::chunk(t, 3, 0);
-    requireEqualTensorList(chunkMethod, chunkType);
-    requireEqualTensorList(chunkMethod, chunkNs);
+// chunk: test method, type, namespace give same result
+void TestChunk(Type& T, Tensor& t) {
+  // test method, type, namespace give same result
+  auto chunkMethod = t.chunk(3, 0);
+  auto chunkType = T.chunk(t, 3, 0);
+  auto chunkNs = at::chunk(t, 3, 0);
+  requireEqualTensorList(chunkMethod, chunkType);
+  requireEqualTensorList(chunkMethod, chunkNs);
+
+  // test rebuilding with cat
+  ASSERT_EQUAL(at::cat(chunkMethod, 0), t);
+}
 
-    // test rebuilding with cat
-    REQUIRE_EQUAL(at::cat(chunkMethod, 0), t);
+void TestStack(Type& T, Tensor& t) {
+  auto x = rand({2, 3, 4});
+  auto y = rand({2, 3, 4});
+  auto z = rand({2, 3, 4});
+  for (int64_t dim = 0; dim < 4; ++dim) {
+    auto res = at::stack({x, y, z}, dim);
+    auto res_neg = at::stack({x, y, z}, dim - 4);
+    std::vector<int64_t> expected_size;
+    expected_size.insert(
+        expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim);
+    expected_size.insert(expected_size.end(), 3);
+    expected_size.insert(
+        expected_size.end(), x.sizes().begin() + dim, x.sizes().end());
+
+    ASSERT_EQUAL(res, res_neg);
+    ASSERT_TRUE(res.sizes().equals(expected_size));
+    ASSERT_EQUAL(res.select(dim, 0), x);
+    ASSERT_EQUAL(res.select(dim, 1), y);
+    ASSERT_EQUAL(res.select(dim, 2), z);
   }
+}
 
-  // stack
-  CATCH_SECTION( "stack" ) {
-    auto x = rand({2, 3, 4});
-    auto y = rand({2, 3, 4});
-    auto z = rand({2, 3, 4});
-    for (int64_t dim = 0; dim < 4; ++dim) {
-      auto res = at::stack({x, y, z}, dim);
-      auto res_neg = at::stack({x, y, z}, dim - 4);
-      std::vector<int64_t> expected_size;
-      expected_size.insert(expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim);
-      expected_size.insert(expected_size.end(), 3);
-      expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end());
-
-      REQUIRE_EQUAL(res, res_neg);
-      CATCH_REQUIRE(res.sizes().equals(expected_size));
-      REQUIRE_EQUAL(res.select(dim, 0), x);
-      REQUIRE_EQUAL(res.select(dim, 1), y);
-      REQUIRE_EQUAL(res.select(dim, 2), z);
-    }
-  }
+// size / stride
+void TestSize(Type& T, Tensor& t) {
+  auto scalar = randn({}, T);
+  // Throw StartsWith("dimension specified as 0 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.size(0));
+  // Throw StartsWith("dimension specified as -1 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.size(-1));
+  // Throw StartsWith("dimension specified as 0 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.stride(0));
+  // Throw StartsWith("dimension specified as -1 but tensor has no dimensions")
+  ASSERT_ANY_THROW(scalar.stride(-1));
+
+  auto empty = randn({0}, T);
+  ASSERT_EQ(empty.size(0), 0);
+  ASSERT_EQ(empty.size(-1), 0);
+  ASSERT_EQ(empty.stride(0), 1);
+  ASSERT_EQ(empty.stride(-1), 1);
+}
 
-  CATCH_SECTION( "size / stride" ) {
-    auto scalar = randn({}, T);
-    CATCH_REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    CATCH_REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
-    CATCH_REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    CATCH_REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
-
-    auto empty = randn({0}, T);
-    CATCH_REQUIRE(empty.size(0) == 0);
-    CATCH_REQUIRE(empty.size(-1) == 0);
-    CATCH_REQUIRE(empty.stride(0) == 1);
-    CATCH_REQUIRE(empty.stride(-1) == 1);
-  }
+void TestMatmul(Type& T, Tensor& t, Type& AccT) {
+  auto scalar = randn({}, T);
+  auto d1 = randn({3}, T);
+  auto d2 = randn({2, 3}, T);
+
+  // 0-d
+  // Throw StartsWith("both arguments to matmul need to be at least 1D")
+  ASSERT_ANY_THROW(scalar.matmul(d2));
+  // Throw StartsWith("both arguments to matmul need to be at least 1D")
+  ASSERT_ANY_THROW(d2.matmul(scalar));
+
+  // 1-d
+  ASSERT_ALLCLOSE(d1.matmul(d1), d1.dot(d1));
+  ASSERT_ALLCLOSE(d2.matmul(d1), d2.mv(d1));
+  auto d1o = randn({2}, T);
+  ASSERT_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0));
+
+  // 2-d
+  auto d2o = randn({3, 5}, T);
+  ASSERT_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o));
+
+  // > 2-d, 1-d
+  auto d3 = randn({5, 2, 3}, T);
+  ASSERT_ALLCLOSE(
+      d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2}));
+  ASSERT_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3}));
+
+  auto d5 = randn({3, 2, 4, 2, 3}, T);
+  ASSERT_ALLCLOSE(
+      d5.matmul(d1),
+      d5.view({24, 2, 3})
+          .bmm(d1.view({1, 3, 1}).expand({24, 3, 1}))
+          .view({3, 2, 4, 2}));
+  ASSERT_ALLCLOSE(
+      d1o.matmul(d5),
+      d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3}));
+
+  // > 2-d, 2-d
+  // we use a "folding" algorithm in this case of matmul, so the direct
+  // comparison to bmm doesn't work; instead, compare to the higher precision
+  // computation (technically, we should always do this). Tolerances are
+  // selected empirically.
+  double atol = 1e-04;
+  double rtol = 1e-06;
+  d2 = randn({3, 4}, T);
+  d2o = randn({4, 2}, T);
+  auto result = d5.matmul(d2).toType(AccT);
+
+  auto d5Acc = d5.toType(AccT);
+  auto d2Acc = d2.toType(AccT);
+  auto acc_result = d5Acc.view({24, 2, 3})
+                        .bmm(d2Acc.expand({24, 3, 4}))
+                        .view({3, 2, 4, 2, 4});
+  ASSERT_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol);
+  ASSERT_ALLCLOSE(
+      d2o.matmul(d5),
+      d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3}));
+
+  // > 2-d, > 2-d
+  auto d5o = randn({2, 1, 2, 4, 3, 2}, T);
+  auto d5_bmm_view =
+      d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3});
+  auto d5o_bmm_view =
+      d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2});
+  ASSERT_ALLCLOSE(
+      d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2}));
+
+  // non-expandable case
+  auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T);
+  // Throw Contains("must match the size")
+  ASSERT_ANY_THROW(d5.matmul(d5wrong));
+}
 
-  // matmul
-  CATCH_SECTION( "matmul" ) {
-    auto scalar = randn({}, T);
-    auto d1 = randn({3}, T);
-    auto d2 = randn({2, 3}, T);
-
-    // 0-d
-    CATCH_REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
-    CATCH_REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
-
-    // 1-d
-    REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1));
-    REQUIRE_ALLCLOSE(d2.matmul(d1), d2.mv(d1));
-    auto d1o = randn({2}, T);
-    REQUIRE_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0));
-
-    // 2-d
-    auto d2o = randn({3, 5}, T);
-    REQUIRE_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o));
-
-    // > 2-d, 1-d
-    auto d3 = randn({5, 2, 3}, T);
-    REQUIRE_ALLCLOSE(d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2}));
-    REQUIRE_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3}));
-
-    auto d5 = randn({3, 2, 4, 2, 3}, T);
-    REQUIRE_ALLCLOSE(d5.matmul(d1), d5.view({24, 2, 3}).bmm(d1.view({1, 3, 1}).expand({24, 3, 1})).view({3, 2, 4, 2}));
-    REQUIRE_ALLCLOSE(d1o.matmul(d5), d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3}));
-
-    // > 2-d, 2-d
-    // we use a "folding" algorithm in this case of matmul, so the direct comparison to bmm doesn't work;
-    // instead, compare to the higher precision computation (technically, we should always do this).
-    // Tolerances are selected empirically.
-    double atol = 1e-04;
-    double rtol = 1e-06;
-    d2 = randn({3, 4}, T);
-    d2o = randn({4, 2}, T);
-    auto result = d5.matmul(d2).toType(AccT);
-
-    auto d5Acc = d5.toType(AccT);
-    auto d2Acc = d2.toType(AccT);
-    auto acc_result = d5Acc.view({24, 2, 3}).bmm(d2Acc.expand({24, 3, 4})).view({3, 2, 4, 2, 4});
-    REQUIRE_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol);
-    REQUIRE_ALLCLOSE(d2o.matmul(d5), d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3}));
-
-    // > 2-d, > 2-d
-    auto d5o = randn({2, 1, 2, 4, 3, 2}, T);
-    auto d5_bmm_view = d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3});
-    auto d5o_bmm_view = d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2});
-    REQUIRE_ALLCLOSE(d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2}));
-
-    // non-expandable case
-    auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T);
-    CATCH_REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size"));
-  }
+void TestStandardGammaGrad(Type& T, Tensor& t) {
+  // check empty
+  auto empty = ones({0}, T);
+  ASSERT_EQUAL(empty, at::_standard_gamma_grad(empty, empty));
+
+  // check scalar equals one element
+  auto one_scalar = ones({}, T).mul(5);
+  auto one_with_dim = ones({1}, T).mul(5);
+  ASSERT_ALLCLOSE(
+      at::_standard_gamma_grad(one_scalar, one_scalar),
+      at::_standard_gamma_grad(one_with_dim, one_with_dim).sum());
+
+  // check mixing types
+  auto t1 = randn({3, 4}, T);
+  auto t2 = randn({3, 4}, T).toType(kDouble);
+  // Throw StartsWith("expected scalar type")
+  ASSERT_ANY_THROW(at::_standard_gamma_grad(t1, t2));
+}
 
-  // _standard_gamma_grad
-  CATCH_SECTION( "_standard_gamma_grad" ) {
-    // check empty
-    auto empty = ones({0}, T);
-    REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty));
-
-    // check scalar equals one element
-    auto one_scalar = ones({}, T).mul(5);
-    auto one_with_dim = ones({1}, T).mul(5);
-    REQUIRE_ALLCLOSE(at::_standard_gamma_grad(one_scalar, one_scalar),
-		     at::_standard_gamma_grad(one_with_dim, one_with_dim).sum());
-
-    // check mixing types
-    auto t1 = randn({3, 4}, T);
-    auto t2 = randn({3, 4}, T).toType(kDouble);
-    CATCH_REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type"));
-  }
+void TestWhere(Type& T, Tensor& t) {
+  // empty
+  auto empty = ones({0}, T);
+  auto& bT = T.toScalarType(ScalarType::Byte);
+  auto empty_byte = ones({0}, bT);
+  ASSERT_EQUAL(empty, at::where(empty_byte, empty, empty));
+
+  // check scalar equals one element
+  auto x_scalar = ones({}, T).mul(5);
+  auto y_scalar = ones({}, T).mul(7);
+  auto cond_scalar = zeros({}, bT);
+  auto x_1d = x_scalar.unsqueeze(0);
+  auto y_1d = y_scalar.unsqueeze(0);
+  auto cond_1d = cond_scalar.unsqueeze(0);
+  ASSERT_ALLCLOSE(
+      at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0),
+      at::where(cond_1d, x_1d, y_1d));
+}
 
-  CATCH_SECTION( "where" ) {
-    // empty
-    auto empty = ones({0}, T);
-    auto &bT = T.toScalarType(ScalarType::Byte);
-    auto empty_byte = ones({0}, bT);
-    REQUIRE_EQUAL(empty, at::where(empty_byte, empty, empty));
-
-    // check scalar equals one element
-    auto x_scalar = ones({}, T).mul(5);
-    auto y_scalar = ones({}, T).mul(7);
-    auto cond_scalar = zeros({}, bT);
-    auto x_1d = x_scalar.unsqueeze(0);
-    auto y_1d = y_scalar.unsqueeze(0);
-    auto cond_1d = cond_scalar.unsqueeze(0);
-    REQUIRE_ALLCLOSE(at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0),
-                     at::where(cond_1d, x_1d, y_1d));
-  }
+void test(Type& T, Type& AccT) {
+  auto t = randn({3, 3}, T);
+  TestSplit(T, t);
+  TestChunk(T, t);
+  TestStack(T, t);
+  TestSize(T, t);
+  TestMatmul(T, t, AccT);
+  TestStandardGammaGrad(T, t);
+  TestWhere(T, t);
 }
 
-CATCH_TEST_CASE( "native test CPU", "[cpu]" ) {
+TEST(TestNative, NativeTestCPU) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat), CPU(kDouble));
 }
 
-CATCH_TEST_CASE( "native test CUDA", "[cuda]" ) {
+TEST(TestNative, NativeTestGPU) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index a89ca81da017f7..5bb3aafaff9247 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -9,31 +8,33 @@
 
 using namespace at;
 
-#define TRY_CATCH_ELSE(fn, catc, els)                           \
-  {                                                             \
-    /* avoid mistakenly passing if els code throws exception*/  \
-    bool _passed = false;                                       \
-    try {                                                       \
-      fn;                                                       \
-      _passed = true;                                           \
-      els;                                                      \
-    } catch (std::exception &e) {                               \
-      CATCH_REQUIRE(!_passed);                                        \
-      catc;                                                     \
-    }                                                           \
+#define TRY_CATCH_ELSE(fn, catc, els)                          \
+  {                                                            \
+    /* avoid mistakenly passing if els code throws exception*/ \
+    bool _passed = false;                                      \
+    try {                                                      \
+      fn;                                                      \
+      _passed = true;                                          \
+      els;                                                     \
+    } catch (std::exception & e) {                             \
+      ASSERT_FALSE(_passed);                                   \
+      catc;                                                    \
+    }                                                          \
   }
 
 void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) {
-  CATCH_REQUIRE(lhs.dim() == rhs.dim());
-  CATCH_REQUIRE(lhs.sizes().equals(rhs.sizes()));
+  ASSERT_EQ(lhs.dim(), rhs.dim());
+  ASSERT_TRUE(lhs.sizes().equals(rhs.sizes()));
 }
 
 bool should_expand(const IntList &from_size, const IntList &to_size) {
-  if(from_size.size() > to_size.size()) {
+  if (from_size.size() > to_size.size()) {
     return false;
   }
-  for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend(); ++from_dim_it) {
-    for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend(); ++to_dim_it) {
+  for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend();
+       ++from_dim_it) {
+    for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend();
+         ++to_dim_it) {
       if (*from_dim_it != 1 && *from_dim_it != *to_dim_it) {
         return false;
       }
@@ -43,21 +44,22 @@ bool should_expand(const IntList &from_size, const IntList &to_size) {
 }
 
 void test(Type &T) {
-  std::vector<std::vector<int64_t> > sizes = { {}, {0}, {1}, {1, 1}, {2}};
+  std::vector<std::vector<int64_t>> sizes = {{}, {0}, {1}, {1, 1}, {2}};
 
   // single-tensor/size tests
   for (auto s = sizes.begin(); s != sizes.end(); ++s) {
     // verify that the dim, sizes, strides, etc match what was requested.
     auto t = ones(*s, T);
-    CATCH_REQUIRE((size_t)t.dim() == s->size());
-    CATCH_REQUIRE((size_t)t.ndimension() == s->size());
-    CATCH_REQUIRE(t.sizes().equals(*s));
-    CATCH_REQUIRE(t.strides().size() == s->size());
-    auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies<int64_t>());
-    CATCH_REQUIRE(t.numel() == numel);
+    ASSERT_EQ((size_t)t.dim(), s->size());
+    ASSERT_EQ((size_t)t.ndimension(), s->size());
+    ASSERT_TRUE(t.sizes().equals(*s));
+    ASSERT_EQ(t.strides().size(), s->size());
+    auto numel =
+        std::accumulate(s->begin(), s->end(), 1, std::multiplies<int64_t>());
+    ASSERT_EQ(t.numel(), numel);
     // verify we can output
     std::stringstream ss;
-    CATCH_REQUIRE_NOTHROW(ss << t << std::endl);
+    ASSERT_NO_THROW(ss << t << std::endl);
 
     // set_
     auto t2 = ones(*s, T);
@@ -65,22 +67,22 @@ void test(Type &T) {
     require_equal_size_dim(t2, ones({0}, T));
 
     // unsqueeze
-    CATCH_REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
+    ASSERT_EQ(t.unsqueeze(0).dim(), t.dim() + 1);
 
     // unsqueeze_
     {
       auto t2 = ones(*s, T);
       auto r = t2.unsqueeze_(0);
-      CATCH_REQUIRE(r.dim() == t.dim() + 1);
+      ASSERT_EQ(r.dim(), t.dim() + 1);
     }
 
     // squeeze (with dimension argument)
     if (t.dim() == 0 || t.sizes()[0] == 1) {
-      CATCH_REQUIRE(t.squeeze(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(t.squeeze(0).dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
-      // in NumPy this is an error.
-      CATCH_REQUIRE(t.squeeze(0).dim() == t.dim());
+      // In PyTorch, it is a no-op to try to squeeze a dimension that has size
+      // != 1; in NumPy this is an error.
+      ASSERT_EQ(t.squeeze(0).dim(), t.dim());
     }
 
     // squeeze (with no dimension argument)
@@ -98,12 +100,12 @@ void test(Type &T) {
     {
       // squeeze_ (with dimension argument)
       auto t2 = ones(*s, T);
-      if (t2.dim() == 0 ||  t2.sizes()[0] == 1) {
-        CATCH_REQUIRE(t2.squeeze_(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      if (t2.dim() == 0 || t2.sizes()[0] == 1) {
+        ASSERT_EQ(t2.squeeze_(0).dim(), std::max<int64_t>(t.dim() - 1, 0));
       } else {
-        // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
-        // in NumPy this is an error.
-        CATCH_REQUIRE(t2.squeeze_(0).dim() == t.dim());
+        // In PyTorch, it is a no-op to try to squeeze a dimension that has size
+        // != 1; in NumPy this is an error.
+        ASSERT_EQ(t2.squeeze_(0).dim(), t.dim());
       }
     }
 
@@ -122,154 +124,156 @@ void test(Type &T) {
 
     // reduce (with dimension argument and with 1 return argument)
     if (t.numel() != 0) {
-      CATCH_REQUIRE(t.sum(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(t.sum(0).dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      CATCH_REQUIRE(t.sum(0).equal(at::zeros({}, T)));
+      ASSERT_TRUE(t.sum(0).equal(at::zeros({}, T)));
     }
 
     // reduce (with dimension argument and with 2 return arguments)
     if (t.numel() != 0) {
       auto ret = t.min(0);
-      CATCH_REQUIRE(std::get<0>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
-      CATCH_REQUIRE(std::get<1>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(std::get<0>(ret).dim(), std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(std::get<1>(ret).dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      _CATCH_REQUIRE_THROWS(t.min(0));
+      ASSERT_ANY_THROW(t.min(0));
     }
 
     // simple indexing
     if (t.dim() > 0 && t.numel() != 0) {
-      CATCH_REQUIRE(t[0].dim() == std::max<int64_t>(t.dim() - 1, 0));
+      ASSERT_EQ(t[0].dim(), std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      _CATCH_REQUIRE_THROWS(t[0]);
+      ASSERT_ANY_THROW(t[0]);
     }
 
     // fill_ (argument to fill_ can only be a 0-dim tensor)
-    TRY_CATCH_ELSE(t.fill_(t.sum(0)),
-                   CATCH_REQUIRE(t.dim() > 1),
-                   CATCH_REQUIRE(t.dim() <= 1));
+    TRY_CATCH_ELSE(
+        t.fill_(t.sum(0)), ASSERT_GT(t.dim(), 1), ASSERT_LE(t.dim(), 1));
   }
 
   for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) {
     for (auto rhs_it = sizes.begin(); rhs_it != sizes.end(); ++rhs_it) {
       // is_same_size should only match if they are the same shape
       {
-          auto lhs = ones(*lhs_it, T);
-          auto rhs = ones(*rhs_it, T);
-          if(*lhs_it != *rhs_it) {
-            CATCH_REQUIRE(!lhs.is_same_size(rhs));
-            CATCH_REQUIRE(!rhs.is_same_size(lhs));
-          }
-      }
-      // forced size functions (resize_, resize_as, set_)
-      {
-        // resize_
-        {
-          auto lhs = ones(*lhs_it, T);
-          auto rhs = ones(*rhs_it, T);
-          lhs.resize_(*rhs_it);
-          require_equal_size_dim(lhs, rhs);
-        }
-        // resize_as_
-        {
-          auto lhs = ones(*lhs_it, T);
-          auto rhs = ones(*rhs_it, T);
-          lhs.resize_as_(rhs);
-          require_equal_size_dim(lhs, rhs);
-        }
-        // set_
-        {
-          {
-            // with tensor
-            auto lhs = ones(*lhs_it, T);
-            auto rhs = ones(*rhs_it, T);
-            lhs.set_(rhs);
-            require_equal_size_dim(lhs, rhs);
-          }
-          {
-            // with storage
-            auto lhs = ones(*lhs_it, T);
-            auto rhs = ones(*rhs_it, T);
-            auto storage = T.storage(rhs.numel(), false);
-            lhs.set_(storage);
-            // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars
-            CATCH_REQUIRE(lhs.dim() != 0);
-          }
-          {
-            // with storage, offset, sizes, strides
-            auto lhs = ones(*lhs_it, T);
-            auto rhs = ones(*rhs_it, T);
-            auto storage = T.storage(rhs.numel(), false);
-            lhs.set_(storage, rhs.storage_offset(), rhs.sizes(), rhs.strides());
-            require_equal_size_dim(lhs, rhs);
-          }
+        auto lhs = ones(*lhs_it, T);
+        auto rhs = ones(*rhs_it, T);
+        if (*lhs_it != *rhs_it) {
+          ASSERT_FALSE(lhs.is_same_size(rhs));
+          ASSERT_FALSE(rhs.is_same_size(lhs));
         }
       }
-
-      // view
+      // forced size functions (resize_, resize_as, set_)
+      {// resize_
+       {auto lhs = ones(*lhs_it, T);
+      auto rhs = ones(*rhs_it, T);
+      lhs.resize_(*rhs_it);
+      require_equal_size_dim(lhs, rhs);
+    }
+    // resize_as_
+    {
+      auto lhs = ones(*lhs_it, T);
+      auto rhs = ones(*rhs_it, T);
+      lhs.resize_as_(rhs);
+      require_equal_size_dim(lhs, rhs);
+    }
+    // set_
+    {
       {
+        // with tensor
         auto lhs = ones(*lhs_it, T);
         auto rhs = ones(*rhs_it, T);
-        auto rhs_size = *rhs_it;
-        TRY_CATCH_ELSE(auto result = lhs.view(rhs_size),
-                       CATCH_REQUIRE(lhs.numel() != rhs.numel()),
-                       CATCH_REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs););
+        lhs.set_(rhs);
+        require_equal_size_dim(lhs, rhs);
       }
-
-      // take
       {
+        // with storage
         auto lhs = ones(*lhs_it, T);
-        auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long);
-        TRY_CATCH_ELSE(auto result = lhs.take(rhs),
-                       CATCH_REQUIRE(lhs.numel() == 0); CATCH_REQUIRE(rhs.numel() != 0),
-                       require_equal_size_dim(result, rhs));
+        auto rhs = ones(*rhs_it, T);
+        auto storage = T.storage(rhs.numel(), false);
+        lhs.set_(storage);
+        // should not be dim 0 because an empty storage is dim 1; all other
+        // storages aren't scalars
+        ASSERT_NE(lhs.dim(), 0);
       }
-
-
-      // ger
       {
+        // with storage, offset, sizes, strides
         auto lhs = ones(*lhs_it, T);
         auto rhs = ones(*rhs_it, T);
-        TRY_CATCH_ELSE(auto result = lhs.ger(rhs),
-                       CATCH_REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)),
-                       [&]() {
-                         int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
-                         int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
-                         require_equal_size_dim(result, at::empty({dim0, dim1}, result.options()));
-                       }(););
+        auto storage = T.storage(rhs.numel(), false);
+        lhs.set_(storage, rhs.storage_offset(), rhs.sizes(), rhs.strides());
+        require_equal_size_dim(lhs, rhs);
       }
+    }
+  }
 
-      // expand
-      {
-        auto lhs = ones(*lhs_it, T);
-        auto lhs_size = *lhs_it;
-        auto rhs = ones(*rhs_it, T);
-        auto rhs_size = *rhs_it;
-        bool should_pass = should_expand(lhs_size, rhs_size);
-        TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size),
-                       CATCH_REQUIRE(!should_pass),
-                       CATCH_REQUIRE(should_pass); require_equal_size_dim(result, rhs););
+  // view
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto rhs = ones(*rhs_it, T);
+    auto rhs_size = *rhs_it;
+    TRY_CATCH_ELSE(auto result = lhs.view(rhs_size),
+                   ASSERT_NE(lhs.numel(), rhs.numel()),
+                   ASSERT_EQ(lhs.numel(), rhs.numel());
+                   require_equal_size_dim(result, rhs););
+  }
 
-        // in-place functions (would be good if we can also do a non-broadcasting one, b/c
-        // broadcasting functions will always end up operating on tensors of same size;
-        // is there an example of this outside of assign_ ?)
-        {
-          bool should_pass_inplace = should_expand(rhs_size, lhs_size);
-          TRY_CATCH_ELSE(lhs.add_(rhs),
-                         CATCH_REQUIRE(!should_pass_inplace),
-                         CATCH_REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T)););
-        }
-      }
+  // take
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long);
+    TRY_CATCH_ELSE(auto result = lhs.take(rhs), ASSERT_EQ(lhs.numel(), 0);
+                   ASSERT_NE(rhs.numel(), 0),
+                   require_equal_size_dim(result, rhs));
+  }
+
+  // ger
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto rhs = ones(*rhs_it, T);
+    TRY_CATCH_ELSE(auto result = lhs.ger(rhs),
+                   ASSERT_TRUE(
+                       (lhs.numel() == 0 || rhs.numel() == 0 ||
+                        lhs.dim() != 1 || rhs.dim() != 1)),
+                   [&]() {
+                     int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
+                     int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
+                     require_equal_size_dim(
+                         result, at::empty({dim0, dim1}, result.options()));
+                   }(););
+  }
+
+  // expand
+  {
+    auto lhs = ones(*lhs_it, T);
+    auto lhs_size = *lhs_it;
+    auto rhs = ones(*rhs_it, T);
+    auto rhs_size = *rhs_it;
+    bool should_pass = should_expand(lhs_size, rhs_size);
+    TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size),
+                   ASSERT_FALSE(should_pass),
+                   ASSERT_TRUE(should_pass);
+                   require_equal_size_dim(result, rhs););
+
+    // in-place functions (would be good if we can also do a non-broadcasting
+    // one, b/c broadcasting functions will always end up operating on tensors
+    // of same size; is there an example of this outside of assign_ ?)
+    {
+      bool should_pass_inplace = should_expand(rhs_size, lhs_size);
+      TRY_CATCH_ELSE(lhs.add_(rhs),
+                     ASSERT_FALSE(should_pass_inplace),
+                     ASSERT_TRUE(should_pass_inplace);
+                     require_equal_size_dim(lhs, ones(*lhs_it, T)););
     }
   }
 }
+}
+}
 
-CATCH_TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
+TEST(TestScalarTensor, TestScalarTensorCPU) {
   manual_seed(123, at::kCPU);
-
   test(CPU(kFloat));
 }
 
-CATCH_TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
+TEST(TestScalarTensor, TestScalarTensorCUDA) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 10ffa9afc326ff..b188146f213f56 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include <iostream>
 // define constants like M_PI and C keywords for MSVC
@@ -33,26 +32,25 @@ struct Foo<Half> {
 
 void test_overflow() {
   auto s1 = Scalar(M_PI);
-  CATCH_REQUIRE(s1.toFloat() == static_cast<float>(M_PI));
+  ASSERT_EQ(s1.toFloat(), static_cast<float>(M_PI));
   s1.toHalf();
 
   s1 = Scalar(100000);
-  CATCH_REQUIRE(s1.toFloat() == 100000.0);
-  CATCH_REQUIRE(s1.toInt() == 100000);
+  ASSERT_EQ(s1.toFloat(), 100000.0);
+  ASSERT_EQ(s1.toInt(), 100000);
 
-  CATCH_REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error);
+  ASSERT_THROW(s1.toHalf(), std::domain_error);
 
   s1 = Scalar(NAN);
-  CATCH_REQUIRE(std::isnan(s1.toFloat()));
-  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  ASSERT_TRUE(std::isnan(s1.toFloat()));
+  ASSERT_THROW(s1.toInt(), std::domain_error);
 
   s1 = Scalar(INFINITY);
-  CATCH_REQUIRE(std::isinf(s1.toFloat()));
-  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  ASSERT_TRUE(std::isinf(s1.toFloat()));
+  ASSERT_THROW(s1.toInt(), std::domain_error);
 }
 
-CATCH_TEST_CASE( "scalar test", "[]" ) {
-
+TEST(TestScalar, TestScalar) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
@@ -60,54 +58,57 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
   Scalar bar = 3.0;
   Half h = bar.toHalf();
   Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() <<  "\n";
-  Generator & gen = at::globalContext().defaultGenerator(at::kCPU);
-  CATCH_REQUIRE_NOTHROW(gen.seed());
-  auto && C = at::globalContext();
-  if(at::hasCUDA()) {
-    auto t2 = zeros({4,4}, at::kCUDA);
+  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
+       << bar.toDouble() << " " << what.isIntegral() << "\n";
+  Generator& gen = at::globalContext().defaultGenerator(at::kCPU);
+  ASSERT_NO_THROW(gen.seed());
+  auto&& C = at::globalContext();
+  if (at::hasCUDA()) {
+    auto t2 = zeros({4, 4}, at::kCUDA);
     cout << &t2 << "\n";
   }
-  auto t = ones({4,4});
+  auto t = ones({4, 4});
 
-  auto wha2 = zeros({4,4}).add(t).sum();
-  CATCH_REQUIRE( wha2.item<double>() == 16.0 );
+  auto wha2 = zeros({4, 4}).add(t).sum();
+  ASSERT_EQ(wha2.item<double>(), 16.0);
 
-  CATCH_REQUIRE( t.sizes()[0] == 4 );
-  CATCH_REQUIRE( t.sizes()[1] == 4 );
-  CATCH_REQUIRE( t.strides()[0] == 4 );
-  CATCH_REQUIRE( t.strides()[1] == 1 );
+  ASSERT_EQ(t.sizes()[0], 4);
+  ASSERT_EQ(t.sizes()[1], 4);
+  ASSERT_EQ(t.strides()[0], 4);
+  ASSERT_EQ(t.strides()[1], 1);
 
-  Type & T = CPU(Float);
-  Tensor x = randn({1,10}, T);
-  Tensor prev_h = randn({1,20}, T);
-  Tensor W_h = randn({20,20}, T);
-  Tensor W_x = randn({20,10}, T);
+  Type& T = CPU(Float);
+  Tensor x = randn({1, 10}, T);
+  Tensor prev_h = randn({1, 20}, T);
+  Tensor W_h = randn({20, 20}, T);
+  Tensor W_x = randn({20, 10}, T);
   Tensor i2h = at::mm(W_x, x.t());
   Tensor h2h = at::mm(W_h, prev_h.t());
   Tensor next_h = i2h.add(h2h);
   next_h = next_h.tanh();
 
-  _CATCH_REQUIRE_THROWS(at::_local_scalar(Tensor{}));
+  ASSERT_ANY_THROW(at::_local_scalar(Tensor{}));
 
   test_overflow();
 
-  if(at::hasCUDA()) {
+  if (at::hasCUDA()) {
     auto r = CUDA(Float).copy(next_h);
-    CATCH_REQUIRE(CPU(Float).copy(r).equal(next_h));
+    ASSERT_TRUE(CPU(Float).copy(r).equal(next_h));
   }
-  CATCH_REQUIRE_NOTHROW(randn({10,10,2}, T));
+  ASSERT_NO_THROW(randn({10, 10, 2}, T));
 
   // check Scalar.toTensor on Scalars backed by different data types
-  CATCH_REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble);
-  CATCH_REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong);
-  CATCH_REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble);
+  ASSERT_EQ(scalar_to_tensor(bar).type().scalarType(), kDouble);
+  ASSERT_EQ(scalar_to_tensor(what).type().scalarType(), kLong);
+  ASSERT_EQ(
+      scalar_to_tensor(ones({})._local_scalar()).type().scalarType(), kDouble);
 
   if (x.type().scalarType() != ScalarType::Half) {
     AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] {
       scalar_t s = 1;
       std::stringstream ss;
-      CATCH_REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n");
+      ASSERT_NO_THROW(
+          ss << "hello, dispatch" << x.type().toString() << s << "\n");
       auto data = (scalar_t*)x.data_ptr();
       (void)data;
     });
@@ -115,11 +116,11 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
 
   // test direct C-scalar type conversions
   {
-    auto x = ones({1,2}, T);
-    _CATCH_REQUIRE_THROWS(x.item<float>());
+    auto x = ones({1, 2}, T);
+    ASSERT_ANY_THROW(x.item<float>());
   }
   auto float_one = ones({}, T);
-  CATCH_REQUIRE(float_one.item<float>() == 1);
-  CATCH_REQUIRE(float_one.item<int32_t>() == 1);
-  CATCH_REQUIRE((float_one.item<at::Half>() == 1));
+  ASSERT_EQ(float_one.item<float>(), 1);
+  ASSERT_EQ(float_one.item<int32_t>(), 1);
+  ASSERT_EQ(float_one.item<at::Half>(), 1);
 }
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
index 8dc015dd1d06ae..5b6ec01a1577c4 100644
--- a/aten/src/ATen/test/stream_test.cpp
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAGuard.h"
@@ -11,12 +10,23 @@
 #include <thread>
 #include <unordered_set>
 
+#define ASSERT_EQ_CUDA(X, Y) \
+  {                          \
+    bool isTRUE = X == Y;    \
+    ASSERT_TRUE(isTRUE);     \
+  }
+
+#define ASSERT_NE_CUDA(X, Y) \
+  {                          \
+    bool isFALSE = X == Y;   \
+    ASSERT_FALSE(isFALSE);   \
+  }
+
 /*
-Tests related to ATen streams.
-*/
-CATCH_TEST_CASE(
-    "Copying and Moving Streams",
-    "Verifies streams are live through copying and moving") {
+   Tests related to ATen streams.
+   */
+// Verifies streams are live through copying and moving
+TEST(TestStream, CopyAndMoveTest) {
   int32_t device = -1;
   cudaStream_t cuda_stream;
 
@@ -29,14 +39,14 @@ CATCH_TEST_CASE(
 
     copyStream = s;
 
-    CATCH_REQUIRE(copyStream.internals() == s.internals());
-    CATCH_REQUIRE(copyStream.device() == device);
-    CATCH_REQUIRE(copyStream.stream() == cuda_stream);
+    ASSERT_EQ_CUDA(copyStream.internals(), s.internals());
+    ASSERT_EQ_CUDA(copyStream.device(), device);
+    ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream);
   }
 
-  CATCH_REQUIRE(copyStream.internals());
-  CATCH_REQUIRE(copyStream.device() == device);
-  CATCH_REQUIRE(copyStream.stream() == cuda_stream);
+  ASSERT_TRUE(copyStream.internals());
+  ASSERT_EQ_CUDA(copyStream.device(), device);
+  ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream);
 
   // Tests that moving works as expected and preserves the stream
   at::cuda::CUDAStream moveStream;
@@ -47,43 +57,43 @@ CATCH_TEST_CASE(
 
     moveStream = std::move(s);
 
-    CATCH_REQUIRE(moveStream.device() == device);
-    CATCH_REQUIRE(moveStream.stream() == cuda_stream);
+    ASSERT_EQ_CUDA(moveStream.device(), device);
+    ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream);
   }
 
-  CATCH_REQUIRE(moveStream.internals());
-  CATCH_REQUIRE(moveStream.device() == device);
-  CATCH_REQUIRE(moveStream.stream() == cuda_stream);
+  ASSERT_TRUE(moveStream.internals());
+  ASSERT_EQ_CUDA(moveStream.device(), device);
+  ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream);
 }
 
-CATCH_TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
+// Verifies streams are set properly
+TEST(TestStream, GetAndSetTest) {
   at::cuda::CUDAStream myStream = at::cuda::createCUDAStream();
 
   // Sets and gets
   at::cuda::setCurrentCUDAStream(myStream);
   at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream();
 
-  CATCH_REQUIRE(myStream == curStream);
+  ASSERT_EQ_CUDA(myStream, curStream);
 
   // Gets, sets, and gets default stream
   at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
   at::cuda::setCurrentCUDAStream(defaultStream);
   curStream = at::cuda::getCurrentCUDAStream();
 
-  CATCH_REQUIRE(defaultStream != myStream);
-  CATCH_REQUIRE(curStream == defaultStream);
+  ASSERT_NE_CUDA(defaultStream, myStream);
+  ASSERT_EQ_CUDA(curStream, defaultStream);
 }
 
 void thread_fun(at::cuda::CUDAStream& cur_thread_stream) {
   auto new_stream = at::cuda::createCUDAStream();
   at::cuda::setCurrentCUDAStream(new_stream);
   cur_thread_stream = at::cuda::getCurrentCUDAStream();
-  CATCH_REQUIRE(cur_thread_stream == new_stream);
+  ASSERT_EQ_CUDA(cur_thread_stream, new_stream);
 }
 
-CATCH_TEST_CASE(
-    "Multithread Getting and Setting",
-    "Ensures streams are thread local") {
+// Ensures streams are thread local
+TEST(TestStream, MultithreadGetAndSetTest) {
   at::cuda::CUDAStream s0, s1;
 
   std::thread t0{thread_fun, std::ref(s0)};
@@ -94,25 +104,25 @@ CATCH_TEST_CASE(
   at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream();
   at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream();
 
-  CATCH_REQUIRE(cur_stream == default_stream);
-  CATCH_REQUIRE(cur_stream != s0);
-  CATCH_REQUIRE(cur_stream != s1);
-  CATCH_REQUIRE(s0 != s1);
+  ASSERT_EQ_CUDA(cur_stream, default_stream);
+  ASSERT_NE_CUDA(cur_stream, s0);
+  ASSERT_NE_CUDA(cur_stream, s1);
+  ASSERT_NE_CUDA(s0, s1);
 }
 
-CATCH_TEST_CASE("CUDAGuard") {
+// CUDA Guard
+TEST(TestStream, CUDAGuardTest) {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
 
   // -- begin setup
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
   std::vector<at::cuda::CUDAStream> streams0 = {
-      at::cuda::getDefaultCUDAStream(),
-      at::cuda::createCUDAStream()};
-  CATCH_REQUIRE(streams0[0].device() == 0);
-  CATCH_REQUIRE(streams0[1].device() == 0);
+      at::cuda::getDefaultCUDAStream(), at::cuda::createCUDAStream()};
+  ASSERT_EQ_CUDA(streams0[0].device(), 0);
+  ASSERT_EQ_CUDA(streams0[1].device(), 0);
   at::cuda::setCurrentCUDAStream(streams0[0]);
 
   std::vector<at::cuda::CUDAStream> streams1;
@@ -121,47 +131,46 @@ CATCH_TEST_CASE("CUDAGuard") {
     streams1.push_back(at::cuda::getDefaultCUDAStream());
     streams1.push_back(at::cuda::createCUDAStream());
   }
-  CATCH_REQUIRE(streams1[0].device() == 1);
-  CATCH_REQUIRE(streams1[1].device() == 1);
+  ASSERT_EQ_CUDA(streams1[0].device(), 1);
+  ASSERT_EQ_CUDA(streams1[1].device(), 1);
   at::cuda::setCurrentCUDAStream(streams1[0]);
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
 
   // -- end setup
 
   // Test that all original streams are recorded.
   {
     at::cuda::CUDAGuard guard;
-    CATCH_REQUIRE(guard.original_streams().empty());
+    ASSERT_TRUE(guard.original_streams().empty());
     guard.set_stream(streams0[0]);
-    CATCH_REQUIRE(
-        guard.original_streams().size() == at::cuda::getNumGPUs());
-    CATCH_REQUIRE(guard.original_streams()[0] == streams0[0]);
-    CATCH_REQUIRE(guard.original_streams()[1] == streams1[0]);
+    ASSERT_EQ_CUDA(guard.original_streams().size(), at::cuda::getNumGPUs());
+    ASSERT_EQ_CUDA(guard.original_streams()[0], streams0[0]);
+    ASSERT_EQ_CUDA(guard.original_streams()[1], streams1[0]);
   }
 
   // Setting a stream changes the current device and the stream on that device
   {
     at::cuda::CUDAGuard guard(streams1[1]);
-    CATCH_REQUIRE(guard.last_device() == 1);
-    CATCH_REQUIRE(at::cuda::current_device() == 1);
-    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]);
+    ASSERT_EQ_CUDA(guard.last_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[1]);
   }
 
   // Device and stream are now reset
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
 
   // Setting only the device changes only the current device and not the stream
   {
     at::cuda::CUDAGuard guard(/*device=*/1);
-    CATCH_REQUIRE(guard.last_device() == 1);
-    CATCH_REQUIRE(at::cuda::current_device() == 1);
-    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+    ASSERT_EQ_CUDA(guard.last_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
+    ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
   }
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(0), streams0[0]);
 
   // Setting the stream first, and then the device, first changes the devices
   // back, and then resets the stream on the initial device.
@@ -171,12 +180,13 @@ CATCH_TEST_CASE("CUDAGuard") {
     guard.set_device(1);
   }
 
-  CATCH_REQUIRE(at::cuda::current_device() == 0);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
-  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(0), streams0[0]);
+  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
 }
 
-CATCH_TEST_CASE("CUDAGuardIsMovable") {
+// CUDAGuardIsMovable
+TEST(TestStream, CUDAGuardMovableTest) {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
@@ -185,17 +195,18 @@ CATCH_TEST_CASE("CUDAGuardIsMovable") {
   at::cuda::CUDAGuard first(stream);
   first.set_device(1);
   at::cuda::CUDAGuard second(std::move(first));
-  CATCH_REQUIRE(second.original_streams().size() == device_count);
-  CATCH_REQUIRE(second.original_device() == 0);
-  CATCH_REQUIRE(second.last_device() == 1);
+  ASSERT_EQ_CUDA(second.original_streams().size(), device_count);
+  ASSERT_EQ_CUDA(second.original_device(), 0);
+  ASSERT_EQ_CUDA(second.last_device(), 1);
   at::cuda::CUDAGuard third;
   third = std::move(second);
-  CATCH_REQUIRE(third.original_streams().size() == device_count);
-  CATCH_REQUIRE(third.original_device() == 0);
-  CATCH_REQUIRE(third.last_device() == 1);
+  ASSERT_EQ_CUDA(third.original_streams().size(), device_count);
+  ASSERT_EQ_CUDA(third.original_device(), 0);
+  ASSERT_EQ_CUDA(third.last_device(), 1);
 }
 
-CATCH_TEST_CASE("Streampool Round Robin") {
+// Streampool Round Robin
+TEST(TestStream, StreamPoolTest) {
   std::vector<at::cuda::CUDAStream> streams{};
   for (int i = 0; i < 200; ++i) {
     streams.emplace_back(at::cuda::detail::CUDAStream_createStream());
@@ -206,14 +217,17 @@ CATCH_TEST_CASE("Streampool Round Robin") {
   for (auto i = decltype(streams.size()){0}; i < streams.size(); ++i) {
     cudaStream_t cuda_stream = streams[i];
     auto result_pair = stream_set.insert(cuda_stream);
-    if (!result_pair.second) hasDuplicates = true;
+    if (!result_pair.second)
+      hasDuplicates = true;
   }
 
-  CATCH_REQUIRE(hasDuplicates);
+  ASSERT_TRUE(hasDuplicates);
 }
 
-CATCH_TEST_CASE("Multi-GPU") {
-  if (at::cuda::getNumGPUs() < 2) return;
+// Multi-GPU
+TEST(TestStream, MultiGPUTest) {
+  if (at::cuda::getNumGPUs() < 2)
+    return;
 
   at::cuda::CUDAStream s0 = at::cuda::createCUDAStream(true, 0);
   at::cuda::CUDAStream s1 = at::cuda::createCUDAStream(false, 1);
@@ -221,17 +235,18 @@ CATCH_TEST_CASE("Multi-GPU") {
   at::cuda::setCurrentCUDAStream(s0);
   at::cuda::setCurrentCUDAStream(s1);
 
-  CATCH_REQUIRE(s0 == at::cuda::getCurrentCUDAStream());
+  ASSERT_EQ_CUDA(s0, at::cuda::getCurrentCUDAStream());
 
   at::DeviceGuard device_guard{1};
-  CATCH_REQUIRE(s1 == at::cuda::getCurrentCUDAStream());
+  ASSERT_EQ_CUDA(s1, at::cuda::getCurrentCUDAStream());
 }
 
-CATCH_TEST_CASE("CUDAEvent Syncs") {
+// CUDAEvent Syncs
+TEST(TestStream, CUDAEventSyncTest) {
   const auto stream = at::cuda::createCUDAStream();
   at::cuda::CUDAEvent event;
 
-  CATCH_REQUIRE(!event.happened());
+  ASSERT_FALSE(event.happened());
 
   event.recordOnce(stream);
 
@@ -242,11 +257,13 @@ CATCH_TEST_CASE("CUDAEvent Syncs") {
   wait_stream1.synchronize_with(event);
 
   cudaStreamSynchronize(wait_stream0);
-  CATCH_REQUIRE(event.happened());
+  ASSERT_TRUE(event.happened());
 }
 
-CATCH_TEST_CASE("Cross-Device Events") {
-  if (at::cuda::getNumGPUs() < 2) return;
+// Cross-Device Events
+TEST(TestStream, CrossDeviceTest) {
+  if (at::cuda::getNumGPUs() < 2)
+    return;
 
   const auto stream0 = at::cuda::createCUDAStream();
   at::cuda::CUDAEvent event0;
@@ -257,13 +274,13 @@ CATCH_TEST_CASE("Cross-Device Events") {
 
   event0.record(stream0);
   event1.record(stream1);
-  
+
   event0 = std::move(event1);
-  
-  CATCH_REQUIRE(event0.device() == 1);
+
+  ASSERT_EQ_CUDA(event0.device(), 1);
 
   stream0.synchronize_with(event0);
-  
+
   cudaStreamSynchronize(stream0);
-  CATCH_REQUIRE(event0.happened());
+  ASSERT_TRUE(event0.happened());
 }
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
index 81701733b53693..99421ca225a361 100644
--- a/aten/src/ATen/test/test_parallel.cpp
+++ b/aten/src/ATen/test/test_parallel.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -11,12 +10,11 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "parallel", "[cpu]" ) {
-
+TEST(TestParallel, TestParallel) {
   manual_seed(123, at::kCPU);
   set_num_threads(1);
 
-  Tensor a = rand({1,3});
+  Tensor a = rand({1, 3});
   a[0][0] = 1;
   a[0][1] = 0;
   a[0][2] = 0;
@@ -24,5 +22,5 @@ CATCH_TEST_CASE( "parallel", "[cpu]" ) {
   as[0] = 1;
   as[1] = 0;
   as[2] = 0;
-  CATCH_REQUIRE(a.sum(0).equal(as));
+  ASSERT_TRUE(a.sum(0).equal(as));
 }
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
index c01dff2d0038b1..8518c4f4358365 100644
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "ATen/core/UndefinedTensorImpl.h"
@@ -8,7 +7,7 @@
 
 using namespace at;
 
-CATCH_TEST_CASE( "undefined tensor test", "[]" ) {
+TEST(TestUndefined, UndefinedTest) {
   manual_seed(123, at::kCPU);
 
   // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
@@ -17,36 +16,36 @@ CATCH_TEST_CASE( "undefined tensor test", "[]" ) {
 
   std::stringstream ss;
   ss << und << std::endl;
-  CATCH_REQUIRE(!und.defined());
-  CATCH_REQUIRE(std::string("UndefinedType") == und.toString());
-
-  _CATCH_REQUIRE_THROWS(und.strides());
-  _CATCH_REQUIRE_THROWS(und.dim());
-  _CATCH_REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5));
-  _CATCH_REQUIRE_THROWS(und.add(und));
-  _CATCH_REQUIRE_THROWS(und.add(ft));
-  _CATCH_REQUIRE_THROWS(ft.add(und));
-  _CATCH_REQUIRE_THROWS(und.add(5));
-  _CATCH_REQUIRE_THROWS(und.mm(und));
+  ASSERT_FALSE(und.defined());
+  ASSERT_EQ(std::string("UndefinedType"), und.toString());
+
+  ASSERT_ANY_THROW(und.strides());
+  ASSERT_ANY_THROW(und.dim());
+  ASSERT_ANY_THROW([]() { return Tensor(); }() = Scalar(5));
+  ASSERT_ANY_THROW(und.add(und));
+  ASSERT_ANY_THROW(und.add(ft));
+  ASSERT_ANY_THROW(ft.add(und));
+  ASSERT_ANY_THROW(und.add(5));
+  ASSERT_ANY_THROW(und.mm(und));
 
   und.toType(und.type());
-  _CATCH_REQUIRE_THROWS(und.toType(ft.type()));
-  _CATCH_REQUIRE_THROWS(ft.toType(und.type()));
+  ASSERT_ANY_THROW(und.toType(ft.type()));
+  ASSERT_ANY_THROW(ft.toType(und.type()));
   und.toType(ScalarType::Undefined);
-  _CATCH_REQUIRE_THROWS(und.toType(ScalarType::Float));
-  _CATCH_REQUIRE_THROWS(ft.toType(ScalarType::Undefined));
+  ASSERT_ANY_THROW(und.toType(ScalarType::Float));
+  ASSERT_ANY_THROW(ft.toType(ScalarType::Undefined));
 
   // copy_
-  _CATCH_REQUIRE_THROWS(und.copy_(und));
-  _CATCH_REQUIRE_THROWS(und.copy_(ft));
-  _CATCH_REQUIRE_THROWS(ft.copy_(und));
+  ASSERT_ANY_THROW(und.copy_(und));
+  ASSERT_ANY_THROW(und.copy_(ft));
+  ASSERT_ANY_THROW(ft.copy_(und));
 
   und.toBackend(Backend::Undefined);
-  _CATCH_REQUIRE_THROWS(und.toBackend(Backend::CPU));
-  _CATCH_REQUIRE_THROWS(ft.toBackend(Backend::Undefined));
+  ASSERT_ANY_THROW(und.toBackend(Backend::CPU));
+  ASSERT_ANY_THROW(ft.toBackend(Backend::Undefined));
 
   Tensor to_move = ones({1}, CPU(kFloat));
   Tensor m(std::move(to_move));
-  CATCH_REQUIRE(!to_move.defined());
-  CATCH_REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton());
+  ASSERT_FALSE(to_move.defined());
+  ASSERT_EQ(to_move.unsafeGetTensorImpl(), UndefinedTensorImpl::singleton());
 }
diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp
index 42c9f61b19b5e1..3539db77d65517 100644
--- a/aten/src/ATen/test/weakref_test.cpp
+++ b/aten/src/ATen/test/weakref_test.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 
@@ -10,53 +9,55 @@
 using at::Tensor;
 using at::WeakTensor;
 
-CATCH_TEST_CASE( "Weak pointer tests", "" ) {
-  CATCH_SECTION("gets invalidated") {
-    Tensor a = at::ones({2, 2});
+// Weak pointer tests
+// gets invalidated
+TEST(TestWeakPointer, WeakPointerGetsInvalidated) {
+  Tensor a = at::ones({2, 2});
+  WeakTensor b = a;
+  a.reset();
+  ASSERT_FALSE(b.lock().defined());
+}
+
+// can successfully lock
+TEST(TestWeakPointer, WeakPointerLock) {
+  Tensor a = at::ones({2, 2});
+  WeakTensor b = a;
+  auto c = b.lock();
+  ASSERT_TRUE(c.defined());
+
+  a.reset();
+  ASSERT_TRUE(b.lock().defined());
+  c.reset();
+  ASSERT_FALSE(b.lock().defined());
+}
+
+// updates refcounts correctly
+TEST(TestWeakPointer, WeakUpdatesRefcountsTest) {
+  Tensor a = at::ones({2, 2});
+  ASSERT_EQ(a.use_count(), 1);
+  ASSERT_EQ(a.weak_use_count(), 1);
+  {
     WeakTensor b = a;
-    a.reset();
-    CATCH_REQUIRE_FALSE(b.lock().defined());
+    ASSERT_EQ(a.use_count(), 1);
+    ASSERT_EQ(a.weak_use_count(), 2);
   }
-
-  CATCH_SECTION("can successfully lock") {
-    Tensor a = at::ones({2, 2});
+  ASSERT_EQ(a.use_count(), 1);
+  ASSERT_EQ(a.weak_use_count(), 1);
+  {
     WeakTensor b = a;
-    auto c = b.lock();
-    CATCH_REQUIRE(c.defined());
-
-    a.reset();
-    CATCH_REQUIRE(b.lock().defined());
-    c.reset();
-    CATCH_REQUIRE_FALSE(b.lock().defined());
+    ASSERT_EQ(a.use_count(), 1);
+    auto locked = b.lock();
+    ASSERT_TRUE(locked.defined());
+    ASSERT_EQ(a.use_count(), 2);
   }
-
-  CATCH_SECTION("updates refcounts correctly") {
-    Tensor a = at::ones({2, 2});
-    CATCH_REQUIRE(a.use_count() == 1);
-    CATCH_REQUIRE(a.weak_use_count() == 1);
-    {
-      WeakTensor b = a;
-      CATCH_REQUIRE(a.use_count() == 1);
-      CATCH_REQUIRE(a.weak_use_count() == 2);
-    }
-    CATCH_REQUIRE(a.use_count() == 1);
-    CATCH_REQUIRE(a.weak_use_count() == 1);
-    {
-      WeakTensor b = a;
-      CATCH_REQUIRE(a.use_count() == 1);
-      auto locked = b.lock();
-      CATCH_REQUIRE(locked.defined());
-      CATCH_REQUIRE(a.use_count() == 2);
-    }
-    CATCH_REQUIRE(a.use_count() == 1);
-    CATCH_REQUIRE(a.weak_use_count() == 1);
-    {
-      WeakTensor b = a;
-      CATCH_REQUIRE(a.use_count() == 1);
-      CATCH_REQUIRE(a.weak_use_count() == 2);
-      a.reset();
-      CATCH_REQUIRE(b.use_count() == 0);
-      CATCH_REQUIRE(b.weak_use_count() == 1);
-    }
+  ASSERT_EQ(a.use_count(), 1);
+  ASSERT_EQ(a.weak_use_count(), 1);
+  {
+    WeakTensor b = a;
+    ASSERT_EQ(a.use_count(), 1);
+    ASSERT_EQ(a.weak_use_count(), 2);
+    a.reset();
+    ASSERT_EQ(b.use_count(), 0);
+    ASSERT_EQ(b.weak_use_count(), 1);
   }
 }
diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp
index f76dac212a0921..f08071424625b3 100644
--- a/aten/src/ATen/test/wrapdim_test.cpp
+++ b/aten/src/ATen/test/wrapdim_test.cpp
@@ -1,43 +1,45 @@
-#define CATCH_CONFIG_MAIN
-#include "catch_utils.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
+void TestSimpleCase(Type& T) {
+  auto a = randn({2, 3, 4, 5}, T);
+  ASSERT_TRUE(a.prod(-4).equal(a.prod(0)));
+  ASSERT_TRUE(a.prod(3).equal(a.prod(-1)));
+}
+
+void TestExpressionSpecification(Type& T) {
+  auto a = randn({2, 3, 4, 5}, T);
+  ASSERT_TRUE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
+  ASSERT_TRUE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
+
+  // can unsqueeze scalar
+  auto b = randn(1, T);
+  b.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_TRUE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
+}
+
+void TestEmptyTensor(Type& T) {
+  auto a = randn(0, T);
+  ASSERT_TRUE(a.prod(0).equal(at::ones({}, T)));
+}
+
+void TestScalarVs1Dim1Size(Type& T) {
+  auto a = randn(1, T);
+  ASSERT_TRUE(a.prod(0).equal(a.prod(-1)));
+  a.unsafeGetTensorImpl()->maybe_zero_dim(true);
+  ASSERT_EQ(a.dim(), 0);
+  ASSERT_TRUE(a.prod(0).equal(a.prod(-1)));
+}
 
-CATCH_TEST_CASE( "wrapdim test", "[]" ) {
+TEST(TestWrapdim, TestWrapdim) {
   manual_seed(123, at::kCPU);
+  Type& T = CPU(kFloat);
 
-  Type & T = CPU(kFloat);
-
-  CATCH_SECTION( "simple case" ) {
-    auto a = randn({2, 3, 4, 5}, T);
-    CATCH_REQUIRE(a.prod(-4).equal(a.prod(0)));
-    CATCH_REQUIRE(a.prod(3).equal(a.prod(-1)));
-  }
-
-  CATCH_SECTION( "expression specification" ) {
-    auto a = randn({2, 3, 4, 5}, T);
-    CATCH_REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
-    CATCH_REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
-
-    // can unsqueeze scalar
-    auto b = randn(1, T);
-    b.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    CATCH_REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
-  }
-
-  CATCH_SECTION( "empty tensor" ) {
-    auto a = randn(0, T);
-    CATCH_REQUIRE(a.prod(0).equal(at::ones({}, T)));
-  }
-
-  CATCH_SECTION( "scalar vs 1-dim, 1-size" ) {
-    auto a = randn(1, T);
-    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
-    a.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    CATCH_REQUIRE(a.dim() == 0);
-    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
-  }
+  TestSimpleCase(T);
+  TestEmptyTensor(T);
+  TestScalarVs1Dim1Size(T);
+  TestExpressionSpecification(T);
 }
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index 756fa0f905ac13..8752b0df458cfd 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -96,19 +96,24 @@ static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
 }
 
 static inline  __device__ void atomicAdd(at::Half *address, at::Half val) {
-  unsigned int * address_as_ui =
-    (unsigned int *) ((char *)address - ((size_t)address & 2));
-  unsigned int old = *address_as_ui;
-  unsigned int assumed;
+  #if ((CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+    unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      at::Half hsum;
+      hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+      hsum = THCNumerics<at::Half>::add(hsum, val);
+      old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+  #else
+    atomicAdd(reinterpret_cast<__half*>(address), val);
+  #endif
 
-  do {
-    assumed = old;
-    at::Half hsum;
-    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    hsum = THCNumerics<at::Half>::add(hsum, val);
-    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
-    old = atomicCAS(address_as_ui, assumed, old);
-  } while (assumed != old);
 }
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 157a324f6e45b8..27ec95adbaa82e 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -1,6 +1,7 @@
 #ifndef THC_NUMERICS_INC
 #define THC_NUMERICS_INC
 
+#include <cstdlib>
 #include <limits>
 #include <cuda.h>
 #include <assert.h>
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index f481a6292c7f56..ecbae477282c15 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -81,8 +81,8 @@ void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
                                                 : backend == "cuda"
                     ? "CUDA"
                     : backend == "dnnlowp" ? "DNNLOWP"
-                                           : backend == "dnnlowp_16"
-                            ? "DNNLOWP_16"
+                                           : backend == "dnnlowp_acc16"
+                            ? "DNNLOWP_ACC16"
                             : backend == "default" ? "" : "NONE";
     CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
     for (int i = 0; i < net_def->op_size(); i++) {
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 4b7bab4f42eeb9..490baa56a8acee 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -16,6 +16,9 @@ configure_file(
 # check with the core PyTorch developers as the dependendency will be
 # transitively passed on to all libraries dependent on PyTorch.
 file(GLOB_RECURSE C10_SRCS *.cpp)
+# exclude test files
+file(GLOB_RECURSE C10_ALL_TEST_FILES test/*.cpp)
+exclude(C10_SRCS "${C10_SRCS}" ${C10_ALL_TEST_FILES})
 file(GLOB_RECURSE C10_HEADERS *.h)
 add_library(c10 ${C10_SRCS} ${C10_HEADERS})
 # If building shared library, set dllimport/dllexport proper.
@@ -31,6 +34,8 @@ target_include_directories(
     $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
     $<INSTALL_INTERFACE:include>)
 
+add_subdirectory(test)
+
 # ---[ Installation
 # Note: for now, we will put all export path into one single Caffe2Targets group
 # to deal with the cmake deployment need. Inside the Caffe2Targets set, the
diff --git a/c10/c10_dummy.cpp b/c10/c10_dummy.cpp
deleted file mode 100644
index df4e73171da3ff..00000000000000
--- a/c10/c10_dummy.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "c10/c10_dummy.h"
-
-namespace c10 {
-bool HasC10() {
-  return true;
-}
-} // namespace c10
diff --git a/c10/c10_dummy.h b/c10/c10_dummy.h
deleted file mode 100644
index cf6c6b30c14bbf..00000000000000
--- a/c10/c10_dummy.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#pragma once
-
-#include "c10/macros/Macros.h"
-
-namespace c10 {
-C10_API bool HasC10();
-}
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 8e593e0100bbf9..4527150c8f6803 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -1,3 +1,6 @@
+#ifndef C10_MACROS_EXPORT_H_
+#define C10_MACROS_EXPORT_H_
+
 /* Header file to define the common scaffolding for exported symbols.
  *
  * Export is by itself a quite tricky situation to deal with, and if you are
@@ -9,8 +12,6 @@
  * Do NOT include this file directly. Instead, use c10/macros/Macros.h
  */
 
-#pragma once
-
 // You do not need to edit this part of file unless you are changing the core
 // pytorch export abstractions.
 //
@@ -74,3 +75,5 @@
 #else
 #define CAFFE2_API C10_IMPORT
 #endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/c10/macros/Legacy.h b/c10/macros/Legacy.h
deleted file mode 100644
index 86752a838acd32..00000000000000
--- a/c10/macros/Legacy.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* A centralized location to provide legacy macro support, and a warning about
- * when this legacy compatibility symbol is going to removed in the future.
- *
- * Do NOT include this file directly. Instead, use c10/macros/Macros.h
- */
-
-#pragma once
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 2b438d670f00de..ad9fafd4ab8f55 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -1,11 +1,12 @@
+#ifndef C10_MACROS_MACROS_H_
+#define C10_MACROS_MACROS_H_
+
 /* Main entry for c10/macros.
  *
  * In your code, include c10/macros/Macros.h directly, instead of individual
  * files in this folder.
  */
 
-#pragma once
-
 // For build systems that do not directly depend on CMake and directly build
 // from the source directory (such as Buck), one may not have a cmake_macros.h
 // file at all. In this case, the build system is responsible for providing
@@ -28,5 +29,4 @@
   classname(const classname&) = delete;        \
   classname& operator=(const classname&) = delete
 
-// Finally, file that provides legacy support for macros
-#include "c10/macros/Legacy.h"
+#endif // C10_MACROS_MACROS_H_
diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in
index 73bc803f063551..c211c54bdd7af6 100644
--- a/c10/macros/cmake_macros.h.in
+++ b/c10/macros/cmake_macros.h.in
@@ -1,6 +1,9 @@
+#ifndef C10_MACROS_CMAKE_MACROS_H_
+#define C10_MACROS_CMAKE_MACROS_H_
+
 // Automatically generated header file for the C10 library.
 // Do not include this file directly. Instead, include c10/macros/Macros.h.
 
-#pragma once
-
 #cmakedefine C10_BUILD_SHARED_LIBS
+
+#endif // C10_MACROS_CMAKE_MACROS_H_
diff --git a/c10/test/CMakeLists.txt b/c10/test/CMakeLists.txt
new file mode 100644
index 00000000000000..a2a29f59eb5bd8
--- /dev/null
+++ b/c10/test/CMakeLists.txt
@@ -0,0 +1,15 @@
+# ---[ Test binaries.
+
+file(GLOB C10_ALL_TEST_FILES *.cpp)
+if (BUILD_TEST)
+  foreach(test_src ${C10_ALL_TEST_FILES})
+    get_filename_component(test_file_name ${test_src} NAME_WE)
+    set(test_name "c10_${test_file_name}")
+    add_executable(${test_name} "${test_src}")
+    target_link_libraries(${test_name} c10 gtest_main)
+    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+    if (INSTALL_TEST)
+      install(TARGETS ${test_name} DESTINATION test)
+    endif()
+  endforeach()
+endif()
diff --git a/c10/test/registry_test.cpp b/c10/test/registry_test.cpp
new file mode 100644
index 00000000000000..c6e7f620e602b5
--- /dev/null
+++ b/c10/test/registry_test.cpp
@@ -0,0 +1,49 @@
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+
+#include "c10/util/Registry.h"
+
+// Note: we use a different namespace to test if the macros defined in
+// Registry.h actuall works with a different namespace from c10.
+namespace c10_test {
+
+class Foo {
+ public:
+  explicit Foo(int x) {
+    // LOG(INFO) << "Foo " << x;
+  }
+};
+
+C10_DECLARE_REGISTRY(FooRegistry, Foo, int);
+C10_DEFINE_REGISTRY(FooRegistry, Foo, int);
+#define REGISTER_FOO(clsname) C10_REGISTER_CLASS(FooRegistry, clsname, clsname)
+
+class Bar : public Foo {
+ public:
+  explicit Bar(int x) : Foo(x) {
+    // LOG(INFO) << "Bar " << x;
+  }
+};
+REGISTER_FOO(Bar);
+
+class AnotherBar : public Foo {
+ public:
+  explicit AnotherBar(int x) : Foo(x) {
+    // LOG(INFO) << "AnotherBar " << x;
+  }
+};
+REGISTER_FOO(AnotherBar);
+
+TEST(RegistryTest, CanRunCreator) {
+  std::unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
+  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
+  std::unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
+  EXPECT_TRUE(another_bar != nullptr);
+}
+
+TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
+  EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
+}
+
+} // namespace c10_test
diff --git a/c10/util/Registry.h b/c10/util/Registry.h
new file mode 100644
index 00000000000000..9f310c73483263
--- /dev/null
+++ b/c10/util/Registry.h
@@ -0,0 +1,226 @@
+#ifndef C10_UTIL_REGISTRY_H_
+#define C10_UTIL_REGISTRY_H_
+
+/**
+ * Simple registry implementation that uses static variables to
+ * register object creators during program initialization time.
+ */
+
+// NB: This Registry works poorly when you have other namespaces.
+// Make all macro invocations from inside the at namespace.
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "c10/util/Type.h"
+
+namespace c10 {
+
+template <typename KeyType>
+inline void PrintOffendingKey(const KeyType& /*key*/) {
+  printf("[key type printing not supported]\n");
+}
+
+template <>
+inline void PrintOffendingKey(const std::string& key) {
+  printf("Offending key: %s.\n", key.c_str());
+}
+
+/**
+ * @brief A template class that allows one to register classes by keys.
+ *
+ * The keys are usually a std::string specifying the name, but can be anything
+ * that can be used in a std::map.
+ *
+ * You should most likely not use the Registry class explicitly, but use the
+ * helper macros below to declare specific registries as well as registering
+ * objects.
+ */
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registry {
+ public:
+  typedef std::function<ObjectPtrType(Args...)> Creator;
+
+  Registry() : registry_() {}
+
+  void Register(const SrcType& key, Creator creator) {
+    std::lock_guard<std::mutex> lock(register_mutex_);
+    // The if statement below is essentially the same as the following line:
+    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, CHECK_EQ depends on google logging, and since registration is
+    // carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    if (registry_.count(key) != 0) {
+      printf("Key already registered.\n");
+      PrintOffendingKey(key);
+      std::exit(1);
+    }
+    registry_[key] = creator;
+  }
+
+  void Register(
+      const SrcType& key,
+      Creator creator,
+      const std::string& help_msg) {
+    Register(key, creator);
+    help_message_[key] = help_msg;
+  }
+
+  inline bool Has(const SrcType& key) {
+    return (registry_.count(key) != 0);
+  }
+
+  ObjectPtrType Create(const SrcType& key, Args... args) {
+    if (registry_.count(key) == 0) {
+      // Returns nullptr if the key is not registered.
+      return nullptr;
+    }
+    return registry_[key](args...);
+  }
+
+  /**
+   * Returns the keys currently registered as a std::vector.
+   */
+  std::vector<SrcType> Keys() const {
+    std::vector<SrcType> keys;
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    return keys;
+  }
+
+  inline const std::unordered_map<SrcType, std::string>& HelpMessage() const {
+    return help_message_;
+  }
+
+  const char* HelpMessage(const SrcType& key) const {
+    auto it = help_message_.find(key);
+    if (it == help_message_.end()) {
+      return nullptr;
+    }
+    return it->second.c_str();
+  }
+
+ private:
+  std::unordered_map<SrcType, Creator> registry_;
+  std::unordered_map<SrcType, std::string> help_message_;
+  std::mutex register_mutex_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(Registry);
+};
+
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registerer {
+ public:
+  Registerer(
+      const SrcType& key,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg);
+  }
+
+  template <class DerivedType>
+  static ObjectPtrType DefaultCreator(Args... args) {
+    return ObjectPtrType(new DerivedType(args...));
+  }
+};
+
+/**
+ * C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ */
+#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
+#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
+#ifdef __COUNTER__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
+#else
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
+#endif
+
+/**
+ * C10_DECLARE_TYPED_REGISTRY is a macro that expands to a function
+ * declaration, as well as creating a convenient typename for its corresponding
+ * registerer.
+ */
+// Note on C10_IMPORT and C10_EXPORT below: we need to explicitly mark DECLARE
+// as import and DEFINE as export, because these registry macros will be used
+// in downstream shared libraries as well, and one cannot use *_API - the API
+// macro will be defined on a per-shared-library basis. Semantically, when one
+// declares a typed registry it is always going to be IMPORT, and when one
+// defines a registry (which should happen ONLY ONCE and ONLY IN SOURCE FILE),
+// the instantiation unit is always going to be exported.
+//
+// The only unique condition is when in the same file one does DECLARE and
+// DEFINE - in Windows compilers, this generates a warning that dllimport and
+// dllexport are mixed, but the warning is fine and linker will be properly
+// exporting the symbol. Same thing happens in the gflags flag declaration and
+// definition caes.
+#define C10_DECLARE_TYPED_REGISTRY(                                        \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                       \
+  C10_IMPORT ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName();                                                          \
+  typedef ::c10::Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>   \
+      Registerer##RegistryName
+
+#define C10_DEFINE_TYPED_REGISTRY(                                         \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                       \
+  C10_EXPORT ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName() {                                                         \
+    static ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*   \
+        registry = new ::c10::                                             \
+            Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();       \
+    return registry;                                                       \
+  }
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define C10_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, RegistryName(), ##__VA_ARGS__);
+
+#define C10_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                  \
+      RegistryName(),                                                       \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
+      ::c10::demangle_type<__VA_ARGS__>());
+
+// C10_DECLARE_REGISTRY and C10_DEFINE_REGISTRY are hard-wired to use
+// std::string as the key type, because that is the most commonly used cases.
+#define C10_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DEFINE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define C10_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DEFINE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+// C10_REGISTER_CREATOR and C10_REGISTER_CLASS are hard-wired to use std::string
+// as the key
+// type, because that is the most commonly used cases.
+#define C10_REGISTER_CREATOR(RegistryName, key, ...) \
+  C10_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+
+#define C10_REGISTER_CLASS(RegistryName, key, ...) \
+  C10_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+
+} // namespace c10
+
+#endif // C10_UTIL_REGISTRY_H_
diff --git a/c10/util/Type.cpp b/c10/util/Type.cpp
new file mode 100644
index 00000000000000..3e00055c699104
--- /dev/null
+++ b/c10/util/Type.cpp
@@ -0,0 +1,59 @@
+#include "c10/util/Type.h"
+
+#include <cstdlib>
+#include <functional>
+#include <memory>
+
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define HAS_DEMANGLE 0
+#elif defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
+#define HAS_DEMANGLE 0
+#else
+#define HAS_DEMANGLE 1
+#endif
+
+#if HAS_DEMANGLE
+
+#include <cxxabi.h>
+#include <execinfo.h>
+
+namespace c10 {
+
+std::string demangle(const char* name) {
+  int status = -1;
+
+  // This function will demangle the mangled function name into a more human
+  // readable format, e.g. _Z1gv -> g().
+  // More information:
+  // https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/libsupc%2B%2B/cxxabi.h
+  // NOTE: `__cxa_demangle` returns a malloc'd string that we have to free
+  // ourselves.
+  std::unique_ptr<char, std::function<void(char*)>> demangled(
+      abi::__cxa_demangle(
+          name,
+          /*__output_buffer=*/nullptr,
+          /*__length=*/0,
+          &status),
+      /*deleter=*/free);
+
+  // Demangling may fail, for example when the name does not follow the
+  // standard C++ (Itanium ABI) mangling scheme. This is the case for `main`
+  // or `clone` for example, so the mangled name is a fine default.
+  if (status == 0) {
+    return demangled.get();
+  } else {
+    return name;
+  }
+}
+
+} // namespace c10
+
+#else // HAS_DEMANGLE
+namespace c10 {
+std::string demangle(const char* name) {
+  return std::string(name);
+}
+} // namespace c10
+
+#endif // HAS_DEMANGLE
diff --git a/c10/util/Type.h b/c10/util/Type.h
new file mode 100644
index 00000000000000..ddaa0c258753a7
--- /dev/null
+++ b/c10/util/Type.h
@@ -0,0 +1,28 @@
+#ifndef C10_UTIL_TYPE_H_
+#define C10_UTIL_TYPE_H_
+
+#include <cstddef>
+#include <string>
+#include <typeinfo>
+
+#include "c10/macros/Macros.h"
+
+namespace c10 {
+
+/// Utility to demangle a C++ symbol name.
+C10_API std::string demangle(const char* name);
+
+/// Returns the printable name of the type.
+template <typename T>
+inline const char* demangle_type() {
+#ifdef __GXX_RTTI
+  static const std::string name = demangle(typeid(T).name());
+  return name.c_str();
+#else // __GXX_RTTI
+  return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
+}
+
+} // namespace c10
+
+#endif // C10_UTIL_TYPE_H_
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 885ca028fb2464..07f69d9f7bab98 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -400,9 +400,6 @@ if (BUILD_TEST)
     target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
     target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-    if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
-      target_compile_features(${test_name} PRIVATE cxx_range_for)
-    endif()
     add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
     if (INSTALL_TEST)
       install(TARGETS ${test_name} DESTINATION test)
@@ -416,9 +413,6 @@ if (BUILD_TEST)
       target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
       target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-      if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
-        target_compile_features(${test_name} PRIVATE cxx_range_for)
-      endif()
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if (INSTALL_TEST)
         install(TARGETS ${test_name} DESTINATION test)
@@ -434,9 +428,6 @@ if (BUILD_TEST)
       target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
       target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-      if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
-        target_compile_features(${test_name} PRIVATE cxx_range_for)
-      endif()
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if (INSTALL_TEST)
         install(TARGETS ${test_name} DESTINATION test)
diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h
index 96bc720ccd59d1..aa41595ae06b66 100644
--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@@ -1,6 +1,7 @@
 #ifndef CAFFE2_CORE_ALLOCATOR_H_
 #define CAFFE2_CORE_ALLOCATOR_H_
 
+#include <cstring>
 #include <unordered_map>
 
 #include "caffe2/core/logging.h"
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index e09a54cbd2df56..06f278aac2ae86 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -21,13 +21,13 @@ inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
     return false;
   }
   const Tensor* tensor = &blob.Get<Tensor>();
-  return tensor && tensor->GetDeviceType() == device_type;
+  return tensor && *tensor && tensor->GetDeviceType() == device_type;
 }
 
 inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
   if (blob->IsType<Tensor>()) {
     Tensor* tensor = blob->GetMutable<Tensor>();
-    if (tensor->GetDeviceType() == device_type) {
+    if (*tensor && tensor->GetDeviceType() == device_type) {
       return tensor;
     }
   }
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index d4ef19db69ce4f..8126b3d59425a1 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -196,7 +196,7 @@ void TensorSerializer::Serialize(
   const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
   proto.set_data_type(data_type);
   StoreDeviceDetail(input, &proto);
-  auto uniq_ptr = input.GetStaticContext()->CreateContext();
+  auto uniq_ptr = CreateContext(input.GetDevice());
   // A lot of copypaste is error prone. Should we create a macro for this?
   switch (data_type) {
     case TensorProto_DataType_FLOAT:
@@ -322,13 +322,13 @@ void TensorSerializer::StoreDeviceDetail(
   input.ExtractDeviceOption(proto->mutable_device_detail());
 }
 // The actual serialization registry objects.
-CAFFE_DEFINE_TYPED_REGISTRY(
+C10_DEFINE_TYPED_REGISTRY(
     BlobSerializerRegistry,
     TypeIdentifier,
     BlobSerializerBase,
     std::unique_ptr);
 
-CAFFE_DEFINE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
+C10_DEFINE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
 
 void DeserializeBlob(const string& content, Blob* result) {
   BlobProto blob_proto;
@@ -371,8 +371,7 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
 void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
   // We create a local context for deserializing. Since Caffe2 contexts are
   // usually lightweight, this should not involve too much overhead.
-  auto uniq_ptr =
-      tensor->GetStaticContext()->CreateContext(proto.device_detail());
+  auto uniq_ptr = CreateContext(OptionToDevice(proto.device_detail()));
   auto context = uniq_ptr.get();
   context->SwitchToDevice(0);
   vector<int64_t> dims;
diff --git a/caffe2/core/blob_serializer_base.h b/caffe2/core/blob_serializer_base.h
index b51f3da21a30f4..4e0e3e4d6d18fe 100644
--- a/caffe2/core/blob_serializer_base.h
+++ b/caffe2/core/blob_serializer_base.h
@@ -3,8 +3,8 @@
 #include <string>
 #include <functional>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
@@ -57,13 +57,13 @@ class BlobSerializerBase {
 };
 
 // The Blob serialization registry and serializer creator functions.
-CAFFE_DECLARE_TYPED_REGISTRY(
+C10_DECLARE_TYPED_REGISTRY(
     BlobSerializerRegistry,
     TypeIdentifier,
     BlobSerializerBase,
     std::unique_ptr);
 #define REGISTER_BLOB_SERIALIZER(id, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
 // Creates an operator with the given operator definition.
 inline unique_ptr<BlobSerializerBase> CreateSerializer(TypeIdentifier id) {
   return BlobSerializerRegistry()->Create(id);
@@ -82,9 +82,9 @@ class CAFFE2_API BlobDeserializerBase {
   virtual void Deserialize(const BlobProto& proto, Blob* blob) = 0;
 };
 
-CAFFE_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
+C10_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
 #define REGISTER_BLOB_DESERIALIZER(name, ...) \
-  CAFFE_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
 // Creates an operator with the given operator definition.
 inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
   return BlobDeserializerRegistry()->Create(type);
diff --git a/caffe2/core/blob_stats.h b/caffe2/core/blob_stats.h
index 67f9e88e2edc62..5c9f80f518f91c 100644
--- a/caffe2/core/blob_stats.h
+++ b/caffe2/core/blob_stats.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/typeid.h"
 
 #include <unordered_map>
@@ -33,7 +33,7 @@ struct BlobStatRegistry {
 
 #define REGISTER_BLOB_STAT_GETTER(Type, BlobStatGetterClass)    \
   static BlobStatRegistry::Registrar<Type, BlobStatGetterClass> \
-      CAFFE_ANONYMOUS_VARIABLE(BlobStatRegistry)
+      C10_ANONYMOUS_VARIABLE(BlobStatRegistry)
 
 namespace BlobStat {
 
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index bb2f4ba6a91818..d856655433aa3e 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -3,6 +3,7 @@
 #include <mutex>
 
 #include <gtest/gtest.h>
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/blob_serialization.h"
 #include "caffe2/core/common.h"
@@ -11,7 +12,6 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/qtensor.h"
 #include "caffe2/core/qtensor_serialization.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/types.h"
 #include "caffe2/core/workspace.h"
@@ -967,7 +967,7 @@ CAFFE_KNOWN_TYPE(DummyType);
 
 namespace {
 REGISTER_BLOB_SERIALIZER((TypeMeta::Id<DummyType>()), DummyTypeSerializer);
-CAFFE_REGISTER_TYPED_CLASS(
+C10_REGISTER_TYPED_CLASS(
     BlobDeserializerRegistry,
     "DummyType",
     DummyTypeDeserializer);
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 93bbf341b5061a..d1803a6a2d2812 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -24,16 +24,13 @@
 
 // Macros used during the build of this caffe2 instance. This header file
 // is automatically generated by the cmake script during build.
+#include "caffe2/core/common.h"
 #include "caffe2/core/macros.h"
 
 #include "c10/macros/Macros.h"
 
 namespace caffe2 {
 
-// Data type for caffe2 Index/Size. We use size_t to be safe here as well as for
-// large matrices that are common in sparse math.
-typedef int64_t TIndex;
-
 // Note(Yangqing): NVCC does not play well with unordered_map on some platforms,
 // forcing us to use std::map instead of unordered_map. This may affect speed
 // in some cases, but in most of the computation code we do not access map very
diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc
index 9e39a85721186f..e2794bbd39d92f 100644
--- a/caffe2/core/common_gpu.cc
+++ b/caffe2/core/common_gpu.cc
@@ -2,6 +2,7 @@
 
 #include <atomic>
 #include <cstdlib>
+#include <iostream>
 #include <sstream>
 
 #include "caffe2/core/asan.h"
diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc
index 30819afdc4ce3f..94047eb71ee0b6 100644
--- a/caffe2/core/context.cc
+++ b/caffe2/core/context.cc
@@ -5,6 +5,10 @@
 #include <process.h>
 #endif
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::CPU, caffe2::CPUContext);
+} // namespace at
 namespace caffe2 {
 
 uint32_t RandomNumberSeed() {
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index aff66534d22198..af66396af72c44 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -50,6 +50,8 @@ class CAFFE2_API CPUContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_CPU);
   }
+  explicit CPUContext(const at::Device& device)
+      : CPUContext(DeviceToOption(device)) {}
 
   ~CPUContext() noexcept override {}
 
@@ -192,15 +194,6 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
     return data_and_deleter;
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CPUContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CPUContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return CPU;
   }
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
index b61b73cbad1cb5..99996d9e165b9b 100644
--- a/caffe2/core/context_base.cc
+++ b/caffe2/core/context_base.cc
@@ -1,4 +1,5 @@
 #include "context_base.h"
 
 namespace caffe2 {
+
 } // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 1eaa579ee0cdbe..0d9e2686212a1e 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -57,6 +57,11 @@ CAFFE2_DEFINE_int(
     128,
     "The threshold in MB on how frequently to report memory changes");
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext);
+} // namespace at
+
 namespace caffe2 {
 
 ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() {
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 5fcdb98b100794..ce73f5f942828b 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -142,6 +142,8 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   // The default cuda context constructor.
   explicit CUDAContext(const int gpu_id = -1);
   explicit CUDAContext(const DeviceOption& option);
+  explicit CUDAContext(const at::Device& device)
+      : CUDAContext(DeviceToOption(device)) {}
 
   ~CUDAContext() override {
     if (curand_generator_) {
@@ -385,19 +387,6 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<CUDAContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<CUDAContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<CUDAContext>(gpu_id);
-  }
-
   DeviceType GetDeviceType() override {
     return CUDA;
   }
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
index 720c2dcaa46de1..c0031cb0661ec8 100644
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@@ -12,7 +12,7 @@ CAFFE_KNOWN_TYPE(db::Cursor);
 
 namespace db {
 
-CAFFE_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+C10_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 
 // Below, we provide a bare minimum database "minidb" as a reference
 // implementation as well as a portable choice to store data.
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 39f8b6f3f02b0d..f6044ff35f8273 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -3,8 +3,8 @@
 
 #include <mutex>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
@@ -104,9 +104,9 @@ class CAFFE2_API DB {
 
 // Database classes are registered by their names so we can do optional
 // dependencies.
-CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
+C10_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode);
 #define REGISTER_CAFFE2_DB(name, ...) \
-  CAFFE_REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__)
 
 /**
  * Returns a database object of the given database type, source and mode. The
diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc
index a84d298466dc03..43131d8beebd27 100644
--- a/caffe2/core/flags.cc
+++ b/caffe2/core/flags.cc
@@ -1,6 +1,7 @@
 #include "caffe2/core/flags.h"
 
 #include <cstdlib>
+#include <iostream>
 #include <sstream>
 
 #include "caffe2/core/logging.h"
@@ -33,8 +34,7 @@ C10_EXPORT bool CommandLineFlagsHasBeenParsed() {
 
 #else  // CAFFE2_USE_GFLAGS
 
-
-CAFFE_DEFINE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
+C10_DEFINE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 
 namespace {
 static bool gCommandLineFlagsParsed = false;
diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
index 4e39c7bdebf137..98b137c2f723ef 100644
--- a/caffe2/core/flags.h
+++ b/caffe2/core/flags.h
@@ -20,7 +20,8 @@
 #ifndef CAFFE2_CORE_FLAGS_H_
 #define CAFFE2_CORE_FLAGS_H_
 
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
+#include "caffe2/core/common.h"
 
 namespace caffe2 {
 /**
@@ -142,7 +143,7 @@ class CAFFE2_API Caffe2FlagParser {
   bool success_;
 };
 
-CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
+C10_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 
 }  // namespace caffe2
 
diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc
index 0fabb20a642c94..3eadaf0e71b118 100644
--- a/caffe2/core/hip/context_hip.cc
+++ b/caffe2/core/hip/context_hip.cc
@@ -50,6 +50,11 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb,
                   128,
                   "The threshold in MB on how frequently to report memory changes");
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::HIP, caffe2::HIPContext);
+} // namespace at
+
 namespace caffe2 {
 
 thread_local ThreadLocalHIPObjects HIPContext::hip_objects_;
@@ -408,13 +413,12 @@ void HIPStaticContext::Delete(void* ptr) {
         g_hip_device_affiliation.erase(it);
         break;
     }
-    case HipMemoryPoolType::THC: 
-    {
-        HIP_ENFORCE(g_thc_allocator->Free(ptr));
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
-        }
-        break;
+    case HipMemoryPoolType::THC: {
+      HIP_ENFORCE(g_thc_allocator->Free(ptr));
+      if (FLAGS_caffe2_gpu_memory_tracking) {
+        g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr));
+      }
+      break;
     }
     }
 }
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index 5a7613cf934fd0..fb04336354e704 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -127,6 +127,8 @@ class HIPContext final : public BaseContext {
   // The default HIP context constructor.
   explicit HIPContext(const int gpu_id = -1);
   explicit HIPContext(const DeviceOption& option);
+  explicit HIPContext(const at::Device& device)
+      : HIPContext(DeviceToOption(device)) {}
 
   ~HIPContext() override {
     if (hiprand_generator_) {
@@ -374,19 +376,6 @@ class HIPStaticContext final : public BaseStaticContext {
  public:
   std::pair<void*, MemoryDeleter> New(size_t nbytes) const override;
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<HIPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<HIPContext>(option);
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(int gpu_id = -1) {
-    return caffe2::make_unique<HIPContext>(gpu_id);
-  }
-
   DeviceType GetDeviceType() override {
     return HIP;
   }
diff --git a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
index e1b4ff2aebb0fc..3ad9336e6d2aee 100644
--- a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
+++ b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
@@ -55,6 +55,6 @@ GetAsyncNetHIPThreadPool(int hip_gpu_id, int pool_size, bool create_new) {
   }
 }
 
-CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, HIP, GetAsyncNetHIPThreadPool);
+C10_REGISTER_CREATOR(ThreadPoolRegistry, HIP, GetAsyncNetHIPThreadPool);
 
 } // namespace caffe2
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index cd057444d31cf4..30603888ad1c44 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -3,6 +3,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <iostream>
 #include <numeric>
 
 // Common code that we use regardless of whether we use glog or not.
@@ -12,6 +13,11 @@ CAFFE2_DEFINE_bool(caffe2_use_fatal_for_enforce, false,
                    "of throwing an exception.");
 
 namespace caffe2 {
+namespace enforce_detail {
+/* implicit */ EnforceFailMessage::EnforceFailMessage(std::string&& msg) {
+  msg_ = new std::string(std::move(msg));
+}
+} // namespace enforce_detail
 
 size_t ReplaceAll(string& s, const char* from, const char* to) {
   CAFFE_ENFORCE(from && *from);
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index 288c34afd5dbe7..859ee4765683a1 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -187,9 +187,8 @@ class CAFFE2_API EnforceFailMessage {
         "like `Equals`. Use CAFFE_ENFORCE for simple boolean checks.");
   }
 
-  /* implicit */ EnforceFailMessage(std::string&& msg) {
-    msg_ = new std::string(std::move(msg));
-  }
+  /* implicit */ EnforceFailMessage(std::string&& msg);
+
   inline bool bad() const {
     return msg_ != nullptr;
   }
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index 77934f6be12d45..c72c34e37e8c00 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -19,7 +19,7 @@ CAFFE2_DEFINE_string(
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     NetRegistry,
     NetBase,
     const std::shared_ptr<const NetDef>&,
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 57fd53f1de4f12..30ef4bde50cab7 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -9,12 +9,12 @@
 #include <unordered_map>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
@@ -134,15 +134,15 @@ class CAFFE2_API ExecutorHelper {
   virtual ~ExecutorHelper() {}
 };
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     NetRegistry,
     NetBase,
     const std::shared_ptr<const NetDef>&,
     Workspace*);
 #define REGISTER_NET_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__)
 #define REGISTER_NET(name, ...) \
-  CAFFE_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__)
 
 /**
  * @brief Creates a network, accessing / creating blobs in the given workspace.
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index b40a8fa33778a7..fe4b57cd3326d4 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -119,7 +119,7 @@ bool AsyncNetBase::RunAsync() {
   return DoRunAsync();
 }
 
-TaskThreadPool* AsyncNetBase::pool_getter(
+TaskThreadPool* AsyncNetBase::poolGetter(
     PoolsMap& pools,
     int device_type,
     int device_id,
@@ -136,7 +136,7 @@ TaskThreadPool* AsyncNetBase::pool_getter(
 
 TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
   if (use_single_pool_) {
-    return pool_getter(cpu_pools_, PROTO_CPU, -1, num_workers_);
+    return poolGetter(cpu_pools_, PROTO_CPU, -1, num_workers_);
   }
   static const std::unordered_set<int> cpu_types{
       PROTO_CPU,
@@ -155,13 +155,13 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
         FLAGS_caffe2_net_async_max_numa_nodes,
         "Invalid NUMA node id: ",
         numa_node_id);
-    return pool_getter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
+    return poolGetter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_);
   } else if (device_option.device_type() == PROTO_CUDA) {
     auto gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE(
         gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus,
         "Invalid GPU id: " + caffe2::to_string(gpu_id));
-    return pool_getter(gpu_pools_, PROTO_CUDA, gpu_id, num_workers_);
+    return poolGetter(gpu_pools_, PROTO_CUDA, gpu_id, num_workers_);
   } else {
     CAFFE_THROW(
         "Unsupported device type " +
@@ -281,10 +281,20 @@ bool AsyncNetBase::testAndSetScheduled(int task_id) {
   return !task_op_node.scheduled_.test_and_set();
 }
 
-int AsyncNetBase::num_ops(int task_id) const {
+int AsyncNetBase::numOps(int task_id) const {
   return chains_[task_id].size();
 }
 
+const OperatorBase* AsyncNetBase::firstTaskOp(int task_id) const {
+  auto op_id = chains_[task_id].front();
+  return operator_nodes_[op_id].operator_.get();
+}
+
+const OperatorBase* AsyncNetBase::lastTaskOp(int task_id) const {
+  auto op_id = chains_[task_id].back();
+  return operator_nodes_[op_id].operator_.get();
+}
+
 void AsyncNetBase::asyncWait(
     int task_id,
     int stream_id,
@@ -408,14 +418,9 @@ void AsyncNetBase::finalizeEvents() {
 
 AsyncNetBase::~AsyncNetBase() {}
 
-CAFFE_DEFINE_SHARED_REGISTRY(
-    ThreadPoolRegistry,
-    TaskThreadPool,
-    int,
-    int,
-    bool);
+C10_DEFINE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool);
 
-CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool);
+C10_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool);
 
 /* static */
 std::shared_ptr<TaskThreadPool>
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 502233e7f045b4..30948853dfb410 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -1,11 +1,11 @@
 #ifndef CAFFE2_CORE_NET_ASYNC_BASE_H_
 #define CAFFE2_CORE_NET_ASYNC_BASE_H_
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/net_async_base.h"
 #include "caffe2/core/net_dag_utils.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/stats.h"
 #include "caffe2/core/timer.h"
 #include "caffe2/core/workspace.h"
@@ -65,7 +65,9 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   int updateParentCount(int child_id);
   int getParentCount(int child_id);
   bool testAndSetScheduled(int task_id);
-  int num_ops(int task_id) const;
+  int numOps(int task_id) const;
+  const OperatorBase* firstTaskOp(int task_id) const;
+  const OperatorBase* lastTaskOp(int task_id) const;
 
   void asyncWait(
       int task_id,
@@ -131,7 +133,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   void storeExceptionPtr();
 
   TaskThreadPool*
-  pool_getter(PoolsMap& pools, int device_type, int device_id, int pool_size);
+  poolGetter(PoolsMap& pools, int device_type, int device_id, int pool_size);
 
   std::unique_ptr<AsyncNetExecutorHelper> helper_;
 
@@ -139,12 +141,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   friend class tracing::Tracer;
 };
 
-CAFFE_DECLARE_SHARED_REGISTRY(
-    ThreadPoolRegistry,
-    TaskThreadPool,
-    int,
-    int,
-    bool);
+C10_DECLARE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool);
 
 class AsyncNetExecutorHelper : public ExecutorHelper {
  public:
diff --git a/caffe2/core/net_async_gpu_thread_pool_gpu.cc b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
index ca3f691bc49764..dc0bf118ab7956 100644
--- a/caffe2/core/net_async_gpu_thread_pool_gpu.cc
+++ b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
@@ -6,7 +6,7 @@ CAFFE2_DEFINE_int(caffe2_threads_per_gpu, 1, "Number of CPU threads per GPU");
 
 namespace caffe2 {
 
-CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool);
+C10_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool);
 
 std::shared_ptr<TaskThreadPool>
 GetAsyncNetGPUThreadPool(int gpu_id, int pool_size, bool create_new) {
diff --git a/caffe2/core/net_async_scheduling.cc b/caffe2/core/net_async_scheduling.cc
index 7feb3631abfd66..80d5807295f75a 100644
--- a/caffe2/core/net_async_scheduling.cc
+++ b/caffe2/core/net_async_scheduling.cc
@@ -35,6 +35,17 @@ void AsyncSchedulingNet::Wait() {
   }
 }
 
+bool AsyncSchedulingNet::isInlineTask(int parent_id, int child_id) const {
+  if (!use_dfs_scheduling_) {
+    return false;
+  }
+  const auto* last_parent_op = lastTaskOp(parent_id);
+  const auto* first_child_op = firstTaskOp(child_id);
+  // check that we do not cross device boundary
+  return IsSameDevice(
+      last_parent_op->device_option(), first_child_op->device_option());
+}
+
 void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
   if (!testAndSetScheduled(task_id)) {
     return;
@@ -63,7 +74,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
             canSchedule(child_id)) {
           // if DFS scheduling is enabled, run children inline,
           // ignore DFS scheduling in callbacks
-          schedule(child_id, use_dfs_scheduling_);
+          schedule(child_id, isInlineTask(task_id, child_id));
         } else {
           bool parent_failed = false;
           bool parent_needs_polling = false;
@@ -102,7 +113,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
           if (parent_failed) {
             // one of parents failed, set failure flag and wrap up execution
             success_ = false;
-            schedule(child_id, use_dfs_scheduling_);
+            schedule(child_id, isInlineTask(task_id, child_id));
           } else if (parent_needs_polling) {
             // some parents are blocking us from scheduling a child and don't
             // support callbacks, using polling
@@ -119,7 +130,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) {
             }
           } else {
             // we're ready to schedule a child
-            schedule(child_id, use_dfs_scheduling_);
+            schedule(child_id, isInlineTask(task_id, child_id));
           }
         }
       }
diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h
index 4fcdf4b7316818..69563c4f20b325 100644
--- a/caffe2/core/net_async_scheduling.h
+++ b/caffe2/core/net_async_scheduling.h
@@ -22,6 +22,7 @@ class CAFFE2_API AsyncSchedulingNet : public AsyncNetBase {
   void reset() override;
   virtual void finishRun();
   void parentCallback(int parent_id);
+  bool isInlineTask(int parent_id, int child_id) const;
 
   std::mutex running_mutex_;
   std::condition_variable running_cv_;
diff --git a/caffe2/core/net_dag.h b/caffe2/core/net_dag.h
index ab3ce0f6f3fa10..7c66217a23ec4d 100644
--- a/caffe2/core/net_dag.h
+++ b/caffe2/core/net_dag.h
@@ -9,6 +9,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
@@ -16,7 +17,6 @@
 #include "caffe2/core/net_dag_utils.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/stats.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/timer.h"
diff --git a/caffe2/core/net_dag_utils.h b/caffe2/core/net_dag_utils.h
index 6debfbf7bd8053..0259f10f954652 100644
--- a/caffe2/core/net_dag_utils.h
+++ b/caffe2/core/net_dag_utils.h
@@ -10,13 +10,13 @@
 #include <unordered_set>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h
index c114fd8d224f21..5b8bc29be4dfae 100644
--- a/caffe2/core/net_simple.h
+++ b/caffe2/core/net_simple.h
@@ -3,10 +3,10 @@
 
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/core/net_simple_async.h b/caffe2/core/net_simple_async.h
index ea5aae959870f6..abe16f2013789f 100644
--- a/caffe2/core/net_simple_async.h
+++ b/caffe2/core/net_simple_async.h
@@ -3,10 +3,10 @@
 
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index e7a889980365c5..523f29225aa07b 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -354,6 +354,11 @@ std::vector<typename G::NodeRef> nodeIterator(G& g) {
   return out;
 }
 
+template <typename T>
+inline std::vector<NNGraph::NodeRef> filter(NNModule& nn) {
+  return nodeIterator<T>(nn.dataFlow);
+}
+
 template <typename T, typename G>
 std::vector<std::pair<T*, typename G::NodeRef>> dataIterator(G& g) {
   std::vector<std::pair<T*, typename G::NodeRef>> out;
diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc
index fa8aee6d818366..b21246a6611789 100644
--- a/caffe2/core/observer_test.cc
+++ b/caffe2/core/observer_test.cc
@@ -1,11 +1,11 @@
 #include <gtest/gtest.h>
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/net_dag.h"
 #include "caffe2/core/net_simple.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/scope_guard.h"
 
 namespace caffe2 {
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 5f3f653b5a4b21..79be08c03b2325 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -316,31 +316,32 @@ std::map<DeviceType, OperatorRegistry*>* gDeviceTypeRegistry() {
   return &g_device_type_registry;
 }
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     CPUOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(CPU, CPUOperatorRegistry);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     CUDAOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(CUDA, CUDAOperatorRegistry);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     HIPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(HIP, HIPOperatorRegistry);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     GradientRegistry,
     GradientMakerBase,
-    const OperatorDef&, const vector<GradientWrapper>&);
+    const OperatorDef&,
+    const vector<GradientWrapper>&);
 
 GradientOpsMeta GetGradientForOp(
     const OperatorDef& def, const vector<GradientWrapper>& g_output) {
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 1a968c4c3755fe..8208eb271bdc1b 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -9,13 +9,13 @@
 #include <typeinfo>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_gradient.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/types.h"
 #include "caffe2/core/workspace.h"
@@ -778,13 +778,13 @@ CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER(
 //     registry function.
 // (2) Then, one can call the operator registry function to further create the
 //     operators.
-typedef Registry<
+typedef c10::Registry<
     std::string,
     std::unique_ptr<OperatorBase>,
     const OperatorDef&,
     Workspace*>
     OperatorRegistry;
-typedef Registry<
+typedef c10::Registry<
     std::string,
     std::unique_ptr<OperatorBase>,
     const OperatorDef&,
@@ -806,7 +806,7 @@ struct CAFFE2_API DeviceTypeRegisterer {
 
 #define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
   namespace {                                               \
-  static DeviceTypeRegisterer CAFFE_ANONYMOUS_VARIABLE(     \
+  static DeviceTypeRegisterer C10_ANONYMOUS_VARIABLE(       \
       DeviceType)(type, &registry_function);                \
   }
 
@@ -817,69 +817,67 @@ struct CAFFE2_API DeviceTypeRegisterer {
 // not depend on specific cuda or cudnn libraries. This means that we will be
 // able to compile it even when there is no cuda available - we simply do not
 // link any cuda or cudnn operators.
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     CPUOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CPU_OPERATOR(name, ...)                           \
   C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
   }                                                                \
-  CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_CPU_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_CPU_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     CUDAOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CUDA_OPERATOR(name, ...)                           \
   C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();   \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
   }                                                                 \
-  CAFFE_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_CUDA_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(                                       \
-      CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 // Macros for cudnn since we use it often
 #define REGISTER_CUDNN_OPERATOR(name, ...) \
   REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__)
 
 // Macros for HIP operators
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     HIPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR(name, ...)                           \
   C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
   }                                                                \
-  CAFFE_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_HIP_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(                                       \
-      HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 #define REGISTER_MIOPEN_OPERATOR(name, ...) \
   REGISTER_HIP_OPERATOR_WITH_ENGINE(name, MIOPEN, __VA_ARGS__)
diff --git a/caffe2/core/operator_c10wrapper.cc b/caffe2/core/operator_c10wrapper.cc
index 6fd62ec1cf63b4..523c467b170d58 100644
--- a/caffe2/core/operator_c10wrapper.cc
+++ b/caffe2/core/operator_c10wrapper.cc
@@ -2,7 +2,7 @@
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     C10OperatorRegistry,
     OperatorBase,
     const OperatorDef&,
diff --git a/caffe2/core/operator_c10wrapper.h b/caffe2/core/operator_c10wrapper.h
index 695319266901a8..57a3c370ba5e32 100644
--- a/caffe2/core/operator_c10wrapper.h
+++ b/caffe2/core/operator_c10wrapper.h
@@ -284,7 +284,7 @@ struct ParameterHelper final {
   }
 };
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     C10OperatorRegistry,
     OperatorBase,
     const OperatorDef&,
@@ -293,14 +293,14 @@ CAFFE_DECLARE_REGISTRY(
 // TODO Currently we only register the CPU variant. This is going to be fixed
 //      once the tensor detemplatization lands.
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH(OpSchemaDef, State, Name) \
-  CAFFE_REGISTER_CLASS(                                                     \
+  C10_REGISTER_CLASS(                                                       \
       C10OperatorRegistry,                                                  \
       Name,                                                                 \
       C10OperatorWrapper<OpSchemaDef, CPUContext, State, false, std::tuple<>>)
 
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_PARAMETERS( \
     OpSchemaDef, State, Name, ...)                                 \
-  CAFFE_REGISTER_CLASS(                                            \
+  C10_REGISTER_CLASS(                                              \
       C10OperatorRegistry,                                         \
       Name,                                                        \
       C10OperatorWrapper<                                          \
@@ -312,14 +312,14 @@ CAFFE_DECLARE_REGISTRY(
 
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_ARRAY_INPUT( \
     OpSchemaDef, State, Name)                                       \
-  CAFFE_REGISTER_CLASS(                                             \
+  C10_REGISTER_CLASS(                                               \
       C10OperatorRegistry,                                          \
       Name,                                                         \
       C10OperatorWrapper<OpSchemaDef, CPUContext, State, true, std::tuple<>>)
 
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_ARRAY_INPUT_AND_PARAMETERS( \
     OpSchemaDef, State, Name, ...)                                                 \
-  CAFFE_REGISTER_CLASS(                                                            \
+  C10_REGISTER_CLASS(                                                              \
       C10OperatorRegistry,                                                         \
       Name,                                                                        \
       C10OperatorWrapper<                                                          \
diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
index 3eea164c21b840..2eb5b581092c30 100644
--- a/caffe2/core/operator_gradient.h
+++ b/caffe2/core/operator_gradient.h
@@ -1,8 +1,8 @@
 #ifndef CAFFE2_CORE_OPERATOR_GRADIENT_H_
 #define CAFFE2_CORE_OPERATOR_GRADIENT_H_
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/proto_utils.h"
 
@@ -295,16 +295,16 @@ struct GradientNotImplementedYet : public GradientMakerBase {
   }
 };
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     GradientRegistry,
     GradientMakerBase,
     const OperatorDef&,
     const vector<GradientWrapper>&);
 
 #define REGISTER_GRADIENT(name, ...) \
-  CAFFE_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
 #define REGISTER_GRADIENT_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
 
 // NO_GRADIENT means that the operator does not need any gradient computation.
 #define NO_GRADIENT(name) REGISTER_GRADIENT(name, NoGradient)
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index 54a6a17b8a0d24..a938d8f56afc93 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -9,9 +9,9 @@
 #include <vector>
 #include <unordered_map>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/filler.h"
 
@@ -578,14 +578,14 @@ OpSchema::Cost PointwiseCostInference(
 
 #define OPERATOR_SCHEMA(name)                                       \
   C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
+  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =     \
       &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #else // CAFFE2_NO_OPERATOR_SCHEMA
 
 #define OPERATOR_SCHEMA(name)                                       \
   C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
+  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =     \
       1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #endif // CAFFE2_NO_OPERATOR_SCHEMA
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 8e48b6b7beabca..51faaed9e7eec9 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -489,7 +489,9 @@ bool RunPlanOnWorkspace(
 
   NetDefMap net_defs;
   for (const NetDef& net_def : plan.network()) {
-    LOG(INFO) << "Processing net '" << net_def.name() << "'";
+    LOG(INFO) << "Processing net '" << net_def.name() << "', type: '"
+              << net_def.type() << "', #ops: " << net_def.op_size()
+              << ", num_workers: " << net_def.num_workers();
     CAFFE_ENFORCE(
         net_defs.count(net_def.name()) == 0,
         "Your plan contains networks of the same name \"",
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
deleted file mode 100644
index f026795b23c3e1..00000000000000
--- a/caffe2/core/registry.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/**
- * Simple registry implementation in Caffe2 that uses static variables to
- * register object creators during program initialization time.
- *
- * WARNING: this registry is not entirely thread-safe, as reads to
- * the registry are not protected by a mutex.  The safest mode of use
- * is to dlopen() *all* dynamic libraries that may write to the library
- * and synchronize prior to performing any reads on the registry.
- */
-#ifndef CAFFE2_CORE_REGISTRY_H_
-#define CAFFE2_CORE_REGISTRY_H_
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <mutex>
-
-#include <ATen/core/Registry.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/typeid.h"
-
-namespace caffe2 {
-
-/**
- * @brief A template class that allows one to register classes by keys.
- *
- * The keys are usually a string specifying the name, but can be anything that
- * can be used in a std::map.
- *
- * You should most likely not use the Registry class explicitly, but use the
- * helper macros below to declare specific registries as well as registering
- * objects.
- */
-template <class SrcType, class ObjectPtrType, class... Args>
-class Registry {
- public:
-  typedef std::function<ObjectPtrType(Args...)> Creator;
-
-  Registry() : registry_() {}
-
-  void Register(const SrcType& key, Creator creator) {
-    // The if statement below is essentially the same as the following line:
-    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
-    //                                   << " registered twice.";
-    // However, CHECK_EQ depends on google logging, and since registration is
-    // carried out at static initialization time, we do not want to have an
-    // explicit dependency on glog's initialization function.
-    std::lock_guard<std::mutex> lock(register_mutex_);
-    if (registry_.count(key) != 0) {
-      printf("Key already registered.\n");
-      at::PrintOffendingKey(key);
-      std::exit(1);
-    }
-    registry_[key] = creator;
-  }
-
-  void Register(const SrcType& key, Creator creator, const string& help_msg) {
-    Register(key, creator);
-    help_message_[key] = help_msg;
-  }
-
-  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
-
-  ObjectPtrType Create(const SrcType& key, Args... args) {
-    if (registry_.count(key) == 0) {
-      // Returns nullptr if the key is not registered.
-      return nullptr;
-    }
-    return registry_[key](args...);
-  }
-
-  /**
-   * Returns the keys currently registered as a vector.
-   */
-  vector<SrcType> Keys() {
-    vector<SrcType> keys;
-    for (const auto& it : registry_) {
-      keys.push_back(it.first);
-    }
-    return keys;
-  }
-
-  const CaffeMap<SrcType, string>& HelpMessage() const {
-    return help_message_;
-  }
-
-  const char* HelpMessage(const SrcType& key) const {
-    auto it = help_message_.find(key);
-    if (it == help_message_.end()) {
-      return nullptr;
-    }
-    return it->second.c_str();
-  }
-
- private:
-  CaffeMap<SrcType, Creator> registry_;
-  CaffeMap<SrcType, string> help_message_;
-  std::mutex register_mutex_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(Registry);
-};
-
-template <class SrcType, class ObjectPtrType, class... Args>
-class Registerer {
- public:
-  Registerer(
-      const SrcType& key,
-      Registry<SrcType, ObjectPtrType, Args...>* registry,
-      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
-      const string& help_msg = "") {
-    registry->Register(key, creator, help_msg);
-  }
-
-  template <class DerivedType>
-  static ObjectPtrType DefaultCreator(Args... args) {
-    // TODO(jiayq): old versions of NVCC does not handle make_unique well
-    // so we are forced to use a unique_ptr constructor here. Check if it is
-    // fine to use make_unique in the future.
-    // return make_unique<DerivedType>(args...);
-    return ObjectPtrType(new DerivedType(args...));
-  }
-};
-
-/**
- * CAFFE_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
- * str and ending with a number that varies with the line.
- * Pretty much a copy from 'folly/Preprocessor.h'
- */
-#define CAFFE_CONCATENATE_IMPL(s1, s2) s1##s2
-#define CAFFE_CONCATENATE(s1, s2) CAFFE_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __COUNTER__)
-#else
-#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __LINE__)
-#endif
-
-/**
- * CAFFE_DECLARE_TYPED_REGISTRY is a macro that expands to a function
- * declaration, as well as creating a convenient typename for its corresponding
- * registerer.
- */
-#define CAFFE_DECLARE_TYPED_REGISTRY(                               \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                \
-  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
-  RegistryName();                                                   \
-  typedef Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>   \
-      Registerer##RegistryName;
-
-#define CAFFE_DEFINE_TYPED_REGISTRY(                                         \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*          \
-  RegistryName() {                                                           \
-    static Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* registry = \
-        new Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();         \
-    return registry;                                                         \
-  }
-
-// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
-// creator with comma in its templated arguments.
-#define CAFFE_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
-  namespace {                                                                 \
-  static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key, RegistryName(), __VA_ARGS__);                                      \
-  }
-
-#define CAFFE_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
-  namespace {                                                                 \
-  static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key,                                                                    \
-      RegistryName(),                                                         \
-      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      at::demangle_type<__VA_ARGS__>());                                           \
-  }
-
-// CAFFE_DECLARE_REGISTRY and CAFFE_DEFINE_REGISTRY are hard-wired to use string
-// as the key
-// type, because that is the most commonly used cases.
-#define CAFFE_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DECLARE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
-
-#define CAFFE_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DEFINE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
-
-#define CAFFE_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DECLARE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
-
-#define CAFFE_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  CAFFE_DEFINE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
-
-// CAFFE_REGISTER_CREATOR and CAFFE_REGISTER_CLASS are hard-wired to use string
-// as the key
-// type, because that is the most commonly used cases.
-#define CAFFE_REGISTER_CREATOR(RegistryName, key, ...) \
-  CAFFE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
-
-#define CAFFE_REGISTER_CLASS(RegistryName, key, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
-
-}  // namespace caffe2
-#endif  // CAFFE2_CORE_REGISTRY_H_
diff --git a/caffe2/core/registry_test.cc b/caffe2/core/registry_test.cc
deleted file mode 100644
index 7ad8ead553463a..00000000000000
--- a/caffe2/core/registry_test.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <iostream>
-#include <memory>
-
-#include "caffe2/core/registry.h"
-#include <gtest/gtest.h>
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-namespace {
-
-class Foo {
- public:
-  explicit Foo(int x) { LOG(INFO) << "Foo " << x; }
-};
-
-CAFFE_DECLARE_REGISTRY(FooRegistry, Foo, int);
-CAFFE_DEFINE_REGISTRY(FooRegistry, Foo, int);
-#define REGISTER_FOO(clsname) \
-  CAFFE_REGISTER_CLASS(FooRegistry, clsname, clsname)
-
-class Bar : public Foo {
- public:
-  explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; }
-};
-REGISTER_FOO(Bar);
-
-class AnotherBar : public Foo {
- public:
-  explicit AnotherBar(int x) : Foo(x) {
-    LOG(INFO) << "AnotherBar " << x;
-  }
-};
-REGISTER_FOO(AnotherBar);
-
-TEST(RegistryTest, CanRunCreator) {
-  unique_ptr<Foo> bar(FooRegistry()->Create("Bar", 1));
-  EXPECT_TRUE(bar != nullptr) << "Cannot create bar.";
-  unique_ptr<Foo> another_bar(FooRegistry()->Create("AnotherBar", 1));
-  EXPECT_TRUE(another_bar != nullptr);
-}
-
-TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
-  EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
-}
-}
-}  // namespace caffe2
diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h
index 86c6827e3039a1..f037ca6e175606 100644
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@@ -350,8 +350,8 @@ _ScopeGuard<T> ScopeGuard(T f) {
         ##__VA_ARGS__);                                             \
   }
 
-#define CAFFE_DURATION(stats, field, ...)                \
-  if (auto g = detail::ScopeGuard([&](int64_t nanos) {   \
-        CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__); \
+#define CAFFE_DURATION(stats, field, ...)                        \
+  if (auto g = ::caffe2::detail::ScopeGuard([&](int64_t nanos) { \
+        CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__);         \
       }))
 } // namespace caffe2
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index caa0ba9ea55f49..0e531c83fcb7ad 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -6,8 +6,6 @@ namespace caffe2 {
 
 CAFFE_DEFINE_KNOWN_TYPE(Tensor);
 
-UndefinedTensorImpl UndefinedTensorImpl::singleton_;
-
 TensorPrinter::TensorPrinter(
     const std::string& tensor_name,
     const std::string& file_name,
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 1e4cac2788b560..bb478e415a8ce6 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -5,34 +5,19 @@
 #include "caffe2/core/tensor_impl.h"
 
 #include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/UndefinedTensorImpl.h>
 
 namespace caffe2 {
 
-class CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
-  UndefinedTensorImpl() : TensorImpl(at::Storage()){};
-
- public:
- // Without this, we get:
- //  error: identifier "at::UndefinedTensor::_singleton" is undefined in device code
- // (ostensibly because the constexpr tricks MSVC into trying to compile this
- // function for device as well).
-#ifdef _WIN32
- static inline TensorImpl * singleton() {
-#else
- static constexpr inline TensorImpl * singleton() {
-#endif
-    return &singleton_;
-  }
-
- private:
-  static UndefinedTensorImpl singleton_;
-};
+using at::UndefinedTensorImpl;
 
 /**
  * @brief Tensor class holds a shared pointer to the implementation TensorImpl,
  * redirects API calls to TensorImpl;
  * Copying of Tensor results in sharing the same underlying implementation
  * object
+ *
+ * NB: See TensorImpl for documentation on these methods.
  */
 class CAFFE2_API Tensor final {
  protected:
@@ -130,28 +115,52 @@ class CAFFE2_API Tensor final {
     return impl_.get()->GetStaticContext();
   }
 
-  std::unique_ptr<BaseContext> CreateContext() const {
-    return impl_.get()->CreateContext();
+  DeviceType GetDeviceType() const {
+    return impl_->device_type();
   }
 
-  DeviceType GetDeviceType() const {
-    return impl_.get()->GetDeviceType();
+  at::Device GetDevice() const {
+    return impl_.get()->GetDevice();
   }
 
   void CopyFrom(const Tensor& src, BaseContext* context = nullptr) const {
     impl_.get()->CopyFrom(*src.impl_.get(), context);
   }
 
+  /**
+   * @brief Extend the outer-most dimension of this tensor
+   *        to dimension of `num`.
+   */
   void ExtendTo(int64_t num, float growthPct, BaseContext* context) const {
-    impl_.get()->ExtendTo(num, growthPct, context);
+    CAFFE_ENFORCE_GE_WITH_CALLER(impl_->dim(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
+    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
+    Extend(num - impl_->size(0), growthPct, context);
   }
 
   void Extend(int64_t num, float growthPct, BaseContext* context) const {
     impl_.get()->Extend(num, growthPct, context);
   }
 
+  /**
+   * @brief Shrinks the outer-most dimension to given size, keeping the data.
+   *
+   * This method guarantees that no re-allocations are carried out, which means
+   * that the extra capacity after the end of the shrunk tensor is maintained.
+   * Notably, this function does NOT respect caffe2_keep_on_shrink.
+   */
   void ShrinkTo(int64_t outer_dim) const {
-    impl_.get()->ShrinkTo(outer_dim);
+    CAFFE_ENFORCE_WITH_CALLER(
+        impl_->is_contiguous(),
+        "Right now ShrinkTo is only supported on contiguous Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(impl_->dim() >= 1, "Tensor must be at least 1D");
+    CAFFE_ENFORCE_WITH_CALLER(
+        outer_dim <= impl_->size(0),
+        "New outer dimension must be smaller than current.");
+    CAFFE_ENFORCE(
+        impl_->storage().unique(),
+        "Can't call ShrinkTo on shared storage, please call Resize instead.");
+    impl_.get()->set_size(0, outer_dim);
   }
 
   template <class T>
@@ -164,8 +173,18 @@ class CAFFE2_API Tensor final {
     impl_.get()->Resize(dim_source...);
   }
 
+  /**
+   * Resize the tensor like the source tensor. Note that this is just a
+   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
+   * This method respects caffe2_keep_on_shrink.
+   */
   inline void ResizeLike(const Tensor& src_tensor) const {
-    impl_.get()->ResizeLike(*src_tensor.impl_.get());
+    CAFFE_ENFORCE_WITH_CALLER(
+        src_tensor.is_contiguous(),
+        "Right now ResizeLike is only supported for contiguous Tensor.");
+    if (impl_ != src_tensor.impl_) {
+      impl_.get()->Resize(src_tensor.dims());
+    }
   }
 
   inline void Reshape(const vector<int64_t>& dims) const {
@@ -173,15 +192,27 @@ class CAFFE2_API Tensor final {
   }
 
   inline void Reshape(const vector<int>& dims) const {
-    impl_.get()->Reshape(dims);
+    impl_.get()->Reshape(ToVectorint64_t(dims));
   }
 
   inline void FreeMemory() const {
     impl_.get()->FreeMemory();
   }
 
+  /**
+   * A utility function to print the debug string for the tensor. Note that this
+   * is very slow since it involves quite some string operations, so do not use
+   * it in your performance-critical code.
+   */
   string DebugString() const {
-    return impl_.get()->DebugString();
+    std::stringstream ss;
+    ss << "A Tensor of item size " << impl_->storage().itemsize() << " and type "
+       << impl_->dtype().name() << " and dimension (";
+    for (int d : impl_->sizes()) {
+      ss << d << ",";
+    }
+    ss << ").";
+    return ss.str();
   }
 
   // NB: a.swap(b) is not equivalent to std::swap(a, b);
@@ -196,25 +227,42 @@ class CAFFE2_API Tensor final {
     impl_.get()->ShareData(*src.impl_.get());
   }
 
+  /**
+   * @brief Shares the data with an externally managed pointer.
+   *
+   * This is similar to ShareData() but the source is a pointer with an advanced
+   * deleter option. In default, no deletion takes place, and one needs to make
+   * sure that the external memory is deallocated only after the tensor finishes
+   * using it. If a Deleter object is passed in, when this tensor is reallocated
+   * or freed, the deleter function is going to be called.
+   */
   template <typename T>
   void ShareExternalPointer(
       T* src,
       size_t capacity = 0,
       MemoryDeleter d = nullptr) const {
-    impl_.get()->ShareExternalPointer<T>(src, capacity, d);
+    ShareExternalPointer((void*)src, caffe2::TypeMeta::Make<T>(), capacity, d);
   }
 
   template <typename T>
   void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const {
-    impl_.get()->ShareExternalPointer<T>(std::move(data_ptr), capacity);
+    ShareExternalPointer(std::move(data_ptr), caffe2::TypeMeta::Make<T>(), capacity);
   }
 
   void ShareExternalPointer(
       void* src,
-      const TypeMeta& meta,
+      const TypeMeta& data_type,
       size_t capacity = 0,
       MemoryDeleter d = nullptr) const {
-    impl_.get()->ShareExternalPointer(src, meta, capacity, d);
+    CAFFE_ENFORCE_WITH_CALLER(
+        impl_->is_contiguous(),
+        "Right now ShareExternalPointer is only supported for contiguous Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    impl_.get()->ShareExternalPointer(
+        at::DataPtr(src, src, d, impl_->device_type()), data_type, capacity);
   }
 
   void ShareExternalPointer(
@@ -224,8 +272,12 @@ class CAFFE2_API Tensor final {
     impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity);
   }
 
+  /**
+   * Returns a const raw void* pointer of the underlying storage. mutable_data()
+   * or raw_mutable_data() must have been called prior to this function call.
+   */
   inline const void* raw_data() const {
-    return impl_.get()->raw_data();
+    return impl_->data();
   }
 
   template <typename T>
@@ -237,8 +289,22 @@ class CAFFE2_API Tensor final {
     return impl_.get()->raw_mutable_data(meta);
   }
 
+  /**
+   * Returns a mutable raw pointer of the underlying storage. This can only be
+   * used when you know for sure that the underlying storage of the tensor is
+   * already created via an earlier raw_mutable_data(meta) call or a
+   * mutable_data<T>() call.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
   inline void* raw_mutable_data() const {
-    return impl_.get()->raw_mutable_data();
+    const auto& data_type = impl_->dtype();
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != caffe2::TypeIdentifier::uninitialized(),
+        "Calling raw_mutable_data() without meta, but the current meta is "
+        "of unknown type.");
+    return raw_mutable_data(data_type);
   }
 
   template <typename T>
@@ -246,20 +312,34 @@ class CAFFE2_API Tensor final {
     return impl_.get()->mutable_data<T>();
   }
 
+  /**
+   * Returns the number of dimensions of the data.
+   */
   inline int ndim() const {
-    return impl_.get()->ndim();
+    return impl_->dim();
   }
 
+  /**
+   * Returns the size (i.e. the number of items) of the tensor.
+   */
   inline int64_t size() const {
-    return impl_.get()->size();
+    return impl_->numel();
   }
 
+  /**
+   * Return the number of bytes each item takes in the tensor.
+   */
   inline size_t itemsize() const {
-    return impl_.get()->itemsize();
+    return impl_->storage().itemsize();
   }
 
+  /**
+   * Returns the total number of bytes of the storage.
+   *
+   * This is equivalent to calling size() * itemsize().
+   */
   inline size_t nbytes() const {
-    return impl_.get()->nbytes();
+    return impl_->numel() * itemsize();
   }
 
   inline const vector<int64_t>& dims() const {
@@ -267,26 +347,37 @@ class CAFFE2_API Tensor final {
   }
 
   inline int64_t size_from_dim(int k) const {
-    return impl_.get()->size_from_dim(k);
+    return size_from_dim_(k, impl_->sizes());
   }
 
   inline int64_t size_to_dim(int k) const {
-    return impl_.get()->size_to_dim(k);
+    return size_to_dim_(k, impl_->sizes());
   }
 
   inline int64_t size_between_dim(int k, int l) const {
-    return impl_.get()->size_between_dim(k, l);
+    return size_between_dim_(k, l, impl_->sizes());
   }
 
+  /**
+   * Returns the 'canonical' version of a (usually)  user-specified axis,
+   * allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param axis_index the axis index.
+   *        If 0 <= index < dim(), return index.
+   *        If -ndim <= index <= -1, return (dim() - (-index)),
+   *        e.g., the last axis index (dim() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
   inline int canonical_axis_index(int axis_index) const {
-    return impl_.get()->canonical_axis_index(axis_index);
+    return canonical_axis_index_(axis_index, impl_->dim());
   }
 
   inline int64_t stride(int64_t dim) const {
     return impl_.get()->stride(dim);
   }
 
-  inline at::DimVector strides() {
+  inline at::IntList strides() {
     return impl_.get()->strides();
   }
 
@@ -294,25 +385,46 @@ class CAFFE2_API Tensor final {
     return impl_.get()->is_contiguous();
   }
 
+  /**
+   * Checks if the tensor content is of the given data type.
+   */
   template <typename T>
   inline bool IsType() const {
-    return impl_.get()->IsType<T>();
+    return impl_->storage().IsType<T>();
   }
 
+  /**
+   * Returns the TypeMeta object associated with the current data type.
+   */
   inline const TypeMeta& meta() const {
-    return impl_.get()->meta();
+    return impl_->dtype();
   }
 
+  /**
+   * Returns the i-th dimension of the tensor in int.
+   *
+   * This function returns an int value instead of int64_t, which depending on
+   * the typedef could be int64. If you want int64 dim values, make sure you
+   * call dim() instead.
+   */
   inline int dim32(const int i) const {
-    return impl_.get()->dim32(i);
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast<int>(impl_->dim()), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    auto s = impl_->size(i);
+    CAFFE_ENFORCE_LT_WITH_CALLER(s, std::numeric_limits<int>::max());
+    return static_cast<int>(s);
   }
 
   inline int64_t dim(const int i) const {
-    return impl_.get()->dim(i);
+    return impl_->size(i);
   }
 
   inline void ExtractDeviceOption(DeviceOption* device) const {
-    return impl_.get()->ExtractDeviceOption(device);
+    auto* context = GetStaticContext();
+    CHECK(context);
+    context->ExtractDeviceOption(device, impl_->data());
   }
 
   const Storage& storage() {
diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc
index cff98c6101ea5d..dc8d666d6cb3a5 100644
--- a/caffe2/core/tensor_impl.cc
+++ b/caffe2/core/tensor_impl.cc
@@ -1,5 +1,4 @@
 #include "caffe2/core/tensor_impl.h"
-
 #include "caffe2/core/flags.h"
 
 CAFFE2_DEFINE_bool(
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 53c812f55e297b..2ee51f655e1e22 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -3,911 +3,13 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/TensorImpl.h>
 #include <ATen/core/context_base.h>
-#include <ATen/core/context_base.h>
-
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/logging.h"
-
-// A global boolean variable to control whether we free memory when a Tensor
-// is shrinked to a smaller size. As a result, a Tensor is always going to
-// keep the memory allocated for its maximum capacity reshaped to so far.
-CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
-
-// Since we can have high variance in blob memory allocated across different
-// inputs in the same run, we will shrink the blob only if the memory gain
-// is larger than this flag in bytes.
-CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+#include <ATen/core/WrapDimMinimal.h>
 
 namespace caffe2 {
-
-// Defined by protobuf
-class DeviceOption;
-
-/**
- * A utility function to convert vector<int> to vector<int64_t>.
- */
-inline std::vector<int64_t> ToVectorint64_t(const std::vector<int>& src) {
-  return std::vector<int64_t>(src.begin(), src.end());
-}
-
-/**
- * Return product of all dimensions starting from k
- */
-inline int64_t size_from_dim_(int k, const std::vector<int64_t>& dims) {
-  int64_t r = 1;
-  for (size_t i = k; i < dims.size(); ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims up to k (not including dims[k])
-inline int64_t size_to_dim_(int k, const std::vector<int64_t>& dims) {
-  CAFFE_ENFORCE((unsigned)k <= dims.size());
-  int64_t r = 1;
-  for (int i = 0; i < k; ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims between k and l (not including dims[k] and dims[l])
-inline int64_t size_between_dim_(int k, int l, const std::vector<int64_t>& dims) {
-  CAFFE_ENFORCE((unsigned)l < dims.size());
-  int64_t r = 1;
-  if (k < l) {
-    for (int i = k + 1; i < l; ++i) {
-      r *= dims[i];
-    }
-  } else {
-    for (int i = l + 1; i < k; ++i) {
-      r *= dims[i];
-    }
-  }
-  return r;
-}
-
-// Wrap around axis_index if it is negative, s.t., -1 is the last dim
-inline int canonical_axis_index_(int axis_index, int ndims) {
-  CAFFE_ENFORCE_GE(axis_index, -ndims);
-  CAFFE_ENFORCE_LT(axis_index, ndims);
-  if (axis_index < 0) {
-    return axis_index + ndims;
-  }
-  return axis_index;
-}
-
-/**
- * @brief TensorImpl is the implementation of a tensor and the basic class
- * in Caffe2 that stores a contiguous memory with its shape information.
- *
- * The TensorImpl class is essentially a wrapper around a device-specific memory
- * (the device is specified by the Context template argument), and deals with
- * the allocation and de-allocation of such memory. We make a simplified
- * assumption that the memory is always contiguous.
- */
-class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
- public:
-  TensorImpl() = delete;
-
-  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {
-    data_type_ = storage_ ? storage_.dtype() : TypeMeta{};
-  }
-
-  TensorImpl(const TensorImpl&) = default;
-  TensorImpl& operator=(const TensorImpl&) = default;
-  TensorImpl(TensorImpl&&) = default;
-  TensorImpl& operator=(TensorImpl&&) = default;
-
-  virtual ~TensorImpl() noexcept {}
-
-  /*
-   * Since we removed template from tensor, we now store a static
-   * context pointer in tensor, which indicates the type of the tensor.
-   */
-  at::BaseStaticContext* GetStaticContext() const {
-    auto device_type = GetDeviceType();
-    return get_static_context(device_type);
-  }
-
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us
-   */
-  std::unique_ptr<at::BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
-  }
-
-  at::DeviceType GetDeviceType() const {
-    return storage_.device_type();
-  }
-
-  /**
-   * @brief Copies the data from a source tensor, with a contex provided to
-   * carry out the underlying memcpy operation.
-   */
-  void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
-    if ((void*)&src == (void*)this) {
-      return;
-    }
-    if (data_type_ != src.meta()) {
-      CAFFE_ENFORCE_WITH_CALLER(
-          src.is_contiguous(),
-          "Right now only copy of contiguous source Tensor is supported.");
-      storage_ = at::Storage(GetDeviceType(), src.meta());
-      data_type_ = src.meta();
-    }
-    if (src.size() == -1) {
-      dims_.clear();
-      numel_ = -1;
-      strides_.clear();
-      is_contiguous_ = true;
-      storage_.reset();
-      data_type_ = TypeMeta();
-      return;
-    }
-    Resize(src.dims());
-    if (size() > 0) {
-      if (data_type_.copy()) {
-        CAFFE_ENFORCE(
-            GetDeviceType() == ::at::DeviceType::CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        CAFFE_ENFORCE(
-            src.GetDeviceType() == ::at::DeviceType::CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
-      } else {
-        // We'll need to use a non-CPU context to perform the copy if
-        // one of the context is not CPU since only non-CPU context
-        // knows how to copy between CPU and that context
-        if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) {
-          if (!context) {
-            src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          } else {
-            CAFFE_ENFORCE(
-                context->device_type() == src.GetDeviceType(),
-                "Type for provided context does not match the type of source");
-            context->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          }
-        } else {
-          // In case source context is CPU, and target context is non-CPU
-          // We'll have to create a Context from target and perform the
-          // copy using that context
-          CreateContext()->CopyBytesFromCPU(
-              nbytes(), src.raw_data(), raw_mutable_data());
-        }
-      }
-    }
-  }
-
-  /**
-   * @brief Extend the outer-most dimension of this tensor
-   *        to dimension of `num`.
-   */
-  void ExtendTo(int64_t num, float growthPct, at::BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
-    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
-    Extend(num - dims_[0], growthPct, context);
-  }
-
-  /**
-   * @brief Extends the outer-most dimension of this tensor by num elements,
-   * preserving the existing data.
-   *
-   * The underlying data may be reallocated in order to accommodate the new
-   * elements, in which case this tensors' capacity is grown at a factor of
-   * growthPct. This ensures that Extend runs on an amortized O(1) time
-   * complexity.
-   */
-  void Extend(int64_t num, float growthPct, at::BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(
-        num, 0, "`num` must be non-negative for Extend");
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now Extend is only supported for contiguous Tensor.");
-    auto newDims = dims_;
-    newDims[0] += num;
-    if (!storage_.data()) {
-      Resize(newDims);
-      return;
-    }
-    auto newNumel = std::accumulate(
-        newDims.begin(),
-        newDims.end(),
-        static_cast<int64_t>(1),
-        std::multiplies<int64_t>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      dims_ = newDims;
-      numel_ = newNumel;
-      return;
-    }
-    auto newCapacity = dims_;
-    newCapacity[0] = std::max<size_t>(
-        newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
-    auto oldData = std::move(storage_.data_ptr());
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    auto* newData = raw_mutable_data(data_type_);
-    CAFFE_ENFORCE(
-        context != nullptr, "Context must be provided to Extend the tensor");
-    context->CopyItemsSameDevice(
-        data_type_, oldSize, oldData.get(), newData);
-    reserved_ = true;
-    dims_ = newDims;
-    numel_ = newNumel;
-  }
-
-  /**
-   * @brief Shrinks the outer-most dimension to given size, keeping the data.
-   *
-   * This method guarantees that no re-allocations are carried out, which means
-   * that the extra capacity after the end of the shurnk tensor is maintained.
-   */
-  void ShrinkTo(int64_t outer_dim) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ShrinkTo is only supported on contiguous Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
-    CAFFE_ENFORCE_WITH_CALLER(
-        outer_dim <= dims_[0],
-        "New outer dimension must be smaller than current.");
-    CAFFE_ENFORCE(
-        storage_.unique(),
-        "Can't call ShrinkTo on shared storage, please call Resize instead.");
-    dims_[0] = outer_dim;
-    numel_ = std::accumulate(
-        dims_.begin(),
-        dims_.end(),
-        static_cast<int64_t>(1),
-        std::multiplies<int64_t>());
-  }
-
-  /**
-   * @brief Reserve space for the underlying tensor.
-   *
-   * This must be called after Resize(), since we only specify the first
-   * dimension This does not copy over the old data to the newly allocated space
-   */
-  template <class T>
-  void ReserveSpace(const T& outer_dim) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ReserveSpace is only supported for contiguous Tensor.");
-    CAFFE_ENFORCE(
-        numel_ != -1, "size should be initialized before calling ReserveSpace");
-    CAFFE_ENFORCE(
-        storage_.unique(), "Can't call ReserveSpace on shared storage.");
-    auto newCapacity = dims_;
-    newCapacity[0] = outer_dim;
-    auto newNumel = std::accumulate(
-        newCapacity.begin(),
-        newCapacity.end(),
-        static_cast<int64_t>(1),
-        std::multiplies<int64_t>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      return;
-    }
-    // Old data is discarded
-    storage_.data_ptr().clear();
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    // Allocate new memory but don't copy over the data
-    raw_mutable_data(data_type_);
-    dims_ = oldDims;
-    numel_ = oldSize;
-    reserved_ = true;
-  }
-
-  /**
-   * @brief Resizes a tensor.
-   *
-   * Resize takes in a vector of ints specifying the dimensions of the tensor.
-   * You can pass in an empty vector to specify that it is a scalar (i.e.
-   * containing one single item).
-   *
-   * The underlying storage may be deleted after calling Resize: if the new
-   * shape leads to a different number of items in the tensor, the old memory
-   * is deleted and new memory will be allocated next time you call
-   * mutable_data(). However, if the shape is different but the total number of
-   * items is the same, the underlying storage is kept.
-   */
-  template <typename... Ts>
-  void Resize(Ts... dim_source) {
-    bool is_init = numel_ == -1;
-    bool size_changed = SetDims(dim_source...);
-    if (size_changed) {
-      // If needed, we will free the data. the next mutable_data() call
-      // will create the data storage.
-      bool reset_tensor = false;
-      if (reserved_) {
-        // If tensor is reserved then don't claim its memeory unless capacity()
-        // is smaller than new size
-        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize();
-      } else {
-        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() ||
-            !FLAGS_caffe2_keep_on_shrink ||
-            storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() >
-                FLAGS_caffe2_max_keep_on_shrink_memory;
-      }
-
-      if (reset_tensor && !is_init) {
-        FreeMemory();
-      }
-    }
-  }
-
-  /**
-   * Resize the tensor like the source tensor. Note that this is just a
-   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
-   */
-  inline void ResizeLike(const TensorImpl& src_tensor) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        src_tensor.is_contiguous(),
-        "Right now ResizeLike is only supported for contiguous Tensor.");
-    // Note: need casting for different context types.
-    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
-      Resize(src_tensor.dims());
-    }
-  }
-
-  /**
-   * Resizes the tensor without touching underlying storage.
-   * This requires the total size of the tensor to remains constant.
-   */
-  inline void Reshape(const std::vector<int64_t>& dims) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now Reshape is only supported for contiguous Tensor.");
-    int64_t new_size = 1;
-    for (auto d : dims) {
-      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
-      new_size *= d;
-    }
-    CAFFE_ENFORCE_WITH_CALLER(
-        new_size == numel_,
-        "New size and old size are not equal. You cannot use Reshape, "
-        "but should use Resize."
-        // TODO(jiayq): remove the following warning after pending diffs
-        // stabilize.
-        " The old caffe2 mixes Reshape and Resize but this behavior has "
-        "been changed. If you find this error, most likely you will need "
-        "to change corresponding code from Reshape to Resize.");
-    dims_ = dims;
-  }
-
-  inline void Reshape(const std::vector<int>& dims) {
-    Reshape(ToVectorint64_t(dims));
-  }
-
-  /**
-   * Release whatever memory the tensor was holding but keep size and type
-   * information. Subsequent call to mutable_data will trigger new memory
-   * allocation.
-   */
-  inline void FreeMemory() {
-    // We'll detach from the old Storage and create a new one
-    storage_ = at::Storage(storage_.device_type(), data_type_);
-    storage_offset_ = 0;
-  }
-
-  /**
-   * A utility function to print the debug string for the tensor. Note that this
-   * is very slow since it involves quite some string operations, so do not use
-   * it in your performance-critical code.
-   */
-  std::string DebugString() const {
-    std::stringstream ss;
-    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
-       << data_type_.name() << " and dimension (";
-    for (int d : dims_) {
-      ss << d << ",";
-    }
-    ss << ").";
-    return ss.str();
-  }
-
-  /**
-   * @brief Shares the data with another tensor.
-   *
-   * To share data between two tensors, the sizes of the two tensors must be
-   * equal already. The reason we do not implicitly do a Resize to make the two
-   * tensors have the same shape is that we want to allow tensors of different
-   * shapes but the same number of items to still be able to share data. This
-   * allows one to e.g. have a n-dimensional Tensor and a flattened version
-   * sharing the same underlying storage.
-   *
-   * The source tensor should already have its data allocated.
-   */
-  void ShareData(const TensorImpl& src) {
-    // Right now, we are assuming the device_type are the same, since it is
-    // inherently the same in the non-templatized code. We should probably add
-    // an ENFORCE here which might affect perf a little bit.
-    CAFFE_ENFORCE_EQ_WITH_CALLER(
-        src.numel_,
-        numel_,
-        "Size mismatch - did you call reshape before sharing the data?");
-    // It is possible that the source tensor hasn't called mutable_data() yet,
-    // in which case ShareData() doesn't make much sense since we don't really
-    // know what to share yet.
-    CAFFE_ENFORCE_WITH_CALLER(
-        src.storage_.data() || src.numel_ == 0,
-        "Source tensor has no content and has size > 0");
-    // Finally, do sharing.
-    /* Since we create new Storage whenever we need to change data_type/capacity
-     * this still keeps the original semantics
-     */
-    storage_ = src.storage();
-    data_type_ = src.dtype();
-    storage_offset_ = src.storage_offset();
-  }
-
-  /**
-   * @brief Shares the data with an externally managed pointer.
-   *
-   * This is similar to ShareData() but the source is a pointer with an advanced
-   * deleter option. In default, no deletion takes place, and one needs to make
-   * sure that the external memory is deallocated only after the tensor finishes
-   * using it. If a Deleter object is passed in, when this tensor is reallocated
-   * or freed, the deleter function is going to be called.
-   */
-  template <typename T>
-  void
-  ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) {
-    ShareExternalPointer((void*)src, TypeMeta::Make<T>(), capacity, d);
-  }
-
-  template <typename T>
-  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
-    ShareExternalPointer(std::move(data_ptr), TypeMeta::Make<T>(), capacity);
-  }
-
-  void ShareExternalPointer(
-      void* src,
-      const TypeMeta& data_type,
-      size_t capacity = 0,
-      MemoryDeleter d = nullptr) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        is_contiguous_,
-        "Right now ShareExternalPointer is only supported for contiguos Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    ShareExternalPointer(
-        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
-  }
-
-  void ShareExternalPointer(
-      at::DataPtr&& data_ptr,
-      const TypeMeta& data_type,
-      size_t capacity) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    if (!capacity) {
-      capacity = numel_ * data_type.itemsize();
-    }
-    if (storage_.unique()) {
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "To share data with a raw pointer, you need to set shape first.");
-      storage_.UniqueStorageShareExternalPointer(
-          std::move(data_ptr), data_type, capacity);
-      data_type_ = data_type;
-      storage_offset_ = 0;
-    } else {
-      int64_t numel = capacity / data_type.itemsize();
-      // Create a new Storage
-      storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
-      data_type_ = data_type;
-      storage_offset_ = 0;
-    }
-  }
-
-  /**
-   * Returns a const raw void* pointer of the underlying storage. mutable_data()
-   * or raw_mutable_data() must have been called prior to this function call.
-   */
-  inline const void* raw_data() const {
-    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
-    return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * storage_.itemsize());
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage. mutable_data() or
-   * raw_mutable_data() must have been called prior to this function call, and
-   * the data type must be of the correct type. If you want to get a void*
-   * pointer instead, use raw_data().
-   */
-  template <typename T>
-  inline const T* data() const {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_.data() || numel_ == 0,
-        "The tensor is of non-zero shape, but its data is not allocated yet. "
-        "Caffe2 uses a lazy allocation, so you will need to call "
-        "mutable_data() or raw_mutable_data() to actually allocate memory.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        IsType<T>(),
-        "Tensor type mismatch, caller expects elements to be ",
-        TypeMeta::TypeName<T>(),
-        ", while tensor contains ",
-        data_type_.name(),
-        ". ");
-    return static_cast<T*>(storage_.data()) + storage_offset_;
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. Since we will need
-   * to know the type of the data for allocation, a TypeMeta object is passed in
-   * to specify the necessary information. This is conceptually equivalent of
-   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
-   * the type T. This function differs from mutable_data<T>() in the sense that
-   * the type T can be specified during runtime via the TypeMeta object.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data(const TypeMeta& meta) {
-    // For 0-size tensors it's fine to return any pointer (including nullptr)
-    if (data_type_ == meta && (storage_.data() || numel_ == 0)) {
-      return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
-    } else {
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "Tensor is not initialized. You probably need to call Resize() "
-          "before calling mutable_data()");
-      bool had_special_dtor = data_type_.dtor() != nullptr;
-      storage_offset_ = 0;
-      if (storage_.unique()) {
-        storage_.set_dtype(meta);
-      } else {
-        if (data_type_ != meta) {
-          storage_ = at::Storage(storage_.device_type(), meta);
-        }
-      }
-      data_type_ = meta;
-
-      // We can reuse the existing buffer if the current data does not have
-      // a special destructor and the new data doesn't have a special
-      // constructor.
-      if (numel_ == 0 ||
-          (meta.ctor() == nullptr && !had_special_dtor &&
-           storage_.numel() >= numel_)) {
-        AT_ASSERT(storage_offset_ == 0); // because we just reallocated
-        return storage_.data();
-      }
-      const at::Allocator* allocator = storage_.allocator();
-      // TODO: Get rid of StaticContext
-      CAFFE_ENFORCE(
-          allocator == nullptr,
-          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
-      if (meta.ctor()) {
-        // For types that need placement new, we will call it, as well as
-        // making sure that when the data is freed, it calls the right
-        // destruction procedure.
-        auto size = numel_;
-        auto dtor = data_type_.dtor();
-        void* ptr;
-        at::DeleterFnPtr deleter;
-        auto ptr_and_deleter = GetStaticContext()->New(
-            numel_ * storage_.itemsize()); // Removing this can get rid of
-                                           // InefficientStdFunctionContext
-        ptr = ptr_and_deleter.first;
-        deleter = ptr_and_deleter.second;
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr,
-            [size, dtor, deleter](void* local_ptr) -> void {
-              dtor(local_ptr, size);
-              deleter(local_ptr);
-            },
-            at::Device(storage_.device_type())));
-        data_type_.ctor()(storage_.data(), numel_);
-      } else {
-        // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter =
-            GetStaticContext()->New(numel_ * storage_.itemsize());
-        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
-            ptr_and_deleter.first,
-            ptr_and_deleter.second,
-            at::Device(storage_.device_type())));
-      }
-      storage_.set_numel(numel_);
-      AT_ASSERT(storage_offset_ == 0); // because we just reallocated
-      return storage_.data();
-    }
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. This can only be
-   * used when you know for sure that the underlying storage of the tensor is
-   * already created via an earlier raw_mutable_data(meta) call or a
-   * mutable_data<T>() call.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data() {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type_.id() != TypeIdentifier::uninitialized(),
-        "Calling raw_mutable_data() without meta, but the current meta is "
-        "of unknown type.");
-    return raw_mutable_data(data_type_);
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage.
-   *
-   * For fundamental types, we reuse possible existing storage if there
-   * is sufficient capacity.
-   */
-  template <typename T>
-  inline T* mutable_data() {
-    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
-      return static_cast<T*>(storage_.data()) + storage_offset_;
-    }
-    // Check it here statically - otherwise TypeMeta would throw the runtime
-    // error in attempt to invoke TypeMeta::ctor()
-    static_assert(
-        std::is_default_constructible<T>::value,
-        "Tensor can't hold non-default-constructible types");
-    return static_cast<T*>(raw_mutable_data(TypeMeta::Make<T>()));
-  }
-
-  /**
-   * Returns the number of dimensions of the data.
-   */
-  inline int ndim() const {
-    return dims_.size();
-  }
-  /**
-   * Returns the size (i.e. the number of items) of the tensor.
-   */
-  inline int64_t size() const {
-    return numel_;
-  }
-  /**
-   * Return the number of bytes each item takes in the tensor.
-   */
-  inline size_t itemsize() const {
-    return storage_.itemsize();
-  }
-  /**
-   * Returns the total number of bytes of the storage.
-   *
-   * This is equivalent to calling size() * itemsize().
-   */
-  inline size_t nbytes() const {
-    return numel_ * itemsize();
-    ;
-  }
-
-  /**
-   * Returns the dimensions of the tensor as a vector.
-   */
-  inline const std::vector<int64_t>& dims() const {
-    return dims_;
-  }
-
-  inline int64_t size_from_dim(int k) const {
-    return size_from_dim_(k, dims_);
-  }
-
-  inline int64_t size_to_dim(int k) const {
-    return size_to_dim_(k, dims_);
-  }
-
-  inline int64_t size_between_dim(int k, int l) const {
-    return size_between_dim_(k, l, dims_);
-  }
-
-  /**
-   * Returns the 'canonical' version of a (usually)  user-specified axis,
-   * allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param axis_index the axis index.
-   *        If 0 <= index < ndim(), return index.
-   *        If -ndim <= index <= -1, return (ndim() - (-index)),
-   *        e.g., the last axis index (ndim() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int canonical_axis_index(int axis_index) const {
-    return canonical_axis_index_(axis_index, ndim());
-  }
-
-  inline int64_t stride(int64_t dim) const {
-#ifndef NDEBUG
-    // TODO: dim wrapping?
-    CAFFE_ENFORCE_LT_WITH_CALLER(dim, strides_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(
-        dim, 0, "Cannot have negative dimension index");
-#endif
-    return strides_[dim];
-  }
-
-  // TODO: Change to ArrayRef later
-  inline at::DimVector strides() {
-    return strides_;
-  }
-
-  inline bool is_contiguous() const {
-    return is_contiguous_;
-  }
-
-  /**
-   * Checks if the tensor content is of the given data type.
-   */
-  template <typename T>
-  inline bool IsType() const {
-    return storage_.IsType<T>();
-  }
-  /**
-   * Returns the TypeMeta object associated with the current data type.
-   */
-  inline const TypeMeta& meta() const {
-    return data_type_;
-  }
-
-  inline const TypeMeta& dtype() const {
-    return data_type_;
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor in int.
-   *
-   * This function returns an int value instead of int64_t, which depending on
-   * the typedef could be int64. If you want int64 dim values, make sure you
-   * call dim() instead.
-   */
-  inline int dim32(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits<int>::max());
-    return static_cast<int>(dims_[i]);
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor. Note that the passed in index
-   * must be between 0 (inclusive) and the number of dimensions, otherwise
-   * this function will produce a fatal message.
-   */
-  inline int64_t dim(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    return dims_[i];
-  }
-
-  void ExtractDeviceOption(DeviceOption* device) const {
-    auto* context = GetStaticContext();
-    CHECK(context);
-    context->ExtractDeviceOption(device, raw_data());
-  }
-
-  const at::Storage& storage() {
-    return storage_;
-  }
-
-  const at::Storage& storage() const {
-    return storage_;
-  }
-
-  int64_t storage_offset() const {
-    return storage_offset_;
-  }
-
- protected:
-  // TODO: change to DimVector
-  std::vector<int64_t> dims_; // sizes_
-  at::DimVector strides_;
-  int64_t numel_ = -1; // numel_
-  bool is_contiguous_ = true;
-  // we decide to keep reserved_ and it will
-  // live in Tensor after the split
-  // The logic is that if Extend() or ReserveSpace() were ever called,
-  // then subsequent Resize()s will not free up Storage.
-  bool reserved_ = false;
-  at::Storage storage_;
-  int64_t storage_offset_ = 0;
-  TypeMeta data_type_;
-
- private:
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_integral<T>::value>::type>
-  bool SetDims(const std::vector<T>& src) {
-    auto old_numel = numel_;
-    dims_.resize(src.size());
-    int64_t new_numel = 1;
-    for (size_t i = 0; i < src.size(); ++i) {
-      new_numel *= src[i];
-      dims_[i] = src[i];
-    }
-    update_strides();
-    numel_ = new_numel;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims() {
-    auto old_numel = numel_;
-    dims_.resize(0);
-    update_strides();
-    numel_ = 1;
-    return numel_ != old_numel;
-  }
-
-  // TODO(jiayq): maybe rewrite the following functions with initializer list.
-  // NVCC does not play well with initializer lists last time, but worth
-  // another shot.
-  bool SetDims(const int64_t d0) {
-    auto old_numel = numel_;
-    dims_.resize(1);
-    dims_[0] = d0;
-    update_strides();
-    numel_ = d0;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const int64_t d0, const int64_t d1) {
-    auto old_numel = numel_;
-    dims_.resize(2);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    update_strides();
-    numel_ = d0 * d1;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
-    auto old_numel = numel_;
-    dims_.resize(3);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    update_strides();
-    numel_ = d0 * d1 * d2;
-    return numel_ != old_numel;
-  }
-
-  bool
-  SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) {
-    auto old_numel = numel_;
-    dims_.resize(4);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    dims_[3] = d3;
-    update_strides();
-    numel_ = d0 * d1 * d2 * d3;
-    return numel_ != old_numel;
-  }
-
-  inline void update_strides() {
-    strides_.resize(dims_.size());
-    if (ndim() > 0) {
-      int last_idx = ndim() - 1;
-      strides_[last_idx] = 1;
-      for (auto i = last_idx - 1; i >= 0; --i) {
-        strides_[i] = strides_[i + 1] * std::max<int64_t>(dims_[i + 1], 1);
-      }
-    }
-    is_contiguous_ = true;
-  }
-};
-
+  using at::ToVectorint64_t;
+  using at::size_from_dim_;
+  using at::size_to_dim_;
+  using at::size_between_dim_;
+  using at::canonical_axis_index_;
+  using at::TensorImpl;
 }
diff --git a/caffe2/core/transform.cc b/caffe2/core/transform.cc
index 5b3f80fbe3fc0a..549322abccc7da 100644
--- a/caffe2/core/transform.cc
+++ b/caffe2/core/transform.cc
@@ -10,7 +10,7 @@ namespace caffe2 {
 
 using transform::Graph;
 
-CAFFE_DEFINE_REGISTRY(TransformRegistry, Transform);
+C10_DEFINE_REGISTRY(TransformRegistry, Transform);
 
 std::vector<std::vector<int>> Transform::PatternMatch(const Graph& graph) {
   // checks if the node at index i is matched already or not
diff --git a/caffe2/core/transform.h b/caffe2/core/transform.h
index c6aaf119513847..723e14789d627c 100644
--- a/caffe2/core/transform.h
+++ b/caffe2/core/transform.h
@@ -150,9 +150,9 @@ class CAFFE2_API Transform {
 // Creates a Transform based on a key, which should be defined in registry.
 CAFFE2_API unique_ptr<Transform> CreateTransform(string key);
 
-CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform);
+C10_DECLARE_REGISTRY(TransformRegistry, Transform);
 #define REGISTER_TRANSFORM(name, ...) \
-  CAFFE_REGISTER_CLASS(TransformRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(TransformRegistry, name, __VA_ARGS__)
 
 // Create a Transform object from registry,
 // and immediately apply it to a Netdef.
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 2ad486c328f56d..324766359de607 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -11,8 +11,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/blob.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/net.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/signal_handler.h"
diff --git a/caffe2/ideep/operators/momentum_sgd_op.cc b/caffe2/ideep/operators/momentum_sgd_op.cc
new file mode 100644
index 00000000000000..320780c12ffe1d
--- /dev/null
+++ b/caffe2/ideep/operators/momentum_sgd_op.cc
@@ -0,0 +1,125 @@
+#include <caffe2/ideep/ideep_utils.h>
+
+namespace caffe2 {
+
+void momentum_sgd_update(
+    const int N,
+    const float* g,
+    const float* m,
+    float* ng,
+    float* nm,
+    const float* lr,
+    const float momentum,
+    const bool nesterov,
+    float* param) {
+  const float LR = lr[0];
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (auto i = 0; i < N; ++i) {
+    if (!nesterov) {
+      const float adjusted_gradient = LR * g[i] + momentum * m[i];
+      nm[i] = adjusted_gradient;
+      ng[i] = adjusted_gradient;
+    } else {
+      const float mi = m[i];
+      const float mi_new = momentum * mi + LR * g[i];
+      nm[i] = mi_new;
+      ng[i] = (1 + momentum) * mi_new - momentum * mi;
+    }
+
+    if (param) {
+      param[i] -= ng[i];
+    }
+  }
+}
+
+class IDEEPMomentumSGDOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPMomentumSGDOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
+    if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
+      Output(OUTPUT_GRAD)->reinit(Input(GRAD).get_descriptor());
+    }
+    if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
+      Output(OUTPUT_MOMENTUM)->reinit(Input(MOMENTUM).get_descriptor());
+    }
+
+    // TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
+    const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
+    CAFFE_ENFORCE(lr.size() == 1);
+
+    momentum_sgd_update(
+        Input(GRAD).get_nelems(),
+        static_cast<float*>(Input(GRAD).get_data_handle()),
+        static_cast<float*>(Input(MOMENTUM).get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
+        lr.template data<float>(),
+        momentum_,
+        nesterov_,
+        nullptr);
+    return true;
+  }
+
+ protected:
+  float momentum_{0.9};
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM);
+};
+
+class IDEEPMomentumSGDUpdateOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+  IDEEPMomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.0)),
+        nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", 0)) {}
+
+  bool RunOnDevice() override {
+    CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems());
+    if (Input(GRAD) != *Output(OUTPUT_GRAD)) {
+      Output(OUTPUT_GRAD)->reinit(Input(GRAD).get_descriptor());
+    }
+    if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) {
+      Output(OUTPUT_MOMENTUM)->reinit(Input(MOMENTUM).get_descriptor());
+    }
+
+    // TODO: Use itensor after 0-dim is supported. Now use CPU tensor.
+    const auto& lr = OperatorBase::Input<TensorCPU>(LR, CPU);
+    CAFFE_ENFORCE(lr.size() == 1);
+
+    momentum_sgd_update(
+        Input(GRAD).get_nelems(),
+        static_cast<float*>(Input(GRAD).get_data_handle()),
+        static_cast<float*>(Input(MOMENTUM).get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_GRAD)->get_data_handle()),
+        static_cast<float*>(Output(OUTPUT_MOMENTUM)->get_data_handle()),
+        lr.template data<float>(),
+        momentum_,
+        nesterov_,
+        static_cast<float*>(Output(OUTPUT_PARAM)->get_data_handle()));
+    return true;
+  }
+
+ protected:
+  float momentum_{0.9};
+  bool nesterov_;
+  INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM);
+  OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM);
+};
+
+REGISTER_IDEEP_OPERATOR(MomentumSGD, IDEEPMomentumSGDOp);
+REGISTER_IDEEP_OPERATOR(MomentumSGDUpdate, IDEEPMomentumSGDUpdateOp);
+
+} // namespace caffe2
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 3226a08c4af9cf..0292dbd5d5a637 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -36,7 +36,7 @@ namespace caffe2 {
  *                            IDEEPFallbackOp<MyMagicOp, SkipIndices<0>>);
  */
 template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
-class IDEEPFallbackOp final : public IDEEPOperator {
+class C10_EXPORT IDEEPFallbackOp final : public IDEEPOperator {
  public:
   USE_IDEEP_DEF_ALIASES();
   USE_IDEEP_OPERATOR_FUNCTIONS();
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index f50a4f34c66789..087078c507d164 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -20,6 +20,8 @@ class IDEEPContext final : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_IDEEP);
   }
+  explicit IDEEPContext(const at::Device& device)
+      : IDEEPContext(DeviceToOption(device)) {}
 
   ~IDEEPContext() noexcept override {}
 
@@ -178,15 +180,6 @@ class IDEEPStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<IDEEPContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<IDEEPContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return IDEEP;
   }
diff --git a/caffe2/ideep/utils/ideep_operator.h b/caffe2/ideep/utils/ideep_operator.h
index 5cccbb509725c2..f9b6a831061388 100644
--- a/caffe2/ideep/utils/ideep_operator.h
+++ b/caffe2/ideep/utils/ideep_operator.h
@@ -6,21 +6,21 @@
 
 namespace caffe2 {
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     IDEEPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 
 #define REGISTER_IDEEP_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_IDEEP_OPERATOR(name, ...) \
-  CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_IDEEP_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_IDEEP_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 // IDEEPOperator is the base scaffolding of the operators that uses IDEEP. It
 // provides a few operators that are useful to IDEEP specific implementations.
@@ -51,7 +51,10 @@ class IDEEPOperator : public OperatorBase {
     // FinishDeviceComputation,
     // it is always just a re-route to RunOnDevice().
     try {
-      return RunOnDevice();
+      StartAllObservers();
+      bool result =  RunOnDevice();
+      StopAllObservers();
+      return result;
     } catch (EnforceNotMet& err) {
       err.AppendMessage(getErrorMsg());
       throw;
diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc
index 020e22fa6143ed..a0b80f8a8e401c 100644
--- a/caffe2/ideep/utils/ideep_register.cc
+++ b/caffe2/ideep/utils/ideep_register.cc
@@ -4,11 +4,14 @@
 #include <ideep_pin_singletons.hpp>
 #include "ideep_context.h"
 
+namespace at {
+REGISTER_CONTEXT(DeviceType::IDEEP, caffe2::IDEEPContext);
+} // namespace at
 namespace caffe2 {
 
 CAFFE_KNOWN_TYPE(ideep::tensor);
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     IDEEPOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
@@ -27,7 +30,7 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU);
 REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU);
 REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU);
 
-BaseStaticContext* GetIDEEPStaticContext() {
+C10_EXPORT BaseStaticContext* GetIDEEPStaticContext() {
   static IDEEPStaticContext context;
   return &context;
 }
diff --git a/caffe2/mkl/mkl_operator.cc b/caffe2/mkl/mkl_operator.cc
index bf5b460d0920be..8fba56da8474d6 100644
--- a/caffe2/mkl/mkl_operator.cc
+++ b/caffe2/mkl/mkl_operator.cc
@@ -9,7 +9,7 @@ CAFFE2_DEFINE_bool(
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(
+C10_DEFINE_REGISTRY(
     MKLOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc
index 6e9075df43475f..8c66bc111282ac 100644
--- a/caffe2/mkl/utils/mkl_context.cc
+++ b/caffe2/mkl/utils/mkl_context.cc
@@ -3,6 +3,10 @@
 #include "mkl_context.h"
 #include "caffe2/core/event_cpu.h"
 
+namespace at {
+
+REGISTER_CONTEXT(DeviceType::MKLDNN, caffe2::MKLContext);
+} // namespace at
 namespace caffe2 {
 
 // MKL events are the same as CPU events
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 0a7b5808a446be..8364026d91c651 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -29,6 +29,8 @@ class MKLContext : public BaseContext {
                                      : RandomNumberSeed()) {
     CAFFE_ENFORCE_EQ(option.device_type(), PROTO_MKLDNN);
   }
+  explicit MKLContext(const at::Device& device)
+      : MKLContext(DeviceToOption(device)) {}
 
   ~MKLContext() override {}
 
@@ -155,15 +157,6 @@ class MKLStaticContext : public BaseStaticContext {
     return GetCPUAllocator()->New(nbytes);
   }
 
-  std::unique_ptr<BaseContext> CreateContext() override {
-    return caffe2::make_unique<MKLContext>();
-  }
-
-  std::unique_ptr<BaseContext> CreateContext(
-      const DeviceOption& option) override {
-    return caffe2::make_unique<MKLContext>(option);
-  }
-
   DeviceType GetDeviceType() override {
     return MKLDNN;
   }
diff --git a/caffe2/mkl/utils/mkl_memory.h b/caffe2/mkl/utils/mkl_memory.h
index 736d8ede8cf53d..ffa1899197f2ba 100644
--- a/caffe2/mkl/utils/mkl_memory.h
+++ b/caffe2/mkl/utils/mkl_memory.h
@@ -148,7 +148,7 @@ class LayoutWrapper {
  * Most of the MKLMemory functions are not thread safe.
  */
 template <typename T>
-class MKLMemory {
+class C10_EXPORT MKLMemory {
  public:
   // Initializes an empty MKLMemory.
   MKLMemory() {}
@@ -460,7 +460,7 @@ class MKLMemory {
     return dims_;
   }
 
-  inline const int ndim() const { return dims_.size(); }
+  inline int ndim() const { return dims_.size(); }
 
   inline int dim32(const int i) const {
     CAFFE_ENFORCE_LT(dims_.at(i), std::numeric_limits<int>::max());
diff --git a/caffe2/mkl/utils/mkl_operator.h b/caffe2/mkl/utils/mkl_operator.h
index 2236e9267af542..0f028fbfaa8c01 100644
--- a/caffe2/mkl/utils/mkl_operator.h
+++ b/caffe2/mkl/utils/mkl_operator.h
@@ -10,20 +10,20 @@ CAFFE2_DECLARE_bool(caffe2_mkl_memonger_in_use);
 
 namespace caffe2 {
 
-CAFFE_DECLARE_REGISTRY(
+C10_DECLARE_REGISTRY(
     MKLOperatorRegistry,
     OperatorBase,
     const OperatorDef&,
     Workspace*);
 #define REGISTER_MKL_OPERATOR_CREATOR(key, ...) \
-  CAFFE_REGISTER_CREATOR(MKLOperatorRegistry, key, __VA_ARGS__)
+  C10_REGISTER_CREATOR(MKLOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_MKL_OPERATOR(name, ...) \
-  CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name, __VA_ARGS__)
+  C10_REGISTER_CLASS(MKLOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_MKL_OPERATOR_STR(str_name, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(MKLOperatorRegistry, str_name, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(MKLOperatorRegistry, str_name, __VA_ARGS__)
 
 #define REGISTER_MKL_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+  C10_REGISTER_CLASS(MKLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 namespace mkl {
 // MKLOperator is the base scaffolding of the operators that uses MKLDNN. It
diff --git a/caffe2/mobile/contrib/arm-compute/core/net_gl.h b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
index 1dc93dedc3fff3..48a47ff87f3351 100644
--- a/caffe2/mobile/contrib/arm-compute/core/net_gl.h
+++ b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
@@ -3,10 +3,10 @@
 
 #include <vector>
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/net.h"
-#include "caffe2/core/registry.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.cc b/caffe2/mobile/contrib/arm-compute/core/operator.cc
index bd4337aa85e7cf..cddd0b0129c6a0 100644
--- a/caffe2/mobile/contrib/arm-compute/core/operator.cc
+++ b/caffe2/mobile/contrib/arm-compute/core/operator.cc
@@ -2,8 +2,11 @@
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &,
-                      Workspace *);
+C10_DEFINE_REGISTRY(
+    GLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
 CAFFE_REGISTER_DEVICE_TYPE(DeviceType::OPENGL, GLOperatorRegistry);
 
 } // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.h b/caffe2/mobile/contrib/arm-compute/core/operator.h
index 037173054f7715..4df78c7734b849 100644
--- a/caffe2/mobile/contrib/arm-compute/core/operator.h
+++ b/caffe2/mobile/contrib/arm-compute/core/operator.h
@@ -1,26 +1,29 @@
 #ifndef CAFFE2_OPENGL_OPERATOR_H_
 #define CAFFE2_OPENGL_OPERATOR_H_
 
+#include "c10/util/Registry.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/registry.h"
 
 namespace caffe2 {
 
-CAFFE_DECLARE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &,
-                       Workspace *);
-#define REGISTER_GL_OPERATOR_CREATOR(key, ...)                                 \
-  CAFFE_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_GL_OPERATOR(name, ...)                                        \
-  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                  \
-  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() {              \
-    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                            \
-  }                                                                            \
-  CAFFE_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_GL_OPERATOR_STR(str_name, ...)                                \
-  CAFFE_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__)
+C10_DECLARE_REGISTRY(
+    GLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+#define REGISTER_GL_OPERATOR_CREATOR(key, ...) \
+  C10_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR(name, ...)                           \
+  extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();     \
+  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() { \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();               \
+  }                                                               \
+  C10_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR_STR(str_name, ...) \
+  C10_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__)
 
-#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...)                    \
-  CAFFE_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
+#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...) \
+  C10_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
 
 } // namespace caffe2
 
diff --git a/caffe2/onnx/torch_ops/CMakeLists.txt b/caffe2/onnx/torch_ops/CMakeLists.txt
new file mode 100644
index 00000000000000..99443af4cc9bc6
--- /dev/null
+++ b/caffe2/onnx/torch_ops/CMakeLists.txt
@@ -0,0 +1,5 @@
+# ---[ Extra onnx files.
+file(GLOB ONNX_SRCS *.cc)
+
+# ---[ Send the lists to the parent scope.
+set(ONNX_SRCS ${ONNX_SRCS} PARENT_SCOPE)
diff --git a/caffe2/onnx/torch_ops/constants.h b/caffe2/onnx/torch_ops/constants.h
new file mode 100644
index 00000000000000..ebd2a2464d9b33
--- /dev/null
+++ b/caffe2/onnx/torch_ops/constants.h
@@ -0,0 +1,7 @@
+namespace ONNX_NAMESPACE {
+
+const int AI_ONNX_PYTORCH_DOMAIN_MIN_OPSET = 1;
+const int AI_ONNX_PYTORCH_DOMAIN_MAX_OPSET = 1;
+constexpr const char* AI_ONNX_PYTORCH_DOMAIN = "ai.onnx.pytorch";
+
+} // namespace ONNX_NAMESPACE
diff --git a/caffe2/onnx/torch_ops/defs.cc b/caffe2/onnx/torch_ops/defs.cc
new file mode 100644
index 00000000000000..8d03120af03557
--- /dev/null
+++ b/caffe2/onnx/torch_ops/defs.cc
@@ -0,0 +1,24 @@
+// Copyright (c) Facebook Inc. and Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "./schema.h"
+
+namespace ONNX_NAMESPACE {
+
+static const char* dummy_test_only_ver1_doc = R"DOC(
+A dummy op for verifying the build setup works, don't use me.
+)DOC";
+
+ONNX_PYTORCH_OPERATOR_SET_SCHEMA(
+    DUMMY_TEST_ONLY,
+    1,
+    OpSchema()
+        .SetDoc(dummy_test_only_ver1_doc)
+        .Input(0, "input", "Input tensor", "T")
+        .Output(0, "output", "Output tensor", "T")
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors."));
+
+} // namespace ONNX_NAMESPACE
diff --git a/caffe2/onnx/torch_ops/operator_sets.h b/caffe2/onnx/torch_ops/operator_sets.h
new file mode 100644
index 00000000000000..760a6b7fa2a7b6
--- /dev/null
+++ b/caffe2/onnx/torch_ops/operator_sets.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "onnx/defs/schema.h"
+
+namespace ONNX_NAMESPACE {
+
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(PyTorch, 1, DUMMY_TEST_ONLY);
+
+// Iterate over schema from ai.onnx.pytorch domain opset 1
+class OpSet_PyTorch_ver1 {
+ public:
+  static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(
+           PyTorch, 1, DUMMY_TEST_ONLY)>());
+  }
+};
+
+inline void RegisterPyTorchOperatorSetSchema() {
+  RegisterOpSetSchema<OpSet_PyTorch_ver1>();
+}
+
+} // namespace ONNX_NAMESPACE
diff --git a/caffe2/onnx/torch_ops/schema.cc b/caffe2/onnx/torch_ops/schema.cc
new file mode 100644
index 00000000000000..de933c2c23ab2e
--- /dev/null
+++ b/caffe2/onnx/torch_ops/schema.cc
@@ -0,0 +1,17 @@
+#include "./schema.h"
+#include "./operator_sets.h"
+
+namespace {
+using namespace ONNX_NAMESPACE;
+class PyTorchSchemasRegisterer {
+ public:
+  PyTorchSchemasRegisterer() {
+    OpSchemaRegistry::DomainToVersionRange::Instance().AddDomainToVersion(
+        AI_ONNX_PYTORCH_DOMAIN,
+        AI_ONNX_PYTORCH_DOMAIN_MIN_OPSET,
+        AI_ONNX_PYTORCH_DOMAIN_MAX_OPSET);
+    RegisterPyTorchOperatorSetSchema();
+  }
+};
+static PyTorchSchemasRegisterer registerer{};
+} // namespace
diff --git a/caffe2/onnx/torch_ops/schema.h b/caffe2/onnx/torch_ops/schema.h
new file mode 100644
index 00000000000000..3454e366a1eeba
--- /dev/null
+++ b/caffe2/onnx/torch_ops/schema.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "./constants.h"
+#include "onnx/defs/schema.h"
+
+#define ONNX_PYTORCH_OPERATOR_SET_SCHEMA(name, ver, impl) \
+  ONNX_OPERATOR_SET_SCHEMA_EX(                            \
+      name, PyTorch, AI_ONNX_PYTORCH_DOMAIN, ver, false, impl)
diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h
index 2b9e4d6d5e6ecc..07ee6187443a97 100644
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@@ -35,31 +35,52 @@ class BatchGatherOp final : public Operator<Context> {
     auto block_size = data.size_from_dim(2);
     auto block_bytesize = block_size * data.meta().itemsize();
     auto N = indices.size();
-    auto data_batch_bytesize = data.size_from_dim(1) * data.meta().itemsize();
-    auto gathered_batch_bytesize =
-        N * data.size_from_dim(2) * data.meta().itemsize();
+    auto data_batch_size = data.size_from_dim(1);
+    auto gathered_batch_size = N * data.size_from_dim(2);
+    auto data_batch_bytesize = data_batch_size * data.meta().itemsize();
+    auto gathered_batch_bytesize = gathered_batch_size * data.meta().itemsize();
     const TInd* idxs = indices.template data<TInd>();
     auto src_base = static_cast<const char*>(data.raw_data());
     auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
 
-    for (auto batch = 0; batch < data.dim(0); ++batch) {
-      for (auto i = 0; i < N; ++i) {
-        auto idx = idxs[i];
-        CAFFE_ENFORCE(
-            0 <= idx && idx < data.dim(1),
-            "INDICES element is out of DATA bounds, id=",
-            idx,
-            " data_dim=",
-            data.dim(1));
-        auto src =
-            src_base + idx * block_bytesize + batch * data_batch_bytesize;
-        auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize;
-        context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
+    for (auto i = 0; i < N; ++i) {
+      auto idx = idxs[i];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data.dim(1),
+          "INDICES element is out of DATA bounds, id=",
+          idx,
+          " data_dim=",
+          data.dim(1));
+    }
+
+    if (data.template IsType<float>() && block_size == 1) {
+      auto src = data.template data<float>();
+      auto dst = output->template mutable_data<float>();
+
+      for (auto batch = 0; batch < data.dim(0); ++batch) {
+        auto src_batch_base = src + batch * data_batch_size;
+        auto out_batch_base = dst + batch * gathered_batch_size;
+
+        for (auto i = 0; i < N; ++i) {
+          auto idx = idxs[i];
+          out_batch_base[i] = src_batch_base[idx];
+        }
+      }
+    } else {
+      for (auto batch = 0; batch < data.dim(0); ++batch) {
+        auto src_batch_base = src_base + batch * data_batch_bytesize;
+        auto out_batch_base = out + batch * gathered_batch_bytesize;
+
+        for (auto i = 0; i < N; ++i) {
+          auto idx = idxs[i];
+          auto src = src_batch_base + idx * block_bytesize;
+          auto dst = out_batch_base + i * block_bytesize;
+          context_.CopyItemsSameDevice(data.meta(), block_size, src, dst);
+        }
       }
     }
     return true;
   }
-
   INPUT_TAGS(DATA, INDICES);
 };
 
@@ -108,21 +129,32 @@ class BatchGatherGradientOp final : public Operator<Context> {
     auto gathered_batch_size = N * data.size_from_dim(2);
     const TInd* idxs = indices.template data<TInd>();
 
+    for (auto i = 0; i < N; ++i) {
+      auto idx = idxs[i];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < data.dim(1),
+          "INDICES element is out of DATA bounds, id=",
+          idx,
+          " data_dim=",
+          data.dim(1));
+    }
+
     for (auto batch = 0; batch < grad.dim(0); ++batch) {
+      auto src_batch_base = grad_data + batch * gathered_batch_size;
+      auto out_batch_base = out_data + batch * data_batch_size;
+
       for (auto i = 0; i < N; ++i) {
         auto idx = idxs[i];
-        CAFFE_ENFORCE(
-            0 <= idx && idx < data.dim(1),
-            "INDICES element is out of DATA bounds, id=",
-            idx,
-            " data_dim=",
-            data.dim(1));
-        math::Add(
-            block_size,
-            out_data + idx * block_size + batch * data_batch_size,
-            grad_data + i * block_size + batch * gathered_batch_size,
-            out_data + idx * block_size + batch * data_batch_size,
-            &context_);
+        if (block_size == 1) {
+          out_batch_base[idx * block_size] += src_batch_base[i * block_size];
+        } else {
+          math::Add(
+              block_size,
+              out_batch_base + idx * block_size,
+              src_batch_base + i * block_size,
+              out_batch_base + idx * block_size,
+              &context_);
+        }
       }
     }
     return true;
diff --git a/caffe2/operators/crf_viterbi_op.cc b/caffe2/operators/crf_viterbi_op.cc
new file mode 100644
index 00000000000000..39a5391d735fcd
--- /dev/null
+++ b/caffe2/operators/crf_viterbi_op.cc
@@ -0,0 +1,221 @@
+#include <algorithm>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+
+void RowwiseMaxAndArg(
+    const float* mat,
+    int32_t N,
+    int32_t D,
+    float* rowMax,
+    int32_t* argMax) {
+  auto eigenMat = ConstEigenMatrixMap<float>(mat, D, N);
+  for (auto i = 0; i < D; i++) {
+    // eigenMat.row(i) is equivalent to column i in mat
+    rowMax[i] = eigenMat.row(i).maxCoeff(argMax + i);
+  }
+}
+void ColwiseMaxAndArg(
+    const float* mat,
+    int32_t N,
+    int32_t D,
+    float* colMax,
+    int32_t* argMax) {
+  auto eigenMat = ConstEigenMatrixMap<float>(mat, D, N);
+  for (auto i = 0; i < N; i++) {
+    // eigenMat.col(i) is equivalent to row i in mat
+    colMax[i] = eigenMat.col(i).maxCoeff(argMax + i);
+  }
+}
+
+class ViterbiPathOp : public Operator<CPUContext> {
+ public:
+  ViterbiPathOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+
+  void GatherRow(
+      const TensorCPU& data,
+      int32_t rowIndex,
+      int32_t block_size,
+      int32_t block_bytesize,
+      TensorCPU* outRow) {
+    CAFFE_ENFORCE(
+        0 <= rowIndex && rowIndex < data.dim(0),
+        "rowIndex is out of DATA bounds");
+    auto out = static_cast<char*>(outRow->raw_mutable_data(data.meta()));
+    auto src_base = static_cast<const char*>(data.raw_data());
+    auto src = src_base + rowIndex * block_bytesize;
+    context_.CopyItemsSameDevice(data.meta(), block_size, src, out);
+  }
+
+  void
+  AddColToMat(const TensorCPU& mat, const TensorCPU& col, TensorCPU* result) {
+    float* resultData = result->template mutable_data<float>();
+    const float* colData = col.template data<float>();
+    // Initialize the columns of the result to be = the input col
+    for (auto i = 0; i < result->dim32(1); i++) {
+      for (auto j = 0; j < result->dim32(0); j++) {
+        resultData[i * result->dim32(0) + j] = colData[i];
+      }
+    }
+    // Element-wise add of the result and the input matrix
+    math::Add<float, CPUContext>(
+        mat.size(),
+        resultData,
+        mat.template data<float>(),
+        resultData,
+        &context_);
+  }
+
+  bool RunOnDevice() override {
+    auto& predictions = Input(0);
+    auto& transitions = Input(1);
+    auto* viterbiPath = Output(0);
+
+    CAFFE_ENFORCE(
+        predictions.ndim() == 2 && transitions.ndim() == 2,
+        "Predictions and transitions hould 2D matrices");
+
+    CAFFE_ENFORCE(
+        predictions.dim(1) == transitions.dim(0),
+        "Predictions and transitions dimensions not matching");
+
+    auto seqLen = predictions.dim32(0);
+
+    viterbiPath->Resize(seqLen);
+    auto block_size = predictions.size() / predictions.dim(0);
+    auto block_bytesize =
+        predictions.size_from_dim(1) * predictions.meta().itemsize();
+    Tensor backpointers(CPU);
+    backpointers.ResizeLike(predictions);
+
+    Tensor trellis(std::vector<int64_t>{block_size}, CPU);
+    Tensor dpMat(CPU);
+    dpMat.ResizeLike(transitions);
+    Tensor dpMax(std::vector<int64_t>{block_size}, CPU);
+    GatherRow(predictions, 0, block_size, block_bytesize, &trellis);
+    for (auto i = 1; i < seqLen; i++) {
+      AddColToMat(transitions, trellis, &dpMat);
+      RowwiseMaxAndArg(
+          dpMat.template data<float>(),
+          dpMat.dim(0),
+          dpMat.dim(1),
+          dpMax.template mutable_data<float>(),
+          backpointers.template mutable_data<int32_t>() + (i * block_size));
+
+      GatherRow(predictions, i, block_size, block_bytesize, &trellis);
+      math::Add<float, CPUContext>(
+          trellis.size(),
+          trellis.template data<float>(),
+          dpMax.template data<float>(),
+          trellis.template mutable_data<float>(),
+          &context_);
+    }
+
+    Tensor tMax(std::vector<int64_t>{1}, CPU);
+    Tensor tArgMax(std::vector<int64_t>{1}, CPU);
+    ColwiseMaxAndArg(
+        trellis.template data<float>(),
+        1,
+        trellis.size(),
+        tMax.template mutable_data<float>(),
+        tArgMax.template mutable_data<int32_t>());
+
+    std::vector<int32_t> viterbiVec;
+    viterbiVec.push_back(tArgMax.template data<int32_t>()[0]);
+    Tensor bpEntry(std::vector<int64_t>{block_size}, CPU);
+    block_bytesize =
+        backpointers.size_from_dim(1) * backpointers.meta().itemsize();
+    for (auto i = seqLen - 1; i > 0; i--) {
+      GatherRow(backpointers, i, block_size, block_bytesize, &bpEntry);
+      viterbiVec.push_back(bpEntry.template data<int32_t>()[viterbiVec.back()]);
+    }
+    std::reverse_copy(
+        viterbiVec.begin(),
+        viterbiVec.end(),
+        viterbiPath->template mutable_data<int32_t>());
+    return true;
+  }
+};
+class SwapBestPathOp : public Operator<CPUContext> {
+ public:
+  SwapBestPathOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws) {}
+  bool RunOnDevice() override {
+    auto& data = Input(0);
+    auto& newBestIdicies = Input(1);
+    auto* updatedData = Output(0);
+
+    CAFFE_ENFORCE(
+        data.ndim() == 2 && newBestIdicies.ndim() == 1,
+        "predictions should be a 2D matrix and  bestPath should be 1D vector");
+
+    CAFFE_ENFORCE(
+        data.dim(0) == newBestIdicies.dim(0),
+        "predictions and bestPath dimensions not matching");
+
+    updatedData->ResizeLike(data);
+    float* outData = updatedData->template mutable_data<float>();
+    context_.CopyItemsSameDevice(
+        data.meta(), data.size(), data.template data<float>(), outData);
+
+    Tensor bestScores(CPU);
+    bestScores.ResizeLike(newBestIdicies);
+    Tensor oldBestIndices(CPU);
+    oldBestIndices.ResizeLike(newBestIdicies);
+
+    ColwiseMaxAndArg(
+        data.template data<float>(),
+        data.dim(0),
+        data.dim(1),
+        bestScores.template mutable_data<float>(),
+        oldBestIndices.template mutable_data<int32_t>());
+
+    auto block_size = data.size() / data.dim(0);
+
+    const int32_t* oldBestIdx = oldBestIndices.template data<int32_t>();
+    const int32_t* newIdx = newBestIdicies.template data<int32_t>();
+
+    for (auto i = 0; i < data.dim32(0); i++) {
+      std::swap(
+          outData[i * block_size + newIdx[i]],
+          outData[i * block_size + oldBestIdx[i]]);
+    }
+    return true;
+  }
+};
+REGISTER_CPU_OPERATOR(ViterbiPath, ViterbiPathOp);
+OPERATOR_SCHEMA(ViterbiPath)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a predictions matrix and a transitions matrix, get the path with the best
+score
+)DOC")
+    .Input(0, "predictions", "N*D predictions matrix")
+    .Input(1, "transitions", "D*D transitions matrix")
+    .Output(0, "viterbi_path", "N*1 vector holds the best path indices");
+NO_GRADIENT(ViterbiPath);
+REGISTER_CPU_OPERATOR(SwapBestPath, SwapBestPathOp);
+OPERATOR_SCHEMA(SwapBestPath)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Given a sequence of idices and a matrix, enforce that these indices have the
+best columnwise scores
+score
+)DOC")
+    .Input(0, "predictions", "N*D predictions matrix")
+    .Input(1, "bestPath", "N*1 vector holds the best path indices ")
+    .Output(0, "new_predictions", "N*D updated predictions matrix");
+NO_GRADIENT(SwapBestPath);
+} // namespace
+} // namespace caffe2
diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
index b6966fe89c76fc..f70149110378fc 100644
--- a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
+++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 REGISTER_CPU_OPERATOR(
diff --git a/caffe2/operators/fused_rowwise_random_quantization_ops.cc b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
index ca5d8f25d3a9f2..9dec789393d993 100644
--- a/caffe2/operators/fused_rowwise_random_quantization_ops.cc
+++ b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/fused_rowwise_random_quantization_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
index 6dc47d7781d131..513ac64e795c41 100644
--- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
+++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
index bfa1a666e6ed9d..5ecfceef5dc612 100644
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc
@@ -1,5 +1,5 @@
 #include "caffe2/operators/lengths_reducer_rowwise_8bit_ops.h"
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/load_save_op.cc b/caffe2/operators/load_save_op.cc
index ffef2f8b39fb5b..50dcf5259a84eb 100644
--- a/caffe2/operators/load_save_op.cc
+++ b/caffe2/operators/load_save_op.cc
@@ -5,6 +5,7 @@ namespace caffe2 {
 template <>
 void LoadOp<CPUContext>::SetCurrentDevice(BlobProto* proto) {
   if (proto->has_tensor()) {
+    proto->mutable_tensor()->clear_device_detail();
     proto->mutable_tensor()->mutable_device_detail()->set_device_type(
         PROTO_CPU);
   }
diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc
index cd70e9c2b5df2f..eaa90b3dcdbc13 100644
--- a/caffe2/operators/load_save_op_gpu.cc
+++ b/caffe2/operators/load_save_op_gpu.cc
@@ -6,6 +6,7 @@ namespace caffe2 {
 template <>
 void LoadOp<CUDAContext>::SetCurrentDevice(BlobProto* proto) {
   if (proto->has_tensor()) {
+    proto->mutable_tensor()->clear_device_detail();
     auto* device_detail = proto->mutable_tensor()->mutable_device_detail();
     device_detail->set_device_type(PROTO_CUDA);
     device_detail->set_cuda_gpu_id(CaffeCudaGetDevice());
diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc
index 4029a52d0c3587..482d0599fc0e2b 100644
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@@ -51,6 +51,7 @@ OPERATOR_SCHEMA(SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient)
 REGISTER_CPU_OPERATOR(
     SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient,
     AbstractLengthsWithMainInputGradientOp<
+        float,
         float,
         int,
         CPUContext,
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index 9e7ab6d604016c..4449613787e881 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -1616,6 +1616,7 @@ class AbstractLengthsGradientOp : public Operator<Context> {
 // Version of gradient that requires the main input and thus needs to receive
 // length, indices and other stuff
 template <
+    typename Tembedding,
     typename T,
     typename TLengths,
     class Context,
@@ -1689,8 +1690,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
     int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
-    const T* data = dataInput.template data<T>();
-
+    const Tembedding* data = dataInput.template data<Tembedding>();
     int64_t dataIndex = 0;
     for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
@@ -1945,6 +1945,7 @@ segments, i.e. len(*LENGTHS*).
   using BackwardOp =
       AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
+      T,
       T,
       SIndex,
       Context,
@@ -2048,6 +2049,7 @@ i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
       ReducerGradient,
       false /*GradientNeedIndices*/>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
+      T,
       T,
       SIndex,
       Context,
diff --git a/caffe2/operators/stats_put_ops.cc b/caffe2/operators/stats_put_ops.cc
new file mode 100644
index 00000000000000..40c6b8cc60d085
--- /dev/null
+++ b/caffe2/operators/stats_put_ops.cc
@@ -0,0 +1,92 @@
+#include "caffe2/operators/stats_put_ops.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+#define REGISTER_TEMPLATED_STAT_PUT_OP(OP_NAME, STAT_NAME, STAT_MACRO) \
+  struct STAT_NAME {                                                   \
+    CAFFE_STAT_CTOR(STAT_NAME);                                        \
+    STAT_MACRO(stat_value);                                            \
+  };                                                                   \
+  REGISTER_CPU_OPERATOR(OP_NAME, TemplatePutOp<STAT_NAME>);
+
+REGISTER_TEMPLATED_STAT_PUT_OP(
+    AveragePut,
+    AveragePutStat,
+    CAFFE_AVG_EXPORTED_STAT)
+
+OPERATOR_SCHEMA(AveragePut)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .Arg(
+        "name",
+        "(*str*): name of the stat. If not present, then uses name of input blob")
+    .Arg(
+        "magnitude_expand",
+        "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .SetDoc(R"DOC(
+    Consume a value and pushes it to the global stat registry as an average.
+
+    Github Links:
+    - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc
+
+        )DOC")
+    .Input(
+        0,
+        "value",
+        "(*Tensor`<number>`*): A scalar tensor, representing any numeric value");
+
+REGISTER_TEMPLATED_STAT_PUT_OP(
+    IncrementPut,
+    IncrementPutStat,
+    CAFFE_EXPORTED_STAT)
+
+OPERATOR_SCHEMA(IncrementPut)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .Arg(
+        "name",
+        "(*str*): name of the stat. If not present, then uses name of input blob")
+    .Arg(
+        "magnitude_expand",
+        "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .SetDoc(R"DOC(
+    Consume a value and pushes it to the global stat registry as an sum.
+
+    Github Links:
+    - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc
+
+        )DOC")
+    .Input(
+        0,
+        "value",
+        "(*Tensor`<number>`*): A scalar tensor, representing any numeric value");
+
+REGISTER_TEMPLATED_STAT_PUT_OP(
+    StdDevPut,
+    StdDevPutStat,
+    CAFFE_STDDEV_EXPORTED_STAT)
+
+OPERATOR_SCHEMA(StdDevPut)
+    .NumInputs(1)
+    .NumOutputs(0)
+    .Arg(
+        "name",
+        "(*str*): name of the stat. If not present, then uses name of input blob")
+    .Arg(
+        "magnitude_expand",
+        "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .SetDoc(R"DOC(
+      Consume a value and pushes it to the global stat registry as an standard deviation.
+
+      Github Links:
+      - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc
+
+        )DOC")
+    .Input(
+        0,
+        "value",
+        "(*Tensor`<number>`*): A scalar tensor, representing any numeric value");
+
+} // namespace caffe2
diff --git a/caffe2/operators/stats_put_ops.h b/caffe2/operators/stats_put_ops.h
new file mode 100644
index 00000000000000..659df219809d34
--- /dev/null
+++ b/caffe2/operators/stats_put_ops.h
@@ -0,0 +1,53 @@
+#include <limits>
+#include "caffe2/core/operator.h"
+#include "caffe2/core/stats.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+template <typename T>
+struct TemplatePutOp : public Operator<CPUContext> {
+  TemplatePutOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator(operator_def, ws),
+        given_name_(GetSingleArgument<std::string>(
+            "stat_name",
+            operator_def.input().Get(0))),
+        magnitude_expand_(GetSingleArgument<int64_t>("magnitude_expand", 1)),
+        stat_(given_name_) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<
+        int,
+        float,
+        uint8_t,
+        int8_t,
+        uint16_t,
+        int16_t,
+        int64_t,
+        at::Half,
+        double>>::call(this, Input(0));
+  }
+
+  template <typename V>
+  bool DoRunWithType() {
+    auto input = *Input(0).template data<V>();
+
+    CAFFE_ENFORCE(
+        static_cast<int64_t>(input + 1) <
+            std::numeric_limits<int64_t>::max() / magnitude_expand_,
+        "Input value is too large for the given magnitude expansion!");
+
+    int64_t int_value = input * magnitude_expand_;
+
+    CAFFE_EVENT(stat_, stat_value, int_value);
+
+    return true;
+  }
+
+ private:
+  const std::string given_name_;
+  const long magnitude_expand_;
+  T stat_;
+};
+} // namespace caffe2
diff --git a/caffe2/opt/annotations.cc b/caffe2/opt/annotations.cc
index 937fb789cce125..271ce3dcc4c61b 100644
--- a/caffe2/opt/annotations.cc
+++ b/caffe2/opt/annotations.cc
@@ -27,6 +27,27 @@ caffe2::OperatorDef* Caffe2Annotation::getMutableOperatorDef() {
 }
 
 // Distributed annotations
+void Caffe2Annotation::setDeviceOption(const caffe2::DeviceOption& devOpt) {
+  *OpDef.mutable_device_option() = devOpt;
+}
+
+bool Caffe2Annotation::hasDeviceOption() const {
+  return OpDef.has_device_option();
+}
+
+const caffe2::DeviceOption& Caffe2Annotation::getDeviceOption() const {
+  CAFFE_ENFORCE(
+      hasDeviceOption(),
+      "DeviceOption was never set.  Use Caffe2Annotation::setDeviceOption.");
+  return OpDef.device_option();
+}
+caffe2::DeviceOption* Caffe2Annotation::getMutableDeviceOption() {
+  CAFFE_ENFORCE(
+      hasDeviceOption(),
+      "DeviceOption was never set.  Use Caffe2Annotation::setDeviceOption.");
+  return OpDef.mutable_device_option();
+}
+
 void Caffe2Annotation::setDevice(std::string device) {
   Device = device;
 }
diff --git a/caffe2/opt/annotations.h b/caffe2/opt/annotations.h
index e143c5e960c542..9bc1f1e3137648 100644
--- a/caffe2/opt/annotations.h
+++ b/caffe2/opt/annotations.h
@@ -19,6 +19,11 @@ class CAFFE2_API Caffe2Annotation : public nom::repr::Annotation {
   const caffe2::OperatorDef& getOperatorDef() const;
   caffe2::OperatorDef* getMutableOperatorDef();
 
+  void setDeviceOption(const caffe2::DeviceOption& opDef);
+  bool hasDeviceOption() const;
+  const caffe2::DeviceOption& getDeviceOption() const;
+  caffe2::DeviceOption* getMutableDeviceOption();
+
   // Distributed annotations
   void setDevice(std::string device);
   const std::string getDevice() const;
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index f9956060b75cfd..46fd8349b05832 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -56,7 +56,7 @@ int getGroup(std::map<std::string, caffe2::Argument>& argMap) {
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(ConverterRegistry, Converter);
+C10_DEFINE_REGISTRY(ConverterRegistry, Converter);
 
 std::map<std::string, caffe2::Argument> Converter::getArgumentsFromOperator(
     caffe2::OperatorDef op) {
@@ -519,4 +519,48 @@ caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m, const caffe2::NetDef& old
   return predictNet;
 }
 
+void pushOpToFront(caffe2::OperatorDef& op, caffe2::NetDef* net) {
+  *net->add_op() = op;
+  google::protobuf::RepeatedPtrField<caffe2::OperatorDef>* op_list(
+      net->mutable_op());
+  // Reverse iterate, swapping new element in front each time
+  for (int i(net->op_size() - 1); i > 0; --i) {
+    op_list->SwapElements(i, i - 1);
+  }
+}
+
+void injectDataEdgeIndicators(caffe2::NetDef* net) {
+  for (const auto& input : net->external_input()) {
+    caffe2::OperatorDef op;
+    op.set_type("Declare");
+    op.add_output(input);
+    pushOpToFront(op, net);
+  }
+  for (const auto& output : net->external_output()) {
+    caffe2::OperatorDef op;
+    op.set_type("Export");
+    op.add_input(output);
+    *net->add_op() = op;
+  }
+  net->clear_external_input();
+  net->clear_external_output();
+}
+
+void removeDataEdgeIndicators(caffe2::NetDef* net) {
+  google::protobuf::RepeatedPtrField<caffe2::OperatorDef>* op_list(
+      net->mutable_op());
+  for (auto i = 0; i < net->op_size(); ++i) {
+    auto op = net->op(i);
+    if (op.type() == "Declare") {
+      net->add_external_input(op.output(0));
+    } else if (op.type() == "Export") {
+      net->add_external_output(op.input(0));
+    } else {
+      continue;
+    }
+    // Note that this compensates for modifying the list inplace
+    op_list->DeleteSubrange(i--, 1);
+  }
+}
+
 } // namespace caffe2
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index c106fc66057916..9666739d14f016 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -13,6 +13,9 @@
 
 namespace caffe2 {
 
+CAFFE2_API void injectDataEdgeIndicators(caffe2::NetDef* net);
+CAFFE2_API void removeDataEdgeIndicators(caffe2::NetDef* net);
+
 CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false);
 CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
 
@@ -44,9 +47,9 @@ class CAFFE2_API Converter {
   virtual ~Converter() {}
 };
 
-CAFFE_DECLARE_REGISTRY(ConverterRegistry, Converter);
+C10_DECLARE_REGISTRY(ConverterRegistry, Converter);
 #define REGISTER_CONVERTER(name, cls) \
-  CAFFE_REGISTER_CLASS(ConverterRegistry, name, cls)
+  C10_REGISTER_CLASS(ConverterRegistry, name, cls)
 
 #define TRIVIAL_CONVERTER(opName)                                             \
   class opName##Converter : public Converter {                                \
diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc
index 995c9a5961c800..e9da69a42dbe3c 100644
--- a/caffe2/opt/converter_nomigraph_test.cc
+++ b/caffe2/opt/converter_nomigraph_test.cc
@@ -98,3 +98,37 @@ TEST(Converter, ExternalOutputs) {
     EXPECT_EQ(new_netdef.external_output(i), net.external_output(i));
   }
 }
+
+TEST(Converter, InjectDataEdgeIndicators) {
+  auto net = fakeNet();
+  caffe2::injectDataEdgeIndicators(&net);
+
+  EXPECT_EQ(net.op_size(), 3 + 1 + 2); // Inserted 1 Declare and 2 Export
+
+  auto declare_count = 0;
+  auto export_count = 0;
+  for (const auto& op : net.op()) {
+    declare_count += op.type() == "Declare";
+    export_count += op.type() == "Export";
+  }
+  EXPECT_EQ(declare_count, 1);
+  EXPECT_EQ(export_count, 2);
+
+  // Remove them from the network
+  EXPECT_EQ(net.external_input_size(), 0);
+  EXPECT_EQ(net.external_output_size(), 0);
+
+  // Ensure nomnigraph can handle this change
+  auto nn = caffe2::convertToNNModule(net);
+  auto new_net = caffe2::convertToCaffe2Proto(nn);
+
+  caffe2::removeDataEdgeIndicators(&new_net);
+
+  for (const auto& op : new_net.op()) {
+    EXPECT_NE(op.type(), "Declare");
+    EXPECT_NE(op.type(), "Export");
+  }
+
+  EXPECT_EQ(new_net.external_input_size(), 1);
+  EXPECT_EQ(new_net.external_output_size(), 2);
+}
diff --git a/caffe2/opt/passes.cc b/caffe2/opt/passes.cc
index e9f05a9df01c79..74250d1bbb3b12 100644
--- a/caffe2/opt/passes.cc
+++ b/caffe2/opt/passes.cc
@@ -2,7 +2,11 @@
 
 namespace caffe2 {
 
-CAFFE_DEFINE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
-CAFFE_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+C10_DEFINE_REGISTRY(
+    WorkspaceOptimizationPassRegistry,
+    WorkspaceOptimizationPass,
+    NNModule*,
+    Workspace*);
+C10_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
 
 } // namespace caffe2
diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h
index 056dbcf8779b3b..fc15dcad13fe7b 100644
--- a/caffe2/opt/passes.h
+++ b/caffe2/opt/passes.h
@@ -40,9 +40,13 @@ class CAFFE2_API WorkspaceOptimizationPass : public OptimizationPass {
   Workspace* ws_;
 };
 
-CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*);
+C10_DECLARE_REGISTRY(
+    WorkspaceOptimizationPassRegistry,
+    WorkspaceOptimizationPass,
+    NNModule*,
+    Workspace*);
 #define REGISTER_WS_OPT_PASS(clsname) \
-  CAFFE_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
 #define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname)      \
   class passname : public WorkspaceOptimizationPass {           \
    public:                                                      \
@@ -53,9 +57,9 @@ CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationP
   };                                                            \
   REGISTER_WS_OPT_PASS(passname);
 
-CAFFE_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+C10_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
 #define REGISTER_OPT_PASS(clsname) \
-  CAFFE_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
+  C10_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
 #define REGISTER_OPT_PASS_FROM_FUNC(passname, funcname) \
   class passname : public OptimizationPass {            \
    public:                                              \
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index 0a08c8db241e98..ded59d52b21f47 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/core/DeviceType.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
 #include <caffe2/proto/caffe2.pb.h>
 
@@ -47,6 +47,10 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
   }
 }
 
+inline CAFFE2_API DeviceType ProtoToType(int p) {
+  return ProtoToType(static_cast<caffe2::DeviceTypeProto>(p));
+}
+
 inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   switch (t) {
     case DeviceType::CPU:
@@ -77,4 +81,59 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   }
 }
 
+inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
+    const at::Device& device) {
+  caffe2::DeviceOption option;
+  auto type = device.type();
+  option.set_device_type(TypeToProto(type));
+
+  switch (type) {
+    case DeviceType::CPU:
+      if (device.index() != -1) {
+        option.set_numa_node_id(device.index());
+      }
+      break;
+    case DeviceType::CUDA:
+      option.set_cuda_gpu_id(device.index());
+      break;
+    case DeviceType::HIP:
+      option.set_hip_gpu_id(device.index());
+      break;
+    case DeviceType::OPENGL:
+    case DeviceType::OPENCL:
+    case DeviceType::MKLDNN:
+    case DeviceType::IDEEP:
+    case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
+    case DeviceType::ONLY_FOR_TEST:
+      break;
+    default:
+      AT_ERROR(
+          "Unknown device:",
+          static_cast<int32_t>(type),
+          ". If you have recently updated the caffe2.proto file to add a new "
+          "device type, did you forget to update the ProtoToType() and TypeToProto"
+          "function to reflect such recent changes?");
+  }
+  return option;
+}
+
+inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
+  auto type = option.device_type();
+  int32_t id = -1;
+  switch (type) {
+    case caffe2::PROTO_CPU:
+      if (option.has_numa_node_id()) {
+        id = option.numa_node_id();
+      }
+      break;
+    case caffe2::PROTO_CUDA:
+      id = option.cuda_gpu_id();
+      break;
+    case caffe2::PROTO_HIP:
+      id = option.hip_gpu_id();
+      break;
+  }
+  return at::Device(ProtoToType(type), id);
+}
+
 } // namespace caffe2
diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py
new file mode 100644
index 00000000000000..dd1c8720bfb153
--- /dev/null
+++ b/caffe2/python/crf_predict.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import numpy as np
+from caffe2.python.crf import CRFWithLoss
+
+
+def crf_update_predictions(model, crf_with_loss, classes):
+    return apply_crf(
+        model.param_init_net,
+        model.net,
+        crf_with_loss.transitions,
+        classes,
+        crf_with_loss.num_classes,
+    )
+
+
+def apply_crf(init_net, net, transitions, predictions, num_classes):
+    padded_classes = CRFWithLoss.pad_predictions(
+        predictions, init_net, net, num_classes
+    )
+    bestPath = net.ViterbiPath([padded_classes, transitions])
+    new_padded_classes = net.SwapBestPath([padded_classes, bestPath])
+    # Revert the effect of pad_predictions by removing the last two rows and
+    # the last two columns
+    new_classes = net.RemovePadding(
+        [new_padded_classes], padding_width=1, end_padding_width=1
+    )
+    slice_starts = np.array([0, 0]).astype(np.int32)
+    slice_ends = np.array([-1, -3]).astype(np.int32)
+    slice_starts = net.GivenTensorIntFill([], shape=[2], values=slice_starts)
+    slice_ends = net.GivenTensorIntFill([], shape=[2], values=slice_ends)
+    new_classes = net.Slice([new_classes, slice_starts, slice_ends])
+    return new_classes
diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py
new file mode 100644
index 00000000000000..a4502d27e3e990
--- /dev/null
+++ b/caffe2/python/crf_viterbi_test.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import workspace, crf
+
+from caffe2.python.cnn import CNNModelHelper
+from caffe2.python.crf_predict import crf_update_predictions
+from caffe2.python.test_util import TestCase
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+
+
+class TestCrfDecode(TestCase):
+
+    @given(num_tags=st.integers(2, 4), num_words=st.integers(2, 15))
+    def test_crf_viterbi(self, num_tags, num_words):
+        model = CNNModelHelper(name='external')
+        predictions = np.random.randn(num_words, num_tags).astype(np.float32)
+        transitions = np.random.uniform(
+            low=-1, high=1, size=(num_tags + 2, num_tags + 2)
+        ).astype(np.float32)
+        predictions_blob, transitions_blob = (
+            model.net.AddExternalInputs('predictions', 'crf_transitions')
+        )
+        workspace.FeedBlob(str(transitions_blob), transitions)
+        workspace.FeedBlob(str(predictions_blob), predictions)
+        crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob)
+
+        updated_predictions = crf_update_predictions(
+            model, crf_layer, predictions_blob
+        )
+        ref_predictions = crf_layer.update_predictions(predictions_blob)
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        updated_predictions = workspace.FetchBlob(str(updated_predictions))
+        ref_predictions = workspace.FetchBlob(str(ref_predictions))
+        np.testing.assert_allclose(
+            updated_predictions,
+            ref_predictions,
+            atol=1e-4, rtol=1e-4, err_msg='Mismatch in CRF predictions'
+        )
diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py
new file mode 100644
index 00000000000000..90b49a8600d76c
--- /dev/null
+++ b/caffe2/python/ideep/moment_sgd_op_test.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import hypothesis.strategies as st
+import unittest
+import caffe2.python.hypothesis_test_util as hu
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class TestMomentumSGDUpdateOps(hu.HypothesisTestCase):
+    @given(n=st.integers(4, 8), nesterov=st.booleans(),
+           **mu.gcs)
+    def test_MomentumSGDUpdate(self, n, nesterov, gc, dc):
+        param = np.random.rand(n).astype(np.float32)
+        grad = np.random.rand(n).astype(np.float32)
+        lr = np.random.rand(1).astype(np.float32)
+        param_momentum = np.random.rand(n).astype(np.float32)
+        momentum = 0.9
+        op = core.CreateOperator(
+            "MomentumSGDUpdate",
+            ["grad", "param_momentum", "lr", "param"],
+            ["grad", "param_momentum", "param"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+        )
+        # Iter lives on the CPU
+        input_device_options = {'lr': hu.cpu_do}
+
+        self.assertDeviceChecks(
+            dc,
+            op,
+            [grad, param_momentum, lr, param],
+            [0],
+            input_device_options=input_device_options,
+            threshold=0.001)
+
+        op_noparam = core.CreateOperator(
+            "MomentumSGD",
+            ["grad", "param_momentum", "lr"],
+            ["grad", "param_momentum"],
+            momentum=momentum,
+            nesterov=int(nesterov),
+        )
+
+        self.assertDeviceChecks(
+            dc,
+            op_noparam,
+            [grad, param_momentum, lr],
+            [0],
+            input_device_options=input_device_options,
+            threshold=0.001)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index d5ab8e58cec0f2..5d05c1e5b23b91 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -39,10 +39,11 @@ def _inputs(draw):
     rows_num = draw(st.integers(1, 100))
     index_num = draw(st.integers(1, 10))
     batch_size = draw(st.integers(2, 10))
+    block_size = draw(st.integers(1, 2))
     return (
         draw(hnp.arrays(
             np.float32,
-            (batch_size, rows_num, 2),
+            (batch_size, rows_num, block_size),
             elements=st.floats(-10.0, 10.0),
         )),
         draw(hnp.arrays(
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index 07f378beb18ff0..2d53027a0a053d 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -4,7 +4,7 @@
 from __future__ import unicode_literals
 import errno
 import hypothesis.strategies as st
-from hypothesis import given
+from hypothesis import given, assume
 import numpy as np
 import os
 import shutil
@@ -42,6 +42,8 @@ def load_save(self, src_device_type, src_gpu_id,
                   np.int16, np.int32, np.int64, np.uint8, np.uint16]
         arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
                   for T in dtypes]
+        assume(src_device_type == caffe2_pb2.CUDA or src_gpu_id == 0)
+        assume(dst_device_type == caffe2_pb2.CUDA or dst_gpu_id == 0)
         src_device_option = core.DeviceOption(
             src_device_type, src_gpu_id)
         dst_device_option = core.DeviceOption(
diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py
new file mode 100644
index 00000000000000..d3757c3b396e50
--- /dev/null
+++ b/caffe2/python/operator_test/stats_put_ops_test.py
@@ -0,0 +1,102 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, workspace
+from caffe2.python.test_util import TestCase
+import numpy as np
+
+
+class TestPutOps(TestCase):
+
+    def test_avg_put_ops(self):
+        put_value = 15.1111
+        magnitude_expand = 10000
+        stat_name = "a1".encode('ascii')
+        sum_postfix = "/stat_value/sum".encode("ascii")
+        count_postfix = "/stat_value/count".encode("ascii")
+
+        workspace.FeedBlob("value", np.array([put_value], dtype=np.float))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            "AveragePut",
+            "value",
+            [],
+            stat_name=stat_name,
+            magnitude_expand=magnitude_expand))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k', 'v', 't']))
+
+        k = workspace.FetchBlob('k')
+        v = workspace.FetchBlob('v')
+
+        stat_dict = dict(zip(k, v))
+
+        self.assertIn(stat_name + sum_postfix, stat_dict)
+        self.assertIn(stat_name + count_postfix, stat_dict)
+        self.assertEquals(stat_dict[stat_name + sum_postfix],
+         put_value * magnitude_expand)
+        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+
+    def test_increment_put_ops(self):
+        put_value = 15.1111
+        magnitude_expand = 10000
+        stat_name = "i1".encode('ascii')
+        member_postfix = "/stat_value".encode("ascii")
+
+        workspace.FeedBlob("value", np.array([put_value], dtype=np.float))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            "IncrementPut",
+            "value",
+            [],
+            stat_name=stat_name,
+            magnitude_expand=magnitude_expand))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k', 'v', 't']))
+
+        k = workspace.FetchBlob('k')
+        v = workspace.FetchBlob('v')
+
+        stat_dict = dict(zip(k, v))
+
+        self.assertIn(stat_name + member_postfix, stat_dict)
+        self.assertEquals(stat_dict[stat_name + member_postfix],
+         put_value * magnitude_expand)
+
+    def test_stddev_put_ops(self):
+        put_value = 15.1111
+        magnitude_expand = 10000
+        stat_name = "s1".encode('ascii')
+        sum_postfix = "/stat_value/sum".encode("ascii")
+        count_postfix = "/stat_value/count".encode("ascii")
+        sumoffset_postfix = "/stat_value/sumoffset".encode("ascii")
+        sumsqoffset_postfix = "/stat_value/sumsqoffset".encode("ascii")
+
+        workspace.FeedBlob("value", np.array([put_value], dtype=np.float))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            "StdDevPut",
+            "value",
+            [],
+            stat_name=stat_name,
+            magnitude_expand=magnitude_expand))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k', 'v', 't']))
+
+        k = workspace.FetchBlob('k')
+        v = workspace.FetchBlob('v')
+
+        stat_dict = dict(zip(k, v))
+
+        self.assertIn(stat_name + sum_postfix, stat_dict)
+        self.assertIn(stat_name + count_postfix, stat_dict)
+        self.assertIn(stat_name + sumoffset_postfix, stat_dict)
+        self.assertIn(stat_name + sumsqoffset_postfix, stat_dict)
+        self.assertEquals(stat_dict[stat_name + sum_postfix],
+            put_value * magnitude_expand)
+        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 482d16a0dfa6a6..0c5b18b0b6ab11 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -22,6 +22,8 @@
 AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
 _optimizer_instance_count = defaultdict(int)
 
+FP16_ENGINES = ["SIMD_Q_FP16", "SIMD_Q_STOC_FP16", "SIMD_Q_STOC_MKL_FP16"]
+
 logger = logging.getLogger(__name__)
 
 
@@ -584,7 +586,7 @@ def _run(self, net, param_init_net, param_info):
                     value=0.0
                 )
         else:
-            if self.engine == "SIMD_Q_FP16" or self.engine == "SIMD_Q_STOC_FP16":
+            if self.engine in FP16_ENGINES:
                 shapes, types = workspace.InferShapesAndTypes([param_init_net])
                 assert str(param) in shapes, shapes
                 shape = shapes[str(param)]
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 9a1d715bfdf225..7062ead045df1c 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -53,12 +53,12 @@ static std::string gCurrentWorkspaceName;
 BlobFetcherBase::~BlobFetcherBase() {}
 BlobFeederBase::~BlobFeederBase() {}
 
-CAFFE_DEFINE_TYPED_REGISTRY(
+C10_DEFINE_TYPED_REGISTRY(
     BlobFetcherRegistry,
     TypeIdentifier,
     BlobFetcherBase,
     std::unique_ptr);
-CAFFE_DEFINE_TYPED_REGISTRY(
+C10_DEFINE_TYPED_REGISTRY(
     BlobFeederRegistry,
     caffe2::DeviceType,
     BlobFeederBase,
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 4f81569e429369..dcb416b07a8fea 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -60,24 +60,24 @@ class BlobFeederBase {
   Feed(const DeviceOption& option, PyArrayObject* array, Blob* blob) = 0;
 };
 
-C10_EXPORT CAFFE_DECLARE_TYPED_REGISTRY(
+C10_DECLARE_TYPED_REGISTRY(
     BlobFetcherRegistry,
     TypeIdentifier,
     BlobFetcherBase,
     std::unique_ptr);
 #define REGISTER_BLOB_FETCHER(id, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
 inline unique_ptr<BlobFetcherBase> CreateFetcher(TypeIdentifier id) {
   return BlobFetcherRegistry()->Create(id);
 }
 
-CAFFE_DECLARE_TYPED_REGISTRY(
+C10_DECLARE_TYPED_REGISTRY(
     BlobFeederRegistry,
     DeviceType,
     BlobFeederBase,
     std::unique_ptr);
 #define REGISTER_BLOB_FEEDER(device_type, ...) \
-  CAFFE_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
+  C10_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
 inline unique_ptr<BlobFeederBase> CreateFeeder(int device_type) {
   return BlobFeederRegistry()->Create(
       caffe2::ProtoToType(static_cast<DeviceTypeProto>(device_type)));
@@ -148,7 +148,7 @@ class TensorFetcher : public BlobFetcherBase {
     }
 
     if (result.copied) {
-      auto context = tensor.GetStaticContext()->CreateContext();
+      auto context = CreateContext(tensor.GetDeviceType());
       context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
       context->FinishDeviceComputation();
     }
diff --git a/caffe2/python/pybind_state_registry.cc b/caffe2/python/pybind_state_registry.cc
index 9dfb87731ff4de..77fabf34256480 100644
--- a/caffe2/python/pybind_state_registry.cc
+++ b/caffe2/python/pybind_state_registry.cc
@@ -5,7 +5,7 @@ namespace python {
 
 namespace py = pybind11;
 
-CAFFE_DEFINE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
+C10_DEFINE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
 
 } // namespace python
 } // namespace caffe2
diff --git a/caffe2/python/pybind_state_registry.h b/caffe2/python/pybind_state_registry.h
index a107e7db8ea0ad..18bb0a3dbaa01d 100644
--- a/caffe2/python/pybind_state_registry.h
+++ b/caffe2/python/pybind_state_registry.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <pybind11/pybind11.h>
-#include "caffe2/core/registry.h"
+#include "c10/util/Registry.h"
 
 namespace caffe2 {
 namespace python {
@@ -14,19 +14,16 @@ struct PybindAddition {
   virtual ~PybindAddition(){};
 };
 
-CAFFE_DECLARE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
+C10_DECLARE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
 
-#define REGISTER_PYBIND_ADDITION(funcname)        \
-  namespace {                                     \
-  struct funcname##Impl : public PybindAddition { \
-    funcname##Impl(py::module& m) {               \
-      funcname(m);                                \
-    }                                             \
-  };                                              \
-  CAFFE_REGISTER_CLASS(                           \
-      PybindAdditionRegistry,                     \
-      funcname##Impl,                             \
-      funcname##Impl);                            \
+#define REGISTER_PYBIND_ADDITION(funcname)                                    \
+  namespace {                                                                 \
+  struct funcname##Impl : public PybindAddition {                             \
+    funcname##Impl(py::module& m) {                                           \
+      funcname(m);                                                            \
+    }                                                                         \
+  };                                                                          \
+  C10_REGISTER_CLASS(PybindAdditionRegistry, funcname##Impl, funcname##Impl); \
   }
 
 } // namespace python
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index dc1f7370132230..fd4b3ab030428d 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -30,6 +30,10 @@ def randBlobsFloat32(names, *dims, **kwargs):
         randBlobFloat32(name, *dims, **kwargs)
 
 
+def numOps(net):
+    return len(net.Proto().op)
+
+
 def str_compare(a, b, encoding="utf8"):
     if isinstance(a, bytes):
         a = a.decode(encoding)
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 26f5450605a1c1..502c844404c567 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -30,134 +30,86 @@
 
 
 class TestTransformations(tu.TestCase):
-    def test_transformer_AddNNPACK(self):
+    def _base_test_net(self):
         net = core.Net("net")
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        net.Relu(["Y"], ["Y2"])
+        return net
+
+    def _add_nnpack(self, net):
         transformer.AddNNPACK(net)
         assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
 
-    def test_transformer_FuseNNPACKConvRelu(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        net.Relu(["Y"], ["Y2"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
+    def _fuse_nnpack_convrelu(self, net, expected_result_num_ops,
+    expected_activation_arg=True):
+        self._add_nnpack(net)
         transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 1
+        self.assertEquals(tu.numOps(net), expected_result_num_ops)
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
             if tu.str_compare(arg.name, "activation"):
                 assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
-        assert has_activation_arg
+        if expected_activation_arg:
+            assert has_activation_arg
+        else:
+            assert not has_activation_arg
+
+    def test_transformer_AddNNPACK(self):
+        net = self._base_test_net()
+        net.Relu(["Y"], ["Y2"])
+        self._add_nnpack(net)
+
+    def test_transformer_FuseNNPACKConvRelu(self):
+        net = self._base_test_net()
+        net.Relu(["Y"], ["Y2"])
+        self._fuse_nnpack_convrelu(net, 1)
 
     def test_noFuseNNPACKConvRelu(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y2"])
         net.Relu(["Y"], ["Y3"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 3
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation") and tu.str_compare(arg.s, "Relu"):
-                has_activation_arg = True
-        assert not has_activation_arg
+        self._fuse_nnpack_convrelu(net, 3, expected_activation_arg=False)
 
     def test_transformer_FuseNNPACKConvReluNoInplace(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["X"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 1
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 1)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
 
     def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 1
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 1)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
 
     def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["X"])
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 2
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 2)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
         assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
 
     def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y2"])
         net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 2
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 2)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
         assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
 
     def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.Relu(["Y"], ["Y"])
         net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y2"], ["Y2"])
-        transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-        transformer.FuseNNPACKConvRelu(net)
-        assert len(net.Proto().op) == 2
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        assert has_activation_arg
+        self._fuse_nnpack_convrelu(net, 2)
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
         assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
 
     def test_transformer_SinkMaxPool(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
+        net = self._base_test_net()
         net.MaxPool(["Y"], ["Y1"], kernel=3)
         net.Relu(["Y1"], ["Y1"])
         transformer.SinkMaxPool(net)
@@ -205,7 +157,7 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         # Check that there is no numerical difference
@@ -256,7 +208,7 @@ def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, orde
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         # Check that there is no numerical difference
@@ -307,7 +259,7 @@ def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channe
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         print("pre")
@@ -365,7 +317,7 @@ def test_transformer_FuseConv3DBN(
         transformer.FuseConvBN(net)
 
         # Ensure fusion
-        assert len(net.Proto().op) == 1
+        assert tu.numOps(net) == 1
         workspace.RunNetOnce(net)
         postTransformOutput = workspace.FetchBlob("Y2").flatten()
         # Check that there is no numerical difference
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 82aff7b8cc87d5..d9ddfcdfc4f9ee 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -797,7 +797,10 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   if (CAFFE2_LINK_LOCAL_PROTOBUF)
     set(ONNX_PROTO_POST_BUILD_SCRIPT ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake)
   endif()
+  # Add op schemas in "ai.onnx.pytorch" domain
+  add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../caffe2/onnx/torch_ops")
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx)
+
   include_directories(${ONNX_INCLUDE_DIRS})
   add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE})
   # In mobile build we care about code size, and so we need drop
diff --git a/docker/caffe2/jenkins/common/install_clang.sh b/docker/caffe2/jenkins/common/install_clang.sh
index 694606ec0b91f3..fbf5515bae36d5 100755
--- a/docker/caffe2/jenkins/common/install_clang.sh
+++ b/docker/caffe2/jenkins/common/install_clang.sh
@@ -4,6 +4,13 @@ set -ex
 
 [ -n "$CLANG_VERSION" ]
 
+if [[ "$CLANG_VERSION" == "7" ]]; then
+  apt-get update
+  apt-get install -y --no-install-recommends software-properties-common wget
+  wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+  apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-7 main"
+fi
+
 apt-get update
 apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
 rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile
index 2ab4947453eaca..b600edee6c2498 100644
--- a/docs/cpp/source/Doxyfile
+++ b/docs/cpp/source/Doxyfile
@@ -66,6 +66,8 @@ CREATE_SUBDIRS         = NO
 FULL_PATH_NAMES        = YES
 # Nested folders will be ignored without this.
 RECURSIVE              = YES
+# Blacklist certain file patterns from the INPUT section.
+EXCLUDE = ../../../torch/csrc/api/include/torch/nn/pimpl-inl.h
 ################################################################################
 # Output formats for Doxygen to create.                                        #
 ################################################################################
@@ -102,7 +104,7 @@ EXTRACT_ALL            = YES
 EXTRACT_PACKAGE        = YES
 EXTRACT_STATIC         = YES
 CASE_SENSE_NAMES       = NO
-EXCLUDE_SYMBOLS        = c10::* caffe2::* cereal* DL* TH* cudnn*
+EXCLUDE_SYMBOLS        = c10::* caffe2::* cereal* DL* TH* cudnn* std::*
 ################################################################################
 # Docstring control / customization.                                           #
 ################################################################################
diff --git a/docs/cpp/source/building.rst b/docs/cpp/source/building.rst
deleted file mode 100644
index 24ab7a5e69ba3f..00000000000000
--- a/docs/cpp/source/building.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Building
-========
diff --git a/docs/cpp/source/contributing.rst b/docs/cpp/source/contributing.rst
index 5a1988f1db7c62..14ae9224d734ea 100644
--- a/docs/cpp/source/contributing.rst
+++ b/docs/cpp/source/contributing.rst
@@ -1,2 +1,8 @@
-Contributing
-============
+Contributing to PyTorch
+=======================
+
+If you would like to contribute to the PyTorch C++ API, refer to the
+`CONTRIBUTING.md
+<https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md>`_ document in
+the PyTorch repository.  It contains instructions on how to develop PyTorch from source
+and submit a proposal for your patch or feature. We will be happy to review it!
diff --git a/docs/cpp/source/examples.rst b/docs/cpp/source/examples.rst
deleted file mode 100644
index bac945d559fec7..00000000000000
--- a/docs/cpp/source/examples.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Examples
-========
diff --git a/docs/cpp/source/frontend.rst b/docs/cpp/source/frontend.rst
new file mode 100644
index 00000000000000..0a9a9943c6cbcd
--- /dev/null
+++ b/docs/cpp/source/frontend.rst
@@ -0,0 +1,146 @@
+The PyTorch C++ Frontend
+========================
+
+The PyTorch C++ frontend is a C++11 library for CPU and GPU
+tensor computation, with automatic differentation and high level building
+blocks for state of the art machine learning applications.
+
+Description
+-----------
+
+The PyTorch C++ frontend can be thought of as a C++ version of the
+PyTorch Python frontend, providing automatic differentiation and various higher
+level abstractions for machine learning and neural networks.  Specifically,
+it consists of the following components:
+
++----------------------+------------------------------------------------------------------------+
+| Component            | Description                                                            |
++======================+========================================================================+
+| ``torch::Tensor``    | Automatically differentiable, efficient CPU and GPU enabled tensors    |
++----------------------+------------------------------------------------------------------------+
+| ``torch::nn``        | A collection of composable modules for neural network modeling         |
++----------------------+------------------------------------------------------------------------+
+| ``torch::optim``     | Optimization algorithms like SGD, Adam or RMSprop to train your models |
++----------------------+------------------------------------------------------------------------+
+| ``torch::data``      | Datasets, data pipelines and multi-threaded, asynchronous data loader  |
++----------------------+------------------------------------------------------------------------+
+| ``torch::serialize`` | A serialization API for storing and loading model checkpoints          |
++----------------------+------------------------------------------------------------------------+
+| ``torch::python``    | Glue to bind your C++ models into Python                               |
++----------------------+------------------------------------------------------------------------+
+| ``torch::jit``       | Pure C++ access to the TorchScript JIT compiler                        |
++----------------------+------------------------------------------------------------------------+
+
+End-to-end example
+------------------
+
+Here is a simple, end-to-end example of defining and training a simple
+neural network on the MNIST dataset:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+
+  // Define a new Module.
+  struct Net : torch::nn::Module {
+    Net() {
+      // Construct and register two Linear submodules.
+      fc1 = register_module("fc1", torch::nn::Linear(8, 64));
+      fc2 = register_module("fc2", torch::nn::Linear(64, 1));
+    }
+
+    // Implement the Net's algorithm.
+    torch::Tensor forward(torch::Tensor x) {
+      // Use one of many tensor manipulation functions.
+      x = torch::relu(fc1->forward(x));
+      x = torch::dropout(x, /*p=*/0.5);
+      x = torch::sigmoid(fc2->forward(x));
+      return x;
+    }
+
+    // Use one of many "standard library" modules.
+    torch::nn::Linear fc1{nullptr}, fc2{nullptr};
+  };
+
+  // Create a new Net.
+  Net net;
+
+  // Create a multi-threaded data loader for the MNIST dataset.
+  auto data_loader =
+    torch::data::data_loader(torch::data::datasets::MNIST("./data"));
+
+  // Instantiate an SGD optimization algorithm to update our Net's parameters.
+  torch::optim::SGD optimizer(net.parameters(), /*lr=*/0.1);
+
+  for (size_t epoch = 1; epoch <= 10; ++epoch) {
+    size_t batch_index = 0;
+    // Iterate the data loader to yield batches from the dataset.
+    for (auto batch : data_loader) {
+      // Reset gradients.
+      optimizer.zero_grad();
+      // Execute the model on the input data.
+      auto prediction = model.forward(batch.data);
+      // Compute a loss value to judge the prediction of our model.
+      auto loss = torch::binary_cross_entropy(prediction, batch.label);
+      // Compute gradients of the loss w.r.t. the parameters of our model.
+      loss.backward();
+      // Update the parameters based on the calculated gradients.
+      optimizer.step();
+
+      if (batch_index++ % 10 == 0) {
+        std::cout << "Epoch: " << epoch << " | Batch: " << batch_index
+                  << " | Loss: " << loss << std::endl;
+        // Serialize your model periodically as a checkpoint.
+        torch::save(net, "net.pt");
+      }
+    }
+
+To see more complete examples of using the PyTorch C++ frontend, see `the example repository
+<https://github.com/goldsborough/examples/tree/cpp/cpp>`_.
+
+Philosophy
+----------
+
+PyTorch's C++ frontend was designed with the idea that the Python frontend is
+great, and should be used when possible; but in some settings, performance and
+portability requirements make the use of the Python interpreter infeasible. For
+example, Python is a poor choice for low latency, high performance or
+multithreaded environments, such as video games or production servers.  The
+goal of the C++ frontend is to address these use cases, while not sacrificing
+the user experience of the Python frontend.
+
+As such, the C++ frontend has been written with a few philosophical goals in mind:
+
+* **Closely model the Python frontend in its design**, naming, conventions and
+  functionality.  While there may be occasional differences between the two
+  frontends (e.g., where we have dropped deprecated features or fixed "warts"
+  in the Python frontend), we guarantee that the effort in porting a Python model
+  to C++ should lie exclusively in **translating language features**,
+  not modifying functionality or behavior.
+
+* **Prioritize flexibility and user-friendliness over micro-optimization.**
+  In C++, you can often get optimal code, but at the cost of an extremely
+  unfriendly user experience.  Flexibility and dynamism is at the heart of
+  PyTorch, and the C++ frontend seeks to preserve this experience, in some
+  cases sacrificing performance (or "hiding" performance knobs) to keep APIs
+  simple and explicable.  We want researchers who don't write C++ for a living
+  to be able to use our APIs.
+
+A word of warning: Python is not necessarily slower than
+C++! The Python frontend calls into C++ for almost anything computationally expensive
+(especially any kind of numeric operation), and these operations will take up
+the bulk of time spent in a program.  If you would prefer to write Python,
+and can afford to write Python, we recommend using the Python interface to
+PyTorch. However, if you would prefer to write C++, or need to write C++
+(because of multithreading, latency or deployment requirements), the
+C++ frontend to PyTorch provides an API that is approximately as convenient,
+flexible, friendly and intuitive as its Python counterpart. The two frontends
+serve different use cases, work hand in hand, and neither is meant to
+unconditionally replace the other.
+
+Installation
+------------
+
+Instructions on how to install the C++ frontend library distribution, including
+an example for how to build a minimal application depending on LibTorch, may be
+found by following `this <https://pytorch.org/cppdocs/installation.html>`_ link.
diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index 2743c3ea650b4e..5fef739c975518 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -1,36 +1,168 @@
 PyTorch C++ API
 ===============
 
-The PyTorch C++ API is a research and production ready C++ interface to PyTorch,
-a library for tensors and dynamic neural networks with strong GPU acceleration.
+These pages provide documentation for the public portions of the PyTorch C++
+API.  This API can roughly be divided into five parts:
 
-Description
+- **ATen**: The foundational tensor and mathematical operation library on which all else is built;
+- **Autograd**: Augments ATen with automatic differentiation;
+- **C++ Frontend**: High level constructs for training and evaluation of machine learning models;
+- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter;
+- **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines.
+
+Together, these building blocks form a research and
+production ready C++ library for tensor computation and dynamic neural
+networks with strong emphasis on GPU acceleration as well as fast CPU
+performance. It is currently in use at Facebook in research and
+production; we look forward to welcoming more users of the PyTorch C++ API.
+
+.. warning::
+
+  At the moment, the C++ API should be considered "beta" stability; we may
+  make major breaking changes to the backend in order to improve the API,
+  or in service of providing the Python interface to PyTorch, which is our
+  most stable and best supported interface.
+
+ATen
+----
+
+ATen is fundamentally a tensor library, on top of which almost all other Python
+and C++ interfaces in PyTorch are built. It provides a core ``Tensor`` class,
+on which many hundreds of operations are defined. Most of these operations have
+both CPU and GPU implementations, to which the ``Tensor`` class will
+dynamically dispatch based on its type. A small example of using ATen could
+look as follows:
+
+.. code-block:: cpp
+
+  #include <ATen/Aten.h>
+
+  at::Tensor a = at::ones({2, 2}, at::kInt);
+  at::Tensor b = at::randn({2, 2});
+  auto c = a + b.to(at::kInt);
+
+This ``Tensor`` class and all other symbols in ATen are found in the `at::`
+namespace, documented
+`here <https://pytorch.org/cppdocs/api/namespace_at.html#namespace-at>`_.
+
+Autograd
+--------
+
+What we term *autograd* are the portions of PyTorch's C++ API that augment the
+ATen ``Tensor`` class with capabilities concerning automatic differentiation.
+The autograd system records operations on tensors to form an *autograd graph*.
+Calling ``backwards()`` on a leaf variable in this graph performs reverse mode
+differentiation through the network of functions and tensors spanning the
+autograd graph, ultimately yieldings gradients. The following example provides
+a taste of this interface:
+
+.. code-block:: cpp
+
+  #include <torch/csrc/autograd/variable.h>
+  #include <torch/csrc/autograd/function.h>
+
+  at::Tensor a = torch::ones({2, 2}, at::requires_grad());
+  at::Tensor b = torch::randn({2, 2});
+  auto c = a + b;
+  c.backward(); // a.grad() will now hold the gradient of c w.r.t. a.
+
+The ``at::Tensor`` class in ATen is not differentiable by default. To add the
+differentiability of tensors the autograd API provides, you must use tensor
+factory functions from the `torch::` namespace instead of the `at` namespace.
+For example, while a tensor created with `at::ones` will not be differentiable,
+a tensor created with `torch::ones` will be.
+
+C++ Frontend
+------------
+
+The PyTorch C++ frontend provides a high level, pure C++ modeling interface for
+neural network and general machine learning research and production use cases,
+largely following the Python API in design and provided functionality. The C++
+frontend includes the following:
+
+- An interface for defining machine learning models through a hierarchical module system (like ``torch.nn.Module``);
+- A "standard library" of pre-existing modules for the most common modeling purposes (e.g. convolutions, RNNs, batch normalization etc.);
+- An optimization API, including implementations of popular optimizers such as SGD, Adam, RMSprop and others;
+- A means of representing datasets and data pipelines, including functionality to load data in parallel over many CPU cores;
+- A serialization format for storing and loading checkpoints of a training session (like ``torch.utils.data.DataLoader``);
+- Automatic parallelization of models onto multiple GPUs (like ``torch.nn.parallel.DataParallel``);
+- Support code to easily bind C++ models into Python using pybind11;
+- Entry points to the TorchScript JIT compiler;
+- Helpful utilities to facilitate interfacing with the ATen and Autograd APIs.
+
+See `this <https://pytorch.org/cppdocs/frontend.html>`_ document for a more
+detailed description of the C++ frontend. Relevant sections of the `torch::`
+namespace related to the C++ Frontend include `torch::nn
+<https://pytorch.org/cppdocs/api/namespace_torch**nn.html#namespace-torch-nn>`_,
+`torch::optim
+<https://pytorch.org/cppdocs/api/namespace_torch**optim.html#namespace-torch-optim>`_,
+`torch::data
+<https://pytorch.org/cppdocs/api/namespace_torch**data.html#namespace-torch-data>`_,
+`torch::serialize
+<https://pytorch.org/cppdocs/api/namespace_torch**serialize.html#namespace-torch-serialize>`_,
+`torch::jit
+<https://pytorch.org/cppdocs/api/namespace_torch**jit.html#namespace-torch-jit>`_
+and `torch::python
+<https://pytorch.org/cppdocs/api/namespace_torch**python.html#namespace-torch-python>`_.
+Examples of the C++ frontend can be found in `this repository
+<https://github.com/goldsborough/examples/tree/cpp/cpp>`_ which is being
+expanded on a continuous and active basis.
+
+.. note::
+
+  Unless you have a particular reason to constrain yourself exclusively to ATen
+  or the Autograd API, the C++ frontend is the recommended entry point to the
+  PyTorch C++ ecosystem. While it is still in beta as we collect user feedback
+  (from you!), it provides both more functionality and better stability
+  guarantees than the ATen and Autograd APIs.
+
+TorchScript
 -----------
 
-The PyTorch C++ API provides all the major building blocks to research and iterate on
-state of the art machine learning models with a user friendly modern C++ interface,
-as well as providing an excellent platform for deploying machine learning applications
-in bare bones, high performance environments.
+TorchScript a representation of a PyTorch model that can be understood,
+compiled and serialized by the TorchScript compiler. Fundamentally, TorchScript
+is a programming language in its own right. It is a subset of Python using
+the PyTorch API.  The C++ interface to TorchScript encompasses three primary pieces of
+functionality:
 
-1. Design Philosophy
-2. Description of components
-3. One small example
+- A mechanism for loading and executing serialized TorchScript models defined in Python;
+- An API for defining custom operators that extend the TorchScript standard library of operations;
+- Just-in-time compilation of TorchScript programs from C++.
 
-License
--------
+The first mechanism may be of great interest to you if you would like to define
+your models in Python as much as possible, but subsequently export them to C++
+for production environments and no-Python inference. You can find out more
+about this by following `this
+<https://pytorch.org/tutorials/advanced/cpp_export.html>`_ link. The second
+API concerns itself with scenarios in which you would like to extend
+TorchScript with custom operators, which can similarly be serialized and
+invoked from C++ during inference. Lastly, the `torch::jit::compile
+<https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a176d99fd5bf0233119a5f49c07a1d01d.html#exhale-function-namespacetorch-1-1jit-1a176d99fd5bf0233119a5f49c07a1d01d>`_
+function may be used to access the TorchScript compiler directly from C++.
 
+C++ Extensions
+--------------
 
+*C++ Extensions* offer a simple yet powerful way of accessing all of the above
+interfaces for the purpose of extending regular Python use-cases of PyTorch.
+C++ extensions are most commonly used to implement custom operators in C++ or
+CUDA to accelerate research in vanilla PyTorch setups. The C++ extension API
+does not add any new functionality to the PyTorch C++ API. Instead, it
+provides integration with Python setuptools as well as JIT compilation
+mechanisms that allow access to ATen, the autograd and other C++ APIs from
+Python. To learn more about the C++ extension API, see
+`this <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_ tutorial.
 
 Contents
-========
+--------
 
 .. toctree::
    :maxdepth: 2
 
-   api/library_root
-   examples
-   building
+   frontend
+   installing
    contributing
+   api/library_root
 
 
 Indices and tables
diff --git a/docs/cpp/source/installing.rst b/docs/cpp/source/installing.rst
new file mode 100644
index 00000000000000..24906dbb53391a
--- /dev/null
+++ b/docs/cpp/source/installing.rst
@@ -0,0 +1,131 @@
+Installing C++ Distributions of PyTorch
+=======================================
+
+We provide binary distributions of all headers, libraries and CMake
+configuration files required to depend on PyTorch. We call this distribution
+*LibTorch*, and you can download ZIP archives containing the latest LibTorch
+distribution on `our website <https://gist.github.com/goldsborough/fc3d94917f0405a9da7ec2899710eb9f>`_. Below
+is a small example of writing a minimal application that depends on LibTorch
+and uses the `at::Tensor` class which comes with the PyTorch C++ API.
+
+Minimal Example
+---------------
+
+The first step is to download the LibTorch ZIP archive via the link above. For
+example:
+
+.. code-block:: sh
+
+  wget https://download.pytorch.org/libtorch/nightly/cpu/libtorch-shared-with-deps-latest.zip
+  unzip libtorch-shared-with-deps-latest.zip
+
+
+Next, we can write a minimal CMake build configuration to develop a small
+application that depends on LibTorch. CMake is not a hard requirement for using
+LibTorch, but it is the recommended and blessed build system and will be well
+supported into the future. A most basic `CMakeLists.txt` file could look like
+this:
+
+.. code-block:: cmake
+
+  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+  project(example-app)
+
+  find_package(Torch REQUIRED)
+
+  add_executable(example-app example-app.cpp)
+  target_link_libraries(example-app "${TORCH_LIBRARIES}")
+  set_property(TARGET example-app PROPERTY CXX_STANDARD 11)
+
+The implementation of our example will simply create a new `at::Tensor` and
+print it:
+
+.. code-block:: cpp
+
+  #include <torch/torch.h>
+  #include <iostream>
+
+  int main() {
+    at::Tensor tensor = torch::rand({2, 3});
+    std::cout << tensor << std::endl;
+  }
+
+While there are more fine-grained headers you can include to access only parts
+of the PyTorch C++ API, including `torch/torch.h` is the most sure-proof way of
+including most of its functionality.
+
+The last step is to build the application. For this, assume our example
+directory is laid out like this:
+
+.. code-block:: sh
+
+  example-app/
+    CMakeLists.txt
+    example-app.cpp
+
+We can now run the following commands to build the application from within the
+``example-app/`` folder:
+
+.. code-block:: sh
+
+  mkdir build
+  cd build
+  cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  make
+
+where ``/path/to/libtorch`` should be the full path to the unzipped LibTorch
+distribution. If all goes well, it will look something like this:
+
+.. code-block:: sh
+
+  root@4b5a67132e81:/example-app# mkdir build
+  root@4b5a67132e81:/example-app# cd build
+  root@4b5a67132e81:/example-app/build# cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  -- The C compiler identification is GNU 5.4.0
+  -- The CXX compiler identification is GNU 5.4.0
+  -- Check for working C compiler: /usr/bin/cc
+  -- Check for working C compiler: /usr/bin/cc -- works
+  -- Detecting C compiler ABI info
+  -- Detecting C compiler ABI info - done
+  -- Detecting C compile features
+  -- Detecting C compile features - done
+  -- Check for working CXX compiler: /usr/bin/c++
+  -- Check for working CXX compiler: /usr/bin/c++ -- works
+  -- Detecting CXX compiler ABI info
+  -- Detecting CXX compiler ABI info - done
+  -- Detecting CXX compile features
+  -- Detecting CXX compile features - done
+  -- Looking for pthread.h
+  -- Looking for pthread.h - found
+  -- Looking for pthread_create
+  -- Looking for pthread_create - not found
+  -- Looking for pthread_create in pthreads
+  -- Looking for pthread_create in pthreads - not found
+  -- Looking for pthread_create in pthread
+  -- Looking for pthread_create in pthread - found
+  -- Found Threads: TRUE
+  -- Configuring done
+  -- Generating done
+  -- Build files have been written to: /example-app/build
+  root@4b5a67132e81:/example-app/build# make
+  Scanning dependencies of target example-app
+  [ 50%] Building CXX object CMakeFiles/example-app.dir/example-app.cpp.o
+  [100%] Linking CXX executable example-app
+  [100%] Built target example-app
+
+Executing the resulting ``example-app`` binary found in the ``build`` folder
+should now merrily print the tensor (exact output subject to randomness):
+
+.. code-block:: sh
+
+  root@4b5a67132e81:/example-app/build# ./example-app model.pt
+  0.2063  0.6593  0.0866
+  0.0796  0.5841  0.1569
+  [ Variable[CPUFloatType]{2,3} ]
+
+Support
+-------
+
+If you run into any troubles with this installation and minimal usage guide,
+please use our `forum <https://discuss.pytorch.org/>`_ or `GitHub issues
+<https://github.com/pytorch/pytorch/issues>`_ to get in touch.
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 68420d837bf801..a0d3abfa5501ad 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -1173,6 +1173,11 @@ Distance functions
 
 .. autofunction:: cosine_similarity
 
+:hidden:`pdist`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: pdist
+
 
 Loss functions
 --------------
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 63047bb11fddff..212f68e694d7f9 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -74,9 +74,10 @@ You can force synchronous computation by setting environment variable
 operation is actually executed, so the stack trace does not show where it was
 requested.)
 
-As an exception, several functions such as :meth:`~torch.Tensor.copy_` admit
-an explicit :attr:`async` argument, which lets the caller bypass synchronization
-when it is unnecessary.  Another exception is CUDA streams, explained below.
+As an exception, several functions such as :meth:`~torch.Tensor.to` and 
+:meth:`~torch.Tensor.copy_` admit an explicit :attr:`non_blocking` argument, 
+which lets the caller bypass synchronization when it is unnecessary.  
+Another exception is CUDA streams, explained below.
 
 CUDA streams
 ^^^^^^^^^^^^
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index a329bb049baac3..71dcaa7511fa26 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -110,6 +110,7 @@ An empty sparse tensor can be constructed by specifying its size:
     .. method:: mm
     .. method:: mul
     .. method:: mul_
+    .. method:: narrow_copy
     .. method:: resizeAs_
     .. method:: size
     .. method:: spadd
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 31585d4a969770..1d55fa8b937738 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -223,6 +223,7 @@ Reduction Ops
 Comparison Ops
 ~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: allclose
+.. autofunction:: argsort
 .. autofunction:: eq
 .. autofunction:: equal
 .. autofunction:: ge
@@ -256,6 +257,7 @@ Spectral Ops
 Other Operations
 ~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: bincount
+.. autofunction:: broadcast_tensors
 .. autofunction:: cross
 .. autofunction:: diag
 .. autofunction:: diagflat
diff --git a/setup.py b/setup.py
index 94455ed1cf7be7..a8cdac91e92369 100644
--- a/setup.py
+++ b/setup.py
@@ -1202,6 +1202,8 @@ def make_relative_rpath(path):
                 'lib/include/caffe2/utils/*.h',
                 'lib/include/c10/*.h',
                 'lib/include/c10/macros/*.h',
+                'lib/include/c10/util/*.h',
+                'lib/include/caffe2/core/*.h',
                 'lib/include/torch/*.h',
                 'lib/include/torch/csrc/*.h',
                 'lib/include/torch/csrc/api/include/torch/*.h',
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index f692bdfae123b9..059b004a84840e 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -5,7 +5,6 @@ project(custom_ops)
 find_package(Torch REQUIRED)
 
 add_library(custom_ops SHARED op.cpp)
-target_compile_features(custom_ops PUBLIC cxx_range_for)
 target_link_libraries(custom_ops ${TORCH_LIBRARIES})
 
 add_executable(test_custom_ops test_custom_ops.cpp)
diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect
index bcbcffaee486a3..8932957402c94e 100644
--- a/test/expect/TestBatched.test_for.expect
+++ b/test/expect/TestBatched.test_for.expect
@@ -6,17 +6,17 @@ graph(%x.1_data : Dynamic
       %y_dims : Dynamic) {
   %6 : int = prim::Constant[value=10]()
   %7 : int = prim::Constant[value=1]()
-  %x : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims)
+  %x : Dynamic, %9 : Dynamic, %10 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims)
     block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) {
-      %13 : int = prim::Constant[value=1]()
-      %14 : Long() = prim::NumToTensor(%13)
-      %alpha : float = prim::TensorToNum(%14)
+      %15 : int = prim::Constant[value=1]()
+      %16 : Long() = prim::NumToTensor(%15)
+      %alpha : float = prim::TensorToNum(%16)
       %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha)
       %mask : Dynamic = aten::mul(%5_mask, %y_mask)
       %dims : Dynamic = aten::__or__(%5_dims, %y_dims)
-      %19 : int = prim::Constant[value=1]()
+      %21 : int = prim::Constant[value=1]()
       %data : Dynamic = aten::where(%mask, %data.1, %5_data)
-      -> (%19, %data, %mask, %dims)
+      -> (%21, %data, %mask, %dims)
     }
-  return (%x, %21, %22);
+  return (%x, %9, %10);
 }
diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect
index 66e3cdb6a2dfa8..7aba7a89ace320 100644
--- a/test/expect/TestBatched.test_while.expect
+++ b/test/expect/TestBatched.test_while.expect
@@ -14,34 +14,34 @@ graph(%a.1_data : Dynamic
   %13 : Dynamic = aten::sum(%12)
   %14 : Dynamic = aten::gt(%13, %11)
   %15 : int = prim::TensorToNum(%14)
-  %63 : Dynamic, %64 : Dynamic, %65 : Dynamic, %a : Dynamic, %61 : Dynamic, %62 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
+  %16 : Dynamic, %17 : Dynamic, %18 : Dynamic, %a : Dynamic, %20 : Dynamic, %21 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
     block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) {
-      %24 : int = prim::Constant[value=1]()
-      %25 : Long() = prim::NumToTensor(%24)
-      %alpha : float = prim::TensorToNum(%25)
+      %29 : int = prim::Constant[value=1]()
+      %30 : Long() = prim::NumToTensor(%29)
+      %alpha : float = prim::TensorToNum(%30)
       %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha)
       %mask : Dynamic = aten::mul(%6_mask, %b_mask)
       %dims : Dynamic = aten::__or__(%6_dims, %b_dims)
-      %30 : Dynamic = aten::gt(%data.1, %b_data)
-      %31 : Dynamic = aten::mul(%mask, %b_mask)
-      %32 : Dynamic = aten::__or__(%dims, %b_dims)
-      %33 : int = prim::TensorToNum(%30)
-      %34 : int = prim::Constant[value=1]()
-      %35 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
-      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %35)
-      %37 : int = aten::dim(%cond_mask.1)
-      %38 : int = aten::eq(%37, %34)
-      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%38)
+      %35 : Dynamic = aten::gt(%data.1, %b_data)
+      %36 : Dynamic = aten::mul(%mask, %b_mask)
+      %37 : Dynamic = aten::__or__(%dims, %b_dims)
+      %38 : int = prim::TensorToNum(%35)
+      %39 : int = prim::Constant[value=1]()
+      %40 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
+      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %40)
+      %42 : int = aten::dim(%cond_mask.1)
+      %43 : int = aten::eq(%42, %39)
+      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%43)
         block0() {
-          %42 : int = aten::dim(%data.1)
-          %43 : int = aten::sub(%42, %34)
-          %44 : int = prim::Constant[value=1]()
-          %data.3 : Dynamic = prim::Loop(%43, %44, %cond_mask.1)
-            block0(%_ : int, %47 : Dynamic) {
-              %48 : int = aten::dim(%47)
-              %data.2 : Dynamic = aten::unsqueeze(%47, %48)
-              %50 : int = prim::Constant[value=1]()
-              -> (%50, %data.2)
+          %47 : int = aten::dim(%data.1)
+          %48 : int = aten::sub(%47, %39)
+          %49 : int = prim::Constant[value=1]()
+          %data.3 : Dynamic = prim::Loop(%48, %49, %cond_mask.1)
+            block0(%_ : int, %52 : Dynamic) {
+              %53 : int = aten::dim(%52)
+              %data.2 : Dynamic = aten::unsqueeze(%52, %53)
+              %55 : int = prim::Constant[value=1]()
+              -> (%55, %data.2)
             }
           %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
           %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
@@ -53,12 +53,12 @@ graph(%a.1_data : Dynamic
       %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data)
       %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask)
       %res_dims : Dynamic = aten::__or__(%dims, %6_dims)
-      %56 : int = prim::Constant[value=0]()
-      %57 : Dynamic = aten::mul(%30, %31)
-      %58 : Dynamic = aten::sum(%57)
-      %59 : Dynamic = aten::gt(%58, %56)
-      %60 : int = prim::TensorToNum(%59)
-      -> (%60, %30, %31, %32, %res_data, %res_mask, %res_dims)
+      %61 : int = prim::Constant[value=0]()
+      %62 : Dynamic = aten::mul(%35, %36)
+      %63 : Dynamic = aten::sum(%62)
+      %64 : Dynamic = aten::gt(%63, %61)
+      %65 : int = prim::TensorToNum(%64)
+      -> (%65, %35, %36, %37, %res_data, %res_mask, %res_dims)
     }
-  return (%a, %61, %62);
+  return (%a, %20, %21);
 }
diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect
index 5bc86daf4765c7..6f72acc4c8483e 100644
--- a/test/expect/TestJit.test_constant_prop_print.expect
+++ b/test/expect/TestJit.test_constant_prop_print.expect
@@ -2,6 +2,7 @@ graph(%input_tensor : Dynamic) {
   %1 : int = prim::Constant[value=6]()
    = prim::Print(%1)
   %2 : int = prim::Constant[value=8]()
-  %3 : Dynamic = aten::add(%2, %input_tensor)
-  return (%3);
+  %3 : int = prim::Constant[value=1]()
+  %4 : Dynamic = aten::add(%input_tensor, %2, %3)
+  return (%4);
 }
diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect
index 029f9ac05a0783..71cf099a54a663 100644
--- a/test/expect/TestJit.test_constant_prop_simple.expect
+++ b/test/expect/TestJit.test_constant_prop_simple.expect
@@ -1,5 +1,6 @@
 graph(%input_tensor : Dynamic) {
   %1 : int = prim::Constant[value=8]()
-  %2 : Dynamic = aten::add(%1, %input_tensor)
-  return (%2);
+  %2 : int = prim::Constant[value=1]()
+  %3 : Dynamic = aten::add(%input_tensor, %1, %2)
+  return (%3);
 }
diff --git a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
index bed05b89580c6c..b2159144f798fc 100644
--- a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
@@ -1,7 +1,7 @@
 graph(%x : Dynamic) {
-  %2 : int = prim::Constant[value=1]()
-  %1 : Dynamic = aten::neg(%x)
+  %1 : int = prim::Constant[value=1]()
+  %2 : Dynamic = aten::neg(%x)
   %3 : int = prim::Constant[value=1]()
-  %4 : Dynamic = aten::add(%1, %2, %3)
+  %4 : Dynamic = aten::add(%2, %1, %3)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
index b7492626b5d8fe..3478376f829c85 100644
--- a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
@@ -1,14 +1,14 @@
 graph(%x : Dynamic) {
-  %9 : int = prim::Constant[value=1]()
-  %1 : int = prim::Constant[value=3]()
-  %2 : int = prim::Constant[value=4]()
-  %3 : int[] = prim::ListConstruct(%2, %1)
-  %4 : int = prim::Constant[value=6]()
-  %5 : int = prim::Constant[value=0]()
-  %6 : int[] = prim::Constant[value=[0, -1]]()
-  %7 : Dynamic = aten::zeros(%3, %4, %5, %6)
-  %8 : Dynamic = aten::mm(%x, %7)
+  %1 : int = prim::Constant[value=1]()
+  %2 : int = prim::Constant[value=3]()
+  %3 : int = prim::Constant[value=4]()
+  %4 : int[] = prim::ListConstruct(%3, %2)
+  %5 : int = prim::Constant[value=6]()
+  %6 : int = prim::Constant[value=0]()
+  %7 : int[] = prim::Constant[value=[0, -1]]()
+  %8 : Dynamic = aten::zeros(%4, %5, %6, %7)
+  %9 : Dynamic = aten::mm(%x, %8)
   %10 : int = prim::Constant[value=1]()
-  %11 : Dynamic = aten::add(%8, %9, %10)
+  %11 : Dynamic = aten::add(%9, %1, %10)
   return (%11);
 }
diff --git a/test/expect/TestScript.test_call_script_mod_from_script_module.expect b/test/expect/TestScript.test_call_script_mod_from_script_module.expect
index 5cae9dcdf96e9d..0365ff600b0a20 100644
--- a/test/expect/TestScript.test_call_script_mod_from_script_module.expect
+++ b/test/expect/TestScript.test_call_script_mod_from_script_module.expect
@@ -1,7 +1,7 @@
 graph(%x : Dynamic
       %1 : Dynamic
-      %3 : Dynamic) {
-  %2 : Dynamic = aten::mm(%x, %1)
-  %4 : Dynamic = aten::mm(%2, %3)
+      %2 : Dynamic) {
+  %3 : Dynamic = aten::mm(%x, %1)
+  %4 : Dynamic = aten::mm(%3, %2)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
index cbdbc744b5e85d..3674a3fbc07d2b 100644
--- a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
@@ -56,8 +56,8 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %23 : Float(*, *) = aten::neg(%3)
   %24 : int = prim::Constant[value=1]()
   %25 : Float(*, *) = aten::add(%23, %24, %24)
-  %26 : Float(*, *) = aten::mul(%19, %3)
-  %27 : Float(*, *) = aten::mul(%26, %25)
+  %26 : Float(*, *) = aten::mul(%25, %3)
+  %27 : Float(*, *) = aten::mul(%26, %19)
   %28 : Float(*, *) = aten::mul(%2, %2)
   %29 : Float(*, *) = aten::neg(%28)
   %30 : int = prim::Constant[value=1]()
@@ -66,13 +66,13 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %33 : Float(*, *) = aten::neg(%1)
   %34 : int = prim::Constant[value=1]()
   %35 : Float(*, *) = aten::add(%33, %34, %34)
-  %36 : Float(*, *) = aten::mul(%22, %1)
-  %37 : Float(*, *) = aten::mul(%36, %35)
+  %36 : Float(*, *) = aten::mul(%35, %1)
+  %37 : Float(*, *) = aten::mul(%36, %22)
   %38 : Float(*, *) = aten::neg(%0)
   %39 : int = prim::Constant[value=1]()
   %40 : Float(*, *) = aten::add(%38, %39, %39)
-  %41 : Float(*, *) = aten::mul(%20, %0)
-  %42 : Float(*, *) = aten::mul(%41, %40)
+  %41 : Float(*, *) = aten::mul(%40, %0)
+  %42 : Float(*, *) = aten::mul(%41, %20)
   %43 : Float(*, *) = prim::FusedConcat[dim=1](%42, %37, %32, %27)
   return (%43, %18);
 }
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
index b0dc85644751d8..fb14a35296623a 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
@@ -62,8 +62,8 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %20 : Float(*, *) = aten::neg(%3)
   %21 : int = prim::Constant[value=1]()
   %22 : Float(*, *) = aten::add(%20, %21, %21)
-  %23 : Float(*, *) = aten::mul(%8, %3)
-  %24 : Float(*, *) = aten::mul(%23, %22)
+  %23 : Float(*, *) = aten::mul(%22, %3)
+  %24 : Float(*, *) = aten::mul(%23, %8)
   %25 : Float(*, *) = aten::mul(%2, %2)
   %26 : Float(*, *) = aten::neg(%25)
   %27 : int = prim::Constant[value=1]()
@@ -72,13 +72,13 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %30 : Float(*, *) = aten::neg(%1)
   %31 : int = prim::Constant[value=1]()
   %32 : Float(*, *) = aten::add(%30, %31, %31)
-  %33 : Float(*, *) = aten::mul(%19, %1)
-  %34 : Float(*, *) = aten::mul(%33, %32)
+  %33 : Float(*, *) = aten::mul(%32, %1)
+  %34 : Float(*, *) = aten::mul(%33, %19)
   %35 : Float(*, *) = aten::neg(%0)
   %36 : int = prim::Constant[value=1]()
   %37 : Float(*, *) = aten::add(%35, %36, %36)
-  %38 : Float(*, *) = aten::mul(%17, %0)
-  %39 : Float(*, *) = aten::mul(%38, %37)
+  %38 : Float(*, *) = aten::mul(%37, %0)
+  %39 : Float(*, *) = aten::mul(%38, %17)
   %40 : Float(*, *) = prim::FusedConcat[dim=1](%39, %34, %29, %24)
   return (%40);
 }
diff --git a/test/expect/TestScript.test_scalar_fusion.expect b/test/expect/TestScript.test_scalar_fusion.expect
index e2fd92a0f5739c..565855f262d16c 100644
--- a/test/expect/TestScript.test_scalar_fusion.expect
+++ b/test/expect/TestScript.test_scalar_fusion.expect
@@ -6,7 +6,7 @@ graph(%x : Float()
 with prim::FusionGroup_0 = graph(%0 : Float()
       %1 : Float()) {
   %2 : int = prim::Constant[value=2]()
-  %3 : Float() = aten::mul(%2, %1)
+  %3 : Float() = aten::mul(%1, %2)
   %4 : int = prim::Constant[value=1]()
   %5 : Float() = aten::add(%3, %0, %4)
   return (%5);
diff --git a/test/onnx/expect/TestOperators.test_full.expect b/test/onnx/expect/TestOperators.test_full.expect
new file mode 100644
index 00000000000000..db975329ffa7b5
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_full.expect
@@ -0,0 +1,148 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    output: "1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: INT64
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    output: "2"
+    op_type: "Shape"
+  }
+  node {
+    input: "2"
+    input: "1"
+    output: "3"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        data_type: INT64
+        raw_data: "\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "0"
+    output: "5"
+    op_type: "Shape"
+  }
+  node {
+    input: "5"
+    input: "4"
+    output: "6"
+    op_type: "Gather"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "3"
+    output: "7"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "6"
+    output: "8"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "7"
+    input: "8"
+    output: "9"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    input: "9"
+    output: "10"
+    op_type: "ConstantFill"
+    attribute {
+      name: "dtype"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "input_as_shape"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "value"
+      f: 2
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "10"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 9
+}
diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
index 63f1f3cc563951..f11abcc7e6ef3b 100644
--- a/test/onnx/expect/TestOperators.test_ge.expect
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -3,8 +3,8 @@ producer_name: "pytorch"
 producer_version: "0.4"
 graph {
   node {
-    input: "1"
     input: "0"
+    input: "1"
     output: "2"
     op_type: "Less"
   }
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
index fb36f3449f2664..b50002eacbaf3b 100644
--- a/test/onnx/expect/TestOperators.test_le.expect
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -3,8 +3,8 @@ producer_name: "pytorch"
 producer_version: "0.4"
 graph {
   node {
-    input: "1"
     input: "0"
+    input: "1"
     output: "2"
     op_type: "Greater"
   }
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 2dfdd409a15ce7..1e2d0ffb294219 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -287,6 +287,10 @@ def test_hardtanh(self):
         x = Variable(torch.randn(3, 4), requires_grad=True)
         self.assertONNX(lambda x: torch.nn.Hardtanh(-0.5, 0.5)(x), x)
 
+    def test_full(self):
+        x = torch.randn(3, 4, requires_grad=True)
+        self.assertONNX(lambda x: torch.full(x.shape, 2), x)
+
     def test_max(self):
         x = Variable(torch.randn(3, 4), requires_grad=True)
         y = Variable(torch.randn(3, 4), requires_grad=True)
diff --git a/test/test_autograd.py b/test/test_autograd.py
index f9ccfb6c958e99..0642e87399c676 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1406,6 +1406,7 @@ def test_unused_output(self):
         expected_grad[:2] = grad_output
         self.assertEqual(x.grad.data, expected_grad)
 
+    @skipIfRocm
     def test_ctc_loss(self):
         batch_size = 64
         num_labels = 101
diff --git a/test/test_cuda.py b/test/test_cuda.py
index cdf8d46ce236cf..2c647b08cbd601 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -268,11 +268,11 @@ def tmp(t):
     ('div', small_3d, lambda t: [number(3.14, 3, t)], '', types, False,
         "skipIfRocm:ByteTensor,CharTensor,FloatTensor,HalfTensor,ShortTensor"),
     ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types, False, "skipIfRocm:HalfTensor"),
+    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
+    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1'),
+    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2'),
+    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3'),
+    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
     # HalfTensor gives bad result at pow-2 with data sampled from torch.randn
     ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half,
         False, "skipIfRocm:HalfTensor,FloatTensor"),
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 020486c1fbda35..3d9af20c859658 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -371,6 +371,7 @@ def test_segfault(self):
         finally:
             p.terminate()
 
+    @skipIfRocm
     def test_timeout(self):
         p = ErrorTrackingProcess(target=_test_timeout)
         p.start()
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 5fbc2003be27e4..2c489d858c1238 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -542,6 +542,12 @@ def is_all_nan(tensor):
             'scale': torch.tensor([1., -1.], requires_grad=True),
         },
     ]),
+    Example(MultivariateNormal, [
+        {
+            'loc': torch.tensor([1., 1.], requires_grad=True),
+            'covariance_matrix': torch.tensor([[1.0, 0.0], [0.0, -2.0]], requires_grad=True),
+        },
+    ]),
     Example(Normal, [
         {
             'loc': torch.tensor([1., 1.], requires_grad=True),
@@ -2372,12 +2378,20 @@ def test_valid_parameter_broadcasting(self):
              (1, 2)),
             (StudentT(df=torch.tensor([1.]), scale=torch.tensor([[1.]])),
              (1, 1)),
+            (StudentT(df=1., loc=torch.zeros(5, 1), scale=torch.ones(3)),
+             (5, 3)),
         ]
 
         for dist, expected_size in valid_examples:
-            dist_sample_size = dist.sample().size()
-            self.assertEqual(dist_sample_size, expected_size,
-                             'actual size: {} != expected size: {}'.format(dist_sample_size, expected_size))
+            actual_size = dist.sample().size()
+            self.assertEqual(actual_size, expected_size,
+                             '{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size))
+
+            sample_shape = torch.Size((2,))
+            expected_size = sample_shape + expected_size
+            actual_size = dist.sample(sample_shape).size()
+            self.assertEqual(actual_size, expected_size,
+                             '{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size))
 
     def test_invalid_parameter_broadcasting(self):
         # invalid broadcasting cases; should throw error
diff --git a/test/test_jit.py b/test/test_jit.py
index a448362b470bbf..24d8076d31365f 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -240,14 +240,10 @@ def getExportImportCopy(self, m):
             imported = torch.jit.load(f.name)
         finally:
             os.unlink(f.name)
-        f = tempfile.NamedTemporaryFile(delete=False)
-        try:
-            f.close()
-            imported.save(f.name)
-            imported = torch.jit.load(f.name)
-        finally:
-            os.unlink(f.name)
-        return imported
+        buffer = io.BytesIO()
+        torch.jit.save(imported, buffer)
+        buffer.seek(0)
+        return torch.jit.load(buffer)
 
     def assertGraphContains(self, graph, kind):
         self.assertTrue(any(n.kind() == kind for n in graph.nodes()))
@@ -2230,7 +2226,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_if_else_with_scalar(self):
         def single_if(a, b):
@@ -2250,7 +2246,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_if_noelse(self):
         def single_if(a, b):
@@ -2268,7 +2264,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_if_noelse_with_scalar(self):
         def single_if(a, b):
@@ -2286,7 +2282,7 @@ def single_if(a, b):
 
         script_if = torch.jit.script(single_if)
         graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_while(self):
         def single_while(a, b):
@@ -2305,7 +2301,7 @@ def single_while(a, b):
 
         script_while = torch.jit.script(single_while)
         graph = torch.to_batch_graph(script_while.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_for(self):
         def single_for(x, y):
@@ -2323,7 +2319,7 @@ def single_for(x, y):
 
         script_for = torch.jit.script(single_for)
         graph = torch.to_batch_graph(script_for.graph)
-        self.assertExpected(str(graph))
+        self.assertExpected(canonical(graph))
 
     def test_lstm(self):
         def LSTM(x_all, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
@@ -2645,18 +2641,23 @@ def stuff3(x):
             return torch.ones(x), x
         self.checkScript(stuff3, ([3, 2],))
 
-    def test_nested_list_error(self):
-        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
-            @torch.jit.script
-            def foo(x):
-                # type: (Tuple[List[List[int]]]) -> int
-                return 4
+    def test_nested_list(self):
+        def foo(z):
+            # type: (Tuple[int, List[List[int]]]) -> int
+            x, y = z
+            return y[0][1]
+        self.checkScript(foo, ((1, [[1, 2], [3, 4]]),))
+
+    def test_nested_list_construct(self):
+        def foo():
+            return [[4]] + [[4, 5]]
+        self.checkScript(foo, ())
 
-    def test_nested_list_construct_error(self):
-        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
+    def test_generic_list_errors(self):
+        with self.assertRaisesRegex(RuntimeError, "previously matched to type"):
             @torch.jit.script
             def foo(x):
-                return [[4]]
+                return [[x]] + [[1]]
 
     def test_script_cu(self):
         cu = torch.jit.CompilationUnit('''
@@ -2723,18 +2724,23 @@ def func(a, b):
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     @skipIfRocm
     def test_clamp_fusion(self):
-        def func(a, b):
+        def func2(a, b):
             return torch.clamp(a + b, min=0, max=2)
 
+        def funcInf(a, b):
+            return torch.clamp(a + b, min=0, max=float('inf'))
+
         a = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
         b = torch.randn(4, 4, dtype=torch.float, device='cuda')
 
-        s = self.checkScript(func, (a, b))
-        self.assertAllFused(s.graph_for(a, b))
+        funcs = (func2, funcInf)
+        for f in funcs:
+            s = self.checkScript(f, (a, b))
+            self.assertAllFused(s.graph_for(a, b))
 
-        c = s(a, b)
-        c.sum().backward()
-        self.assertAllFused(backward_graph(s))
+            c = s(a, b)
+            c.sum().backward()
+            self.assertAllFused(backward_graph(s))
 
     def test_mul(self):
         def func(a, b):
@@ -3448,6 +3454,80 @@ def test_over_slice():
             return a[3:10] == [3, 4]
         self.checkScript(test_backward_slice, ())
 
+    def test_mutable_list(self):
+        def test_append():
+            a = [0, 1]
+            a.append(2)
+            a.append(3)
+            return a == [0, 1, 2, 3]
+        self.checkScript(test_append, ())
+
+        def test_append_2():
+            a = [0, 1]
+            a.append(2)
+            a = [1]
+            a.append(4)
+            return a == [1, 4]
+        self.checkScript(test_append_2, ())
+
+        def test_append_if():
+            a = [1]
+            if True:
+                a.append(4)
+            return a == [1, 4]
+        self.checkScript(test_append_if, ())
+
+        def test_append_if_else():
+            a = [1]
+            if False:
+                a.append(4)
+            else:
+                a.append(10)
+            return a == [1, 10]
+        self.checkScript(test_append_if_else, ())
+
+        def test_append_loop():
+            a = _construct_empty_int_list()
+            for i in range(5):
+                a.append(i)
+
+            return a == [0, 1, 2, 3, 4]
+        self.checkScript(test_append_loop, ())
+
+        def test_append_loop_if():
+            a = _construct_empty_int_list()
+            for i in range(5):
+                if i > 3:
+                    a.append(i)
+                else:
+                    a.append(0)
+
+            return a == [0, 0, 0, 0, 4]
+        self.checkScript(test_append_loop_if, ())
+
+        def test_nested_loop():
+            a = _construct_empty_int_list()
+            for i in range(2):
+                for j in range(2):
+                    a.append(i + j)
+
+            return a == [0, 1, 1, 2]
+        self.checkScript(test_append_loop_if, ())
+
+    def test_mutable_list_function_inline(self):
+        @torch.jit.script
+        def bar(y):
+            # type: (List[int]) -> List[int]
+            y.append(4)
+
+        @torch.jit.script
+        def foo():
+            x = [1, 2, 3]
+            bar(x)
+            return x
+
+        self.assertEqual(foo(), [1, 2, 3, 4])
+
     def test_func_call(self):
         script = '''
         def add(a, b):
@@ -4845,7 +4925,6 @@ def bar():
             bar()
 
     def test_tuples(self):
-        @torch.jit.script
         def foo(i):
             a = (i + 4, i * 2)
             c = a
@@ -4857,10 +4936,12 @@ def foo(i):
             while False:
                 t0, t1 = c
                 c = (t1, t0)
-            return t0
+            x = (1,)
+            y = 1,
+            return t0, x, y
 
         v = torch.rand(10, 3)
-        self.assertEqual(v * 9, foo(v))
+        self.checkScript(foo, (v,))
 
         with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type \(Tensor, Tensor\)"):
             @torch.jit.script
@@ -6491,7 +6572,7 @@ def script_fn(x):
 
         # Note: the neg op from script_fn1 should be properly inlined into the
         # graph of script_fn
-        self.assertExpected(str(script_fn.graph))
+        self.assertExpected(canonical(script_fn.graph))
 
     def test_call_script_mod_from_script_fn(self):
         class ScriptMod(torch.jit.ScriptModule):
@@ -6508,7 +6589,7 @@ def forward(self, x):
         def script_fn(x):
             return sm(x) + 1
 
-        self.assertExpected(str(script_fn.graph))
+        self.assertExpected(canonical(script_fn.graph))
 
     def test_call_python_fn_from_script_module(self):
         def python_fn(x):
@@ -6607,7 +6688,7 @@ def forward(self, x):
                 return script_fn(torch.mm(x, self.param))
 
         sm = ScriptMod()
-        self.assertExpected(str(sm.__getattr__('forward').graph))
+        self.assertExpected(canonical(sm.__getattr__('forward').graph))
 
     def test_call_script_mod_from_script_module(self):
         class ScriptMod1(torch.jit.ScriptModule):
@@ -6633,7 +6714,7 @@ def forward(self, x):
         # Note: the parameters from both modules should appear in the flattened
         # input list to the graph. The mm op from ScriptMod1 should be properly
         # inlined
-        self.assertExpected(str(sm.graph))
+        self.assertExpected(canonical(sm.graph))
 
     def test_module_with_params_called_fails(self):
         with self.assertRaisesRegex(RuntimeError, "Attempted to inline a Module with parameters. Stateful "
@@ -7168,6 +7249,7 @@ def test_dcgan_models(self):
         self._test_dcgan_models(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_dcgan_models_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_dcgan_models(self, device='cuda', check_export_import=False)
@@ -7290,11 +7372,13 @@ def test_mnist(self):
         self._test_mnist(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_mnist(self, device='cuda', check_export_import=False)
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_training_leaks_no_memory_cuda(self):
         net = MnistNet().cuda()
         # MnistNet uses dropout, don't check its trace
diff --git a/test/test_nn.py b/test/test_nn.py
index 0d61d72f3ceb66..eee4e3a7c74755 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4202,6 +4202,7 @@ def get_inputs(input_shape, hidden_shape, mode):
             test(input_shape, hidden_shape, mode)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_rnn_check_device(self):
         input_size = 3
         hidden_size = 5
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 1304f42bda78fa..a91681d4767049 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -496,6 +496,76 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
         test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
+    @skipIfRocm
+    def test_Sparse_to_Sparse_copy_(self):
+        # This is for testing torch.copy_(SparseTensor, SparseTensor)
+        sparse_dims = 3
+        nnz = 10
+        sizes = [2, 3, 4, 5]  # hybrid sparse
+        x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
+        x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes)
+
+        # test copy
+        x2_dense = x2.to_dense()
+        x1.copy_(x2)
+        self.assertEqual(x2_dense, x1.to_dense())
+
+        # test type conversion (when x1.copy_(x2), x1.dtype should stay the same)
+        x1 = x1.to(torch.float32)
+        x2 = x2.to(torch.float64)
+        x1_dtype = x1.dtype
+        x1.copy_(x2)
+        self.assertEqual(x1_dtype, x1.dtype)
+
+        # test no broadcast
+        self.assertRaises(RuntimeError, lambda: x1.copy_(x2.narrow_copy(0, 0, 1)))
+
+        # test raise error on copy_() between dense and sparse Tensors
+        self.assertRaises(RuntimeError, lambda: x1.copy_(torch.randn(5, 5)))
+
+        # test autograd
+        x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
+        x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes)
+        x2.requires_grad_(True)
+        x1.copy_(x2)
+        y = x1 * 2
+        x2_clone = x2.clone()
+        y.backward(x2_clone)
+        expected_grad = x2_clone * 2
+        self.assertEqual(expected_grad.to_dense(), x2.grad.to_dense())
+        self.assertEqual(None, x1.grad)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    @skipIfRocm
+    def test_Sparse_to_Sparse_copy_multi_gpu(self):
+        # This is for testing torch.copy_(SparseTensor, SparseTensor) across GPU devices
+        sparse_dims = 3
+        nnz = 10
+        sizes = [2, 3, 4, 5]  # hybrid sparse
+        x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
+        x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes)
+        x1 = x1.to('cuda:0')
+
+        def test_cross_device(x1, x2):
+            x1_device = x1.device
+            x1.copy_(x2)
+            self.assertEqual(x2.to('cuda:0').to_dense(), x1.to_dense())
+            self.assertEqual(x1_device, x1.device)
+
+        test_cross_device(x1, x2.to('cuda:1'))  # test across gpu devices
+        test_cross_device(x1, x2.to('cpu'))  # test between cpu and gpu
+
+        # test autograd
+        x2 = x2.to('cuda:1')
+        x2.requires_grad_(True)
+        x1.copy_(x2)
+        y = x1 * 2
+        x2_clone = x2.clone().to('cuda:0')
+        y.backward(x2_clone)
+        expected_grad = x2_clone * 2
+        self.assertEqual(expected_grad.to_dense(), x2.grad.to('cuda:0').to_dense())
+        self.assertEqual(None, x1.grad)
+
     @cuda_only
     def test_cuda_empty(self):
         def test_tensor(x):
@@ -1023,6 +1093,34 @@ def test_shape(i_shapes, v_shapes, nnzs):
         test_shape([0, 3, 4], [3, 4, 5, 6], [0])
         test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
 
+    def _test_narrow(self, input, narrow_args):
+        expected = input.to_dense().narrow(*narrow_args)
+        self.assertEqual(expected, input.narrow_copy(*narrow_args).to_dense())
+
+    def _all_narrow_combs(self, shape):
+        for dim, dim_sz in enumerate(shape):
+            for start in range(dim_sz):
+                for length in range(dim_sz - start):
+                    yield [dim, start, length]
+
+    @skipIfRocm
+    def test_narrow(self):
+        shape = [3, 3, 4, 2]
+        input, _, _ = self._gen_sparse(4, 19, shape)
+        for narrow_args in self._all_narrow_combs(shape):
+            self._test_narrow(input, narrow_args)
+
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(-1, 0, 3))  # dim < 0
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(10, 0, 3))  # dim > input.dim()
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(0, shape[0] + 1, 3))  # start > size of dim
+        self.assertRaises(RuntimeError, lambda: input.narrow_copy(0, 2, shape[0]))  # start+length > size of dim
+
+        with_dense, _, _ = self._gen_sparse(2, 7, shape)
+        for narrow_args in self._all_narrow_combs(shape):
+            self._test_narrow(with_dense, narrow_args)
+
+        self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3))  # dim > sparseDim + denseDim
+
     def _test_log1p_tensor(self, input, dense_tensor):
         expected_output = torch.tensor(dense_tensor).log1p_()
         self.assertEqual(expected_output, input.log1p().to_dense())
@@ -1410,6 +1508,7 @@ def test_tensor(indices, values, indices_equal, values_equal):
         test_tensor(indices, values, False, True)  # An empty tensor's data_ptr is always equal to 0
 
     @cpu_only  # just run once, we test both cpu and cuda
+    @skipIfRocm
     def test_constructor_device_legacy(self):
         i = torch.tensor([[0, 1, 1], [2, 0, 2]])
         v = torch.tensor([3., 4., 5.])
@@ -1556,6 +1655,7 @@ def test_resize(self):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 0], [2, 2, 0])
 
+    @skipIfRocm
     def test_is_nonzero(self):
         self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero())
diff --git a/test/test_torch.py b/test/test_torch.py
index 84ef8a22e050b3..3026548b99043e 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3999,6 +3999,7 @@ def test_is_signed_cuda(self):
         self.assertEqual(torch.cuda.HalfTensor(10).is_signed(), True)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_gesv(self):
         a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
                           (-6.05, -3.30, 5.36, -4.44, 1.08),
@@ -4130,6 +4131,7 @@ def test_gesv_batched_dims(self):
         self._test_gesv_batched_dims(self, lambda t: t)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_qr(self):
 
         # Since the QR decomposition is unique only up to the signs of the rows of
@@ -4312,10 +4314,12 @@ def _test_trtrs(self, cast):
         self.assertEqual(res1, tb, 0)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_trtrs(self):
         self._test_trtrs(self, lambda t: t)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_gels(self):
         def _test_underdetermined(a, b, expectedNorm):
             m = a.size()[0]
@@ -4431,6 +4435,7 @@ def check_norm(a, b, expected_norm, gels_result):
         self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_eig(self):
         a = torch.Tensor(((1.96, 0.00, 0.00, 0.00, 0.00),
                           (-6.49, 3.80, 0.00, 0.00, 0.00),
diff --git a/test/test_utils.py b/test/test_utils.py
index 971e8a4f05f8e0..dff6102e4579e7 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -25,16 +25,6 @@
 
 from common import TestCase, run_tests, download_file
 
-try:
-    import cffi
-    HAS_CFFI = True
-except ImportError:
-    HAS_CFFI = False
-
-
-if HAS_CFFI:
-    from torch.utils.ffi import create_extension
-
 
 class SimplePlugin(Plugin):
 
@@ -371,74 +361,9 @@ def test_model_gradient(self):
 
 
 class TestFFI(TestCase):
-
-    def setUp(self):
-        self.tmpdir = tempfile.mkdtemp()
-        os.chdir(self.tmpdir)
-        sys.path.append(self.tmpdir)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdir)
-
-    @unittest.skipIf(not HAS_CFFI, "ffi tests require cffi package")
-    @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
-    @unittest.skipIf(IS_PPC, "skip for ppc64le due to incompatible exception handling")
-    def test_cpu(self):
-        create_extension(
-            name='test_extensions.cpulib',
-            headers=[test_dir + '/ffi/src/cpu/lib.h'],
-            sources=[
-                test_dir + '/ffi/src/cpu/lib1.c',
-                test_dir + '/ffi/src/cpu/lib2.c',
-            ],
-            verbose=False,
-        ).build()
-        from test_extensions import cpulib
-        tensor = torch.ones(2, 2).float()
-
-        cpulib.good_func(tensor, 2, 1.5)
-        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
-
-        new_tensor = cpulib.new_tensor(4)
-        self.assertEqual(new_tensor, torch.ones(4, 4) * 4)
-
-        f = cpulib.int_to_float(5)
-        self.assertIs(type(f), float)
-
-        self.assertRaises(TypeError,
-                          lambda: cpulib.good_func(tensor.double(), 2, 1.5))
-        self.assertRaises(torch.FatalError,
-                          lambda: cpulib.bad_func(tensor, 2, 1.5))
-
-    @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package")
-    @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
-    @skipIfRocm
-    def test_gpu(self):
-        from torch.utils.cpp_extension import CUDA_HOME
-        create_extension(
-            name='gpulib',
-            headers=[test_dir + '/ffi/src/cuda/cudalib.h'],
-            sources=[
-                test_dir + '/ffi/src/cuda/cudalib.c',
-            ],
-            with_cuda=True,
-            verbose=False,
-            include_dirs=[os.path.join(CUDA_HOME, 'include')],
-        ).build()
-        import gpulib
-        tensor = torch.ones(2, 2).float()
-
-        gpulib.good_func(tensor, 2, 1.5)
-        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)
-
-        ctensor = tensor.cuda().fill_(1)
-        gpulib.cuda_func(ctensor, 2, 1.5)
-        self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)
-
-        self.assertRaises(TypeError,
-                          lambda: gpulib.cuda_func(tensor, 2, 1.5))
-        self.assertRaises(TypeError,
-                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
+    def test_deprecated(self):
+        with self.assertRaisesRegex(ImportError, "torch.utils.ffi is deprecated. Please use cpp extensions instead."):
+            from torch.utils.ffi import create_extension
 
 
 @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
diff --git a/third_party/ideep b/third_party/ideep
index 4bd9a6800bf7db..dedff8fb8193fe 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit 4bd9a6800bf7db068187619e0582d34dec9651dc
+Subproject commit dedff8fb8193fe3a1ea893d4bc852f8ea395b6b3
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 81856c62ad07d9..7e26d84432182c 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -28,7 +28,8 @@
     '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*',
     'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', 'randint(_out)?',
     '_local_scalar', '_local_scalar_dense',
-    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to'
+    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to',
+    'copy_sparse_to_sparse_'
 ]
 
 # These function signatures are not exposed to Python. Note that this signature
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 24ac92dd63926f..64ad9fc5e6d185 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -416,7 +416,9 @@ Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_block
       grad_fn->src_device = src.get_device();
     }
   }
-  baseType->s_copy_(self_, src_, non_blocking);
+  if (self.is_sparse() && src.is_sparse()) baseType->copy_sparse_to_sparse_(self_, src_, non_blocking);
+  else if (!self.is_sparse() && !src.is_sparse()) baseType->s_copy_(self_, src_, non_blocking);
+  else AT_ERROR("copy_() between dense and sparse Tensors is not implemented! Found self type = ", self.type(), " and src type = ", src.type());
   increment_version(self);
   rebase_history(as_variable_ref( self ), std::move(grad_fn));
   if(torch::jit::tracer::isTracing()) {
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index ce337e93c85463..27e580e8965edf 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -149,6 +149,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/annotate_effects.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
@@ -174,6 +175,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/script/builtin_functions.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp
   ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index f5c7d41c199e0b..832de8d76db4b0 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1453,6 +1453,17 @@ def callable(a, b) -> number
             [ 8,  9]])
 """)
 
+add_docstr_all('narrow_copy',
+               r"""
+narrow_copy(dimension, start, length) -> Tensor
+
+Same as :meth:`Tensor.narrow` except returning a copy rather
+than shared storage.  This is primarily for sparse tensors, which
+do not have a shared-storage narrow method.  Calling ```narrow_copy``
+with ```dimemsion > self._sparseDims()``` will return a copy with the
+relevant dense dimension narrowed, and ```self.shape``` updated accordingly.
+""")
+
 add_docstr_all('ndimension',
                r"""
 ndimension() -> int
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 7601ce3c268d4c..8f3c1ae6ebf73c 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -2938,7 +2938,7 @@ def parse_kwargs(desc):
 .. math::
     \log(\Gamma_{p}(a)) = C + \displaystyle \sum_{i=1}^{p} \log\left(\Gamma\left(a - \frac{i - 1}{2}\right)\right)
 
-where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\Gamma(.)` is the Gamma function.
+where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\Gamma(\cdot)` is the Gamma function.
 
 If any of the elements are less than or equal to :math:`\frac{p - 1}{2}`, then an error
 is thrown.
diff --git a/torch/csrc/api/src/serialize/input-archive.cpp b/torch/csrc/api/src/serialize/input-archive.cpp
index bd6995d67d69e9..11e97bce08f564 100644
--- a/torch/csrc/api/src/serialize/input-archive.cpp
+++ b/torch/csrc/api/src/serialize/input-archive.cpp
@@ -6,7 +6,7 @@
 #include <torch/csrc/jit/import.h>
 #include <torch/csrc/jit/script/module.h>
 
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <memory>
 #include <string>
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 1847bb65b08f8a..a5edc29833633a 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -8,7 +8,7 @@
 
 #include <ATen/DeviceGuard.h>
 #include <ATen/ExpandUtils.h>
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <atomic>
 #include <condition_variable>
diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp
index 2299cce245a16b..1e4e7bf7b9e37f 100644
--- a/torch/csrc/generic/serialization.cpp
+++ b/torch/csrc/generic/serialization.cpp
@@ -2,8 +2,6 @@
 #define TH_GENERIC_FILE "generic/serialization.cpp"
 #else
 
-#define SYSCHECK(call) { ssize_t __result = call; if (__result < 0) throw std::system_error((int) __result, std::system_category()); }
-
 template <class io>
 void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
 {
@@ -16,23 +14,10 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
   data = (scalar_t*)cpu_data.get();
   THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(scalar_t), cudaMemcpyDeviceToHost));
 #endif
-  ssize_t result = doWrite(fd, &size, sizeof(int64_t));
-  if (result != sizeof(int64_t))
-    throw std::system_error(result, std::system_category());
+  doWrite(fd, &size, sizeof(int64_t));
   // fast track for bytes and little endian
   if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    char *bytes = (char *) data;
-    int64_t remaining = sizeof(scalar_t) * size;
-    while (remaining > 0) {
-      // we write and read in 1GB blocks to avoid bugs on some OSes
-      ssize_t result = doWrite(fd, bytes, THMin(remaining, 1073741824));
-      if (result < 0)
-        throw std::system_error(result, std::system_category());
-      bytes += result;
-      remaining -= result;
-    }
-    if (remaining != 0)
-      throw std::system_error(result, std::system_category());
+    doWrite(fd, data, sizeof(scalar_t) * size);
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
     std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(scalar_t)]);
@@ -54,7 +39,7 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
             THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       }
-      SYSCHECK(doWrite(fd, le_buffer.get(), to_convert * sizeof(scalar_t)));
+      doWrite(fd, le_buffer.get(), to_convert * sizeof(scalar_t));
     }
   }
 }
@@ -67,11 +52,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 {
   scalar_t *data;
   int64_t size;
-  ssize_t result = doRead(file, &size, sizeof(int64_t));
-  if (result == 0)
-    throw std::runtime_error("unexpected EOF. The file might be corrupted.");
-  if (result != sizeof(int64_t))
-    throw std::system_error(result, std::system_category());
+  doRead(file, &size, sizeof(int64_t));
   THWStoragePtr storage;
   if (_storage == nullptr) {
     storage = THWStorage_(newWithSize)(LIBRARY_STATE size);
@@ -91,20 +72,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 
   // fast track for bytes and little endian
   if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
-    char *bytes = (char *) data;
-    int64_t remaining = sizeof(scalar_t) * THWStorage_(size)(LIBRARY_STATE storage);
-    while (remaining > 0) {
-      // we write and read in 1GB blocks to avoid bugs on some OSes
-      ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824));
-      if (result == 0) // 0 means EOF, which is also an error
-        throw std::runtime_error("unexpected EOF. The file might be corrupted.");
-      if (result < 0)
-        throw std::system_error(result, std::system_category());
-      bytes += result;
-      remaining -= result;
-    }
-    if (remaining != 0)
-      throw std::system_error(result, std::system_category());
+    doRead(file, data, sizeof(scalar_t) * THWStorage_(size)(LIBRARY_STATE storage));
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
     std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * sizeof(scalar_t)]);
@@ -112,7 +80,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 
     for (int64_t i = 0; i < size; i += buffer_size) {
       size_t to_convert = std::min(size - i, buffer_size);
-      SYSCHECK(doRead(file, le_buffer.get(), sizeof(scalar_t) * to_convert));
+      doRead(file, le_buffer.get(), sizeof(scalar_t) * to_convert);
 
       if (sizeof(scalar_t) == 2) {
         THP_decodeInt16Buffer((int16_t*)data + i,
@@ -142,6 +110,4 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 template THWStorage* THPStorage_(readFileRaw<int>)(int fd, THWStorage* storage);
 template THWStorage* THPStorage_(readFileRaw<PyObject*>)(PyObject* fd, THWStorage* storage);
 
-#undef SYSCHECK
-
 #endif
diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
index 10b0cad6749128..2d1b9f7b147abb 100644
--- a/torch/csrc/jit/argument_spec.h
+++ b/torch/csrc/jit/argument_spec.h
@@ -61,7 +61,6 @@ static_assert(sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type),
 struct ArgumentSpec {
   ArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs, size_t num_flat_inputs) {
     hash_code = num_flat_inputs;
-
     args.resize(num_flat_inputs);
     size_t offset = 0;
     for (size_t i = 0; i < inputs.size(); ++i) {
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 009bf68ae3f6da..a59c856eaba751 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -27,14 +27,10 @@ bool isDifferentiable(Node * n) {
   static OperatorSet differentiable_ops = {
     "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
     "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
-    "aten::add(Scalar other, Tensor self) -> Tensor",
     "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
     "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
-    "aten::sub(Scalar other, Tensor self) -> Tensor",
     "aten::mul(Tensor self, Tensor other) -> Tensor",
     "aten::mul(Tensor self, Scalar other) -> Tensor",
-    "aten::mul(Scalar other, Tensor self) -> Tensor",
-    "aten::div(Scalar other, Tensor self) -> Tensor",
     "aten::div(Tensor self, Tensor other) -> Tensor",
     "aten::div(Tensor self, Scalar other) -> Tensor",
     "aten::sigmoid(Tensor self) -> Tensor",
@@ -132,9 +128,6 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
       return {grads.at(0), nullptr, nullptr};
 
-    } else if (node->matches("aten::add(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, grads.at(0)};
-
     } else if (node->kind() == prim::AutogradAdd) {
       return {grads.at(0), grads.at(0)};
 
@@ -144,29 +137,23 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     } else if (node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
       return {grads.at(0), nullptr, nullptr};
 
-    } else if (node->matches("aten::sub(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, -grads.at(0)};
-
     } else if (node->matches("aten::mul(Tensor self, Tensor other) -> Tensor")) {
       return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)};
 
     } else if (node->matches("aten::mul(Tensor self, Scalar other) -> Tensor")) {
       return {grads.at(0) * inputs.at(1), nullptr};
 
-    } else if (node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, grads.at(0) * inputs.at(0)};
-
     } else if (node->matches("aten::div(Tensor self, Tensor other) -> Tensor")) {
       return {grads.at(0) / inputs.at(1), -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
 
     } else if (node->matches("aten::div(Tensor self, Scalar other) -> Tensor")) {
       return {grads.at(0) / inputs.at(1), nullptr};
 
-    } else if (node->matches("aten::div(Scalar other, Tensor self) -> Tensor")) {
-      return {nullptr, -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
-
     } else if (node->matches("aten::sigmoid(Tensor self) -> Tensor")) {
-      return {grads.at(0) * outputs.at(0) * (1 - outputs.at(0))};
+      // TODO: The order of operations matter in this case. This 
+      // works for ppc64le and x86_64. Need to look at why the 
+      // order matters.
+      return {(1 - outputs.at(0)) * outputs.at(0) * grads.at(0)};
 
     } else if (node->matches("aten::tanh(Tensor self) -> Tensor")) {
       return {grads.at(0) * (1 - outputs.at(0) * outputs.at(0))};
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index f1844d2bac6651..1633ac0de4d6aa 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -42,6 +42,8 @@ Value* insertConstant(
     n->destroy();
     n = g.create(prim::None);
     n->output()->setType(NoneType::get());
+  } else if(val.isWorld()) {
+    n->output()->setType(WorldType::get());
   } else {
     throw constant_not_supported_error("Unsupported value kind: " + val.tagKind());
   }
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 437d0f6c779972..973780b7d7d62f 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -15,6 +15,7 @@
 #include <vector>
 #include <string>
 #include <sstream>
+#include <fstream>
 
 namespace torch { namespace jit {
 
@@ -425,7 +426,7 @@ void GraphEncoder::EncodeTensor(
 class ModuleEncoder: public EncoderBase {
  public:
   ModuleEncoder(const script::Module &module,
-                const std::string &filename);
+                std::ostream& out);
 
  private:
   void EncodeModule(onnx::GraphProto *graph_proto, const script::Module &module);
@@ -448,7 +449,7 @@ class ModuleEncoder: public EncoderBase {
 
   virtual void EncodeTensor(onnx::TensorProto *tensor_proto,
                             const at::Tensor &tensor,
-                            const at::optional<std::string> external_ref) override;
+                            const at::optional<std::string> external_ref = {}) override;
 
   virtual void EncodeIntermediateValueInfo(onnx::GraphProto *graph_proto,
                                            const Value* n) override;
@@ -462,7 +463,7 @@ class ModuleEncoder: public EncoderBase {
                       const TypePtr& type,
                       const std::string& name);
 
-  PyTorchFileWriter file_writer_;
+  PyTorchStreamWriter stream_writer_;
   // Used to deduplicate tensor storages
   std::unordered_map<const void*, uint64_t> storage_dedup_map_;
 
@@ -475,9 +476,9 @@ class ModuleEncoder: public EncoderBase {
 
 ModuleEncoder::ModuleEncoder(
     const script::Module &module,
-    const std::string &filename)
+    std::ostream& out)
     : EncoderBase(onnx_torch::OperatorExportTypes::RAW, false),
-      file_writer_(filename) {
+      stream_writer_(out) {
   model_proto_.set_doc_string("THIS PROTO IS NOT STANDARD ONNX");
   EncodeModule(model_proto_.mutable_graph(), module);
 }
@@ -564,6 +565,10 @@ void ModuleEncoder::EncodeTypeInfo(
     type_proto->set_denotation("GeneratorType");
   } else if (kind == TypeKind::StringType) {
     type_proto->set_denotation("StringType");
+  } else if (kind == TypeKind::VarType) {
+    type_proto->set_denotation("TypeVar:" + type->expect<VarType>()->name());
+  } else if (kind == TypeKind::WorldType) {
+    type_proto->set_denotation("WorldType");
   } else {
     throw std::runtime_error("unexpected type kind");
   }
@@ -582,7 +587,7 @@ void ModuleEncoder::EncodeModule(
   EncodeParameters(graph_proto, module, "");
   EncodeMethods(graph_proto, module, "");
   auto str = model_proto_.SerializeAsString();
-  file_writer_.writeRecord(str.data(), str.size());
+  stream_writer_.writeRecord(str.data(), str.size());
 }
 
 void ModuleEncoder::EncodeParameters(
@@ -670,7 +675,7 @@ void ModuleEncoder::EncodeMethod(
 void ModuleEncoder::EncodeTensor(
     onnx::TensorProto *tensor_proto,
     const at::Tensor &tensor,
-    const at::optional<std::string> external_ref = {}) {
+    const at::optional<std::string> external_ref) {
   auto storage_ptr = tensor.storage().unsafeGetStorageImpl();
   auto dedup_it = storage_dedup_map_.find(storage_ptr);
   if (dedup_it != storage_dedup_map_.end()) {
@@ -689,7 +694,7 @@ void ModuleEncoder::EncodeTensor(
         .cpu();
     }
 
-    auto record_number = file_writer_.writeRecord(
+    auto record_number = stream_writer_.writeRecord(
       static_cast<char*>(t.storage().data()), t.type().elementSizeInBytes() * t.storage().size());
     tensor_proto->add_int64_data(record_number);
     storage_dedup_map_[storage_ptr] = record_number;
@@ -915,8 +920,14 @@ std::tuple<std::string, RawDataExportMap> ExportGraph(
                          graph_encoder.get_raw_data_export_map());
 }
 
+void ExportModule(const script::Module& module, std::ostream& out) {
+  ModuleEncoder(module, out);
+}
+
 void ExportModule(const script::Module& module, const std::string &filename) {
-  ModuleEncoder(module, filename);
+  std::ofstream out(filename, std::ios_base::binary);
+
+  ExportModule(module, out);
 }
 
 }}
diff --git a/torch/csrc/jit/export.h b/torch/csrc/jit/export.h
index f7eee3dc77ac07..363de0b56ac169 100644
--- a/torch/csrc/jit/export.h
+++ b/torch/csrc/jit/export.h
@@ -4,6 +4,8 @@
 #include "torch/csrc/jit/script/module.h"
 #include "torch/csrc/onnx/onnx.h"
 
+#include <ostream>
+
 namespace torch { namespace jit {
 
 // This map is used to keep track of parameters that should be exported
@@ -34,6 +36,10 @@ TORCH_API std::string PrettyPrintExportedGraph(
       = ::torch::onnx::OperatorExportTypes::ONNX,
     bool google_printer = false);
 
+TORCH_API void ExportModule(
+    const script::Module& module,
+    std::ostream& out);
+
 TORCH_API void ExportModule(
     const script::Module& module,
     const std::string& filename);
diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h
index c7b53abf46c2a2..dcaaf766e18c0e 100644
--- a/torch/csrc/jit/function_schema.h
+++ b/torch/csrc/jit/function_schema.h
@@ -46,7 +46,10 @@ struct FunctionSchema {
         arguments(std::move(arguments)),
         returns(std::move(returns)),
         is_vararg(is_vararg),
-        is_varret(is_varret) {}
+        is_varret(is_varret),
+        is_mutable(isMutable()) {
+    validate();
+  }
   FunctionSchema(
       Symbol name,
       std::vector<Argument> arguments,
@@ -58,7 +61,9 @@ struct FunctionSchema {
             std::move(std::move(arguments)),
             std::move(std::move(returns)),
             is_vararg,
-            is_varret) {}
+            is_varret) {
+    validate();
+  }
 
   const std::string name;
   const std::vector<Argument> arguments;
@@ -69,6 +74,8 @@ struct FunctionSchema {
   // arguments are not checked by schema
   const bool is_vararg;
   const bool is_varret;
+  const bool is_mutable;
+
   at::optional<int> argumentIndexWithName(const std::string& name) const {
     for(size_t i = 0; i < arguments.size(); ++i) {
       if(name == arguments[i].name)
@@ -76,6 +83,23 @@ struct FunctionSchema {
     }
     return at::nullopt;
   }
+
+ private:
+  bool isMutable() const {
+    return std::any_of(
+        arguments.cbegin(), arguments.cend(), [](const Argument& arg) {
+          return arg.type == WorldType::get();
+        });
+  }
+
+  void validate() const {
+    if (is_mutable) {
+      // Mutable schemas should have a world token as the first argument
+      // and return.
+      JIT_ASSERT(arguments.at(0).type == WorldType::get());
+      JIT_ASSERT(returns.at(0).type == WorldType::get());
+    }
+  }
 };
 
 // for debugging, make sure we can describe the call site
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp
index 6095bb13748470..5718a656a7f520 100644
--- a/torch/csrc/jit/fusers/common/fused_kernel.cpp
+++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp
@@ -24,6 +24,7 @@
 #include <sstream>
 #include <cstdint>
 #include <vector>
+#include <cmath>
 
 namespace torch { namespace jit {
 
@@ -235,9 +236,22 @@ static std::string scalarValue(int64_t v) {
   return std::to_string(v);
 }
 
+// Note: The NAN, NEG_INFINITY and POS_INFINITY strings map to device-specific
+// implementations of these special values. These macros are found in the 
+// resource strings for each device.
 static std::string scalarValue(double v) {
   std::ostringstream out;
-  out << std::scientific << v << "f";
+  if (std::isnan(v)) {
+    out << "NAN";
+  } else if (std::isinf(v)) {
+    if (v < 0) {
+      out << "NEG_INFINITY";
+    } else {
+      out << "POS_INFINITY";
+    }
+  } else {
+    out << std::scientific << v << "f";
+  }
   return out.str();
 }
 
diff --git a/torch/csrc/jit/fusers/cpu/resource_strings.h b/torch/csrc/jit/fusers/cpu/resource_strings.h
index 60c1c0faaa4fea..59a92ccc19b740 100644
--- a/torch/csrc/jit/fusers/cpu/resource_strings.h
+++ b/torch/csrc/jit/fusers/cpu/resource_strings.h
@@ -11,6 +11,10 @@ Correct code for this case is generated, however, nvrtc does not know how to han
 so typedefs help it handle those cases*/
 
 auto type_declarations_template = CodeTemplate(R"(
+
+#define POS_INFINITY INFINITY
+#define NEG_INFINITY -INFINITY
+
 typedef ${IndexType} IndexType;
 template<typename T, size_t N>
 struct TensorInfo {
diff --git a/torch/csrc/jit/fusers/cuda/resource_strings.h b/torch/csrc/jit/fusers/cuda/resource_strings.h
index 0063288721d727..6278a4f239636c 100644
--- a/torch/csrc/jit/fusers/cuda/resource_strings.h
+++ b/torch/csrc/jit/fusers/cuda/resource_strings.h
@@ -18,6 +18,10 @@ typedef long long int int64_t;
 ${HalfHeader}
 ${RandHeader}
 
+#define NAN __int_as_float(0x7fffffff)
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define NEG_INFINITY __int_as_float(0xff800000)
+
 typedef ${IndexType} IndexType;
 template<typename T, size_t N>
 struct TensorInfo {
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index d071c464721559..20ee429b3696c8 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -7,6 +7,7 @@
 #include "torch/csrc/jit/interpreter.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/passes/annotate_effects.h"
 #include "torch/csrc/jit/passes/batch_mm.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index b2fa6eba2f748a..4574addb3a4465 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -11,6 +11,7 @@
 #include <unordered_map>
 #include <vector>
 #include <string>
+#include <fstream>
 
 namespace torch { namespace jit {
 
@@ -181,7 +182,7 @@ void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block,
 class ModuleDecoder : DecoderBase {
  public:
   ModuleDecoder(ModuleLookup module_lookup,
-                const std::string& filename);
+                std::istream& in);
 
  private:
   virtual std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto) override;
@@ -205,7 +206,7 @@ class ModuleDecoder : DecoderBase {
       ModuleLookup module_lookup,
       const std::string fullname);
 
-  PyTorchFileReader file_reader_;
+  PyTorchStreamReader stream_reader_;
   std::unordered_map<uint64_t, std::shared_ptr<at::Storage>> storage_map_;
   std::unordered_map<std::string, const onnx::TypeProto*> value_type_map_;
 };
@@ -260,8 +261,12 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) {
     return NoneType::get();
   } else if (kind == "GeneratorType") {
     return GeneratorType::get();
-  }else if (kind == "StringType") {
+  } else if (kind == "WorldType") {
+    return WorldType::get();
+  } else if (kind == "StringType") {
     return StringType::get();
+  } else if (kind.find("TypeVar:") == 0) {
+    return VarType::create(kind.substr(strlen("TypeVar:")));
   } else {
     throw std::runtime_error("unexpected string for type kind");
   }
@@ -315,7 +320,7 @@ at::Tensor ModuleDecoder::buildTensorCommon(
   if (storage_it == storage_map_.end()) {
     at::DataPtr storage_ptr;
     int64_t size;
-    std::tie(storage_ptr, size) = file_reader_.getRecordWithKey(record_number);
+    std::tie(storage_ptr, size) = stream_reader_.getRecordWithKey(record_number);
     auto storage = std::make_shared<at::Storage>(
       at::CPU(type).typeMeta(),
       std::move(storage_ptr),
@@ -349,10 +354,10 @@ std::pair<std::shared_ptr<script::Module>, std::string> ModuleDecoder::parseFull
 
 ModuleDecoder::ModuleDecoder(
     ModuleLookup module_lookup,
-    const std::string &filename) :
-    file_reader_(filename) {
+    std::istream& in) :
+    stream_reader_(in) {
   auto model_proto = onnx::ModelProto();
-  auto record = file_reader_.getLastRecord();
+  auto record = stream_reader_.getLastRecord();
   model_proto.ParsePartialFromArray(std::get<0>(record).get(), std::get<1>(record));
   auto graph_proto = model_proto.graph();
 
@@ -391,13 +396,21 @@ ModuleDecoder::ModuleDecoder(
 
 }  // namespace
 
+void import_ir_module(
+    ModuleLookup module_lookup,
+    std::istream& in) {
+  ModuleDecoder(module_lookup, in);
+}
+
 void import_ir_module(
     ModuleLookup module_lookup,
     const std::string& filename) {
-  ModuleDecoder(module_lookup, filename);
+  std::ifstream in(filename, std::ios_base::binary);
+
+  ModuleDecoder(module_lookup, in);
 }
 
-std::shared_ptr<script::Module> load(const std::string& filename) {
+std::shared_ptr<script::Module> load(std::istream& in) {
   auto module = std::make_shared<script::Module>();
 
   auto module_lookup = [&](const std::vector<std::string>& qualified_name) {
@@ -410,7 +423,17 @@ std::shared_ptr<script::Module> load(const std::string& filename) {
     }
     return curr;
   };
-  ModuleDecoder(module_lookup, filename);
+
+  ModuleDecoder(module_lookup, in);
+
+  return module;
+}
+
+std::shared_ptr<script::Module> load(const std::string& filename) {
+  std::ifstream in(filename, std::ios_base::binary);
+
+  auto module = load(in);
+
   return module;
 }
 
diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h
index 6ce901c4369961..a1e0b31fe2295a 100644
--- a/torch/csrc/jit/import.h
+++ b/torch/csrc/jit/import.h
@@ -3,6 +3,8 @@
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/script/module.h"
 
+#include <istream>
+
 namespace torch {
 namespace jit {
 
@@ -13,11 +15,18 @@ TORCH_API void import_ir_module(
     ModuleLookup module_lookup,
     const std::string& filename);
 
+TORCH_API void import_ir_module(
+    ModuleLookup module_lookup,
+    std::istream& in);
+
 /// Loads a serialized `script::Module` from the given `filename`.
 ///
 /// The file stored at the location given in `filename` must contain a
 /// serialized `script::Module`, exported either via `ScriptModule.save()` in
 /// Python or `torch::jit::ExportModule` in C++.
+
+TORCH_API std::shared_ptr<script::Module> load(std::istream& in);
+
 TORCH_API std::shared_ptr<script::Module> load(const std::string& filename);
 
 } // namespace jit
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 98a7b010419324..ac6f9ac4a15c1c 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -227,7 +227,6 @@ void initJITBindings(PyObject *module) {
         return createPyObjectForStack(std::move(stack));
       });
 
-
     py::class_<PyTorchFileWriter>(m, "PyTorchFileWriter")
       .def(py::init<std::string>())
       .def("write_record", &PyTorchFileWriter::writeRecord)
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index e1d76dde56c59d..b4e6b7c1398f1b 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -59,6 +59,11 @@ namespace torch { namespace jit {
   _(prim, ConstantChunk)           \
   _(prim, NoneGenerator)           \
   _(aten, floordiv)                \
+  _(prim, MemoryFence)             \
+  _(prim, LoadWorld)               \
+  _(prim, StoreWorld)              \
+  _(prim, DummyWorld)              \
+  _(aten, append)                  \
   _(aten, __not__)                 \
   FORALL_ATEN_BASE_SYMBOLS(_)      \
   _(onnx, Add)                     \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 0d2e22307527b6..14e7fab54d9549 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -32,7 +32,7 @@ namespace torch { namespace jit {
 // to what the instructions will look like.
 // In particular we:
 // * (TODO) desugar Loop trip counts into c = 0, c += 1 instructions in the loop
-// * flatten stages so that each stage starts with a load from the stack
+// * flatten stages so that each stage starts with a load to registers
 //   and ends with a store to the stack
 // *. computes move_flags (see Outputs), and inserts
 // *  Drop nodes are inserted for any node that is unused to create a dummy use
@@ -72,8 +72,6 @@ Value* createTripCountConjunctiveCondition(
   return new_cond;
 }
 
-} // namespace
-
 // this currently just _removes_ the trip count inputs and checks they are
 // unused. In the future they will be desugared into normal arithmetic to
 // provide a loop counter
@@ -142,9 +140,9 @@ static std::vector<std::vector<TypePtr>> flattenStages(Graph & graph) {
   auto it = graph.nodes().begin();
   for(size_t i = 0; i <= graph.stage(); i++) {
     stage_input_types.emplace_back();
-    auto store = graph.create(prim::Store, 0)->insertBefore(*it);
+    auto load = graph.create(prim::Load, 0)->insertBefore(*it);
     while(input_pos < graph.inputs().size() && graph.inputs()[input_pos]->stage() == i) {
-      auto nv = store->addOutput();
+      auto nv = load->addOutput();
       auto old_node = graph.inputs()[input_pos];
       nv->setType(old_node->type());
       stage_input_types[i].push_back(old_node->type());
@@ -153,9 +151,9 @@ static std::vector<std::vector<TypePtr>> flattenStages(Graph & graph) {
     }
     while(it != graph.nodes().end() && it->stage() == i)
       ++it;
-    auto load = graph.create(prim::Load, 0)->insertBefore(*it);
+    auto store = graph.create(prim::Store, 0)->insertBefore(*it);
     while(output_pos < graph.outputs().size() && graph.outputs()[output_pos]->stage() == i) {
-      load->addInput(graph.outputs()[output_pos]);
+      store->addInput(graph.outputs()[output_pos]);
       output_pos++;
     }
   }
@@ -307,6 +305,7 @@ std::unordered_map<Node*, std::vector<uint8_t>> findLastUses(Graph & g) {
 
   return FindLastUses(g).move_flags;
 }
+} //namespace
 
 // pre-processing that happens once per graph
 struct PreprocessGraph {
@@ -503,10 +502,10 @@ struct CodeImpl {
           insertInstruction(node);
         } break;
       }
-      // each stage ends with a load instruction
+      // each stage ends with a store instruction
       // we record where these instructions occur, and use them to
       // exit the interpreter
-      if(node->kind() == prim::Load) {
+      if(node->kind() == prim::Store) {
         stage_end.push_back(instructions.size());
       }
     }
@@ -694,7 +693,7 @@ struct InterpreterStateImpl {
           for(int i = inst.outputs.size - 1; i >= 0; i--) {
             int reg = get(inst.outputs,i);
             registers[reg] = pop(stack);
-            // std::cout << "pop reg[" << reg << "];\n" << registers[reg].pImpl << "\n";
+            // std::cout << "pop reg[" << reg << "];\n" << registers[reg] << "\n";
           }
           pc = new_pc;
         } catch(std::exception & e) {
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 82b14fa0b6839d..90451494bacbc7 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -248,18 +248,22 @@ void Node::lint() const {
   }
 
   // Node subclass invariants
-  // - Return uses is zero
-  // - Param inputs is zero
-  // - Select inputs is one
-  // - Python operator cconv is correct
-
   IR_IF(this,Constant)
     JIT_ASSERT(inputs_.size() == 0);
+  IR_ELSEIF(LoadWorld)
+    JIT_ASSERT(inputs_.size() == 0);
+    JIT_ASSERT(outputs_.size() == 1);
+  IR_ELSEIF(StoreWorld)
+    JIT_ASSERT(inputs_.size() == 1);
+    JIT_ASSERT(outputs_.size() == 0);
   IR_ELSEIF(Return)
+    // Return uses is zero
     JIT_ASSERT(outputs().size() == 0);
   IR_ELSEIF(Param)
+    // Param inputs is zero
     JIT_ASSERT(inputs_.size() == 0);
   IR_ELSEIFM_CONST(PythonOp)
+    // Python operator cconv is correct
     size_t n_scalars = 0, n_tensors = 0;
     for (auto c : value->cconv) {
       if (c == 'c') {
@@ -381,6 +385,7 @@ void Graph::lint() const {
       for (auto n : b->nodes()) {
         JIT_ASSERT(n->kind_ != prim::Param);
         JIT_ASSERT(n->kind_ != prim::Return);
+        JIT_ASSERT(n->kind_ != prim::DummyWorld);
         check_node(n);
       }
 
@@ -447,6 +452,7 @@ void Block::cloneFrom(Block * src, std::function<Value*(Value*)> value_map) {
     local_map[input] = this->addInput()->copyMetadata(input)->setStage(input->stage());
     graph->setStage(std::max(graph->stage(), input->stage()));
   }
+
   for(auto node : src->nodes()) {
     auto new_node = this->appendNode(graph->createClone(node, env));
     new_node->setStage(node->stage());
@@ -466,8 +472,9 @@ void Block::cloneFrom(Block * src, std::function<Value*(Value*)> value_map) {
 
 std::shared_ptr<Graph> Graph::copy() {
   auto new_g = std::make_shared<Graph>();
-  auto env = [](Value *) -> Value* {
-    AT_ERROR("Graph::copy() encountered a use of a value not in scope. Run lint!");
+  auto env = [](Value* v) -> Value* {
+    AT_ERROR(
+        "Graph::copy() encountered a use of a value not in scope. Run lint!");
   };
   new_g->block()->cloneFrom(this->block(), env);
   return new_g;
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 062d0422c2be07..0bb5c899c7321d 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -236,7 +236,7 @@ struct Value {
 
   void replaceFirstUseWith(Value * newValue);
 
-  // Replaces all uses of this node with 'newValue'.
+  // Replaces all uses of this value with 'newValue'.
   //
   // Given:   %3 = f(%1, %2)
   //          %4 = g(%3)
@@ -320,6 +320,9 @@ struct Node : public Attributes<Node> {
   Block * owningBlock() {
     return owning_block_;
   }
+  const Block * owningBlock() const {
+    return owning_block_;
+  }
   size_t stage() const {
     return stage_;
   }
@@ -442,33 +445,33 @@ struct Node : public Attributes<Node> {
   // Given:   %3 = f(%1, %2)
   // Execute: %3.addInput(%4)
   // Result:  %3 = f(%1, %2, %4)
-  Value* addInput(Value * node) {
-    JIT_ASSERT(graph_ == node->owningGraph());
+  Value* addInput(Value * value) {
+    JIT_ASSERT(graph_ == value->owningGraph());
     schema_ = nullptr;
-    node->uses_.emplace_back(this, inputs_.size());
-    inputs_.push_back(node);
-    return node;
+    value->uses_.emplace_back(this, inputs_.size());
+    inputs_.push_back(value);
+    return value;
   }
 
-  // Add 'node' as an input to 'this' at the specified position in the
-  // arguments. Returns the added node for ease of chaining.
-  Value* insertInput(size_t i, Value* node) {
-    JIT_ASSERT(graph_ == node->owningGraph());
+  // Add 'value' as an input to 'this' at the specified position in the
+  // arguments. Returns the added value for ease of chaining.
+  Value* insertInput(size_t i, Value* value) {
+    JIT_ASSERT(graph_ == value->owningGraph());
     schema_ = nullptr;
     // First we update the offsets for all existing inputs that will reside
     // after the one we're inserting. Concretely, these are the inputs at
     // indices [i, # input). Since we're inserting one input before all of
-    // these inputs, increment their use offsets for this Node by 1
+    // these inputs, increment their use offsets for this value by 1
     for (size_t use_itr = i; use_itr < inputs_.size(); ++use_itr) {
       // See Note [User node does not uniquely identify use]
       auto use = findUseForInput(use_itr);
       use->offset += 1;
     }
     // Insert the actual input at the specified index
-    inputs_.insert(inputs_.begin() + i, node);
+    inputs_.insert(inputs_.begin() + i, value);
     // Register the new use of the value we're inserted as an input.
-    node->uses_.emplace_back(this, i);
-    return node;
+    value->uses_.emplace_back(this, i);
+    return value;
   }
 
   // Replace the input of 'this' at position 'i' with
@@ -549,7 +552,7 @@ struct Node : public Attributes<Node> {
     return {blocks_.data(), blocks_.size()};
   }
 
-  // Insert unattached 'this' node after 'n' in the topological order.
+  // Insert unattached 'this' node before 'n' in the topological order.
   // Returns this (for chaining).
   //
   // Given:   %3 = f(%1, %2)
@@ -804,8 +807,8 @@ struct Block {
   void eraseInput(size_t i) {
     input_->eraseOutput(i);
   }
-  size_t registerOutput(Value * n) {
-    output_->addInput(n);
+  size_t registerOutput(Value * v) {
+    output_->addInput(v);
     return outputs().size() - 1;
   }
   size_t insertOutput(size_t i, Value* n) {
@@ -1107,6 +1110,12 @@ friend struct Block;
     return jit::insertConstant(*this, std::move(val), loc);
   }
 
+  Value* insertDummyWorld() {
+    auto node = create(prim::DummyWorld, 1);
+    node->output()->setType(WorldType::get());
+    return insertNode(node)->output();
+  }
+
   // schema-driven insert
   // this inserts a node into the graph with inputs determined from args and kwargs using Python
   // argument matching rules, and checks that the op matches a known schema
@@ -1323,11 +1332,11 @@ inline void Node::cloneFrom(Node * s) {
 	copyAttributes(*s);
 }
 
-inline Block::Block(Graph * graph_, Node * node_)
-: graph_(graph_)
-, output_(initOutput(graph_->create(prim::Return, 0)))
-, input_(graph_->create(prim::Param,0))
-, owning_node_(node_) {
+inline Block::Block(Graph* graph_, Node* node_)
+    : graph_(graph_),
+      output_(initOutput(graph_->create(prim::Return, 0))),
+      input_(graph_->create(prim::Param, 0)),
+      owning_node_(node_) {
   graph_->all_blocks.emplace(this);
   output_->owning_block_ = this;
   input_->owning_block_ = this;
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index 75e5833535bcfc..d701b536c44a0c 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -58,12 +58,19 @@ struct SchemaParser {
       {"float", FloatType::get() },
       {"int", IntType::get() },
       {"bool", IntType::get() }, // TODO: add separate bool type
+      {"World", WorldType::get() },
     };
     auto tok = L.expect(TK_IDENT);
     auto text = tok.text();
     auto it = type_map.find(text);
-    if(it == type_map.end())
+    if(it == type_map.end()) {
+      if(text.size() > 0 && islower(text[0])) {
+        // lower case identifiers that are not otherwise valid types
+        // are treated as type variables
+        return VarType::create(text);
+      }
       throw ErrorReport(tok.range) << "unknown type specifier";
+    }
     return it->second;
   }
   void parseArgumentType(std::vector<Argument>& arguments) {
@@ -358,9 +365,16 @@ bool Operator::matches(const Node* node) const {
   if(actuals.size() < formals.size())
     return false;
 
+
+  TypeEnv type_env;
   for(size_t i = 0; i < formals.size(); ++i) {
-    // mismatched input type
-    if (!actuals[i]->type()->isSubtypeOf(formals[i].type)) {
+    try {
+      TypePtr formal = matchTypeVariables(formals[i].type, actuals[i]->type(), type_env);
+      // mismatched input type
+      if (!actuals[i]->type()->isSubtypeOf(formal)) {
+        return false;
+      }
+    } catch(TypeMatchError& err) {
       return false;
     }
   }
diff --git a/torch/csrc/jit/passes/annotate_effects.cpp b/torch/csrc/jit/passes/annotate_effects.cpp
new file mode 100644
index 00000000000000..b8aaec83dddbac
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_effects.cpp
@@ -0,0 +1,320 @@
+#include "torch/csrc/jit/passes/annotate_effects.h"
+
+#include <set>
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+namespace torch {
+namespace jit {
+namespace {
+
+/**
+ * AnnotateEffects
+ *
+ * This pass annotates effectful operations (such as ones that mutate existing
+ * values) to prevent subsequent passes from re-ordering ops in a way that
+ * changes the meaning of the program.
+ *
+ * It does this by threading a "world token" value through nodes that use
+ * mutable values. This models effects explicitly in the IR and forces all
+ * annotated nodes to be linearized during optimization.
+ *
+ * For mutating operators: the world token is threaded directly through the node
+ * For purely functional operators: their node will be "fenced" by two
+ *   `prim::MemoryFence` nodes that take world tokens as their input.
+ *
+ * Graphs have special EntryWorld and ExitWorld nodes that provide end-points
+ * for the world token. They are similar to graph inputs/outputs in that they
+ * are not in the node list and only accessible via special methods.
+ *
+ * When inlined, graphs will manifest the EntryWorld/ExitWorld nodes explicitly
+ * as StoreWorld/LoadWorld ops so that they can act as endpoints where the
+ * callee "world thread" can be joined to the caller world thread.
+ */
+class AnnotateEffectsImpl {
+ public:
+  void annotateEffects(Graph* g) {
+    if (!shouldAnnotate(g->block())) {
+      return;
+    }
+
+    // Generate the first world token
+    Value* curToken = nullptr;
+    {
+      WithInsertPoint guard(*g->nodes().begin());
+      auto loadWorld = g->insertNode(g->create(prim::LoadWorld, 1));
+      curToken = loadWorld->output()->setType(WorldType::get());
+    }
+
+    auto lastToken = visitBlock(g->block(), curToken);
+
+    auto storeWorld = g->insertNode(g->create(prim::StoreWorld, 0));
+    storeWorld->addInput(lastToken);
+  }
+
+ private:
+  Value* visitBlock(Block* block, Value* curToken) {
+    for (auto* node : block->nodes()) {
+      curToken = visitNode(node, curToken);
+    }
+    return curToken;
+  }
+
+  // General node annotation. If a node uses a mutable variable (or mutates a
+  // previously constant variable), annotate it
+  //
+  // Returns the last world token emitted for subsequent annotations to use.
+  Value* visitNode(Node* node, Value* curToken) {
+    // Avoid annotating memory fences. This avoids an infinite loop as we add
+    // fences and continue to iterate through nodes.
+    if (node->kind() == prim::MemoryFence) {
+      // Return this memory fence's world token
+      return node->outputs().at(0);
+    }
+
+    // Handle inlined functions. Inlined functions will expose their Entry and
+    // Exit tokens as regular nodes. These exposed nodes provide fixed points
+    // to thread the current world token through.
+    if (node->kind() == prim::LoadWorld) {
+      auto inlinedEntryToken = node->output();
+      inlinedEntryToken->replaceAllUsesWith(curToken);
+      return curToken;
+    }
+
+    if (node->kind() == prim::StoreWorld) {
+      return node->input();
+    }
+
+    if (node->kind() == prim::If) {
+      JIT_ASSERT(node->blocks().size() == 2);
+
+      auto trueBlock = node->blocks().at(0);
+      auto falseBlock = node->blocks().at(1);
+
+      auto trueToken = visitBlock(trueBlock, curToken);
+      auto falseToken = visitBlock(falseBlock, curToken);
+
+      // If any branch has a mutating op, this node has to output a world token
+      if (trueToken != curToken || falseToken != curToken) {
+        trueBlock->registerOutput(trueToken);
+        falseBlock->registerOutput(falseToken);
+
+        return node->addOutput()->setType(WorldType::get());
+      }
+      return curToken;
+    }
+
+    if (node->kind() == prim::Loop) {
+      JIT_ASSERT(node->blocks().size() == 1);
+      auto block = node->blocks().at(0);
+      if (!shouldAnnotate(block)) {
+        // Bail out early if there's no mutable variables used inside
+        return curToken;
+      }
+
+      // Register the world token as a loop carried dependency
+      auto beginLoopToken = block->addInput()->setType(WorldType::get());
+      auto endLoopToken = visitBlock(block, beginLoopToken);
+      block->registerOutput(endLoopToken);
+
+      JIT_ASSERT(endLoopToken != beginLoopToken);
+
+      // Thread the world token through the loop node
+      node->addInput(curToken);
+      return node->addOutput()->setType(WorldType::get());
+    }
+
+    // For mutating ops, just thread the world token through the node.
+    if (isMutatingOp(node)) {
+      // Replace the "dummy" token generated by the compiler
+      node->replaceInput(0, curToken);
+      return node->outputs().at(0);
+    }
+
+    JIT_ASSERT(node->blocks().size() == 0);
+
+    // For pure ops that need to be annotated, fence them.
+    if (shouldAnnotate(node)) {
+      if (isFenced(node)) {
+        // If the node has already been fenced, just return the value from the
+        // end fence. This can happen when another graph is inlined.
+        return getTokenForFencedNode(node);
+      }
+      return addFenceForNode(node, curToken);
+    }
+
+    return curToken;
+  }
+
+  bool shouldAnnotate(const Node* node) {
+    // Check if this node uses a known mutable value
+    for (const auto* input : node->inputs()) {
+      if (!isMutableType(input)) {
+        // TODO(suo): Right now, we only support mutable lists.
+        // If we remove this check, it's not clear whether:
+        //
+        //   append(int[] a, int b)
+        //
+        // mutates `a` or `b`. We'll need to extend the schema language to be
+        // able to express which argument is mutated.
+        continue;
+      }
+      // First check the cache
+      if (mutableValues_.count(input) != 0) {
+        return true;
+      }
+
+      // Check whether any mutating op uses this input
+      for (const auto& use : input->uses()) {
+        if (isMutatingOp(use.user)) {
+          mutableValues_.insert(input);
+          return true;
+        }
+      }
+    }
+
+    // Check that any sub-blocks need to be annotated
+    for (auto block : node->blocks()) {
+      if (shouldAnnotate(block)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  bool shouldAnnotate(const Block* block) {
+    return std::any_of(
+        block->nodes().begin(), block->nodes().end(), [this](const Node* node) {
+          return shouldAnnotate(node);
+        });
+  }
+
+  bool isMutableType(const Value* value) {
+    return value->type()->kind() == TypeKind::ListType;
+  }
+
+  bool isMutatingOp(const Node* node) {
+    return !node->inputs().empty() &&
+        node->inputs()[0]->type() == WorldType::get();
+  }
+
+  // Returns true iff this node has already been fenced. This can happen if
+  // another graph was inlined into the current one.
+  bool isFenced(const Node* node) {
+    // A node is fenced if all its inputs/outputs are used by memory fences.
+    const auto inputsFenced = std::all_of(
+        node->inputs().begin(), node->inputs().end(), [&](const Value* input) {
+          return std::any_of(
+              input->uses().cbegin(),
+              input->uses().cend(),
+              [&](const Use& use) {
+                return use.user->kind() == prim::MemoryFence;
+              });
+        });
+    if (!inputsFenced) {
+      return false;
+    }
+
+    const auto outputsFenced = std::all_of(
+        node->outputs().begin(),
+        node->outputs().end(),
+        [&](const Value* input) {
+          return std::any_of(
+              input->uses().cbegin(),
+              input->uses().cend(),
+              [&](const Use& use) {
+                return use.user->kind() == prim::MemoryFence;
+              });
+        });
+    if (!outputsFenced) {
+      return false;
+    }
+
+    return true;
+  }
+
+  // Given a fenced node, return the world token outputted from its end fence
+  Value* getTokenForFencedNode(const Node* node) {
+    // Take advantage of the fact that the end fence consumes the node's
+    // outputs, i.e. it will be the only user.
+    const auto output = node->outputs().at(0);
+    JIT_ASSERT(output->uses().size() == 1);
+    const auto endFence = output->uses()[0].user;
+    const auto token = endFence->outputs().at(0);
+    JIT_ASSERT(token->type() == WorldType::get());
+    return token;
+  }
+
+  // Create a memory fence around a node, using the world token.
+  //
+  // Input:
+  //  %size : Int = prim::len(%mut_list)
+  //
+  // Output:
+  //  %t.1 : World, %list.2 : int[] = prim::MemoryFence(%curToken, %mut_list)
+  //  %size : Int = prim::len(%mut_list)
+  //  %t.2 : World, %size.2 : int = prim::MemoryFence(%t.1, %size)
+  //
+  // Returns the new world token (%t.2) for subsequent fences to use.
+  Value* addFenceForNode(Node* node, Value* curToken) {
+    // Add a start fence
+    auto startFence =
+        node->owningGraph()->create(prim::MemoryFence, /*outputs=*/0);
+
+    // Add world tokens as the first input and output
+    startFence->addInput(curToken);
+    curToken = startFence->addOutput()->setType(WorldType::get());
+
+    // Fence off all node's inputs
+    for (const auto input : node->inputs()) {
+      startFence->addInput(input);
+      startFence->addOutput()->setType(input->type());
+    }
+
+    startFence->insertBefore(node);
+
+    JIT_ASSERT(node->inputs().size() == startFence->outputs().size() - 1);
+
+    // modify the node to take in the start fence's output values
+    for (size_t i = 0; i < node->inputs().size(); i++) {
+      node->replaceInput(i, startFence->outputs()[i + 1]);
+    }
+
+    // Add an end fence
+    auto endFence =
+        node->owningGraph()->create(prim::MemoryFence, /*outputs=*/0);
+
+    // Add world tokens as the first input and output
+    endFence->addInput(curToken);
+    curToken = endFence->addOutput()->setType(WorldType::get());
+
+    // Fence off all the node's outputs
+    for (auto output : node->outputs()) {
+      endFence->addInput(output);
+      auto fencedOutput = endFence->addOutput()->setType(output->type());
+      output->replaceAllUsesWith(fencedOutput);
+      // replaceAllUsesWith() replaces the fence's INPUT value with the new
+      // output as well, so we need to manually add the "real" input back
+      endFence->replaceInputWith(fencedOutput, output);
+    }
+
+    endFence->insertAfter(node);
+
+    return curToken;
+  }
+
+  // Memoize which values will be mutated at some point in the program
+  std::set<const Value*> mutableValues_;
+};
+} // namespace
+
+void AnnotateEffects(std::shared_ptr<Graph>& graph) {
+  AnnotateEffectsImpl impl;
+  impl.annotateEffects(graph.get());
+
+  // Prune the dummy world tokens
+  EliminateDeadCode(graph);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/annotate_effects.h b/torch/csrc/jit/passes/annotate_effects.h
new file mode 100644
index 00000000000000..9c8e969d54ba41
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_effects.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch {
+namespace jit {
+
+TORCH_API void AnnotateEffects(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index b9d36d0e4b88e3..179f3751526c4e 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -17,6 +17,8 @@ std::unordered_set<Symbol> skip_list = {
   prim::Loop, //TODO: handle Loop
   prim::Print,
   prim::PythonOp, //may have side effects
+  prim::LoadWorld,
+  prim::StoreWorld,
   //all the rand functions from native_functions.yaml
   aten::rand,
   aten::rand_like,
diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp
index d8341cbb99c6aa..6424eb70a6cafc 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.cpp
+++ b/torch/csrc/jit/passes/dead_code_elimination.cpp
@@ -13,12 +13,13 @@ bool hasSideEffects(Node * node, bool_memo_type& memo) {
   auto it = memo.find(node);
   if (it != memo.end())
     return it->second;
-  bool has_side_effects = node->kind() == prim::Print ||
-    std::any_of(node->blocks().begin(), node->blocks().end(),
-                [&](Block *b) {
-                  return std::any_of(b->nodes().begin(), b->nodes().end(),
-                                    [&](Node *n) { return hasSideEffects(n, memo); });
-                });
+  bool has_side_effects =
+      node->kind() == prim::Print || node->kind() == prim::StoreWorld ||
+      std::any_of(node->blocks().begin(), node->blocks().end(), [&](Block* b) {
+        return std::any_of(b->nodes().begin(), b->nodes().end(), [&](Node* n) {
+          return hasSideEffects(n, memo);
+        });
+      });
   memo.emplace(node, has_side_effects);
   return has_side_effects;
 }
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 6c7166f3b43552..c8a1ef566f2a3c 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -208,16 +208,12 @@ struct GraphFuser {
           /*const=*/attr::alpha) ||
         node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
           /*const=*/{attr::other, attr::alpha}) ||
-        node->matches("aten::add(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
           /*const=*/attr::alpha) ||
         node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
           /*const=*/{attr::other, attr::alpha}) ||
-        node->matches("aten::sub(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::mul(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::mul(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::div(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::div(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor", /*const=*/{attr::min, attr::max})) {
       auto inputs = tensorInputs(node);
       return haveSupportedType(inputs);
@@ -225,22 +221,16 @@ struct GraphFuser {
     else if (
         node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::lt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::lt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::le(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::le(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::gt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::gt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::ge(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::ge(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::eq(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::eq(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ne(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::ne(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::ne(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other)) {
+        node->matches("aten::ne(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other)) {
       // comparison operators produce Byte type, and it's ok, check only inputs
       auto inputs = tensorInputs(node);
       return haveSupportedType(inputs);
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 8045a46a4af1ba..6e90780ecbf695 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -159,7 +159,7 @@ void eliminateNopTranspose(Block *b) {
     }
     if (n->kind() == onnx::Transpose) {
       if (isNopTranspose(n->is(attr::perm))) {
-        n->replaceAllUsesWith(n->input()->node());
+        n->output()->replaceAllUsesWith(n->input());
         it.destroyCurrent();
         continue;
       }
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index b01d9d3b61359c..eedc7fd0a8a686 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -557,10 +557,6 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) {
     "aten::pow(Tensor self, Scalar exponent) -> Tensor",
     "aten::fmod(Tensor self, Scalar other) -> Tensor",
     "aten::remainder(Tensor self, Scalar other) -> Tensor",
-    "aten::add(Scalar other, Tensor self) -> Tensor",
-    "aten::sub(Scalar other, Tensor self) -> Tensor",
-    "aten::mul(Scalar other, Tensor self) -> Tensor",
-    "aten::div(Scalar other, Tensor self) -> Tensor",
     "aten::pow(Scalar base, Tensor self) -> Tensor",
     "aten::__and__(Tensor self, Scalar other) -> Tensor",
     "aten::__or__(Tensor self, Scalar other) -> Tensor",
@@ -1139,10 +1135,7 @@ bool PropagateCompleteShapeOnNode(Node * node, bool insert_expands,
   } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
              node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
              node->matches("aten::mul(Tensor self, Scalar other) -> Tensor") ||
-             node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor") ||
-             node->matches("aten::add(Scalar other, Tensor self) -> Tensor") ||
-             node->matches("aten::sub(Scalar other, Tensor self) -> Tensor") ||
-             node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) {
+             node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor")) {
     node->output()->setType(tensor_types.at(0));
     return true;
   } else if (insert_expands && (
diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp
index f78da9b92baccc..0d56ca2255286f 100644
--- a/torch/csrc/jit/passes/to_batch.cpp
+++ b/torch/csrc/jit/passes/to_batch.cpp
@@ -525,11 +525,10 @@ void ToBatch::toBatch(Block* block, Block* res_block) {
 }
 
 std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph){
-  // std::cout<<graph->toString()<<std::endl;
   std::shared_ptr<Graph> res_graph = std::make_shared<Graph>(graph->scope_root());
   ToBatch to_batch;
   to_batch.toBatch(graph->block(), res_graph->block());
-  // std::cout<<res_graph->toString()<<std::endl;
+
   return res_graph;
 }
 
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index d1420574bf6bcb..192cabca3f38d1 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -9,7 +9,7 @@
 #include "torch/csrc/utils/pybind.h"
 #include "torch/csrc/utils/auto_gil.h"
 
-#include <ATen/Error.h>
+#include <ATen/core/Error.h>
 
 #include <algorithm>
 #include <cstddef>
@@ -86,7 +86,7 @@ inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) {
   for(auto elem : obj) {
     elems.push_back(toIValue(elem, elem_type));
   }
-  return ConstantList<IValue>::create(std::move(elems));
+  return List<IValue>::create(std::move(elems));
 }
 
 inline IValue toIValue(py::handle obj, const TypePtr& type) {
@@ -140,8 +140,11 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
             return createGenericList(obj, elem_type);
         }
       }
+      case TypeKind::WorldType:
+        AT_ERROR("World arguments should not be passed in by users");
       case TypeKind::NumberType:
       case TypeKind::GeneratorType:
+      case TypeKind::VarType:
         break;
     }
   AT_ERROR("Missing cases in toIValue for type: ", type->str(), "! File a bug report.");
@@ -199,6 +202,14 @@ inline py::object toPyObject(IValue&& ivalue) {
     return py::cast(ivalue.toDoubleListRef());
   } else if (ivalue.isTensorList()) {
     return py::cast(ivalue.toTensorListRef());
+  } else if (ivalue.isGenericList()) {
+    auto list = ivalue.toGenericList();
+    const auto & elements = list->elements();
+    py::list t { elements.size() };
+    for (size_t i = 0; i < elements.size(); ++i) {
+      t[i] = toPyObject(IValue{elements[i]});
+    }
+    return t;
   } else if (ivalue.isTuple()) {
     auto tuple = ivalue.toTuple();
     const auto & elements = tuple->elements();
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 5aa053f626faa1..ad03ac556cd272 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -455,6 +455,10 @@ void initPythonIRBindings(PyObject * module_) {
           return "StringType";
         case TypeKind::GeneratorType:
           return "GeneratorType";
+        case TypeKind::VarType:
+          return "VarType";
+        case TypeKind::WorldType:
+          return "WorldType";
         }
         // not reachable, but some compilers complain
         AT_ERROR("Unknown Type Kind");
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 71168cd3ee3d4d..cdea4ab894b253 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -52,6 +52,13 @@ void checkImplicitTensorToNum(at::Tensor t, bool toInt) {
 }
 
 RegisterOperators reg({
+    Operator(
+        prim::MemoryFence,
+        [](Node* node) {
+          return [](Stack& stack) {
+            return 0;
+          };
+        }),
     Operator(
         prim::FusionGroup,
         [](Node* node) {
@@ -204,6 +211,30 @@ RegisterOperators reg({
             return 0;
           };
         }),
+    Operator(
+        prim::LoadWorld,
+        [](Node* node) {
+          return [](Stack& stack) {
+            push(stack, World{0});
+            return 0;
+          };
+        }),
+    Operator(
+        prim::StoreWorld,
+        [](Node* node) {
+          return [](Stack& stack) {
+            drop(stack, 1);
+            return 0;
+          };
+        }),
+    Operator(
+        prim::DummyWorld,
+        [](Node* node) {
+          return [](Stack& stack) {
+            AT_ERROR("Encountered a dummy world during graph execution.");
+            return 0;
+          };
+        }),
     Operator(
         onnx::Reshape,
         [](Node* node) {
@@ -399,9 +430,17 @@ RegisterOperators reg({
               return 0;
             };
           } else {
-            std::stringstream ss;
-            ss << "unsupported list type: " << *lt->getElementType();
-            throw std::runtime_error(ss.str());
+            return [=](Stack& stack) {
+              const size_t stack_size = stack.size();
+              std::vector<IValue> vals;
+              vals.reserve(num_inputs);
+              for (size_t i = stack_size - num_inputs; i < stack_size; ++i) {
+                vals.push_back(std::move(stack[i]));
+              }
+              drop(stack, num_inputs);
+              push(stack, std::move(vals));
+              return 0;
+            };
           }
         }),
 });
@@ -441,26 +480,6 @@ RegisterOperators reg({
 #define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, float)
 #define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, int)
 
-// define helpers for where aten is missing scalar overloads
-// note: it would be better to define these in a standard library as
-// script functions and have the compiler substitute them in
-// however, we need to add type annotations to the parser in order for us
-// to move them there.
-// e.g. s + t ==> t + s
-// e.g. s - d == -d + s
-
-#define DEFINE_ST_OP(aten_op, reverse_exp)                             \
-  Operator("aten::" #aten_op "(Scalar other, Tensor self) -> Tensor", [](Node* node) { \
-    return [=](Stack& stack) {                                         \
-      at::Scalar a;                                                    \
-      at::Tensor b;                                                    \
-      pop(stack, a, b);                                                \
-      at::DeviceGuard guard(b);                                        \
-      push(stack, reverse_exp);                                        \
-      return 0;                                                        \
-    };                                                                 \
-  }),
-
 // Convert an python index (which may be negative) into an index usable for a
 // C++ container
 int64_t normalizeIndex(int64_t idx, int64_t list_size) {
@@ -471,6 +490,19 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size) {
   return idx;
 }
 
+template <typename TList, typename TElement>
+Operation listAppend(Node* node) {
+  return [](Stack& stack) {
+    TList a;
+    TElement el;
+    pop(stack, a, el);
+
+    a->elements().push_back(el);
+
+    return 0;
+  };
+}
+
 template <typename T>
 Operation listSelect(Node* node) {
   return [=](Stack& stack) {
@@ -506,11 +538,7 @@ Operation listEq(Node* node) {
     T a;
     T b;
     pop(stack, a, b);
-    if (a->elements() == b->elements()) {
-      push(stack, 1);
-    } else {
-      push(stack, 0);
-    }
+    push(stack, a->elements() == b->elements() ? 1 : 0);
     return 0;
   };
 }
@@ -604,32 +632,29 @@ Operation listSlice(Node* node) {
 }
 
 RegisterOperators reg2({
-    Operator("aten::select(int[] a, int b) -> int", listSelect<Shared<IntList>>),
-    Operator("aten::select(float[] a, int b) -> float", listSelect<Shared<DoubleList>>),
-    Operator("aten::select(Tensor[] a, int b) -> Tensor", listSelect<Shared<TensorList>>),
 
-    Operator("aten::len(int[] a) -> int", listLen<Shared<IntList>>),
-    Operator("aten::len(float[] a) -> int", listLen<Shared<DoubleList>>),
-    Operator("aten::len(Tensor[] a) -> int", listLen<Shared<TensorList>>),
+#define CREATE_LIST_OPS(decl_type, c_type) \
+    Operator("aten::select(" decl_type "[] a, int b) -> " decl_type, listSelect<Shared<c_type>>), \
+    Operator("aten::len(" decl_type "[] a) -> int", listLen<Shared<c_type>>), \
+    Operator("aten::add(" decl_type "[] a, " decl_type "[] b) -> " decl_type "[]", listAdd<Shared<c_type>, c_type::ElemType>), \
+    Operator( \
+        "aten::slice(" decl_type "[] l, int start, int end=9223372036854775807, int step=1) -> " decl_type "[]", \
+        listSlice<Shared<c_type>, c_type::ElemType>), \
+    Operator( \
+        "aten::append(World w, " decl_type "[] list, " decl_type " el) -> World", \
+        listAppend<Shared<c_type>, c_type::ElemType>), \
+
+
+    CREATE_LIST_OPS("int", IntList)
+    CREATE_LIST_OPS("float", DoubleList)
+    CREATE_LIST_OPS("Tensor", TensorList)
+    CREATE_LIST_OPS("t", GenericList)
+
 
     Operator("aten::eq(int[] a, int[] b) -> int", listEq<Shared<IntList>>),
     Operator("aten::eq(float[] a, float[] b) -> int", listEq<Shared<DoubleList>>),
     Operator("aten::eq(Tensor[] a, Tensor[] b) -> int", listEq<Shared<TensorList>>),
 
-    Operator("aten::add(int[] a, int[] b) -> int[]", listAdd<Shared<IntList>, int64_t>),
-    Operator("aten::add(float[] a, float[] b) -> float[]", listAdd<Shared<DoubleList>, double>),
-    Operator("aten::add(Tensor[] a, Tensor[] b) -> Tensor[]", listAdd<Shared<TensorList>, at::Tensor>),
-
-    Operator(
-        "aten::slice(int[] l, int start, int end=9223372036854775807, int step=1) -> int[]",
-        listSlice<Shared<IntList>, int64_t>),
-    Operator(
-        "aten::slice(float[] l, int start, int end=9223372036854775807, int step=1) -> float[]",
-        listSlice<Shared<DoubleList>, double>),
-    Operator(
-        "aten::slice(Tensor[] l, int start, int end=9223372036854775807, int step=1) -> Tensor[]",
-        listSlice<Shared<TensorList>, at::Tensor>),
-
     DEFINE_BINARY_OP(aten::add, a + b)
     DEFINE_BINARY_OP(aten::sub, a - b)
     DEFINE_BINARY_OP(aten::mul, a * b)
@@ -748,21 +773,5 @@ RegisterOperators reg2({
             return 0;
           };
         }),
-    // commutative
-    DEFINE_ST_OP(mul, at::mul(b, a))
-    DEFINE_ST_OP(add, at::add(b, a))
-    DEFINE_ST_OP(ne, at::ne(b, a))
-    DEFINE_ST_OP(eq, at::eq(b, a))
-
-    // comparisons, reverse the condition
-    DEFINE_ST_OP(lt, b > a)
-    DEFINE_ST_OP(le, b >= a)
-    DEFINE_ST_OP(gt, b < a)
-    DEFINE_ST_OP(ge, b <= a)
-
-    // rsub
-    DEFINE_ST_OP(sub, at::add(b.neg(), a))
-    // rdiv
-    DEFINE_ST_OP(div, at::mul(at::reciprocal(b), a))
 });
 }}} // torch::jit::anon
diff --git a/torch/csrc/jit/script/builtin_functions.cpp b/torch/csrc/jit/script/builtin_functions.cpp
new file mode 100644
index 00000000000000..ea82d06879d7c7
--- /dev/null
+++ b/torch/csrc/jit/script/builtin_functions.cpp
@@ -0,0 +1,83 @@
+#include "torch/csrc/jit/script/builtin_functions.h"
+#include "torch/csrc/api/include/torch/jit.h"
+#include "torch/csrc/jit/code_template.h"
+
+namespace torch { namespace jit { namespace script {
+
+auto scalar_operators_source = CodeTemplate(
+R"SCRIPT(
+def mul(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b * a
+def add(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b + a
+def ne(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b != a
+def eq(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b == a
+def lt(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b > a
+def le(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b >= a
+def gt(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b < a
+def ge(a : ${Scalar}, b : Tensor) -> Tensor:
+  return b <= a
+def sub(a : ${Scalar}, b : Tensor) -> Tensor:
+  return torch.neg(b) + a
+def div(a : ${Scalar}, b : Tensor) -> Tensor:
+  return torch.reciprocal(b) * a
+)SCRIPT");
+
+struct BuiltinFunctionRegistry {
+
+  const std::vector<Method*>& getAllBuiltinFunctionsFor(Symbol name) {
+    const static std::vector<Method*> empty;
+    // when initializing the builtin function library, we will re-enter
+    // getAllBuiltinFunctionsFor since it is called in the compiler to
+    // lookup builtins and initializing the builtin functions calls the compiler.
+    // To avoid deadlocking, we use a recursive mutex (same thread can re-lock,
+    // the mutex without waiting), and report no loaded builtins during init.
+    std::lock_guard<std::recursive_mutex> guard(mutex);
+    if(state == INTIIALIZING) {
+      return empty;
+    } else if (state == UNINITIALIZED) {
+      state = INTIIALIZING;
+      loadBuiltinFunctions();
+      state = INITIALIZED;
+    }
+    JIT_ASSERT(state == INITIALIZED);
+    auto it = builtins_by_name.find(name);
+    if(it == builtins_by_name.end())
+      return empty;
+    return it->second;
+  }
+private:
+  void loadSource(const std::string& source) {
+    auto module = std::make_shared<script::Module>();
+    defineMethodsInModule(
+        *module, source, script::nativeResolver, /*self=*/nullptr);
+    modules.push_back(module);
+    for (auto& method : module->get_methods()) {
+      builtins_by_name[Symbol::fromQualString("aten::" + method.key)].push_back(
+          method.value.get());
+    }
+  }
+  void loadBuiltinFunctions() {
+    for(auto scalar : {"float", "int"}) {
+      TemplateEnv env;
+      env.s("Scalar", scalar);
+      loadSource(scalar_operators_source.format(env));
+    }
+  }
+  enum {UNINITIALIZED, INTIIALIZING, INITIALIZED} state = UNINITIALIZED;
+  std::recursive_mutex mutex;
+  std::vector<std::shared_ptr<Module>> modules;
+  std::unordered_map<Symbol, std::vector<Method*>> builtins_by_name;
+};
+
+TORCH_API const std::vector<Method*>& getAllBuiltinFunctionsFor(Symbol name) {
+  static BuiltinFunctionRegistry registry;
+  return registry.getAllBuiltinFunctionsFor(name);
+}
+
+}}}
diff --git a/torch/csrc/jit/script/builtin_functions.h b/torch/csrc/jit/script/builtin_functions.h
new file mode 100644
index 00000000000000..042dc96b1826f0
--- /dev/null
+++ b/torch/csrc/jit/script/builtin_functions.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+#include "torch/csrc/jit/script/module.h"
+
+namespace torch { namespace jit { namespace script {
+
+
+TORCH_API const std::vector<Method*>& getAllBuiltinFunctionsFor(Symbol name);
+
+
+
+}}}
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index b66b96dd5eb6fb..28aa735fc37249 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -1,5 +1,6 @@
 #include "torch/csrc/jit/script/compiler.h"
 #include "torch/csrc/jit/passes/lower_tuples.h"
+#include "torch/csrc/jit/passes/annotate_effects.h"
 #include "torch/csrc/jit/operator.h"
 #include "torch/csrc/jit/interpreter.h"
 #include "torch/csrc/jit/ir.h"
@@ -7,6 +8,7 @@
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/utils/object_ptr.h"
 #include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/script/builtin_functions.h"
 
 #include "torch/csrc/jit/constants.h"
 
@@ -449,7 +451,8 @@ Value* tryMatchArgument(
     const SourceRange& loc,
     const NamedValue& named_value,
     std::function<std::ostream&()> err,
-    bool convert_tensors_to_nums) {
+    bool convert_tensors_to_nums,
+    TypeEnv & type_env) {
   Value* value = named_value.value(graph);
 
   // some functions that take lists of integers for fixed size arrays
@@ -460,35 +463,44 @@ Value* tryMatchArgument(
     value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
   }
 
+  TypePtr concrete_type;
+  try {
+    concrete_type = matchTypeVariables(arg.type, value->type(), type_env);
+  } catch(TypeMatchError& e) {
+    err() << "could not match type " << value->type()->str() << " to "
+          << arg.type->str() << " in argument '" << arg.name << "': " << e.what() << "\n"
+          << named_value.locOr(loc);
+    return nullptr;
+  }
+
   // Allow homogeneous tuples to be casted implicitly to lists of appropriate types
-  if (convertibleToList(value->type(), arg.type) &&
+  if (convertibleToList(value->type(), concrete_type) &&
       value->type()->kind() == TypeKind::TupleType) {
     auto unpacked = createTupleUnpack(value);
-    auto elem_type = arg.type->expect<ListType>()->getElementType();
+    auto elem_type = concrete_type->expect<ListType>()->getElementType();
     value = graph.insertNode(graph.createList(elem_type, unpacked))->output();
   }
 
   if (value->node()->kind() == prim::None){
-    if (arg.type->isSubtypeOf(NumberType::get()))
+    if (concrete_type->isSubtypeOf(NumberType::get()))
       value = graph.insertConstant(at::Scalar(NAN), loc);
-    else if (arg.type->isSubtypeOf(GeneratorType::get())) {
-      value = graph.insertNode(graph.createNoneGenerator())
-        ->output()->setType(GeneratorType::get());
+    else if (concrete_type->isSubtypeOf(GeneratorType::get())) {
+      value = graph.insertNode(graph.createNoneGenerator())->output();
     } else
       value = graph.insertNode(graph.createUndefined())->output();
   }
 
   //implicit conversion of tensors to scalars
-  if(convert_tensors_to_nums && arg.type->isSubtypeOf(NumberType::get())
+  if(convert_tensors_to_nums && concrete_type->isSubtypeOf(NumberType::get())
       && value->type()->isSubtypeOf(DynamicType::get())) {
-      auto n = graph.createImplicitTensorToNum(arg.type, value);
+      auto n = graph.createImplicitTensorToNum(concrete_type, value);
       value = graph.insertNode(n)
         ->setSourceLocation(std::make_shared<SourceRange>(loc))
         ->output();
   }
 
-  if(!value->type()->isSubtypeOf(arg.type)) {
-    err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found "
+  if(!value->type()->isSubtypeOf(concrete_type)) {
+    err() << "expected a value of type " << concrete_type->str() << " for argument '" << arg.name << "' but found "
           << value->type()->str() << "\n"
           << named_value.locOr(loc);
     return nullptr;
@@ -510,11 +522,12 @@ Value* tryCreateList(
     const SourceRange& loc,
     at::ArrayRef<NamedValue> varargs,
     std::function<std::ostream&()> err,
-    bool convert_tensor_to_num) {
-  Argument elem_arg("", elem_type);
+    bool convert_tensor_to_num,
+    TypeEnv & type_env) {
+  Argument elem_arg("<varargs>", elem_type);
   std::vector<Value*> list_ctor;
   for(const auto& a : varargs) {
-    Value* av = tryMatchArgument(elem_arg, graph, loc, a, err, convert_tensor_to_num);
+    Value* av = tryMatchArgument(elem_arg, graph, loc, a, err, convert_tensor_to_num, type_env);
     if(!av)
       return nullptr;
     list_ctor.push_back(av);
@@ -537,117 +550,129 @@ static Value* materializeConstant(T val, Graph& graph,
   return new_constant;
 }
 
-at::optional<std::vector<Value*>> tryMatchSchema(
-  const FunctionSchema& schema,
-  const SourceRange& loc,
-  Graph& graph,
-  at::ArrayRef<NamedValue> args,
-  at::ArrayRef<NamedValue> kwargs,
-  std::ostream& failure_messages,
-  bool convert_tensors_to_nums) {
-    auto err = [&]() -> std::ostream& {
-      failure_messages << "\nfor operator " << schema << ":\n";
-      return failure_messages;
-    };
-
-    std::vector<Value*> positional_inputs;
-    std::vector<bool> used_kwarg(kwargs.size(), false);
-
-    // if we finish the loop will we have consumed all arguments?
-    size_t used_args = 0;
-
-    for(size_t schema_i = 0; schema_i < schema.arguments.size(); ++schema_i) {
-      const auto& arg = schema.arguments[schema_i];
-      at::optional<NamedValue> v;
-      if(!arg.kwarg_only && schema_i < args.size()) {
+at::optional<MatchedSchema> tryMatchSchema(
+    const FunctionSchema& schema,
+    const SourceRange& loc,
+    Graph& graph,
+    at::ArrayRef<NamedValue> raw_args,
+    at::ArrayRef<NamedValue> kwargs,
+    std::ostream& failure_messages,
+    bool convert_tensors_to_nums) {
+  // Match against a potentially mutable schema.
+  //
+  // We need to treat mutable schemas differently because the IR explicitly
+  // expresses effects by including a world token in mutable ops. Users do not
+  // know about the world token, so we need to generate a dummy one and add
+  // it to the inputs for schema matching.
+  //
+  // Example:
+  //   append(int[] list, int el)
+  // becomes
+  //   append(World w, int[] list, int el)
+  //
+  // NOTE: The dummy world token has no meaning; the AnnotateEffects pass is
+  // necessary to enforce linearization on effectful ops.
+  std::vector<NamedValue> modifiedArgs(raw_args.begin(), raw_args.end());
+  if (schema.is_mutable) {
+    // Add a dummy world token to be matched against
+    const auto worldToken = graph.insertDummyWorld();
+    modifiedArgs.insert(modifiedArgs.begin(), worldToken);
+  }
+  auto err = [&]() -> std::ostream& {
+    failure_messages << "\nfor operator " << schema << ":\n";
+    return failure_messages;
+  };
 
-        // allow zeros(IntList sizes) to work with zeros(1, 2) or zeros(1)
-        if (arg.type->kind() == TypeKind::ListType && // the formal must be a list
-            !arg.N && // it must not be a broadcasting list like int[3], otherwise a single int is a valid input
-            (schema_i + 1 == schema.arguments.size() || schema.arguments[schema_i + 1].kwarg_only) &&  // must be the last position argument
-            !convertibleToList(args[schema_i].value(graph)->type(), arg.type)) { // and the actual should not be a list already
+  TypeEnv type_env;
+  std::vector<Value*> positional_inputs;
+  std::vector<bool> used_kwarg(kwargs.size(), false);
+
+  // if we finish the loop will we have consumed all arguments?
+  size_t used_args = 0;
+
+  for (size_t schema_i = 0; schema_i < schema.arguments.size(); ++schema_i) {
+    const auto& arg = schema.arguments[schema_i];
+    at::optional<NamedValue> v;
+    if (!arg.kwarg_only && schema_i < modifiedArgs.size()) {
+      // allow zeros(IntList sizes) to work with zeros(1, 2) or zeros(1)
+      if (arg.type->kind() == TypeKind::ListType && // the formal must be a list
+          !arg.N && // it must not be a broadcasting list like int[3], otherwise
+                    // a single int is a valid input
+          (schema_i + 1 == schema.arguments.size() ||
+           schema.arguments[schema_i + 1]
+               .kwarg_only)) { // must be the last position argument
+        auto actual_type = modifiedArgs[schema_i].value(graph)->type();
+        if (actual_type->kind() != TypeKind::ListType &&
+            !convertibleToList(
+                actual_type,
+                arg.type)) { // and the actual should not be a list already
           auto elem_type = arg.type->expect<ListType>()->getElementType();
-          Value* list = tryCreateList(elem_type, graph, loc, args.slice(schema_i),
-            err, convert_tensors_to_nums);
-          if(!list)
+          Value* list = tryCreateList(
+              elem_type,
+              graph,
+              loc,
+              at::ArrayRef<NamedValue>(modifiedArgs).slice(schema_i),
+              err,
+              convert_tensors_to_nums,
+              type_env);
+          if (!list)
             return at::nullopt;
-          used_args = args.size();
+          used_args = modifiedArgs.size();
           positional_inputs.push_back(list);
           continue;
         }
+      }
 
-        v = args[schema_i];
-        used_args++;
-      } else if(auto idx = findInputWithName(arg.name, kwargs))  {
-        const NamedValue& nv = kwargs[*idx];
-        if(used_kwarg[*idx]) {
-          err() << "argument " << nv.name() << " specified twice in schema, submit a bug report!\n" << nv.locOr(loc);
-          return at::nullopt;
-        }
-        used_kwarg[*idx] = true;
-        v = nv;
-      } else if(arg.default_value) {
-        v = NamedValue(*arg.default_value);
-      } else {
-        err() << "argument " << schema.arguments[schema_i].name << " not provided.\n" << loc;
+      v = modifiedArgs[schema_i];
+      used_args++;
+    } else if (auto idx = findInputWithName(arg.name, kwargs)) {
+      const NamedValue& nv = kwargs[*idx];
+      if (used_kwarg[*idx]) {
+        err() << "argument " << nv.name()
+              << " specified twice in schema, submit a bug report!\n"
+              << nv.locOr(loc);
         return at::nullopt;
       }
-      Value * positional = tryMatchArgument(arg, graph, loc, *v, err, convert_tensors_to_nums);
-      if(!positional)
-        return at::nullopt;
-      positional_inputs.push_back(positional);
-    }
-
-    // check for unused positional arguments
-    if(used_args < args.size()) {
-      err() << "expected at most " << used_args << " arguments "
-      << "but found " << args.size() << " positional arguments.\n" << loc << "\n";
+      used_kwarg[*idx] = true;
+      v = nv;
+    } else if (arg.default_value) {
+      v = NamedValue(*arg.default_value);
+    } else {
+      err() << "argument " << schema.arguments[schema_i].name
+            << " not provided.\n"
+            << loc;
       return at::nullopt;
     }
-    // check for unused kwargs
-    for(size_t i = 0; i < kwargs.size(); ++i) {
-      const auto& nv = kwargs[i];
-      if (!used_kwarg[i]) {
-        if(!schema.argumentIndexWithName(nv.name())) {
-          err() << "keyword argument " << nv.name() << " unknown\n";
-        } else {
-          err() << "keyword argument " << nv.name() << " specified twice\n";
-        }
-        return at::nullopt;
+    Value* positional = tryMatchArgument(
+        arg, graph, loc, *v, err, convert_tensors_to_nums, type_env);
+    if (!positional)
+      return at::nullopt;
+    positional_inputs.push_back(positional);
+  }
+
+  // check for unused positional arguments
+  if (used_args < modifiedArgs.size()) {
+    err() << "expected at most " << used_args << " arguments "
+          << "but found " << modifiedArgs.size() << " positional arguments.\n"
+          << loc << "\n";
+    return at::nullopt;
+  }
+  // check for unused kwargs
+  for (size_t i = 0; i < kwargs.size(); ++i) {
+    const auto& nv = kwargs[i];
+    if (!used_kwarg[i]) {
+      if (!schema.argumentIndexWithName(nv.name())) {
+        err() << "keyword argument " << nv.name() << " unknown\n";
+      } else {
+        err() << "keyword argument " << nv.name() << " specified twice\n";
       }
+      return at::nullopt;
     }
-    return positional_inputs;
-}
-
-
-static Value* tryEmitBuiltin(
-  const std::shared_ptr<Operator>& op,
-  std::stringstream& failure_messages,
-  const SourceRange& loc,
-  Graph& graph,
-  Symbol name,
-  at::ArrayRef<NamedValue> inputs,
-  at::ArrayRef<NamedValue> attributes,
-  bool convert_tensors_to_nums) {
-
-  auto matched_inputs = tryMatchSchema(op->schema(), loc, graph, inputs, attributes,
-    failure_messages, convert_tensors_to_nums);
-  if(!matched_inputs)
-    return nullptr;
-  // we successfully matched this schema, construct the node
-
-  auto n = graph.insertNode(graph.create(name, *matched_inputs, 0))
-                ->setSourceLocation(std::make_shared<SourceRange>(loc));
-
-  for(auto & ret : op->schema().returns) {
-    n->addOutput()->setType(ret.type);
   }
-
-  // assert that we did indeed create an op that has implementation
-  // otherwise schema and dispatch are not in sync
-  getOperation(n);
-
-  return packOutputs(graph, n->outputs());
+  auto return_types = fmap(schema.returns, [&](const Argument& r) {
+    return evalTypeVariables(r.type, type_env);
+  });
+  return MatchedSchema{std::move(positional_inputs), std::move(return_types)};
 }
 
 static std::string prefixLine(const std::string& str, std::string prefix) {
@@ -662,6 +687,29 @@ static std::string prefixLine(const std::string& str, std::string prefix) {
   return ss.str();
 }
 
+// Given a successful match between operator schema and symbol, emit a node
+// with the appropriate inputs and outputs.
+static Value* emitBuiltinNode(
+    const MatchedSchema& matched_schema,
+    const SourceRange& loc,
+    Graph& graph,
+    Symbol name) {
+  auto n = graph.insertNode(graph.create(name, matched_schema.inputs, 0))
+                ->setSourceLocation(std::make_shared<SourceRange>(loc));
+
+  for(auto & ret : matched_schema.return_types) {
+    n->addOutput()->setType(ret);
+  }
+
+  // assert that we did indeed create an op that has implementation
+  // otherwise schema and dispatch are not in sync
+  getOperation(n);
+
+  return packOutputs(graph, n->outputs());
+}
+
+// Search for operators matching the provided symbol name and input types.
+// If one is found, emit a node to the graph for that operator.
 Value* emitBuiltinCall(
   const SourceRange& loc,
   Graph& graph,
@@ -674,23 +722,45 @@ Value* emitBuiltinCall(
 
 
   const auto& variants = getAllOperatorsFor(name);
+  const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
+
   std::stringstream failure_messages;
   //first we try to match the schema without any conversion
   //if no schema matches then insert ImplicitTensorToNum
-  for(bool convert_tensors_to_nums : {false, true}) {
-    //clear previous error messages
+  for (bool convert_tensors_to_nums : {false, true}) {
+    // clear previous error messages
     failure_messages.str("");
     for (const std::shared_ptr<Operator>& op : variants) {
-      if (auto result = tryEmitBuiltin(
-              op, failure_messages, loc, graph, name, inputs, attributes,
+      const auto matched_schema = tryMatchSchema(
+          op->schema(),
+          loc,
+          graph,
+          inputs,
+          attributes,
+          failure_messages,
+          convert_tensors_to_nums);
+
+      if (matched_schema) {
+        return emitBuiltinNode(*matched_schema, loc, graph, name);
+      }
+    }
+    for (Method* method : builtin_functions) {
+      if (auto result = try_emit_call_to(
+              graph,
+              loc,
+              *method,
+              inputs,
+              attributes,
+              failure_messages,
+              nullptr,
               convert_tensors_to_nums)) {
-        return result;
+        return packOutputs(graph, *result);
       }
     }
   }
 
   // none of the options worked
-  if(!required) {
+  if (!required) {
     return nullptr;
   }
   if(variants.size() == 0) {
@@ -719,8 +789,8 @@ std::shared_ptr<SugaredValue> BuiltinFunction::call(
   if (value)
     inputs.push_back(*value);
   inputs.insert(inputs.end(), inputs_.begin(), inputs_.end());
-  return std::make_shared<SimpleValue>(
-      emitBuiltinCall(loc, *m.graph(), symbol, inputs, attributes, true));
+  return std::make_shared<SimpleValue>(emitBuiltinCall(
+      loc, *m.graph(), symbol, inputs, attributes, true));
 }
 
 inline bool isSupportedListElementType(TypePtr type) {
@@ -728,19 +798,6 @@ inline bool isSupportedListElementType(TypePtr type) {
       type->isSubtypeOf(NumberType::get());
 }
 
-// guard for List types we do not currently have operations for
-inline void ensureLegalType(const SourceRange& range, TypePtr ptr) {
-  if(TupleTypePtr tt = ptr->cast<TupleType>()) {
-    for(auto elem : tt->elements()) {
-      ensureLegalType(range, elem);
-    }
-  } else if(ListTypePtr lt = ptr->cast<ListType>()) {
-    if(!isSupportedListElementType(lt->getElementType())) {
-        throw ErrorReport(range) << "Lists can only contain numbers or Tensors, but found " << lt->getElementType()->str();
-    }
-  }
-}
-
 struct to_ir {
   to_ir(
       Def def,
@@ -791,7 +848,6 @@ struct to_ir {
       // Record the type for the schema and set the Type on the Value*
       arguments.push_back(schema.arguments.at(arg_annotation_idx++));
       new_input->setType(arguments.back().type);
-      ensureLegalType((*it).ident().range(), arguments.back().type);
     }
     // body
     auto stmts = def.statements();
@@ -841,6 +897,8 @@ struct to_ir {
     }
 
     method.setSchema({def.name().name(), std::move(arguments), std::move(returns)});
+    // annotate effects to prevent reordering
+    AnnotateEffects(graph);
     // remove any uses of tuples that we inserted that are not needed
     LowerSimpleTuples(graph);
   }
@@ -1577,7 +1635,6 @@ struct to_ir {
         }
         Value* result = graph->insertNode(graph->createList(elem_type, values))
             ->output();
-        ensureLegalType(tree->range(), result->type());
         return result;
       } break;
       case TK_TUPLE_LITERAL: {
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
index deef6a5c2ca8f5..745137d1a9ad05 100644
--- a/torch/csrc/jit/script/compiler.h
+++ b/torch/csrc/jit/script/compiler.h
@@ -164,7 +164,13 @@ TORCH_API void ensureTensors(const SourceRange& range, at::ArrayRef<Value*> valu
 // if it returns nullopt, then failure_messages contains a good error report
 // set convert_tensor_to_num to true if ImplicitTensorToNums should be inserted to
 // match the schema
-TORCH_API at::optional<std::vector<Value*>> tryMatchSchema(
+
+struct MatchedSchema {
+  std::vector<Value*> inputs;
+  std::vector<TypePtr> return_types;
+};
+
+TORCH_API at::optional<MatchedSchema> tryMatchSchema(
   const FunctionSchema& schema,
   const SourceRange& loc,
   Graph& graph,
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index f0dfda81cc0926..4c7df820b13b4f 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -114,9 +114,9 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
     auto schema = getSchema(inputs.size(), n_binders);
 
     std::stringstream failure_messages;
-    at::optional<std::vector<Value*>> all_inputs =
+    at::optional<MatchedSchema> matched_schema =
       tryMatchSchema(schema, loc, *m.graph(), inputs_, attributes, failure_messages, /*conv_tensor_to_num*/true);
-    if (!all_inputs)
+    if (!matched_schema)
       throw ErrorReport(loc) << failure_messages.str();
 
     // Release the function object so we can wrap it in a PythonOp
@@ -125,12 +125,12 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
     Node* new_node = m.graph()->insertNode(m.graph()->createPythonOp(
       THPObjectPtr(func.release().ptr()), cconv, {}));
     new_node->setSourceLocation(std::make_shared<SourceRange>(loc));
-    for(auto &i : *all_inputs)
+    for(auto &i : matched_schema->inputs)
       new_node->addInput(i);
 
     std::vector<Value*> outputs;
-    for(auto & ret_arg : schema.returns) {
-      outputs.push_back(new_node->addOutput()->setType(ret_arg.type));
+    for(auto & ret_arg : matched_schema->return_types) {
+      outputs.push_back(new_node->addOutput()->setType(ret_arg));
     }
     return std::make_shared<SimpleValue>(packOutputs(*m.graph(), outputs));
   }
@@ -371,7 +371,14 @@ void initJitScriptBindings(PyObject* module) {
   // public.
   py::class_<Module, std::shared_ptr<Module>>(m, "ScriptModule")
       .def(py::init<>())
-      .def("save", &Module::save)
+      .def("save", [](std::shared_ptr<Module> m, const std::string& filename) {
+          m->save(filename);
+      })
+      .def("save_to_buffer", [](std::shared_ptr<Module> m) {
+          std::ostringstream buf;
+          m->save(buf);
+          return py::bytes(buf.str());
+      })
       .def("_set_optimized", &Module::set_optimized)
       .def(
           "_define",
@@ -534,7 +541,13 @@ void initJitScriptBindings(PyObject* module) {
   });
 
   m.def("merge_type_from_type_comment", &mergeTypesFromTypeComment);
-  m.def("import_ir_module", import_ir_module);
+  m.def("import_ir_module", [](ModuleLookup module_lookup, const std::string& filename) {
+    import_ir_module(module_lookup, filename);
+  });
+  m.def("import_ir_module_from_buffer", [](ModuleLookup module_lookup, const std::string& buffer) {
+    std::istringstream in(buffer);
+    import_ir_module(module_lookup, in);
+  });
 }
 
 } // namespace script
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
index b1f6a6e220bbc9..61261a352d456e 100644
--- a/torch/csrc/jit/script/module.cpp
+++ b/torch/csrc/jit/script/module.cpp
@@ -37,8 +37,15 @@ const FunctionSchema& Method::getSchema() const {
   return *schema;
 }
 
-std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs) {
-  JIT_ASSERT(!executor);
+at::optional<std::vector<Value*>> try_emit_call_to(
+    Graph& graph,
+    SourceRange loc,
+    Method& callee,
+    ArrayRef<NamedValue> args,
+    ArrayRef<NamedValue> kwargs,
+    std::stringstream& failure_messages,
+    Method* caller,
+    bool conv_tensors_to_nums) {
   try {
     callee.ensure_defined();
   } catch (RecursiveMethodCallError&) {
@@ -47,19 +54,38 @@ std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, Array
   }
   auto fn = callee.graph();
 
-  std::stringstream failure_messages;
-  auto all_inputs = tryMatchSchema(
+  auto matched_schema = tryMatchSchema(
     callee.getSchema(),
-    loc, *graph(), args, kwargs, failure_messages, /*conv_tensors_to_nums*/true);
-  if(!all_inputs)
-    throw ErrorReport(loc) << failure_messages.str();
+    loc, graph, args, kwargs, failure_messages, conv_tensors_to_nums);
+  if(!matched_schema)
+    return at::nullopt;
 
   // parameters to callee method (which become parameters to _this_ method
   // if they were not already)
-  for(at::Tensor* member : callee.member_inputs) {
-    all_inputs->push_back(get_or_add_parameter(member));
+  for(at::Tensor* member : callee.params()) {
+    if(!caller) {
+      throw ErrorReport(loc) << " attempting to call a method with parameters from a raw graph. File a bug report";
+    }
+    matched_schema->inputs.push_back(caller->get_or_add_parameter(member));
   }
-  return inlineCallTo(*graph(), *callee.graph(), *all_inputs);
+  return inlineCallTo(graph, *callee.graph(), matched_schema->inputs);
+}
+
+std::vector<Value*> Method::emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs) {
+  JIT_ASSERT(!executor);
+  std::stringstream failure_messages;
+  if (auto result = try_emit_call_to(
+          *graph(),
+          loc,
+          callee,
+          args,
+          kwargs,
+          failure_messages,
+          this,
+          /*conv_tensors_to_nums=*/true)) {
+    return *result;
+  }
+  throw ErrorReport(loc) << failure_messages.str();
 }
 
 void Method::ensure_defined() {
@@ -71,6 +97,10 @@ void Method::ensure_defined() {
   }
 }
 
+void Module::save(std::ostream& out) {
+  ExportModule(*this, out);
+}
+
 void Module::save(const std::string& filename) {
   ExportModule(*this, filename);
 }
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 50ae9f48fb3c93..cafd084ce2cca3 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <ostream>
 
 // This file contains classes which assist in desugaring Python style
 // modules and their methods into flattened graphs which don't have any
@@ -84,6 +85,7 @@ struct Method {
 
   // defined here to keep details of member_input handling confined to this class
   std::vector<Value*> emit_call_to(SourceRange loc, Method & callee, ArrayRef<NamedValue> args, ArrayRef<NamedValue> kwargs);
+
   // if this isn't yet defined, run its method_creator function
   void ensure_defined();
 
@@ -376,6 +378,8 @@ struct Module {
     return get_method(method_name)({IValue(std::forward<Types>(args))...});
   }
 
+  void save(std::ostream& out);
+
   void save(const std::string& filename);
 
  private:
@@ -390,4 +394,18 @@ struct Module {
   bool optimize;
 };
 
+// returns at::nullopt and fills in failure_messages if the callee does not
+// match the functions schema
+at::optional<std::vector<Value*>> try_emit_call_to(
+    Graph& graph,
+    SourceRange loc,
+    Method& callee,
+    ArrayRef<NamedValue> args,
+    ArrayRef<NamedValue> kwargs,
+    std::stringstream& failure_messages,
+    // when callee uses no parameters (e.g. it is a function in a compilation unit,
+    // and not a method), then nullptr can be passed as caller.
+    Method* caller,
+    bool conv_tensors_to_nums);
+
 }}}
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
index 14e5e4f5ae1354..64f7f9c8db935a 100644
--- a/torch/csrc/jit/script/parser.h
+++ b/torch/csrc/jit/script/parser.h
@@ -64,6 +64,8 @@ struct Parser {
       std::vector<Expr> exprs = { prefix };
       while(L.cur().kind != end) {
         L.expect(',');
+        if (L.cur().kind == end)
+          break;
         exprs.push_back(parseExp());
       }
       auto list = List<Expr>::create(prefix.range(), exprs);
diff --git a/torch/csrc/jit/serialization.h b/torch/csrc/jit/serialization.h
index a4ebd864ac6cc3..9fc8d4a1b688dd 100644
--- a/torch/csrc/jit/serialization.h
+++ b/torch/csrc/jit/serialization.h
@@ -3,6 +3,9 @@
 #include <cstdio>
 #include <cstring>
 #include <cerrno>
+#include <istream>
+#include <ostream>
+#include <fstream>
 
 namespace torch { namespace jit {
 
@@ -75,25 +78,16 @@ namespace {
   static constexpr uint64_t kFileFormatVersion = 0x1L;
   static constexpr uint8_t kPadValue = 0xEF;
 
-  void wrapPErrorAndThrow(const std::string& msg) {
-    std::ostringstream oss;
-    oss << msg << " : " << strerror(errno);
-    throw std::runtime_error(oss.str());
-  }
 }  // namespace
 
-class PyTorchFileReader {
+class PyTorchStreamReader {
  public:
-  PyTorchFileReader(std::string filename) {
-    fp = std::fopen(filename.c_str(), "rb");
-    if (!fp) {
-      wrapPErrorAndThrow("Couldn't open file for reading!");
-    }
+  PyTorchStreamReader(std::istream& in_) : in(in_) {
     // Store file size so we know when we're done reading because the f* APIs
     // don't do a good job of that
-    std::fseek(fp, 0L, SEEK_END);
-    file_size = std::ftell(fp);
-    std::fseek(fp, 0L, SEEK_SET);
+    in.seekg(0L, in.end);
+    file_size = in.tellg();
+    in.seekg(0L);
     readAndValidateFileHeader();
     // Do this now since we're reasonably sure this is actually a PyT file from
     // the header.
@@ -115,7 +109,7 @@ class PyTorchFileReader {
     }
     // Seek to the provided offset
     cursor = key;
-    std::fseek(fp, cursor, SEEK_SET);
+    in.seekg(cursor);
     auto tag = read64BitIntegerLittleEndian();
     if (tag != RecordTags::STORAGE) {
       throw std::runtime_error("Attempted to read a record of non-storage type");
@@ -124,18 +118,16 @@ class PyTorchFileReader {
     seekToNextAlignmentBoundary();
     auto ptr = malloc(size);
     at::DataPtr retval(ptr, ptr, free, at::kCPU);
-    if (!std::fread(ptr, size, 1, fp)) {
-      wrapPErrorAndThrow("Failed to read data from record");
-    }
+
+    in.read((char*)ptr, size);
     cursor += size;
     seekToNextAlignmentBoundary();
     return std::tuple<at::DataPtr, size_t>(std::move(retval), size);
   }
-  ~PyTorchFileReader() {
-    std::fclose(fp);
+  ~PyTorchStreamReader() {
   }
  private:
-  FILE *fp;
+  std::istream& in;
   size_t cursor = 0;
   size_t file_size;
   size_t last_record_offset;
@@ -144,8 +136,9 @@ class PyTorchFileReader {
   uint64_t read64BitIntegerLittleEndian() {
    uint64_t retval;
    // TODO endian swap on platforms that need it?
-   size_t read_bytes = std::fread(&retval, 1u, 8u, fp);
-   if (read_bytes != 8u) {
+   in.read(reinterpret_cast<char *>(&retval), 8);
+   std::streamsize read_bytes = in.gcount();
+   if (read_bytes != 8) {
      std::ostringstream errmsg;
      errmsg << "Expected to read 8 bytes but got " << read_bytes;
      throw std::runtime_error(errmsg.str());
@@ -158,7 +151,7 @@ class PyTorchFileReader {
    size_t next_offset = (cursor + kFieldAlignment) - (cursor % kFieldAlignment);
    size_t pad_amount = next_offset - cursor;
    cursor += pad_amount;
-   std::fseek(fp, cursor, SEEK_SET);
+   in.seekg(cursor);
   }
 
   // File format deserialization functions
@@ -183,7 +176,7 @@ class PyTorchFileReader {
     // Seek to location of file footer. We've already validated that the file
     // length is a multiple of the alignment size
     cursor = file_size - kFieldAlignment;
-    std::fseek(fp, cursor, SEEK_SET);
+    in.seekg(cursor);
     auto tag = read64BitIntegerLittleEndian();
     if (tag != RecordTags::FOOTER) {
       throw std::runtime_error("File footer has wrong record type. Is this"
@@ -197,13 +190,9 @@ class PyTorchFileReader {
   }
 };
 
-class PyTorchFileWriter {
+class PyTorchStreamWriter {
  public:
-  PyTorchFileWriter(const std::string& filename) {
-    fp = std::fopen(filename.c_str(), "wb");
-    if (!fp) {
-      wrapPErrorAndThrow("Unable to open PyTorch file for writing!");
-    }
+  PyTorchStreamWriter(std::ostream& out_) : out(out_) {
     writeFileHeader();
     // In the case that we do not write any records into this file, the last
     // record index written into the footer will point to the footer itself.
@@ -224,15 +213,14 @@ class PyTorchFileWriter {
     JIT_ASSERT(!finalized);
     writeFileFooter();
     finalized = true;
-    std::fclose(fp);
   }
-  ~PyTorchFileWriter() {
+  ~PyTorchStreamWriter() {
     if (!finalized) {
       writeEndOfFile();
     }
   }
  private:
-  FILE *fp;
+  std::ostream& out;
   size_t cursor = 0;
   bool finalized = false;
   size_t last_record_idx = 0;
@@ -240,17 +228,13 @@ class PyTorchFileWriter {
   // Utility functions
   void write64BitIntegerLittleEndian(const uint64_t value) {
     // TODO endian swap on platforms that need it?
-    if (!std::fwrite(&value, 8u, 1u, fp)) {
-      wrapPErrorAndThrow("Unable to write to file!");
-    }
+    out.write(reinterpret_cast<const char *>(&value), 8);
     cursor += 8u;
   }
 
   void writePad(const size_t num_bytes) {
     static std::vector<char> pad_buffer(kPadValue, kFieldAlignment);
-    if (!std::fwrite(pad_buffer.data(), num_bytes, 1u, fp)) {
-      wrapPErrorAndThrow("Unable to write to file!");
-    }
+    out.write(pad_buffer.data(), num_bytes);
     cursor += num_bytes;
   }
 
@@ -261,9 +245,7 @@ class PyTorchFileWriter {
   }
 
   void writeBuffer(const char* data, size_t size) {
-    if (!std::fwrite(data, size, 1u, fp)) {
-      wrapPErrorAndThrow("Unable to write to file!");
-    }
+    out.write(data, size);
     cursor += size;
   }
 
@@ -281,5 +263,43 @@ class PyTorchFileWriter {
   }
 };
 
+class PyTorchFileReader {
+ public:
+  PyTorchFileReader(const std::string& filename) :
+    in(filename, std::ios_base::binary),
+    stream_reader(in) {}
+
+  std::tuple<at::DataPtr, size_t> getLastRecord() {
+    return stream_reader.getLastRecord();
+  }
+
+  std::tuple<at::DataPtr, size_t> getRecordWithKey(uint64_t key) {
+    return stream_reader.getRecordWithKey(key);
+  }
+
+ private:
+  std::ifstream in;
+  PyTorchStreamReader stream_reader;
+};
+
+class PyTorchFileWriter {
+ public:
+  PyTorchFileWriter(const std::string& filename) :
+    out(filename, std::ios_base::binary),
+    stream_writer(out) {}
+
+  uint64_t writeRecord(const char* data, size_t size) {
+    return stream_writer.writeRecord(data, size);
+  }
+
+  void writeEndOfFile() {
+    stream_writer.writeEndOfFile();
+    out.close();
+  }
+
+ private:
+  std::ofstream out;
+  PyTorchStreamWriter stream_writer;
+};
 
 }}  // namespace torch::jit
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index a4ada2647af7f5..855adad429191f 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -55,6 +55,10 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << "string";
   } else if(t.kind() == TypeKind::GeneratorType) {
     out << "Generator";
+  } else if(t.kind() == TypeKind::VarType) {
+    out << t.expect<VarType>()->name();
+  } else if(t.kind() == TypeKind::WorldType) {
+    out << "World";
   } else {
     AT_ERROR("unknown type kind");
   }
@@ -89,6 +93,10 @@ GeneratorTypePtr GeneratorType::get() {
   static auto value = GeneratorType::create();
   return value;
 }
+WorldTypePtr WorldType::get() {
+  static auto value = WorldType::create();
+  return value;
+}
 StringTypePtr StringType::get() {
   static auto value = StringType::create();
   return value;
@@ -170,4 +178,71 @@ at::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2) {
   return at::nullopt;
 }
 
+TypePtr matchTypeVariables(TypePtr formal, TypePtr actual, TypeEnv& type_env) {
+  if(!formal->hasFreeVariables())
+    return formal;
+  if(auto vt = formal->cast<VarType>()) {
+    auto it = type_env.find(vt->name());
+    if(it == type_env.end()) {
+      type_env[vt->name()] = actual;
+      return actual;
+    } else if(auto unified = unifyTypes(it->second, actual)) {
+      type_env[vt->name()] = *unified;
+      return *unified;
+    }
+    std::stringstream ss;
+    ss << "type variable '" << vt->name() <<"' previously matched to type " <<
+      it->second->str() << " is matched to type " << actual->str();
+    throw TypeMatchError(ss.str());
+  } else if(auto lt_formal = formal->cast<ListType>()) {
+    if(auto lt_actual = actual->cast<ListType>()) {
+      return ListType::create(matchTypeVariables(lt_formal->getElementType(), lt_actual->getElementType(), type_env));
+    } else {
+      std::stringstream ss;
+      ss << "cannot match a list to " << actual->str();
+      throw TypeMatchError(ss.str());
+    }
+  } else if(auto tp_formal = formal->cast<TupleType>()) {
+    if(auto tp_actual = actual->cast<TupleType>()) {
+      if(tp_formal->elements().size() != tp_actual->elements().size()) {
+        std::stringstream ss;
+        throw TypeMatchError("cannot match tuples of mismatched size");
+      }
+      std::vector<TypePtr> elements;
+      for(size_t i = 0; i < tp_formal->elements().size(); ++i) {
+        TypePtr result = matchTypeVariables(
+            tp_formal->elements()[i],
+            tp_actual->elements()[i],
+            type_env);
+        elements.push_back(result);
+      }
+      return TupleType::create(std::move(elements));
+    } else {
+      std::stringstream ss;
+      ss << "cannot match a tuple to " << actual->str();
+      throw TypeMatchError(ss.str());
+    }
+  }
+  AT_ERROR("unhandled free variable container: ", formal->str());
+}
+
+// change return types like List[List[t]] into List[List[int]]
+TORCH_API TypePtr evalTypeVariables(TypePtr type, std::unordered_map<std::string, TypePtr>& type_env) {
+  if(!type->hasFreeVariables())
+    return type;
+
+  if(auto vt = type->cast<VarType>()) {
+    auto it = type_env.find(vt->name());
+    AT_ASSERTM(it != type_env.end(), "schema has unbound type variable '", vt->name(), "' in its return type");
+    return it->second;
+  } else if(auto lt = type->cast<ListType>()) {
+    return ListType::create(evalTypeVariables(lt->getElementType(), type_env));
+  } else if(auto tp = type->cast<TupleType>()) {
+    return TupleType::create(fmap(tp->elements(), [&](const TypePtr& typ) {
+      return evalTypeVariables(typ, type_env);
+    }));
+  }
+  return type;
+}
+
 }} // namespace torch::jit
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index 96e9f45496a34b..49748de239e2b2 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -27,6 +27,8 @@ _(IntType) \
 _(NoneType) \
 _(StringType) \
 _(GeneratorType) \
+_(VarType) \
+_(WorldType) \
 
 enum class TypeKind {
 #define DEFINE_TYPE(T) T,
@@ -133,6 +135,9 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     return r;
   }
   virtual ~Type() = default;
+  virtual bool hasFreeVariables() const {
+    return false;
+  }
 };
 
 inline bool operator!=(const Type & lhs, const Type & rhs) {
@@ -366,6 +371,32 @@ struct TORCH_API CompleteTensorType : public TensorType {
   std::vector<int64_t> strides_;
 };
 
+// This type is a token used to represent effectful computation in the IR.
+// See the AnnotateEffects pass for how it is used.
+struct WorldType;
+using WorldTypePtr = std::shared_ptr<WorldType>;
+struct TORCH_API WorldType : public Type {
+  template <typename... T>
+  static WorldTypePtr create(T&&... all) {
+    return WorldTypePtr(new WorldType(std::forward<T>(all)...));
+  }
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "world";
+  }
+  bool isSubtypeOf(const TypePtr rhs) const override {
+    return *this == *rhs;
+  }
+  static const TypeKind Kind = TypeKind::WorldType;
+  // global singleton
+  static WorldTypePtr get();
+
+ private:
+  WorldType() : Type(TypeKind::WorldType) {}
+};
+
 struct ListType;
 using ListTypePtr = std::shared_ptr<ListType>;
 
@@ -400,6 +431,9 @@ struct TORCH_API ListType : public Type {
   TypePtr getElementType() const {
     return elem;
   }
+  bool hasFreeVariables() const override {
+    return has_free_variables_;
+  }
   // common cast List[Tensor]
   static ListTypePtr ofTensors();
   static ListTypePtr ofInts();
@@ -408,8 +442,11 @@ struct TORCH_API ListType : public Type {
   static const TypeKind Kind = TypeKind::ListType;
 private:
   ListType(TypePtr elem)
-  : Type(TypeKind::ListType), elem(std::move(elem)) {}
+  : Type(TypeKind::ListType)
+  , elem(std::move(elem))
+  , has_free_variables_(getElementType()->hasFreeVariables()) {}
   TypePtr elem;
+  bool has_free_variables_;
 };
 
 struct TupleType;
@@ -461,12 +498,20 @@ struct TORCH_API TupleType : public Type {
     ss << "]";
     return ss.str();
   }
+  bool hasFreeVariables() const override {
+    return has_free_variables_;
+  }
 
   static const TypeKind Kind = TypeKind::TupleType;
 private:
   TupleType(std::vector<TypePtr> elements_)
   : Type(TypeKind::TupleType)
-  , elements_(std::move(elements_)) {}
+  , elements_(std::move(elements_)) {
+    has_free_variables_ =
+        std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) {
+          return v->hasFreeVariables();
+        });
+  }
 
   bool compare(const Type& rhs, std::function<bool(const TypePtr, const TypePtr)> fn) const {
     if(rhs.kind() != kind())
@@ -482,6 +527,7 @@ struct TORCH_API TupleType : public Type {
     return true;
   }
   std::vector<TypePtr> elements_;
+  bool has_free_variables_;
 };
 
 struct NumberType;
@@ -631,6 +677,34 @@ struct GeneratorType : public Type {
 };
 
 
+// a type variable, used in FunctionSchema
+struct VarType;
+using VarTypePtr = std::shared_ptr<VarType>;
+struct VarType : public Type {
+  static constexpr bool is_singleton = false;
+  template<typename ... T>
+  static VarTypePtr create(std::string name_) {
+    return VarTypePtr(new VarType(std::move(name_)));
+  }
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return name();
+  }
+  static const TypeKind Kind = TypeKind::VarType;
+  const std::string& name() const {
+    return name_;
+  }
+  bool hasFreeVariables() const override {
+    return true;
+  }
+private:
+  VarType(std::string name_)
+  : Type(TypeKind::VarType), name_(name_) {}
+  std::string name_;
+};
+
 TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t);
 // what is the type, ignoring extra size/shape information?
 // e.g. Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...)
@@ -689,4 +763,17 @@ template<> inline TypePtr getTypePtr<std::vector<int64_t>>() { return ListType::
 
 TORCH_API TypePtr inferTypeFrom(const IValue& value);
 
+struct TORCH_API TypeMatchError : public std::exception {
+  TypeMatchError(std::string msg_)
+  : msg_(std::move(msg_)) {}
+  const char * what() const noexcept override {
+    return msg_.c_str();
+  }
+private:
+  std::string msg_;
+};
+using TypeEnv = std::unordered_map<std::string, TypePtr>;
+TORCH_API TypePtr matchTypeVariables(TypePtr formal, TypePtr actual, TypeEnv & type_env);
+TORCH_API TypePtr evalTypeVariables(TypePtr type, TypeEnv & type_env);
+
 }} // namespace torch::jit
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index eaf93b92be14bb..de98d278d11a10 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -4,34 +4,41 @@
 #include "THP.h"
 #include "serialization.h"
 
-static ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes);
-static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes);
-static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes);
+template <class io>
+ssize_t doPartialRead(io fildes, void* buf, size_t nbytes);
+
+template <class io>
+ssize_t doPartialWrite(io fildes, void* buf, size_t nbytes);
+
+static ssize_t doPartialPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes);
+static ssize_t doPartialPythonReadInto(PyObject* fildes, void* buf, size_t nbytes);
+static ssize_t doPartialPythonWrite(PyObject* fildes, void* buf, size_t nbytes);
 
 template <>
-ssize_t doRead<int>(int fildes, void* buf, size_t nbytes) {
+ssize_t doPartialRead<int>(int fildes, void* buf, size_t nbytes) {
   return read(fildes, buf, nbytes);
 }
 
 template <>
-ssize_t doRead<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
+ssize_t doPartialRead<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
   // Try to use fildes.readinto() instead of fildes.read()
   // because it is more memory efficient.
+  // TODO: Stop calling PyObject_HasAttrString() in a loop on our read loop
   auto has_readinto = PyObject_HasAttrString(fildes, "readinto") == 1;
   if (has_readinto) {
-    return doPythonReadInto(fildes, buf, nbytes);
+    return doPartialPythonReadInto(fildes, buf, nbytes);
   }
-  return doPythonReadBuffered(fildes, buf, nbytes);
+  return doPartialPythonReadBuffered(fildes, buf, nbytes);
 }
 
 template <>
-ssize_t doWrite<int>(int fildes, void* buf, size_t nbytes) {
+ssize_t doPartialWrite<int>(int fildes, void* buf, size_t nbytes) {
   return write(fildes, buf, nbytes);
 }
 
 template <>
-ssize_t doWrite<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
-  return doPythonWrite(fildes, buf, nbytes);
+ssize_t doPartialWrite<PyObject*>(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPartialPythonWrite(fildes, buf, nbytes);
 }
 
 static inline bool isUnsupportedOperation() {
@@ -43,39 +50,39 @@ static inline bool isUnsupportedOperation() {
 }
 
 // Call Python fildes.read(nbytes) and copy it to buf.
-static inline ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes) {
-  const size_t buffer_size = 262144;  // 2^18
-  size_t read_bytes = 0;
-
-  while (read_bytes < nbytes) {
-    auto remaining = nbytes - read_bytes;
-    auto to_read = remaining > buffer_size ? buffer_size : remaining;
-    THPObjectPtr r(PyObject_CallMethod(fildes, "read", "i", to_read));
-    if (!r) throw python_error();
-
-    // read output is String (Python 2) / Bytes (Python 3)
+static inline ssize_t doPartialPythonReadBuffered(PyObject* fildes, void* buf, size_t raw_nbytes) {
+  // If we request a large amount of data, f.read() will internally try to
+  // allocate a buffer of that size.  This is counterproductive, because
+  // it's not the buffer we ultimately want to write the data into.  Read
+  // less than that and avoid allocating too much extra memory.
+  // TODO: Maybe 260 KB is a bit small...
+  const size_t nbytes = std::min<size_t>(raw_nbytes, 262144u); // 2^18 (~260 KB)
+
+  THPObjectPtr r(PyObject_CallMethod(fildes, "read", "i", nbytes));
+  if (!r) throw python_error();
+
+  // read output is String (Python 2) / Bytes (Python 3)
 #if PY_MAJOR_VERSION >= 3
-    auto size = PyBytes_GET_SIZE(r.get());
-    const void* bytes = PyBytes_AsString(r.get());
+  auto size = PyBytes_GET_SIZE(r.get());
+  const void* py_buf = PyBytes_AsString(r.get());
 #else
-    auto size = PyString_GET_SIZE(r.get());
-    const void* bytes = PyString_AsString(r.get());
+  auto size = PyString_GET_SIZE(r.get());
+  const void* py_buf = PyString_AsString(r.get());
 #endif
 
-    // we read EOF
-    if (size == 0) {
-      return read_bytes;
-    }
+  // we read EOF
+  if (size == 0) {
+    return 0;
+  }
 
-    memcpy(reinterpret_cast<char*>(buf) + read_bytes, bytes, size);
-    read_bytes += size;
-  } // Reading loop
+  // Slurp it into the buffer we actually want
+  memcpy(buf, py_buf, size);
 
-  return read_bytes;
+  return size;
 }
 
 // Either does fildes.readinto(buf) or fildes.write(buf)
-static inline ssize_t doPythonIO(PyObject* fildes, void* buf, size_t nbytes, bool is_read) {
+static inline ssize_t doPartialPythonIO(PyObject* fildes, void* buf, size_t nbytes, bool is_read) {
 #if PY_MAJOR_VERSION >= 3
   auto rw_flag = is_read ? PyBUF_WRITE : PyBUF_READ;
   THPObjectPtr memview(PyMemoryView_FromMemory(
@@ -97,19 +104,77 @@ static inline ssize_t doPythonIO(PyObject* fildes, void* buf, size_t nbytes, boo
   // fildes.readinto can return UnsupportedOperation so fall back to fildes.read.
   if (is_read && isUnsupportedOperation()) {
     PyErr_Clear();
-    return doPythonReadBuffered(fildes, buf, nbytes);
+    return doPartialPythonReadBuffered(fildes, buf, nbytes);
   }
   throw python_error();
 }
 
 // Call Python fildes.readinto(buf)
-static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes) {
-  return doPythonIO(fildes, buf, nbytes, /* is_read */ true);
+static ssize_t doPartialPythonReadInto(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPartialPythonIO(fildes, buf, nbytes, /* is_read */ true);
 }
 
 // Call Python fildes.write(buf)
-static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes) {
-  return doPythonIO(fildes, buf, nbytes, /* is_read */ false);
+static ssize_t doPartialPythonWrite(PyObject* fildes, void* buf, size_t nbytes) {
+  return doPartialPythonIO(fildes, buf, nbytes, /* is_read */ false);
+}
+
+// Requires that we read EXACTLY nbytes; fails if we don't.
+template <typename io>
+void doRead(io fildes, void* raw_buf, size_t nbytes) {
+  char* buf = static_cast<char*>(raw_buf);
+  while (nbytes > 0) {
+    errno = 0; // doPartialRead may not set errno
+    // we read in 1GB blocks to avoid bugs on Mac OS X Lion
+    // see https://github.com/pytorch/pytorch/issues/1031 for more details
+    ssize_t r = doPartialRead(fildes, buf, std::min<size_t>(nbytes, 1073741824));
+    if (r < 0) {
+      int err = errno;
+      AT_ASSERTM(err != 0, "read(): impossible! r < 0, but no errno was set");
+      AT_ASSERTM(err != EAGAIN, "read(): non-blocking fd ", fildes,
+                                " read EAGAIN; cowardly refusing to spin-wait");
+      if (err == EINTR) {
+        continue;
+      } else {
+        AT_ERROR("read(): fd ", fildes, " failed with ", strerror(err));
+      }
+    } else if (r == 0) {
+      break;
+    }
+    buf += r;
+    // This is guaranteed by POSIX, but I just want to be double-sure
+    // to not underflow a signed integer.
+    AT_ASSERT(static_cast<size_t>(r) <= nbytes);
+    nbytes -= r;
+  }
+  if (nbytes != 0) {
+    AT_ERROR("unexpected EOF, expected ", nbytes, " more bytes. The file might be corrupted.");
+  }
+}
+
+template <typename io>
+void doWrite(io fildes, void* raw_buf, size_t nbytes) {
+  char* buf = static_cast<char*>(raw_buf);
+  while (nbytes > 0) {
+    errno = 0; // doPartialWrite may not set errno
+    // we write in 1GB blocks to avoid bugs on Mac OS X Lion
+    // see https://github.com/pytorch/pytorch/issues/1031 for more details
+    ssize_t r = doPartialWrite(fildes, buf, std::min<size_t>(nbytes, 1073741824));
+    if (r < 0) {
+      int err = errno;
+      AT_ASSERTM(err != 0, "write(): impossible! r < 0, but no errno was set");
+      AT_ASSERTM(err != EAGAIN, "write(): non-blocking fd ", fildes,
+                                " read EAGAIN; cowardly refusing to spin-wait");
+      if (err == EINTR) {
+        continue;
+      } else {
+        AT_ERROR("write(): fd ", fildes, " failed with ", strerror(err));
+      }
+    }
+    buf += r;
+    AT_ASSERT(static_cast<size_t>(r) <= nbytes);
+    nbytes -= r;
+  }
 }
 
 #include "generic/serialization.cpp"
diff --git a/torch/csrc/serialization.h b/torch/csrc/serialization.h
index 410619a68422c5..df811052fe7cda 100644
--- a/torch/csrc/serialization.h
+++ b/torch/csrc/serialization.h
@@ -8,9 +8,9 @@
 #include <TH/THGenerateHalfType.h>
 
 template <class io>
-ssize_t doRead(io fildes, void* buf, size_t nbytes);
+void doRead(io fildes, void* buf, size_t nbytes);
 
 template <class io>
-ssize_t doWrite(io fildes, void* buf, size_t nbytes);
+void doWrite(io fildes, void* buf, size_t nbytes);
 
 #endif
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 014a07e53c9532..345fe35ceee614 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -125,27 +125,29 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri
             if scale_tril.dim() < 2:
                 raise ValueError("scale_tril matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            self._unbroadcasted_scale_tril = scale_tril
             self.scale_tril, loc_ = torch.broadcast_tensors(scale_tril, loc_)
         elif covariance_matrix is not None:
             if covariance_matrix.dim() < 2:
                 raise ValueError("covariance_matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            self._unbroadcasted_scale_tril = _batch_potrf_lower(covariance_matrix)
             self.covariance_matrix, loc_ = torch.broadcast_tensors(covariance_matrix, loc_)
         else:
             if precision_matrix.dim() < 2:
                 raise ValueError("precision_matrix must be at least two-dimensional, "
                                  "with optional leading batch dimensions")
-            covariance_matrix = _batch_inverse(precision_matrix)
-            self._unbroadcasted_scale_tril = _batch_potrf_lower(covariance_matrix)
-            self.covariance_matrix, self.precision_matrix, loc_ = torch.broadcast_tensors(
-                covariance_matrix, precision_matrix, loc_)
+            self.precision_matrix, loc_ = torch.broadcast_tensors(precision_matrix, loc_)
         self.loc = loc_[..., 0]  # drop rightmost dim
 
         batch_shape, event_shape = self.loc.shape[:-1], self.loc.shape[-1:]
         super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+        if scale_tril is not None:
+            self._unbroadcasted_scale_tril = scale_tril
+        else:
+            if precision_matrix is not None:
+                self.covariance_matrix = _batch_inverse(precision_matrix).expand_as(loc_)
+            self._unbroadcasted_scale_tril = _batch_potrf_lower(self.covariance_matrix)
+
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(MultivariateNormal, _instance)
         batch_shape = torch.Size(batch_shape)
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index 6530940b328e7f..3e995d1477faed 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -41,8 +41,8 @@ def variance(self):
 
     def __init__(self, df, loc=0., scale=1., validate_args=None):
         self.df, self.loc, self.scale = broadcast_all(df, loc, scale)
-        self._chi2 = Chi2(df)
-        batch_shape = torch.Size() if isinstance(df, Number) else self.df.size()
+        self._chi2 = Chi2(self.df)
+        batch_shape = self.df.size()
         super(StudentT, self).__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
diff --git a/torch/functional.py b/torch/functional.py
index 0eac8f16741766..4290f78585a965 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -439,6 +439,8 @@ def unique(input, sorted=False, return_inverse=False, dim=None):
             before returning as output.
         return_inverse (bool): Whether to also return the indices for where
             elements in the original input ended up in the returned unique list.
+        dim (int): the dimension to apply unique. If ``None``, the unique of the
+            flattened input is returned. default: ``None``
 
     Returns:
         (Tensor, Tensor (optional)): A tensor or a tuple of tensors containing
@@ -646,8 +648,9 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None):
 
     Args:
         input (Tensor): the input tensor
-        p ({int, float, inf, -inf, 'fro', 'nuc'}): the order of norm
+        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
             The following norms can be calculated:
+
             =====  ============================  ==========================
             ord    matrix norm                   vector norm
             =====  ============================  ==========================
@@ -656,20 +659,22 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None):
             'nuc'  nuclear norm                  --
             Other  as vec norm when dim is None  sum(abs(x)**ord)**(1./ord)
             =====  ============================  ==========================
-        dim ({int, 2-tuple of ints, 2-list of ints}, optional): If it is an int,
-        vector norm will be calculated, if it is 2-tuple of ints, matrix norm
-        will be calculated. If the value is None, matrix norm will be calculated
-        when the input tensor only has two dimensions, vector norm will be
-        calculated when the input tensor only has one dimension. If the input
-        tensor has more than two dimensions, the vector norm will be applied to
-        last dimension.
-        keepdim (bool): whether the output tensors have :attr:`dim`
-            retained or not. Ignored if attr:`dim`=``None`` and
-            :attr:`out`=``None``.
+
+        dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int,
+            vector norm will be calculated, if it is 2-tuple of ints, matrix norm
+            will be calculated. If the value is None, matrix norm will be calculated
+            when the input tensor only has two dimensions, vector norm will be
+            calculated when the input tensor only has one dimension. If the input
+            tensor has more than two dimensions, the vector norm will be applied to
+            last dimension.
+        keepdim (bool, optional): whether the output tensors have :attr:`dim`
+            retained or not. Ignored if :attr:`dim` = ``None`` and
+            :attr:`out` = ``None``. Default: ``False``
         out (Tensor, optional): the output tensor. Ignored if
-        attr:`dim`=``None`` and :attr:`out`=``None``.
+            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
 
     Example::
+
         >>> import torch
         >>> a = torch.arange(9, dtype= torch.float) - 4
         >>> b = a.reshape((3, 3))
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f7cea00e6292fd..5fd90b5fd95382 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -20,6 +20,8 @@
 import numbers
 import collections
 import re
+if sys.version_info[0] > 2:
+    import pathlib
 
 
 def _parse_env(name, default, true_message, false_message):
@@ -58,19 +60,27 @@ def scope(scope_name):
             tracing_state.pop_scope()
 
 
-def load(filename):
+def load(f):
     r"""
-        Load a ``ScriptModule`` previously saved with :func:`save <torch.jit.ScriptModule.save>`
+        Load a ``ScriptModule`` previously saved with :func:`save <torch.jit.save>`
 
         .. DANGER::
            All previously saved modules, no matter their device, are always loaded onto the CPU.
            This is different from :func:`torch.load`'s semantics and may change in the future.
 
         Arguments:
-            filename (string): the file to load
+            f: a file-like object (has to implement read, readline, tell, and seek),
+                or a string containing a file name
 
         Returns:
             A ``ScriptModule`` object.
+
+        Example:
+            >>> torch.jit.load('scriptmodule.pt')
+            # Load ScriptModule from io.BytesIO object
+            >>> with open('scriptmodule.pt', 'rb') as f:
+                    buffer = io.BytesIO(f.read())
+            >>> torch.jit.load(buffer)
     """
     m = ScriptModule()
 
@@ -82,10 +92,48 @@ def module_lookup(names):
             curr = getattr(curr, name)
         return curr
 
-    torch._C.import_ir_module(module_lookup, filename)
+    if isinstance(f, str) or \
+            (sys.version_info[0] == 2 and isinstance(f, unicode)) or \
+            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
+        torch._C.import_ir_module(module_lookup, f)
+    else:
+        torch._C.import_ir_module_from_buffer(module_lookup, f.read())
     return m
 
 
+def save(m, f):
+    """
+        Saves a ScriptModule to a file.
+
+        Args:
+            m: a ScriptModule to save
+            f: a file-like object (has to implement write and flush) or a string
+               containing a file name
+
+        .. warning::
+            If you are using Python 2, torch.save does NOT support StringIO.StringIO
+            as a valid file-like object. This is because the write method should return
+            the number of bytes written; StringIO.write() does not do this.
+
+            Please use something like io.BytesIO instead.
+
+        Example:
+            >>> m = torch.jit.ScriptModule()
+            >>> # Save to file
+            >>> torch.jit.save(m, 'scriptmodule.pt')
+            >>> # Save to io.BytesIO buffer
+            >>> buffer = io.BytesIO()
+            >>> torch.jit.save(m, buffer)
+    """
+    if isinstance(f, str) or \
+            (sys.version_info[0] == 2 and isinstance(f, unicode)) or \
+            (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)):
+        m.save(f)
+    else:
+        ret = m.save_to_buffer()
+        f.write(ret)
+
+
 def get_trace_graph(f, args=(), kwargs=None):
     """
     Trace a function or model, returning a tuple consisting of the both the
@@ -317,6 +365,8 @@ def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None):
 # Check the traced module against a set of user-provided validation inputs
 @torch.no_grad()
 def _check_trace(check_inputs, func, executor_options, module, check_tolerance):
+    # Note: tracing is independent of optimizations, which consume the trace
+    executor_options['optimize'] = False
     for inputs in check_inputs:
         if isinstance(inputs, torch.Tensor):
             inputs = (inputs,)
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index a1bfcbc08e097e..313bad93fea4bd 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -664,10 +664,10 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
         - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
 
-          .. math::
+        .. math::
               H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0]
                     + \text{kernel\_size}[0] + \text{output\_padding}[0]
-
+        .. math::
               W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1]
                     + \text{kernel\_size}[1] + \text{output\_padding}[1]
 
@@ -806,13 +806,13 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
         - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where
 
-          .. math::
+        .. math::
               D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0]
                     + \text{kernel\_size}[0] + \text{output\_padding}[0]
-
+        .. math::
               H_{out} = (H_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1]
                     + \text{kernel\_size}[1] + \text{output\_padding}[1]
-
+        .. math::
               W_{out} = (W_{in} - 1) \times \text{stride}[2] - 2 \times \text{padding}[2]
                     + \text{kernel\_size}[2] + \text{output\_padding}[2]
 
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index e1468637ba4ff2..eae2e7fe2cdab3 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -633,12 +633,12 @@ def lt(g, input, other):
 
 def ge(g, input, other):
     other = _maybe_get_scalar(other)
-    return g.op("Not", lt(g, _if_scalar_type_as(g, other, input), input))
+    return g.op("Not", lt(g, input, _if_scalar_type_as(g, other, input)))
 
 
 def le(g, input, other):
     other = _maybe_get_scalar(other)
-    return g.op("Not", gt(g, _if_scalar_type_as(g, other, input), input))
+    return g.op("Not", gt(g, input, _if_scalar_type_as(g, other, input)))
 
 
 @parse_args('v', 'i')
@@ -975,13 +975,24 @@ def zeros_like(g, input):
 ]
 
 
-@parse_args('v', 'i', 'i', 'v')
+@parse_args('v', 'i', 'v', 'v')
 def zeros(g, shape, scalar_type, layout, device):
     # NOTE: no way to set device in ONNX, so we ignore it
     return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type],
                 input_as_shape_i=1, value_f=0)
 
 
+def full(g, shape, value, scalar_type, layout, device):
+    const_value = _maybe_get_const(value, 't')
+    if _is_value(const_value):
+        tmp = zeros(shape, scalar_type, layout, device)
+        return add(tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        scalar_type = _get_const(scalar_type, 'i', 'dtype')
+        return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type],
+                    input_as_shape_i=1, value_f=const_value)
+
+
 def full_like(g, input, fill_value):
     # TODO: a more efficient implementation (ConstantFill?)
     return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1)))
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 308ec0c8cf9150..a26de99ec02b93 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -87,7 +87,7 @@ def step(self, closure=None):
                 state['step'] += 1
 
                 if group['weight_decay'] != 0:
-                    grad = grad.add(group['weight_decay'], p.data)
+                    grad.add_(group['weight_decay'], p.data)
 
                 # Decay the first and second moment running average coefficient
                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 4b1c4cbc32bc09..eff79df6f29b84 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -10,8 +10,6 @@
 import tempfile
 import warnings
 
-from future.utils import raise_from
-
 import torch
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
@@ -858,7 +856,7 @@ def _build_extension_module(name, build_directory, verbose):
         message = "Error building extension '{}'".format(name)
         if hasattr(error, 'output') and error.output:
             message += ": {}".format(error.output.decode())
-        raise_from(RuntimeError(message), None)
+        raise RuntimeError(message)
 
 
 def _import_module_from_library(module_name, path):
diff --git a/torch/utils/ffi/__init__.py b/torch/utils/ffi/__init__.py
index 086cd99839eb1f..e47a4f8a341705 100644
--- a/torch/utils/ffi/__init__.py
+++ b/torch/utils/ffi/__init__.py
@@ -1,213 +1 @@
-import os
-import glob
-import tempfile
-import shutil
-from functools import wraps, reduce
-from string import Template
-import torch
-import torch.cuda
-from torch._utils import _accumulate
-
-try:
-    import cffi
-except ImportError:
-    raise ImportError("torch.utils.ffi requires the cffi package")
-
-
-if cffi.__version_info__ < (1, 4, 0):
-    raise ImportError("torch.utils.ffi requires cffi version >= 1.4, but "
-                      "got " + '.'.join(map(str, cffi.__version_info__)))
-
-
-def _generate_typedefs():
-    typedefs = []
-    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
-        for lib in ['TH', 'THCuda']:
-            for kind in ['Tensor', 'Storage']:
-                python_name = t + kind
-                if t == 'Float' and lib == 'THCuda':
-                    th_name = 'THCuda' + kind
-                else:
-                    th_name = lib + t + kind
-                th_struct = 'struct ' + th_name
-
-                typedefs += ['typedef {} {};'.format(th_struct, th_name)]
-                # We have to assemble a string here, because we're going to
-                # do this lookup based on tensor.type(), which returns a
-                # string (not a type object, as this code was before)
-                python_module = 'torch.cuda' if lib == 'THCuda' else 'torch'
-                python_class = python_module + '.' + python_name
-                _cffi_to_torch[th_struct] = python_class
-                _torch_to_cffi[python_class] = th_struct
-    return '\n'.join(typedefs) + '\n'
-_cffi_to_torch = {}
-_torch_to_cffi = {}
-_typedefs = _generate_typedefs()
-
-
-PY_MODULE_TEMPLATE = Template("""
-from torch.utils.ffi import _wrap_function
-from .$cffi_wrapper_name import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
-""")
-
-
-def _setup_wrapper(with_cuda):
-    here = os.path.abspath(os.path.dirname(__file__))
-    lib_dir = os.path.join(here, '..', '..', 'lib')
-    include_dirs = [
-        os.path.join(lib_dir, 'include'),
-        os.path.join(lib_dir, 'include', 'TH'),
-    ]
-
-    wrapper_source = '#include <TH/TH.h>\n'
-    if with_cuda:
-        import torch.cuda
-        wrapper_source += '#include <THC/THC.h>\n'
-        if os.sys.platform == 'win32':
-            cuda_include_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/include')
-            cuda_include_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/include')
-        else:
-            cuda_include_dirs = glob.glob('/usr/local/cuda/include')
-            cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
-        include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
-        include_dirs.extend(cuda_include_dirs)
-    return wrapper_source, include_dirs
-
-
-def _create_module_dir(base_path, fullname):
-    module, _, name = fullname.rpartition('.')
-    if not module:
-        target_dir = name
-    else:
-        target_dir = reduce(os.path.join, fullname.split('.'))
-    target_dir = os.path.join(base_path, target_dir)
-    try:
-        os.makedirs(target_dir)
-    except os.error:
-        pass
-    for dirname in _accumulate(fullname.split('.'), os.path.join):
-        init_file = os.path.join(base_path, dirname, '__init__.py')
-        open(init_file, 'a').close()  # Create file if it doesn't exist yet
-    return name, target_dir
-
-
-def _build_extension(ffi, cffi_wrapper_name, target_dir, verbose):
-    try:
-        tmpdir = tempfile.mkdtemp()
-        ext_suf = '.pyd' if os.sys.platform == 'win32' else '.so'
-        libname = cffi_wrapper_name + ext_suf
-        outfile = ffi.compile(tmpdir=tmpdir, verbose=verbose, target=libname)
-        shutil.copy(outfile, os.path.join(target_dir, libname))
-    finally:
-        shutil.rmtree(tmpdir)
-
-
-def _make_python_wrapper(name, cffi_wrapper_name, target_dir):
-    py_source = PY_MODULE_TEMPLATE.substitute(name=name,
-                                              cffi_wrapper_name=cffi_wrapper_name)
-    with open(os.path.join(target_dir, '__init__.py'), 'w') as f:
-        f.write(py_source)
-
-
-def create_extension(name, headers, sources, verbose=True, with_cuda=False,
-                     package=False, relative_to='.', **kwargs):
-    """Creates and configures a cffi.FFI object, that builds PyTorch extension.
-
-    Arguments:
-        name (str): package name. Can be a nested module e.g. ``.ext.my_lib``.
-        headers (str or List[str]): list of headers, that contain only exported
-            functions
-        sources (List[str]): list of sources to compile.
-        verbose (bool, optional): if set to ``False``, no output will be printed
-            (default: True).
-        with_cuda (bool, optional): set to ``True`` to compile with CUDA headers
-            (default: False)
-        package (bool, optional): set to ``True`` to build in package mode (for modules
-            meant to be installed as pip packages) (default: False).
-        relative_to (str, optional): path of the build file. Required when
-            ``package is True``. It's best to use ``__file__`` for this argument.
-        kwargs: additional arguments that are passed to ffi to declare the
-            extension. See `Extension API reference`_ for details.
-
-    .. _`Extension API reference`: https://docs.python.org/3/distutils/apiref.html#distutils.core.Extension
-    """
-    base_path = os.path.abspath(os.path.dirname(relative_to))
-    name_suffix, target_dir = _create_module_dir(base_path, name)
-    if not package:
-        cffi_wrapper_name = '_' + name_suffix
-    else:
-        cffi_wrapper_name = (name.rpartition('.')[0] +
-                             '.{0}._{0}'.format(name_suffix))
-
-    wrapper_source, include_dirs = _setup_wrapper(with_cuda)
-    include_dirs.extend(kwargs.pop('include_dirs', []))
-
-    if os.sys.platform == 'win32':
-        library_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/lib/x64')
-        library_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/lib/x64')
-
-        here = os.path.abspath(os.path.dirname(__file__))
-        lib_dir = os.path.join(here, '..', '..', 'lib')
-
-        library_dirs.append(os.path.join(lib_dir))
-    else:
-        library_dirs = []
-    library_dirs.extend(kwargs.pop('library_dirs', []))
-
-    if isinstance(headers, str):
-        headers = [headers]
-    all_headers_source = ''
-    for header in headers:
-        with open(os.path.join(base_path, header), 'r') as f:
-            all_headers_source += f.read() + '\n\n'
-
-    ffi = cffi.FFI()
-    sources = [os.path.join(base_path, src) for src in sources]
-    # NB: TH headers are C99 now
-    kwargs['extra_compile_args'] = ['-std=c99'] + kwargs.get('extra_compile_args', [])
-    ffi.set_source(cffi_wrapper_name, wrapper_source + all_headers_source,
-                   sources=sources,
-                   include_dirs=include_dirs,
-                   library_dirs=library_dirs, **kwargs)
-    ffi.cdef(_typedefs + all_headers_source)
-
-    _make_python_wrapper(name_suffix, '_' + name_suffix, target_dir)
-
-    def build():
-        _build_extension(ffi, cffi_wrapper_name, target_dir, verbose)
-    ffi.build = build
-    return ffi
-
-
-def _wrap_function(function, ffi):
-    @wraps(function)
-    def safe_call(*args, **kwargs):
-        args = tuple(ffi.cast(_torch_to_cffi.get(arg.type(), 'void') + '*', arg._cdata)
-                     if isinstance(arg, torch.Tensor) or torch.is_storage(arg)
-                     else arg
-                     for arg in args)
-        args = (function,) + args
-        result = torch._C._safe_call(*args, **kwargs)
-        if isinstance(result, ffi.CData):
-            typeof = ffi.typeof(result)
-            if typeof.kind == 'pointer':
-                cdata = int(ffi.cast('uintptr_t', result))
-                cname = typeof.item.cname
-                if cname in _cffi_to_torch:
-                    # TODO: Maybe there is a less janky way to eval
-                    # off of this
-                    return eval(_cffi_to_torch[cname])(cdata=cdata)
-        return result
-    return safe_call
+raise ImportError("torch.utils.ffi is deprecated. Please use cpp extensions instead.")