diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 16d34342c544c8..bc81326675d4ba 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -15,7 +15,7 @@ if [ ! -d "${PYTORCH_ENV_DIR}/miniconda3" ]; then
 fi
 export PATH="${PYTORCH_ENV_DIR}/miniconda3/bin:$PATH"
 source ${PYTORCH_ENV_DIR}/miniconda3/bin/activate
-conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja
+conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja future six
 if [ -z "${IN_CIRCLECI}" ]; then
   rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 fi
diff --git a/.jenkins/pytorch/perf_test/compare_with_baseline.py b/.jenkins/pytorch/perf_test/compare_with_baseline.py
index 0fbeda6339d1ef..5518e719513981 100644
--- a/.jenkins/pytorch/perf_test/compare_with_baseline.py
+++ b/.jenkins/pytorch/perf_test/compare_with_baseline.py
@@ -1,5 +1,6 @@
 import sys
 import json
+import math
 import numpy
 import argparse
 
@@ -35,14 +36,25 @@
 print("population mean: ", mean)
 print("population sigma: ", sigma)
 
+# Let the test pass if baseline number is NaN (which happened in
+# the past when we didn't have logic for catching NaN numbers)
+if math.isnan(mean) or math.isnan(sigma):
+    mean = sys.maxsize
+    sigma = 0.001
+
 sample_stats_data = json.loads(args.sample_stats)
 
-sample_mean = sample_stats_data['mean']
-sample_sigma = sample_stats_data['sigma']
+sample_mean = float(sample_stats_data['mean'])
+sample_sigma = float(sample_stats_data['sigma'])
 
 print("sample mean: ", sample_mean)
 print("sample sigma: ", sample_sigma)
 
+if math.isnan(sample_mean):
+    raise Exception('''Error: sample mean is NaN''')
+elif math.isnan(sample_sigma):
+    raise Exception('''Error: sample sigma is NaN''')
+
 z_value = (sample_mean - mean) / sigma
 
 print("z-value: ", z_value)
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh
index 61d7585496b775..61a3bed0cbd5c9 100644
--- a/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh
+++ b/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh
@@ -20,6 +20,9 @@ test_gpu_speed_mnist () {
   SAMPLE_ARRAY=()
   NUM_RUNS=$1
 
+  # Needs warm up to get accurate number
+  python main.py --epochs 1 --no-log
+
   for (( i=1; i<=$NUM_RUNS; i++ )) do
     runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
     echo $runtime
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index c0e04c849cc640..c3bee6adde22fa 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -45,7 +45,7 @@ curl https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
 call C:\\Jenkins\\Miniconda3\\Scripts\\activate.bat C:\\Jenkins\\Miniconda3
 call conda install -y -q numpy mkl cffi pyyaml boto3
 
-pip install ninja
+pip install ninja future
 
 call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" x86_amd64
 
diff --git a/README.md b/README.md
index b23bc60aa19de6..39d02a3f7b0211 100644
--- a/README.md
+++ b/README.md
@@ -239,7 +239,7 @@ You can then build the documentation by running ``make <format>`` from the
 ### Previous Versions
 
 Installation instructions and binaries for previous PyTorch versions may be found
-on [our website](http://pytorch.org/previous-versions/).
+on [our website](http://pytorch.org/previous-versions).
 
 
 ## Getting Started
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 530b470f6c8be3..655b55754eecb6 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -207,7 +207,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
   for (size_t i = 0; i < tensors.size() - 1; i++) {
     oss << tensors[i].sizes() << ", ";
   }
-  oss << "and " << tensors[tensors.size() - 1]
+  oss << "and " << tensors[tensors.size() - 1].sizes()
       << " to have the same number of elements, but got ";
   for (size_t i = 0; i < tensors.size() - 1; i++) {
     oss << tensors[i].numel() << ", ";
@@ -220,7 +220,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
 inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
   checkBackend("CPU_tensor_apply", tensors, Backend::CPU);
   if (!_all_equal_numel(tensors))
-    throw std::runtime_error(_all_equal_numel_error(tensors));
+    AT_ERROR(_all_equal_numel_error(tensors));
   // An empty tensor has no elements
   for (auto& t : tensors)
     if (t.numel() == 0)
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index d45815c5b600c9..2ae1d649c7cce3 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -3218,38 +3218,6 @@
       kwarg_only: True
     - double p
 ]]
-[[
-  name: _bernoulli_
-  backends:
-    - CPU
-    - CUDA
-  cname: bernoulli
-  return: self
-  variants: function
-  arguments:
-    - THTensor* self
-    - arg: THGenerator* generator
-      default: nullptr
-      kwarg_only: True
-    - double p
-]]
-[[
-  name: _th_bernoulli
-  types:
-    - Float
-    - Double
-  return: argument 0
-  variants: function
-  cname: bernoulli_Tensor
-  arguments:
-    - arg: THTensor* output
-      output: True
-      resize: self
-    - arg: THGenerator* generator
-      default: nullptr
-      kwarg_only: True
-    - THTensor* self
-]]
 [[
   name: _dirichlet_grad
   types:
diff --git a/aten/src/ATen/core/DeviceType.h b/aten/src/ATen/core/DeviceType.h
index 5614d247af7ae5..870b1e5bf9e538 100644
--- a/aten/src/ATen/core/DeviceType.h
+++ b/aten/src/ATen/core/DeviceType.h
@@ -8,6 +8,7 @@
 #include <ATen/core/Macros.h>
 
 #include <ostream>
+#include <functional>
 
 namespace at {
 
@@ -32,3 +33,11 @@ AT_CORE_API std::string DeviceTypeName(
 AT_CORE_API std::ostream& operator<<(std::ostream& stream, at::DeviceType type);
 
 } // namespace at
+
+namespace std {
+template <> struct hash<at::DeviceType> {
+  std::size_t operator()(const at::DeviceType &k) const {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+} // namespace std
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index 1b2c0f0e288264..a9c3040b4ae01b 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -441,12 +441,10 @@ struct AT_API Tensor {
   Tensor & atan_();
   Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
   Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
-  Tensor bernoulli(const Tensor & p, Generator * generator=nullptr) const;
-  Tensor bernoulli(double p, Generator * generator=nullptr) const;
-  Tensor bernoulli() const;
+  Tensor bernoulli(Generator * generator=nullptr) const;
   Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr);
-  Tensor & bernoulli_(double p, Generator * generator=nullptr);
-  Tensor & bernoulli_();
+  Tensor & bernoulli_(double p=0.5, Generator * generator=nullptr);
+  Tensor bernoulli(double p, Generator * generator=nullptr) const;
   Tensor bincount(const Tensor & weights={}, int64_t minlength=0) const;
   Tensor bmm(const Tensor & mat2) const;
   Tensor ceil() const;
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index ff85267e78fb81..62017be6140b96 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -605,14 +605,8 @@ inline Tensor Tensor::baddbmm(const Tensor & batch1, const Tensor & batch2, Scal
 inline Tensor & Tensor::baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
     return type().baddbmm_(*this, batch1, batch2, beta, alpha);
 }
-inline Tensor Tensor::bernoulli(const Tensor & p, Generator * generator) const {
-    return type().bernoulli(*this, p, generator);
-}
-inline Tensor Tensor::bernoulli(double p, Generator * generator) const {
-    return type().bernoulli(*this, p, generator);
-}
-inline Tensor Tensor::bernoulli() const {
-    return type().bernoulli(*this);
+inline Tensor Tensor::bernoulli(Generator * generator) const {
+    return type().bernoulli(*this, generator);
 }
 inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) {
     return type().bernoulli_(*this, p, generator);
@@ -620,8 +614,8 @@ inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) {
 inline Tensor & Tensor::bernoulli_(double p, Generator * generator) {
     return type().bernoulli_(*this, p, generator);
 }
-inline Tensor & Tensor::bernoulli_() {
-    return type().bernoulli_(*this);
+inline Tensor Tensor::bernoulli(double p, Generator * generator) const {
+    return type().bernoulli(*this, p, generator);
 }
 inline Tensor Tensor::bincount(const Tensor & weights, int64_t minlength) const {
     return type().bincount(*this, weights, minlength);
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index 1366f899c30b84..501508d16e972e 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -381,8 +381,6 @@ struct AT_API Type {
   virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0;
   virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim) const = 0;
-  AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step) const = 0);
-  AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0);
   virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor argmax(const Tensor & self) const = 0;
   virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim) const = 0;
@@ -397,12 +395,10 @@ struct AT_API Type {
   virtual Tensor & atan_(Tensor & self) const = 0;
   virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
   virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
-  virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator) const = 0;
-  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0;
-  virtual Tensor bernoulli(const Tensor & self) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, Generator * generator) const = 0;
   virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) const = 0;
   virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator) const = 0;
-  virtual Tensor & bernoulli_(Tensor & self) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0;
   virtual Tensor bincount(const Tensor & self, const Tensor & weights, int64_t minlength) const = 0;
   virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0;
   virtual Tensor ceil(const Tensor & self) const = 0;
@@ -430,7 +426,6 @@ struct AT_API Type {
   virtual Tensor div(const Tensor & self, Scalar other) const = 0;
   virtual Tensor & div_(Tensor & self, Scalar other) const = 0;
   virtual Tensor dot(const Tensor & self, const Tensor & tensor) const = 0;
-  AT_DEPRECATED(virtual Tensor empty(IntList size) const = 0);
   virtual Tensor erf(const Tensor & self) const = 0;
   virtual Tensor & erf_(Tensor & self) const = 0;
   virtual Tensor erfc(const Tensor & self) const = 0;
@@ -441,13 +436,11 @@ struct AT_API Type {
   virtual Tensor & expm1_(Tensor & self) const = 0;
   virtual Tensor expand(const Tensor & self, IntList size, bool implicit) const = 0;
   virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0;
-  AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m) const = 0);
   virtual Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim) const = 0;
   virtual Tensor & fill_(Tensor & self, Scalar value) const = 0;
   virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0;
   virtual Tensor floor(const Tensor & self) const = 0;
   virtual Tensor & floor_(Tensor & self) const = 0;
-  AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0);
   virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0;
   virtual std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A) const = 0;
   virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
@@ -469,7 +462,6 @@ struct AT_API Type {
   virtual bool is_signed(const Tensor & self) const = 0;
   virtual bool is_sparse(const Tensor & self) const = 0;
   virtual std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) const = 0;
-  AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps) const = 0);
   virtual Tensor log(const Tensor & self) const = 0;
   virtual Tensor & log_(Tensor & self) const = 0;
   virtual Tensor log10(const Tensor & self) const = 0;
@@ -479,7 +471,6 @@ struct AT_API Type {
   virtual Tensor log2(const Tensor & self) const = 0;
   virtual Tensor & log2_(Tensor & self) const = 0;
   virtual Tensor logdet(const Tensor & self) const = 0;
-  AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps) const = 0);
   virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0;
   virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0;
@@ -504,16 +495,9 @@ struct AT_API Type {
   virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0;
   virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0;
   virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
-  AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0);
   virtual Tensor permute(const Tensor & self, IntList dims) const = 0;
   virtual Tensor pin_memory(const Tensor & self) const = 0;
   virtual Tensor pinverse(const Tensor & self, double rcond) const = 0;
-  AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step) const = 0);
   virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0;
   virtual Tensor reshape(const Tensor & self, IntList shape) const = 0;
   virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0;
@@ -581,7 +565,6 @@ struct AT_API Type {
   virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0;
   virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0;
-  AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0);
   virtual Tensor norm(const Tensor & self, Scalar p) const = 0;
   virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) const = 0;
   virtual Tensor clone(const Tensor & self) const = 0;
diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp
new file mode 100644
index 00000000000000..e34c6880c0210a
--- /dev/null
+++ b/aten/src/ATen/core/context_base.cpp
@@ -0,0 +1,22 @@
+#include <ATen/core/context_base.h>
+
+namespace caffe2 {
+
+// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
+StaticContextMap& GetStaticContexts() {
+  static StaticContextMap static_contexts;
+  return static_contexts;
+}
+
+void set_static_context(at::DeviceType t, BaseStaticContext* ptr) {
+  auto& static_contexts = GetStaticContexts();
+  static_contexts[t] = ptr;
+}
+
+BaseStaticContext* get_static_context(at::DeviceType t) {
+  auto* ptr = GetStaticContexts()[t];
+  AT_ASSERTM(ptr, "StaticContext for ", t, " is not registered yet.");
+  return ptr;
+}
+
+} // namespace caffe2
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 7cf1b7cc174980..0a653ba0a12379 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -10,6 +10,7 @@
 #include <ATen/core/Error.h>
 #include <ATen/core/UniqueVoidPtr.h>
 #include <ATen/core/typeid.h>
+#include <ATen/core/ATenGeneral.h>
 
 namespace caffe2 {
 class Event;
@@ -184,3 +185,27 @@ class AT_CORE_API BaseContext {
 };
 
 } // namespace at
+
+namespace caffe2 {
+
+using at::BaseContext;
+using at::BaseStaticContext;
+
+using StaticContextMap = std::unordered_map<at::DeviceType, BaseStaticContext*>;
+AT_API StaticContextMap& GetStaticContexts();
+AT_API void set_static_context(at::DeviceType t, BaseStaticContext* ptr);
+AT_API BaseStaticContext* get_static_context(at::DeviceType t);
+
+template <at::DeviceType t>
+struct StaticContextFunctionRegisterer {
+  explicit StaticContextFunctionRegisterer(BaseStaticContext* ptr) {
+    set_static_context(t, ptr);
+  }
+};
+
+#define REGISTER_STATIC_CONTEXT(t, f)                                \
+  namespace {                                                        \
+  static StaticContextFunctionRegisterer<t> g_static_context_##d(f); \
+  }
+
+} // namespace caffe2
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index f1eba7e2d3c428..a6289deeb816f7 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -438,4 +438,14 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {
                         Vec256<T>::loadu(static_cast<void*>(buffer2)));
 }
 
+template <typename src_T, typename dst_T>
+void convert(const src_T *src, dst_T *dst, int64_t n) {
+#pragma unroll
+  for (int64_t i = 0; i < n; i++) {
+    *dst = static_cast<dst_T>(*src);
+    src++;
+    dst++;
+  }
+}
+
 }}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index 2ca4d614c21e7b..5bca4ff57b810b 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -208,6 +208,38 @@ struct Vec256<int32_t> : public Vec256i {
   }
 };
 
+template <>
+void convert(const int32_t *src, float *dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#pragma unroll
+  for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
+    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_ps(input_vec);
+    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+void convert(const int32_t *src, double *dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#pragma unroll
+  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
+    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
 template <>
 struct Vec256<int16_t> : public Vec256i {
   static constexpr int size = 16;
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index 88981fcc4d16fe..b6cfb1843fdef9 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -5,6 +5,8 @@
 #include "THC/THCAtomics.cuh"
 #include "ATen/cuda/CUDAContext.h"
 
+#include <math.h>
+
 //
 // This file contains pointwise operation functions and kernels that
 // work on both contiguous and non-contiguous tensor arguments of
@@ -12,12 +14,85 @@
 // copying or temporary storage.
 //
 
+/*
+  NOTE [ CUDA_tensor_applyN helpers ]
+
+  The following CUDA_tensor_applyN (where N currently can be 1, 2, 3, or 4)
+  functions apply a pointwise operator to N tensor(s).
+
+  The calling convention is
+
+  1. The template arguments should be, sequentially,
+    - First N typename args specify the scalar types of each of the N tensors.
+    - (Optional) `int step` arg specifies the number of elements processed
+      together at the same time.
+      Default is 1.
+    - A usually omitted (i.e., inferred) typename arg specifies the type of the
+      function/functor applied on `N * step` values  in each iteration of each
+      CUDA thread.
+  2. The arguments should be, sequentially,
+    - N tensors
+    - op: a function/functor that processes `N * step` values at the same time.
+      - If `step == 1`, it must have signature
+        `void(*)(scalar1_t&, scalar2_t&, ..., scalarN_t&)`, where
+        `scalar*_t`s are the first N typename template args, and the inputs
+        are the `N` values from the `N` tensors retrieved at a common index.
+      - Otherwise, it must must have signature
+          void(*)(int n, scalar1_t&, scalar1_t&, ..., scalar1_t&,  // repeat `step` times
+                         scalar2_t&, scalar2_t&, ..., scalar2_t&,  // repeat `step` times
+                         ...,
+                         scalarN_t&, scalarN_t&, ..., scalarN_t&)  // repeat `step` times
+        Different from `step == 1` case, it processes `N * step` values taken
+        from `step` common indices. Moreover, the first input `n` represents the
+        number of valid indices (it will always have `0 < n <= step`). It will
+        almost always be `step`, but at the boundary we may not have full `step`
+        elements and `n` can be a lesser value.
+
+        E.g., if `step == 4` and `N == 2`, `op` could be
+
+          [](int n, scalar1_t &u1, scalar1_t &u2, scalar1_t &u3, scalar1_t &u4,
+                    scalar2_t &v1, scalar2_t &v2, scalar2_t &v3, scalar2_t &v4) {
+            // Only process u1, ..., un and v1, ..., vn.
+            // So if `n == 3`, `u4` and `v4` need not to be considered.
+          }
+
+      In both cases, the references can actually be const, but at least one of
+      them should be non-const in order to write the output.
+    - (Optional, but recommended) N TensorArgType args that specify for each
+      tensor whether `op` reads AND writes ] (i.e., TensorArgType::ReadWrite),
+      or only reads (i.e., TensorArgType::ReadOnly).
+      Default is TensorArgType::ReadWrite for first Tensor, and
+                 TensorArgType::ReadOnly  for the rest.
+
+  E.g.,
+
+  to compute a = b^2 for a and b of same dtype, we can call
+
+  CUDA_tensor_apply2<scalar, scalar>(
+    a, b,
+    [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; }
+  );
+
+  to work on 2 values at the same time, we can call
+
+  CUDA_tensor_apply2<scalar1, scalar2, 2>(
+    a, b,
+    [] __device__ (int n, scalar1 &a_val1, scalar1 &a_val2,
+                          const scalar2 &b_val1, const scalar2 &b_val2) {
+      // call special vectorized op here, or just do elementwise and enjoy unrolling...
+      // if n == 1, only process a_val1 and b_val1
+    }
+  );
+*/
+
 namespace at {
 namespace cuda {
 
 // TODO: combine with TensorArg?  So far that's been for debugging, and this is functional...
 enum class TensorArgType { ReadWrite, ReadOnly };
 
+namespace {
+
 // Rearrange dimensions for pointwise operations so that strides are in
 // decreasing order as much as possible, so that kernels have better memory
 // access patterns.
@@ -47,10 +122,10 @@ enum class TensorArgType { ReadWrite, ReadOnly };
 //        (exchanging them will not make any input worse).
 template <typename T1, typename IndexType,
           typename T2 = void, typename T3 = void, typename T4 = void>
-void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
-                   detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
-                   detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
-                   detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
+inline void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
+                          detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
+                          detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
+                          detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
   int numInfos = 1;
   int dims = aInfo->dims;
   IndexType *sizes[4] = { aInfo->sizes, };
@@ -126,11 +201,160 @@ void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
 #define AT_APPLY_THREADS_PER_BLOCK 32 * 16
 #define AT_APPLY_BLOCKS_PER_SM 4
 
+// The `remaining_steps` argument is used to support Op that operates on
+// multiple elements at the same time. Generally, the strategy of ApplyOpN is to
+//  1. Initialize `remaining_steps = step`, where `step` is the template arg of
+//     CUDA_tensor_applyN helpers. The input arg `n` to `apply()` represents the
+//     number of elements in bound for this call. It will almost always equal to
+//     `step` except at boundaries.
+//  2. If `remaining_steps > 0` convert the current linearIndex to offset (if in
+//     bound), and recursively call `ApplyOpN` with `remaining_steps - 1`.
+//  3. At `remaining_steps = 0`,
+//       if `step = 1`, call `op(tensor1_val, tensor2_val, ...)`;
+//       if `step > 1`, call `op(n, tensor1_val1, tensor1_val2, ..., tesor1_valstep,
+//                                  tensor2_val1, tensor2_val2, ..., tesor2_valstep,
+//                                       ...
+//                                  tensorN_val1, tensorN_val2, ..., tesorN_valstep);`
+//
+// See NOTE [ CUDA_tensor_applyN helpers ] above for how Op may look like.
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp1 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
+                  IndexType linearIndex, Offsets... aOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  ApplyOp1<Op, scalar, IndexType, ADims, remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, op, n, linearIndex + 1, aOffsets..., aOffset
+  );
+}
+};
+
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          typename Offset>
+struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, int n,
+                  const Op &op, IndexType linearIndex, Offset offset) {
+  op(a.data[offset]);
+}
+};
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          typename... Offsets>
+struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
+                 IndexType linearIndex, Offsets... offsets) {
+  op(n, a.data[offsets]...);
+}
+};
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          int step>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+#endif
+__global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
+                                      IndexType totalElements, const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp1<Op, scalar, IndexType, ADims, step>::apply(
+      a, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
+  }
+}
+
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp2 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  // Convert `linearIndex` into an offset of `b`
+  const IndexType bOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;
+
+  ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset
+  );
+}
+};
+
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
 template <typename Op,
           typename scalar1,
           typename scalar2,
           typename IndexType,
-          int ADims, int BDims>
+          int ADims,
+          int BDims,
+          typename Offset>
+struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offset aOffset, Offset bOffset) {
+  op(a.data[aOffset], b.data[bOffset]);
+}
+};
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          typename... Offsets>
+struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets) {
+  op(n, a.data[aOffsets]..., b.data[bOffsets]...);
+}
+};
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims, int BDims,
+          int step>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
 #endif
@@ -138,29 +362,108 @@ __global__ void
 kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
                       detail::TensorInfo<scalar2, IndexType> b,
                       IndexType totalElements,
-                      Op op) {
-  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+                      const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
        linearIndex < totalElements;
-       linearIndex += gridDim.x * blockDim.x) {
-    // Convert `linearIndex` into an offset of `a`
-    const IndexType aOffset =
-      detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a);
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, step>::apply(
+      a, b, op, ::min(step, static_cast<int>(totalElements - linearIndex)),
+      linearIndex);
+  }
+}
 
-    // Convert `linearIndex` into an offset of `b`
-    const IndexType bOffset =
-      detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b);
 
-    op(a.data[aOffset], b.data[bOffset]);
-  }
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int CDims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp3 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  detail::TensorInfo<scalar3, IndexType> &c,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets,
+                  Offsets... cOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  // Convert `linearIndex` into an offset of `b`
+  const IndexType bOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;
+
+  // Convert `linearIndex` into an offset of `c`
+  const IndexType cOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar3, IndexType, CDims>::get(linearIndex, c) : 0;
+
+  ApplyOp3<Op, scalar1, scalar2, scalar3, IndexType, ADims, BDims, CDims,
+           remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, b, c, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset,
+    cOffsets..., cOffset
+  );
 }
+};
 
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int CDims,
+          typename Offset>
+struct ApplyOp3<Op, scalar1, scalar2, scalar3, IndexType,
+                ADims, BDims, CDims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  detail::TensorInfo<scalar3, IndexType> &c,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offset aOffset, Offset bOffset, Offset cOffset) {
+  op(a.data[aOffset], b.data[bOffset], c.data[cOffset]);
+}
+};
 
 template <typename Op,
           typename scalar1,
           typename scalar2,
           typename scalar3,
           typename IndexType,
-          int ADims, int BDims, int CDims>
+          int ADims,
+          int BDims,
+          int CDims,
+          typename... Offsets>
+struct ApplyOp3<Op, scalar1, scalar2, scalar3, IndexType,
+                ADims, BDims, CDims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  detail::TensorInfo<scalar3, IndexType> &c,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets,
+                  Offsets... cOffsets) {
+  op(n, a.data[aOffsets]..., b.data[bOffsets]..., c.data[cOffsets]...);
+}
+};
+
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename IndexType,
+          int ADims, int BDims, int CDims,
+          int step>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
 #endif
@@ -169,25 +472,112 @@ kernelPointwiseApply3(detail::TensorInfo<scalar1, IndexType> a,
                       detail::TensorInfo<scalar2, IndexType> b,
                       detail::TensorInfo<scalar3, IndexType> c,
                       IndexType totalElements,
-                      Op op) {
-  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+                      const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
        linearIndex < totalElements;
-       linearIndex += gridDim.x * blockDim.x) {
-    // Convert `linearIndex` into an offset of `a`
-    const IndexType aOffset =
-      detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a);
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp3<Op, scalar1, scalar2, scalar3, IndexType, ADims, BDims, CDims, step>::apply(
+      a, b, c, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
+  }
+}
 
-    // Convert `linearIndex` into an offset of `b`
-    const IndexType bOffset =
-      detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b);
 
-    // Convert `linearIndex` into an offset of `c`
-    const IndexType cOffset =
-      detail::IndexToOffset<scalar3, IndexType, CDims>::get(linearIndex, c);
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename scalar4,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int CDims,
+          int DDims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp4 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  detail::TensorInfo<scalar3, IndexType> &c,
+                  detail::TensorInfo<scalar4, IndexType> &d,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets,
+                  Offsets... cOffsets, Offsets... dOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  // Convert `linearIndex` into an offset of `b`
+  const IndexType bOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;
+
+  // Convert `linearIndex` into an offset of `c`
+  const IndexType cOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar3, IndexType, CDims>::get(linearIndex, c) : 0;
+
+  // Convert `linearIndex` into an offset of `d`
+  const IndexType dOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar4, IndexType, DDims>::get(linearIndex, d) : 0;
+
+  ApplyOp4<Op, scalar1, scalar2, scalar3, scalar4, IndexType,
+           ADims, BDims, CDims, DDims, remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, b, c, d, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset,
+    cOffsets..., cOffset, dOffsets..., dOffset
+  );
+}
+};
 
-    op(a.data[aOffset], b.data[bOffset], c.data[cOffset]);
-  }
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename scalar4,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int CDims,
+          int DDims,
+          typename Offset>
+struct ApplyOp4<Op, scalar1, scalar2, scalar3, scalar4, IndexType,
+                ADims, BDims, CDims, DDims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  detail::TensorInfo<scalar3, IndexType> &c,
+                  detail::TensorInfo<scalar4, IndexType> &d,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offset aOffset, Offset bOffset,
+                  Offset cOffset, Offset dOffset) {
+  op(a.data[aOffset], b.data[bOffset], c.data[cOffset], d.data[dOffset]);
+}
+};
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename scalar3,
+          typename scalar4,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int CDims,
+          int DDims,
+          typename... Offsets>
+struct ApplyOp4<Op, scalar1, scalar2, scalar3, scalar4, IndexType,
+                ADims, BDims, CDims, DDims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  detail::TensorInfo<scalar3, IndexType> &c,
+                  detail::TensorInfo<scalar4, IndexType> &d,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets,
+                  Offsets... cOffsets, Offsets... dOffsets) {
+  op(n, a.data[aOffsets]..., b.data[bOffsets]..., c.data[cOffsets]..., d.data[dOffsets]...);
 }
+};
 
 template <typename Op,
           typename scalar1,
@@ -195,7 +585,8 @@ template <typename Op,
           typename scalar3,
           typename scalar4,
           typename IndexType,
-          int ADims, int BDims, int CDims, int DDims>
+          int ADims, int BDims, int CDims, int DDims,
+          int step>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
 #endif
@@ -205,30 +596,18 @@ kernelPointwiseApply4(detail::TensorInfo<scalar1, IndexType> a,
                       detail::TensorInfo<scalar3, IndexType> c,
                       detail::TensorInfo<scalar4, IndexType> d,
                       IndexType totalElements,
-                      Op op) {
-  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+                      const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
        linearIndex < totalElements;
-       linearIndex += gridDim.x * blockDim.x) {
-    // Convert `linearIndex` into an offset of `a`
-    const IndexType aOffset =
-      detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a);
-
-    // Convert `linearIndex` into an offset of `b`
-    const IndexType bOffset =
-      detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b);
-
-    // Convert `linearIndex` into an offset of `c`
-    const IndexType cOffset =
-      detail::IndexToOffset<scalar3, IndexType, CDims>::get(linearIndex, c);
-
-    // Convert `linearIndex` into an offset of `d`
-    const IndexType dOffset =
-      detail::IndexToOffset<scalar4, IndexType, DDims>::get(linearIndex, d);
-
-    op(a.data[aOffset], b.data[bOffset], c.data[cOffset], d.data[dOffset]);
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp4<Op, scalar1, scalar2, scalar3, scalar4, IndexType,
+             ADims, BDims, CDims, DDims, step>::apply(
+      a, b, c, d, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
   }
 }
 
+} // namespace
+
 /**
    Computes ceil(a / b)
 */
@@ -237,9 +616,11 @@ __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
   return (a + b - 1) / b;
 }
 
+template <int step = 1>
 inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice) {
   if (curDevice == -1) return false;
-  uint64_t numBlocks = ATenCeilDiv(totalElements, static_cast<uint64_t>(AT_APPLY_THREADS_PER_BLOCK));
+  uint64_t numel_per_thread = static_cast<uint64_t>(AT_APPLY_THREADS_PER_BLOCK) * static_cast<uint64_t>(step);
+  uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
   uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
   if (numBlocks > maxGridX)
       numBlocks = maxGridX;
@@ -251,20 +632,155 @@ inline dim3 getApplyBlock() {
   return dim3(AT_APPLY_THREADS_PER_BLOCK);
 }
 
-/*
-  Apply a pointwise operator to two tensors.
 
-  The calling convention for op is a function/functor that takes takes two references to
-  type scalar; at least one of these references should be non-const in order to write the output.
-  For example, to compute a = b^2, op would be of the form:
-  [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; };
-*/
-template <typename scalar1, typename scalar2, typename Op>
-bool CUDA_tensor_apply2(at::Tensor a,
-                        at::Tensor b,
-                        Op op,
-                        TensorArgType aType = TensorArgType::ReadWrite,
-                        TensorArgType bType = TensorArgType::ReadOnly) {
+template <typename scalar, int step, typename Op>
+inline bool CUDA_tensor_apply1(at::Tensor a,
+                               Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite) {
+  checkBackend("CUDA_tensor_apply1", {a}, Backend::CUDA);
+  auto dim = a.dim();
+
+  /*
+  Since this is a unary op, we can easily first check for expanded dimensions
+  (with stride 0), and remove them, to avoid calling .contiguous() in such
+  case when detail::maybeOverlappingIndices(a) returns true.
+  */
+  std::vector<int64_t> collapsed_shape;
+  std::vector<int64_t> collapsed_strides;
+  collapsed_shape.reserve(dim);
+  collapsed_strides.reserve(dim);
+  for (int64_t i = 0; i < dim; i++) {
+    if (a.stride(i) != 0) {
+      collapsed_shape.push_back(a.size(i));
+      collapsed_strides.push_back(a.stride(i));
+    }
+  }
+  if (collapsed_shape.size() != dim) {
+    a = a.as_strided(collapsed_shape, collapsed_strides);
+  }
+
+  int64_t totalElements = a.numel();
+
+  if (dim > MAX_TENSORINFO_DIMS) {
+    return false;
+  }
+
+  if (totalElements == 0) {
+    // Empty tensor; do nothing
+    return true;
+  }
+  const dim3 block = getApplyBlock();
+
+  dim3 grid;
+  int64_t curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid<step>(totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only
+  once.
+  */
+  Tensor oldA;
+
+  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
+    // Must perform in contiguous space
+    oldA = a;
+    a = a.contiguous();
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+
+#define HANDLE_CASE(TYPE, A)                                           \
+  kernelPointwiseApply1<Op,                                            \
+                        scalar,                                        \
+                        TYPE, A, step>                                 \
+   <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>(    \
+       aInfo, static_cast<TYPE>(totalElements), op);
+
+#define HANDLE_A_CASE(TYPE, A) {            \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, 1);                 \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, 2);                 \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, -1);                \
+      break;                                \
+  }                                         \
+}
+
+  if (detail::canUse32BitIndexMath(a)) {
+    detail::TensorInfo<scalar, unsigned int> aInfo =
+      detail::getTensorInfo<scalar, unsigned int>(a);
+
+    rearrangeDims(&aInfo);
+    aInfo.collapseDims();
+#if CUDA_VERSION < 9000
+    if (!aInfo.isContiguous())
+        grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims);
+  } else {
+    detail::TensorInfo<scalar, uint64_t> aInfo =
+      detail::getTensorInfo<scalar, uint64_t>(a);
+
+    rearrangeDims(&aInfo);
+    aInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time.
+    */
+    if (aInfo.dims == 1) {
+      HANDLE_CASE(uint64_t, 1);
+    } else {
+#if CUDA_VERSION < 9000
+      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+#endif
+      HANDLE_CASE(uint64_t, -1);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA.defined()) {
+    // Ignore overlaps when copying back; if we use copy
+    // instead, it will recursively try and invoke ourselves to make
+    // oldA contiguous.
+    at::_copy_ignoring_overlaps_(oldA, a);
+  }
+
+  return true;
+}
+
+/* Provides default step = 1 to CUDA_tensor_apply1. */
+template <typename scalar, typename Op>
+inline bool CUDA_tensor_apply1(at::Tensor a,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite) {
+  return CUDA_tensor_apply1<scalar, 1, Op>(a, op, aType);
+}
+
+
+template <typename scalar1, typename scalar2, int step, typename Op>
+inline bool CUDA_tensor_apply2(at::Tensor a,
+                               at::Tensor b,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly) {
   checkBackend("CUDA_tensor_apply2", {a, b}, Backend::CUDA);
   int64_t totalElements = a.numel();
 
@@ -286,7 +802,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
   dim3 grid;
   int64_t curDevice = current_device();
   if (curDevice == -1) return false;
-  if (!getApplyGrid(totalElements, grid, curDevice)) {
+  if (!getApplyGrid<step>(totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -318,13 +834,13 @@ bool CUDA_tensor_apply2(at::Tensor a,
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
 
-#define HANDLE_CASE(TYPE, A, B)                                         \
-  kernelPointwiseApply2<Op,                                             \
-                        scalar1,                                        \
-                        scalar2,                                        \
-                        TYPE, A, B>                                     \
+#define HANDLE_CASE(TYPE, A, B)                                        \
+  kernelPointwiseApply2<Op,                                            \
+                        scalar1,                                       \
+                        scalar2,                                       \
+                        TYPE, A, B, step>                              \
    <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>(    \
-       aInfo, bInfo, (TYPE) totalElements, op);
+       aInfo, bInfo, static_cast<TYPE>(totalElements), op);
 
 #define HANDLE_B_CASE(TYPE, A, B) {         \
   switch (B) {                              \
@@ -385,22 +901,12 @@ bool CUDA_tensor_apply2(at::Tensor a,
     large (64-bit indexed) tensors to reduce compilation time.
     */
     if (aInfo.dims == 1 && bInfo.dims == 1) {
-      kernelPointwiseApply2<Op,
-                            scalar1,
-                            scalar2,
-                          uint64_t, 1, 1>
-        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-           aInfo, bInfo, (uint64_t) totalElements, op);
+      HANDLE_CASE(uint64_t, 1, 1);
     } else {
 #if CUDA_VERSION < 9000
       grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
-      kernelPointwiseApply2<Op,
-                            scalar1,
-                            scalar2,
-                            uint64_t, -1, -1>
-        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-           aInfo, bInfo, (uint64_t) totalElements, op);
+      HANDLE_CASE(uint64_t, -1, -1);
     }
   }
 #undef HANDLE_CASE
@@ -412,7 +918,6 @@ bool CUDA_tensor_apply2(at::Tensor a,
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     at::_copy_ignoring_overlaps_(oldA, a);
-    a = oldA;
   }
 
   if (oldB.defined()) {
@@ -420,30 +925,30 @@ bool CUDA_tensor_apply2(at::Tensor a,
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
     at::_copy_ignoring_overlaps_(oldB, b);
-    b = oldB;
   }
 
   return true;
 }
 
-/*
-  Apply a pointwise operator to three tensors.
-
-  The calling convention for op is a function/functor that takes takes three references to
-  type scalar; at least one of these references should be non-const in order to write the output.
-  For example, to compute a = b + c, op would be of the form:
-  [] __device__ (scalar &a_val, const scalar &b_val, const scalar &c_val) {
-    a_val = b_val + c_val;
-  };
-*/
-template <typename scalar1, typename scalar2, typename scalar3, typename Op>
-bool CUDA_tensor_apply3(at::Tensor a,
-                        at::Tensor b,
-                        at::Tensor c,
-                        const Op& op,
-                        TensorArgType aType = TensorArgType::ReadWrite,
-                        TensorArgType bType = TensorArgType::ReadOnly,
-                        TensorArgType cType = TensorArgType::ReadOnly) {
+/* Provides default step = 1 to CUDA_tensor_apply2. */
+template <typename scalar1, typename scalar2, typename Op>
+inline bool CUDA_tensor_apply2(at::Tensor a,
+                               at::Tensor b,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly) {
+  return CUDA_tensor_apply2<scalar1, scalar2, 1, Op>(a, b, op, aType, bType);
+}
+
+
+template <typename scalar1, typename scalar2, typename scalar3, int step, typename Op>
+inline bool CUDA_tensor_apply3(at::Tensor a,
+                               at::Tensor b,
+                               at::Tensor c,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly,
+                               TensorArgType cType = TensorArgType::ReadOnly) {
   checkBackend("CUDA_tensor_apply3", {a, b, c}, Backend::CUDA);
   int64_t totalElements = a.numel();
 
@@ -468,7 +973,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
   dim3 grid;
   int64_t curDevice = current_device();
   if (curDevice == -1) return false;
-  if (!getApplyGrid(totalElements, grid, curDevice)) {
+  if (!getApplyGrid<step>(totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -497,14 +1002,14 @@ bool CUDA_tensor_apply3(at::Tensor a,
     c = c.contiguous();
   }
 
-#define HANDLE_CASE(TYPE, A, B, C)                                      \
-  kernelPointwiseApply3<Op,                                             \
-                        scalar1,                                        \
-                        scalar2,                                        \
-                        scalar3,                                        \
-                        TYPE, A, B, C>                                  \
+#define HANDLE_CASE(TYPE, A, B, C)                                     \
+  kernelPointwiseApply3<Op,                                            \
+                        scalar1,                                       \
+                        scalar2,                                       \
+                        scalar3,                                       \
+                        TYPE, A, B, C, step>                           \
     <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>(   \
-      aInfo, bInfo, cInfo, (TYPE) totalElements, op);
+      aInfo, bInfo, cInfo, static_cast<TYPE>(totalElements), op);
 
 #define HANDLE_C_CASE(TYPE, A, B, C) {      \
   switch (C) {                              \
@@ -590,25 +1095,13 @@ bool CUDA_tensor_apply3(at::Tensor a,
     large (64-bit indexed) tensors to reduce compilation time.
     */
     if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) {
-      kernelPointwiseApply3<Op,
-                            scalar1,
-                            scalar2,
-                            scalar3,
-                            uint64_t, 1, 1, 1>
-        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-          aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
+      HANDLE_CASE(uint64_t, 1, 1, 1);
     } else {
 #if CUDA_VERSION < 9000
   grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
 
-	kernelPointwiseApply3<Op,
-                        scalar1,
-                        scalar2,
-                        scalar3,
-                        uint64_t, -1, -1, -1>
-        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-          aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
+      HANDLE_CASE(uint64_t, -1, -1, -1);
     }
   }
 #undef HANDLE_CASE
@@ -643,26 +1136,31 @@ bool CUDA_tensor_apply3(at::Tensor a,
   return true;
 }
 
-/*
-  Apply a pointwise operator to four tensors.
-
-  The calling convention for op is a function/functor that takes takes four references to
-  type scalar; at least one of these references should be non-const in order to write the output.
-  For example, to compute a = b + c * d, op would be of the form:
-  [] __device__ (scalar &a_val, const scalar &b_val, const scalar &c_val, const scalar &d_val) {
-    a_val = b_val + c_val * d_val;
-  };
-*/
-template <typename scalar1, typename scalar2, typename scalar3, typename scalar4, typename Op>
-bool CUDA_tensor_apply4(at::Tensor a,
-                        at::Tensor b,
-                        at::Tensor c,
-                        at::Tensor d,
-                        const Op& op,
-                        TensorArgType aType = TensorArgType::ReadWrite,
-                        TensorArgType bType = TensorArgType::ReadOnly,
-                        TensorArgType cType = TensorArgType::ReadOnly,
-                        TensorArgType dType = TensorArgType::ReadOnly) {
+/* Provides default step = 1 to CUDA_tensor_apply3. */
+template <typename scalar1, typename scalar2, typename scalar3, typename Op>
+inline bool CUDA_tensor_apply3(at::Tensor a,
+                               at::Tensor b,
+                               at::Tensor c,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly,
+                               TensorArgType cType = TensorArgType::ReadOnly) {
+  return CUDA_tensor_apply3<scalar1, scalar2, scalar3, 1, Op>(
+    a, b, c, op, aType, bType, cType);
+}
+
+
+template <typename scalar1, typename scalar2, typename scalar3, typename scalar4,
+          int step, typename Op>
+inline bool CUDA_tensor_apply4(at::Tensor a,
+                               at::Tensor b,
+                               at::Tensor c,
+                               at::Tensor d,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly,
+                               TensorArgType cType = TensorArgType::ReadOnly,
+                               TensorArgType dType = TensorArgType::ReadOnly) {
   checkBackend("CUDA_tensor_apply4", {a, b, c, d}, Backend::CUDA);
   int64_t totalElements = a.numel();
 
@@ -689,7 +1187,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
   dim3 grid;
   int64_t curDevice = current_device();
   if (curDevice == -1) return false;
-  if (!getApplyGrid(totalElements, grid, curDevice)) {
+  if (!getApplyGrid<step>(totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -724,15 +1222,15 @@ bool CUDA_tensor_apply4(at::Tensor a,
     d = d.contiguous();
   }
 
-#define HANDLE_CASE(TYPE, A, B, C, D)                                   \
-  kernelPointwiseApply4<Op,                                             \
-                        scalar1,                                        \
-                        scalar2,                                        \
-                        scalar3,                                        \
-                        scalar4,                                        \
-                        TYPE, A, B, C, D>                               \
+#define HANDLE_CASE(TYPE, A, B, C, D)                                  \
+  kernelPointwiseApply4<Op,                                            \
+                        scalar1,                                       \
+                        scalar2,                                       \
+                        scalar3,                                       \
+                        scalar4,                                       \
+                        TYPE, A, B, C, D, step>                        \
     <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>(   \
-    aInfo, bInfo, cInfo, dInfo, (TYPE) totalElements, op);
+    aInfo, bInfo, cInfo, dInfo, static_cast<TYPE>(totalElements), op);
 
 #define HANDLE_D_CASE(TYPE, A, B, C, D) {       \
   switch (D) {                                  \
@@ -841,27 +1339,12 @@ bool CUDA_tensor_apply4(at::Tensor a,
     large (64-bit indexed) tensors to reduce compilation time.
     */
     if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1 && dInfo.dims == 1) {
-      kernelPointwiseApply4<Op,
-                            scalar1,
-                            scalar2,
-                            scalar3,
-                            scalar4,
-                            uint64_t, 1, 1, 1, 1>
-        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-          aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op);
+      HANDLE_CASE(uint64_t, 1, 1, 1, 1);
     } else {
 #if CUDA_VERSION < 9000
   grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
-
-	kernelPointwiseApply4<Op,
-                        scalar1,
-                        scalar2,
-                        scalar3,
-                        scalar4,
-                        uint64_t, -1, -1, -1, -1>
-        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-          aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op);
+      HANDLE_CASE(uint64_t, -1, -1, -1, -1);
     }
   }
 #undef HANDLE_CASE
@@ -875,7 +1358,6 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     at::_copy_ignoring_overlaps_(oldA, a);
-    a = oldA;
   }
 
   if (oldB.defined()) {
@@ -883,7 +1365,6 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
     at::_copy_ignoring_overlaps_(oldB, b);
-    b = oldB;
   }
 
   if (oldC.defined()) {
@@ -891,7 +1372,6 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
     at::_copy_ignoring_overlaps_(oldC, c);
-    c = oldC;
   }
 
   if (oldD.defined()) {
@@ -899,11 +1379,26 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
     at::_copy_ignoring_overlaps_(oldD, c);
-    d = oldD;
   }
 
   return true;
 }
 
+/* Provides default step = 1 to CUDA_tensor_apply4. */
+template <typename scalar1, typename scalar2, typename scalar3, typename scalar4,
+          typename Op>
+inline bool CUDA_tensor_apply4(at::Tensor a,
+                               at::Tensor b,
+                               at::Tensor c,
+                               at::Tensor d,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly,
+                               TensorArgType cType = TensorArgType::ReadOnly,
+                               TensorArgType dType = TensorArgType::ReadOnly) {
+  return CUDA_tensor_apply4<scalar1, scalar2, scalar3, scalar4, 1, Op>(
+    a, b, c, d, op, aType, bType, cType);
+}
+
 } // cuda
 } // at
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 5df218a89cc06d..01460fa2d0712f 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -1077,22 +1077,12 @@ def find_formal(formal_name, formals):
 
         # Factory methods are not dispatched over `Type`.
         if not is_factory_method:
-            if option['deprecated']:
-                # Deprecated functions are always non-extended,
-                # because they need to be made available from Type
-                # (the public interface) so that code like
-                # tensor.type().arange(...) keeps working.  Once
-                # we remove the deprecated functions, we can eliminate
-                # these methods entirely.
-                top_env['pure_virtual_type_method_declarations'].append(
-                    DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            if option['extended_method']:
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             else:
-                if option['extended_method']:
-                    top_env['pure_virtual_extended_type_method_declarations'].append(
-                        PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
-                else:
-                    top_env['pure_virtual_type_method_declarations'].append(
-                        PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             top_env['type_method_declarations'].append(TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
         dispatch = option['type_method_definition_dispatch']
         option['native_type_method_dispatch'] = dispatch
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 3112e5ff0424ab..af4ac96ea9994f 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -454,7 +454,7 @@ def generate_outputs():
             file_component = '{' + file_component + '}'
         update_cmd = "cp {}/{} {}".format(core_install_dir, file_component, core_source_path)
         raise RuntimeError("Source files: {} did not match generated files.  To update the source files, "
-                           "run \"{}\"".format(mismatch, update_cmd))
+                           "set environment variable GEN_TO_SOURCE or run \"{}\"".format(mismatch, update_cmd))
 
 declare_outputs()
 if options.output_dependencies is not None:
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index ccc6d8a0c04094..3a2d1da5bd9a5a 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -1,4 +1,5 @@
 #include "ATen/ATen.h"
+#include "ATen/Config.h"
 #include "ATen/CPUApplyUtils.h"
 #include "ATen/Dispatch.h"
 #include "ATen/ExpandUtils.h"
@@ -9,8 +10,13 @@
 #include "ATen/CheckGenerator.h"
 #include "ATen/core/Generator.h"
 #include "ATen/native/Distributions.h"
+#include "ATen/native/DispatchStub.h"
+#include "ATen/native/cpu/UnaryOpsKernel.h"
 
+#include <type_traits>
 #include <functional>
+#include <assert.h>
+#include <cpuinfo.h>
 
 #include "TH/THRandom.h"
 #include "TH/THGenerator.hpp"
@@ -48,11 +54,6 @@ namespace {
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-THGenerator* get_generator(at::Generator* gen) {
-  auto default_gen = &at::globalContext().defaultGenerator(at::kCPU);
-  auto gen_ = at::check_generator<at::CPUGenerator>(gen, default_gen);
-  return gen_->generator;
-}
 
 int64_t sample_poisson(double lambda, THGenerator* generator) {
   if (lambda >= 10) {
@@ -109,49 +110,67 @@ int64_t sample_poisson(double lambda, THGenerator* generator) {
 namespace at {
 namespace native {
 
-Tensor bernoulli(const Tensor& self, const Tensor& p, Generator* gen) {
-  Tensor result = self.type().tensor();
-  result.resize_(self.sizes());
-  return native::bernoulli_(result, p, gen);
+Tensor bernoulli(const Tensor& self, Generator* gen) {
+  return at::empty_like(self).bernoulli_(self, gen);
 }
 
 Tensor bernoulli(const Tensor& self, double p, Generator* gen) {
-  Tensor result = self.type().tensor();
-  result.resize_(self.sizes());
-  return native::bernoulli_(result, p, gen);
+  return at::empty_like(self).bernoulli_(p, gen);
 }
 
-Tensor bernoulli(const Tensor& self) {
-  Tensor result = self.type().tensor();
-  result.resize_(self.sizes());
-  return native::bernoulli(result, self, nullptr);
+Tensor& bernoulli_out(Tensor& result, const Tensor& self, Generator* gen) {
+  // result.resize_as_(self) requires self to have same dtype as result, so we
+  // use resize_ instead.
+  // TODO: Fix resize_as_. See pytorch/pytorch#11665.
+  return result.resize_(self.sizes()).bernoulli_(self, gen);
 }
 
-Tensor& bernoulli_(Tensor& self, const Tensor& p_, Generator* gen) {
-  if (!self.is_cuda() && !p_.is_cuda()) {
-    Tensor p = p_.toType(kDouble);
-    AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_", [&] {
-      THGenerator* generator = get_generator(gen);
-      std::lock_guard<std::mutex> lock(generator->mutex);
-      CPU_tensor_apply2<scalar_t, double>(
-          self, p, [generator](scalar_t& ret_val, double& p_val) {
-            ret_val = (scalar_t)THRandom_bernoulli(generator, p_val);
+Tensor& bernoulli_tensor_cpu_(Tensor& self, const Tensor& p_, Generator* gen) {
+  AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_tensor_cpu_self_", [&] {
+    THGenerator* generator = get_generator(gen);
+    std::lock_guard<std::mutex> lock(generator->mutex);
+    using self_t = scalar_t;
+    if (p_.type().scalarType() == kDouble) {
+      auto p = std::get<0>(expand_inplace(self, p_.to(kCPU)));
+      CPU_tensor_apply2<self_t, double>(
+        self, p, [generator](self_t& ret_val, double& p_val) {
+          ret_val = static_cast<self_t>(THRandom_bernoulli(generator, p_val));
+        });
+    } else {
+      AT_DISPATCH_FLOATING_TYPES(p_.type(), "bernoulli_tensor_cpu_p_", [&] {
+        auto p = std::get<0>(expand_inplace(self, p_.to(kCPU)));
+        using p_t = scalar_t;
+        CPU_tensor_apply2<self_t, p_t>(
+          self, p, [generator](self_t& ret_val, p_t& p_val) {
+            ret_val = static_cast<self_t>(THRandom_bernoulliFloat(generator, static_cast<p_t>(p_val)));
           });
-    });
-    return self;
-  }
-  self.copy_(at::_th_bernoulli(std::get<0>(expand_inplace(self, p_)), gen));
+      });
+    }
+  });
   return self;
 }
 
-Tensor& bernoulli_(Tensor& self, double p, Generator* gen) {
-    at::_bernoulli_(self, p, gen);
+DEFINE_DISPATCH(bernoulli_mkl_stub);
+
+Tensor& bernoulli_scalar_cpu_(Tensor& self, double p, Generator* gen) {
+  AT_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
+#if AT_MKL_ENABLED()
+  if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
+    bernoulli_mkl_stub(kCPU, self, p, gen);
     return self;
+  }
+#endif
+  AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_scalar_cpu_", [&] {
+    THGenerator* generator = get_generator(gen);
+    std::lock_guard<std::mutex> lock(generator->mutex);
+    CPU_tensor_apply1<scalar_t>(
+        self, [generator, p](scalar_t& ret_val) {
+          ret_val = static_cast<scalar_t>(THRandom_bernoulli(generator, p));
+        });
+  });
+  return self;
 }
 
-Tensor& bernoulli_(Tensor& self) {
-  return native::bernoulli_(self, 0.5, nullptr);
-}
 
 Tensor _standard_gamma_grad_cpu(const Tensor& self, const Tensor& output) {
   Tensor ret = self.type().tensor(self.sizes());
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index c374740a3ce7d1..5cee088d55d4ff 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -1,8 +1,26 @@
+#pragma once
+
 #include "TH/THMath.h"
 #ifdef __CUDA_ARCH__
 #include <nvfunctional>
 #endif
 
+#include "ATen/ATen.h"
+#include "ATen/CPUGenerator.h"
+#include "ATen/CheckGenerator.h"
+#include "ATen/Generator.h"
+#include "TH/THGenerator.hpp"
+
+namespace at {namespace native {
+
+static inline THGenerator* get_generator(at::Generator* gen) {
+  auto default_gen = &at::globalContext().defaultGenerator(at::kCPU);
+  auto gen_ = at::check_generator<at::CPUGenerator>(gen, default_gen);
+  return gen_->generator;
+}
+
+}}  // namespace at::native
+
 namespace {
 
 #ifdef __CUDA_ARCH__
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 50726cb99b81b9..04bf617081387b 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -307,7 +307,7 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor&
   } else if (contraction_size == 0) {
     return self_or_result.zero_();
   }
-  
+
   auto batch_items_contiguous_or_transposed = [&](const Tensor& t) {
     return (t.stride(2) == 1 && t.stride(1) == t.size(2))
             || (t.stride(1) == 1 && t.stride(2) == t.size(1));
@@ -536,5 +536,55 @@ Tensor matrix_power(const Tensor& a, int64_t n) {
   return result;
 }
 
+Tensor frobenius_norm(const Tensor& self) {
+  return at::norm(self);
+}
+
+Tensor frobenius_norm(const Tensor& self, IntList dim, bool keepdim) {
+  AT_CHECK(
+      dim.size() <= 2,
+      "Expected at most 2 dimensions, but got ",
+      dim.size(),
+      " dimensions instead.");
+  if (dim.size() == 1) {
+    return at::norm(self, 2, dim[0], keepdim);
+  }
+  return at::sqrt(at::sum(self * self, dim, keepdim));
+}
+
+Tensor &frobenius_norm_out(
+    Tensor& result,
+    const Tensor& self,
+    IntList dim,
+    bool keepdim) {
+  AT_CHECK(
+      dim.size() <= 2,
+      "Expected at most 2 dimensions, but got ",
+      dim.size(),
+      " dimensions instead.");
+  if (dim.size() == 1) {
+    return at::norm_out(result, self, 2, dim[0], keepdim);
+  }
+  return at::sqrt_out(result, at::sum(self * self, dim, keepdim));
+}
+
+Tensor nuclear_norm(const Tensor& self, bool keepdim) {
+  AT_CHECK(
+      self.dim() == 2,
+      "Expected a tensor with 2 dimensions, but got a ",
+      self.dim(),
+      " dimensions tensor instead.");
+  return at::sum(std::get<1>(at::svd(self)), 0, keepdim);
+}
+
+Tensor &nuclear_norm_out(Tensor& result, const Tensor& self, bool keepdim) {
+  AT_CHECK(
+      self.dim() == 2,
+      "Expected a tensor with 2 dimensions, but got a ",
+      self.dim(),
+      " dimensions tensor instead.");
+  return at::sum_out(result, std::get<1>(at::svd(self)), 0, keepdim);
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/RNN.h b/aten/src/ATen/native/RNN.h
index 3fc89993404a9c..a4a359a073802e 100644
--- a/aten/src/ATen/native/RNN.h
+++ b/aten/src/ATen/native/RNN.h
@@ -19,5 +19,19 @@ DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub);
 DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub);
 DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub);
 
-}} // namespace at::native
+inline void check_device(const Tensor& input, const TensorList& params, const TensorList& hiddens) {
+  auto input_device = input.device();
+
+  auto check_tensors = [&](const std::string& name, const Tensor& t) {
+    if (!t.defined()) return;
+    auto t_device = t.device();
+    AT_CHECK(input_device == t_device,
+             "Input and ", name, " tensors are not at the same device, found input tensor at ",
+             input_device, " and ", name, " tensor at ", t_device);
+  };
 
+  for (auto h : hiddens) check_tensors("hidden", h);
+  for (auto p : params) check_tensors("parameter", p);
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index aebb021696084f..ee2e5c98706442 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -51,13 +51,14 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
           }
 
           if (LogSoftMax)
-            tmpsum = max_input + std::log(tmpsum);
+            tmpsum = std::log(tmpsum);
           else
             tmpsum = 1 / tmpsum;
 
           for (int64_t d = 0; d < dim_size; d++)
             if (LogSoftMax)
-              output_data[d * dim_stride] = input_data[d * dim_stride] - tmpsum;
+              output_data[d * dim_stride] =
+                  input_data[d * dim_stride] - max_input - tmpsum;
             else
               output_data[d * dim_stride] *= tmpsum;
         }
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 6733a94db3454d..57618f0fdafe12 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -1,14 +1,27 @@
 #include "ATen/native/cpu/UnaryOpsKernel.h"
 
 #include <cmath>
+#include <type_traits>
+#include "ATen/Config.h"
 #include "ATen/Dispatch.h"
+#include "ATen/CPUGenerator.h"
+#include "ATen/CheckGenerator.h"
+#include "ATen/Generator.h"
 #include "ATen/cpu/vml.h"
 #include "ATen/CPUApplyUtils.h"
 #include "ATen/native/DispatchStub.h"
+#include "ATen/native/Distributions.h"
 #ifdef __AVX2__
 #include "ATen/native/cpu/avx_mathfun.h"
 #endif
 
+#if AT_MKL_ENABLED()
+#include <mkl.h>
+#endif
+
+#include "TH/THGenerator.hpp"
+#include "TH/THRandom.h"
+
 namespace at { namespace native {
 namespace {
 
@@ -102,6 +115,65 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
   });
 }
 
+
+#if !AT_MKL_ENABLED()
+void bernoulli_mkl_kernel(Tensor &output, const double p, Generator* gen) {
+  // Use AT_ASSERTM because this should never be reached, and AT_ASSERTM tells
+  // users to report this as a bug.
+  AT_ASSERTM(false, "ATen not compiled with MKL");
+}
+#else
+void bernoulli_mkl_kernel(Tensor &self, const double p, Generator* gen) {
+  THGenerator* generator = get_generator(gen);
+  int64_t seed;
+  {
+    std::lock_guard<std::mutex> lock(generator->mutex);
+    seed = THRandom_random(generator);
+  }
+  int64_t n = self.numel();
+  bool contig = self.is_contiguous();
+
+  AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_scalar_cpu_", [&] {
+    at::Tensor tmp_int_tensor;
+    if (std::is_same<scalar_t, int>::value && contig) {
+      tmp_int_tensor = self;
+    } else {
+      tmp_int_tensor = at::empty(self.sizes(), self.options().dtype(at::kInt));
+    }
+
+    scalar_t *self_ptr = self.data<scalar_t>();
+    int *sample_int_ptr = tmp_int_tensor.data<int>();
+
+    auto sample = [&](int64_t begin, int64_t end) {
+      int64_t len = end - begin;
+      if (len > 0) {
+        VSLStreamStatePtr stream;
+        vslNewStream(&stream, VSL_BRNG_MCG31, seed);
+        vslSkipAheadStream(stream, begin);
+        viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, len,
+          sample_int_ptr + begin, p);
+        vslDeleteStream(&stream);
+
+        // vectorized copy if using buffer and contiguous, i.e., being non-int
+        // type and contiguous
+        if (!std::is_same<scalar_t, int>::value && contig) {
+          scalar_t *self_seg = self_ptr + begin;
+          int* tmp_seg = sample_int_ptr + begin;
+          at::vec256::convert<int, scalar_t>(tmp_seg, self_seg, len);
+        }
+      }
+    };
+
+    parallel_for(0, n, /* grain_size= */ 800, sample);
+
+    // copy_ if using buffer and non contiguous
+    if (!contig) {
+      self.copy_(tmp_int_tensor);
+    }
+  });
+}
+#endif
+
 #define IMPLEMENT_FLOAT_KERNEL(dispatchtypes, op)                          \
   static void op##_kernel(Tensor& result, const Tensor& self) {            \
     checkBackend(#op, {result}, Backend::CPU);                             \
@@ -143,6 +215,7 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
 } // anonymous namespace
 
 REGISTER_DISPATCH(sigmoidImpl, &sigmoid_kernel)
+REGISTER_DISPATCH(bernoulli_mkl_stub, &bernoulli_mkl_kernel);
 
 // IMPLEMENT_FLOAT_KERNEL(ALL, abs)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, acos)
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.h b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
index 157dda8b2598ae..e1809d770d2c18 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.h
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <ATen/Generator.h>
 #include <stdexcept>
 
 namespace at { namespace native {
@@ -34,6 +35,8 @@ DECLARE_DISPATCH(unary_fn, tanImpl);
 DECLARE_DISPATCH(unary_fn, tanhImpl);
 DECLARE_DISPATCH(unary_fn, truncImpl);
 
+DECLARE_DISPATCH(void(*)(Tensor&, const double, Generator *), bernoulli_mkl_stub);
+
 
 // Missing unary functions
 // digamma
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index 4b346cae42c109..fc908714f18f28 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -1,4 +1,5 @@
 #include "ATen/Dispatch.h"
+#include "ATen/ExpandUtils.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/cuda/CUDAApplyUtils.cuh"
 #include "ATen/AccumulateType.h"
@@ -16,14 +17,18 @@
 #include <THC/THCTensorRandom.h>
 #include <THC/THCGenerator.hpp>
 #include <THC/THCApply.cuh>
+#include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
 #include <limits>
 #include <utility>
+#include <type_traits>
 
 THCGenerator* THCRandom_getGenerator(THCState* state);
 
 namespace {
+// increment should be at least the number of curand() random numbers used in
+// each thread.
 std::pair<uint64_t, uint64_t> next_philox_seed(at::Generator* gen, uint64_t increment) {
   auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
   uint64_t offset = gen_->state.philox_seed_offset.fetch_add(increment);
@@ -92,6 +97,87 @@ void gamma_grad_cuda_kernel(
       });
 }
 
+template<typename scalar_t, typename prob_t>
+void bernoulli_tensor_cuda_kernel(
+    at::Tensor& ret, const at::Tensor& p,
+    std::pair<uint64_t, uint64_t> seeds) {
+  // The template argument `4` below indicates that we want to operate on four
+  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
+  at::cuda::CUDA_tensor_apply2<scalar_t, prob_t, 4>(
+      ret, p,
+      [seeds] __device__(
+          int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
+          const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) {
+        curandStatePhilox4_32_10_t state;
+        curand_init(
+            seeds.first,
+            blockIdx.x * blockDim.x + threadIdx.x,
+            seeds.second,
+            &state);
+        float4 rand = curand_uniform4(&state);
+        switch (n) {
+          case 4: {
+            assert(0 <= p4 && p4 <= 1);
+            v4 = static_cast<scalar_t>(rand.w <= p4);
+            // fallthrough
+          }
+          case 3: {
+            assert(0 <= p3 && p3 <= 1);
+            v3 = static_cast<scalar_t>(rand.z <= p3);
+            // fallthrough
+          }
+          case 2: {
+            assert(0 <= p2 && p2 <= 1);
+            v2 = static_cast<scalar_t>(rand.y <= p2);
+            // fallthrough
+          }
+          case 1: {
+            assert(0 <= p1 && p1 <= 1);
+            v1 = static_cast<scalar_t>(rand.x <= p1);
+          }
+        }
+      }
+    );
+}
+
+template<typename scalar_t>
+void bernoulli_scalar_cuda_kernel(
+    at::Tensor& ret, double p_,
+    std::pair<uint64_t, uint64_t> seeds) {
+  float p = static_cast<float>(p_);
+  // The template argument `4` below indicates that we want to operate on four
+  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
+  at::cuda::CUDA_tensor_apply1<scalar_t, 4>(
+      ret, [seeds, p] __device__(
+        int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4) {
+        curandStatePhilox4_32_10_t state;
+        curand_init(
+            seeds.first,
+            blockIdx.x * blockDim.x + threadIdx.x,
+            seeds.second,
+            &state);
+        float4 rand = curand_uniform4(&state);
+        switch (n) {
+          case 4: {
+            v4 = static_cast<scalar_t>(rand.w <= p);
+            // fallthrough
+          }
+          case 3: {
+            v3 = static_cast<scalar_t>(rand.z <= p);
+            // fallthrough
+          }
+          case 2: {
+            v2 = static_cast<scalar_t>(rand.y <= p);
+            // fallthrough
+          }
+          case 1: {
+            v1 = static_cast<scalar_t>(rand.x <= p);
+          }
+        }
+      }
+    );
+}
+
 } // namespace
 
 namespace at { namespace native {
@@ -119,4 +205,28 @@ Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
   return ret;
 }
 
+Tensor& bernoulli_tensor_cuda_(Tensor &self, const Tensor& p_, Generator* gen) {
+  auto p = std::get<0>(expand_inplace(self, p_.to(kCUDA)));
+  AT_DISPATCH_ALL_TYPES_AND_HALF(self.type(), "bernoulli_tensor_cuda_self_", [&] {
+    const at::Type& p_type = p.type();
+    using self_t = scalar_t;
+    auto seeds = next_philox_seed(gen, 10);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(p.type(), "bernoulli_tensor_cuda_p_", [&] {
+      using p_t = scalar_t;
+      return bernoulli_tensor_cuda_kernel<self_t, p_t>(self, p, seeds);
+    });
+   });
+  return self;
+}
+
+Tensor& bernoulli_scalar_cuda_(Tensor &self, double p, Generator* gen) {
+  AT_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
+  AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_scalar_cuda_", [&] {
+    auto seeds = next_philox_seed(gen, 10);
+    bernoulli_scalar_cuda_kernel<scalar_t>(self, p, seeds);
+   });
+  return self;
+}
+
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 876590409c43c4..09c9365793ec75 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -8,6 +8,7 @@
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/CUDAEvent.h>
 #include <ATen/native/RNN.h>
+#include <ATen/TensorUtils.h>
 
 #if !AT_CUDNN_ENABLED()
 
@@ -655,6 +656,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     const Tensor& fn_dropout_state
     ) {
 
+  check_device(input_r, weight, {hx, cx});
   auto input = input_r;
   auto weight_buf = weight_buf_r;
   if (fn_dropout_state.defined()) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 5c99d7c97e9b3e..fd9d8d6e1fed2b 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -165,12 +165,6 @@
 
 - func: arange_out(Tensor result, Scalar end) -> Tensor
 
-- func: arange(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor
-  deprecated: true
-
-- func: arange(Type dtype, Scalar end) -> Tensor
-  deprecated: true
-
 # This function is a temporary hack to allow tracing of arange like constructs with dynamic
 # bounds on arange.  Normal arange is not traceable because it does not take any tensor inputs;
 # if the range you need is based on another tensor, calling this function directly will
@@ -272,22 +266,29 @@
 
 - func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double momentum, double eps, bool cudnn_enabled) -> Tensor
 
-- func: bernoulli(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor
+# Sample bernoulli with values in `self` as probability.
+- func: bernoulli(Tensor self, *, Generator* generator=nullptr) -> Tensor
   variants: function, method
 
-- func: bernoulli(Tensor self, double p, Generator* generator=nullptr) -> Tensor
-  variants: function, method
-
-- func: bernoulli(Tensor self) -> Tensor
-  variants: function, method
+- func: bernoulli_out(Tensor result, Tensor self, *, Generator* generator=nullptr) -> Tensor
+  variants: function
 
-- func: bernoulli_(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor
-  variants: function, method
+- func: bernoulli_(Tensor self, Tensor p, *, Generator* generator=nullptr) -> Tensor
+  variants: method
+  dispatch:
+    CPU: bernoulli_tensor_cpu_
+    CUDA: bernoulli_tensor_cuda_
 
-- func: bernoulli_(Tensor self, double p, Generator* generator=nullptr) -> Tensor
-  variants: function, method
+- func: bernoulli_(Tensor self, double p=0.5, *, Generator* generator=nullptr) -> Tensor
+  variants: method
+  dispatch:
+    CPU: bernoulli_scalar_cpu_
+    CUDA: bernoulli_scalar_cuda_
 
-- func: bernoulli_(Tensor self) -> Tensor
+# This out-of-place version isn't used explicitly, but needed by jit.
+# There is no default valid on `p` here because it would introduce ambiguity
+# with `bernoulli(Tensor self, *, Generator* generator=nullptr)` declaration.
+- func: bernoulli(Tensor self, double p, *, Generator* generator=nullptr) -> Tensor
   variants: function, method
 
 - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor
@@ -648,9 +649,6 @@
 
 - func: empty_like(Tensor self, *, TensorOptions options) -> Tensor
 
-- func: empty(Type dtype, IntList size) -> Tensor
-  deprecated: true
-
 - func: erf(Tensor self) -> Tensor
   variants: function, method
 
@@ -727,9 +725,6 @@
     CPU: eye_out_cpu
     CUDA: eye_out_cuda
 
-- func: eye(Type dtype, int64_t n, int64_t m=-1) -> Tensor
-  deprecated: true
-
 - func: flatten(Tensor self, int64_t start_dim=0, int64_t end_dim=-1) -> Tensor
   variants: function, method
 
@@ -761,9 +756,6 @@
 
 - func: full_like(Tensor self, Scalar fill_value, *, TensorOptions options) -> Tensor
 
-- func: full(Type dtype, IntList size, Scalar fill_value) -> Tensor
-  deprecated: true
-
 # NOTE [ grid_sampler Native Functions ]
 # `grid_sampler` does all the shape checking and then dispatches to one of
 # `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
@@ -940,9 +932,6 @@
 
 - func: linspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor
 
-- func: linspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor
-  deprecated: true
-
 - func: log(Tensor self) -> Tensor
   variants: function, method
 
@@ -1014,9 +1003,6 @@
 
 - func: logspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor
 
-- func: logspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor
-  deprecated: true
-
 - func: log_softmax(Tensor self, int64_t dim) -> Tensor
   variants: function, method
   dispatch:
@@ -1199,9 +1185,6 @@
 
 - func: ones_like(Tensor self, *, TensorOptions options) -> Tensor
 
-- func: ones(Type dtype, IntList size) -> Tensor
-  deprecated: true
-
 - func: pairwise_distance(Tensor x1, Tensor x2, double p=2, double eps=1e-6, bool keepdim=false) -> Tensor
 
 - func: pdist(Tensor self, double p=2) -> Tensor
@@ -1233,9 +1216,6 @@
 
 - func: rand_like(Tensor self, *, TensorOptions options) -> Tensor
 
-- func: rand(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor
-  deprecated: true
-
 - func: randint(int64_t high, IntList size, *, TensorOptions options={}) -> Tensor
 
 - func: randint(int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
@@ -1244,12 +1224,6 @@
 
 - func: randint(int64_t low, int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
 
-- func: randint(Type dtype, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor
-  deprecated: true
-
-- func: randint(Type dtype, int64_t low, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor
-  deprecated: true
-
 - func: randint_out(Tensor result, int64_t high, IntList size, *) -> Tensor
 
 - func: randint_out(Tensor result, int64_t high, IntList size, *, Generator* generator) -> Tensor
@@ -1278,9 +1252,6 @@
 
 - func: randn_like(Tensor self, *, TensorOptions options) -> Tensor
 
-- func: randn(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor
-  deprecated: true
-
 - func: randperm(int64_t n, *, TensorOptions options={}) -> Tensor
 
 - func: randperm(int64_t n, *, Generator* generator, TensorOptions options={}) -> Tensor
@@ -1292,9 +1263,6 @@
     CPU: randperm_out_cpu
     CUDA: randperm_out_cuda
 
-- func: randperm(Type dtype, int64_t n, *, Generator* generator=nullptr) -> Tensor
-  deprecated: true
-
 - func: range(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
 
 - func: range(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor
@@ -1303,9 +1271,6 @@
 
 - func: range_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor
 
-- func: range(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor
-  deprecated: true
-
 - func: repeat(Tensor self, IntList repeats) -> Tensor
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
@@ -1780,9 +1745,6 @@
 
 - func: zeros_like(Tensor self, *, TensorOptions options) -> Tensor
 
-- func: zeros(Type dtype, IntList size) -> Tensor
-  deprecated: true
-
 - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
   variants: function
   dispatch:
@@ -1820,6 +1782,21 @@
   python_default_init:
     p: 2
 
+- func: frobenius_norm(Tensor self) -> Tensor
+  variants: function
+
+- func: frobenius_norm(Tensor self, IntList[1] dim, bool keepdim=false) -> Tensor
+  variants: function
+
+- func: frobenius_norm_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=false) -> Tensor
+  variants: function
+
+- func: nuclear_norm(Tensor self, bool keepdim=false) -> Tensor
+  variants: function
+
+- func: nuclear_norm_out(Tensor result, Tensor self, bool keepdim=false) -> Tensor
+  variants: function
+
 - func: native_clone(Tensor self) -> Tensor
   dispatch:
     SparseCPU: clone_sparse
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 25fd4fc5df4326..49efed2a1e066b 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -257,12 +257,16 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
   AT_ASSERT(!self.is_variable());
   AT_ASSERT(self.is_sparse());
 
-  if (self._nnz() < 2) {
-    _get_sparse_impl(self)->set_coalesced(true);
-  }
   if (self.is_coalesced()) {
     return self;
   }
+  // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false,
+  // we should keep the original tensor intact and do coalesce on a copy of the tensor
+  if (self._nnz() < 2) {
+    SparseTensor dst = self.clone();
+    _get_sparse_impl(dst)->set_coalesced(true);
+    return dst;
+  }
 
   LongTensor indices = self._indices();
   Tensor values = self._values().contiguous();
@@ -306,13 +310,17 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
           int64_t pos = indicesPermutationAccessor[j];
           int64_t curr = indicesBufferAccessor[j];
           if (curr == prev) {
-            THBlas_axpy<scalar_t>(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            if (values.numel() > 0) {  // if values is an empty tensor, there are no elements to copy
+              THBlas_axpy<scalar_t>(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            }
           } else {
             ++i;
             for (int64_t d = 0; d < sparseDims; d++) {
               newIndicesAccessor[d][i] = indicesAccessor[d][pos];
             }
-            THBlas_copy<scalar_t>(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            if (values.numel() > 0) {  // if values is an empty tensor, there are no elements to copy
+              THBlas_copy<scalar_t>(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1);
+            }
           }
           prev = curr;
         }
@@ -345,6 +353,10 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   int64_t r_nnz = mask._nnz();
   _get_sparse_impl(r)->set_nnz_and_narrow(r_nnz);
+  if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
+    return r;
+  }
+
   // NB: Relies on mask._nnz() == 0 test above
   auto mask_indices_accessor = mask_indices.accessor<int64_t, 2>();
 
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index afd8001734a9a8..ec074b5a6c8a88 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -250,18 +250,22 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
             for (d = 0; d < sparseDims; d++) {
               r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i];
             }
-            THBlas_axpy<scalar_t>(blockSize, 1,
-              t_values_ptr + t_i * blockSize, 1,
-              r_values_ptr + r_i * blockSize, 1);
+            if (t_values.numel() > 0) {  // We add all elements from t_values to r_values only if t_values is not an empty tensor
+              THBlas_axpy<scalar_t>(blockSize, 1,
+                t_values_ptr + t_i * blockSize, 1,
+                r_values_ptr + r_i * blockSize, 1);
+            }
             t_i++;
           }
           if (cmp <= 0) {
             for (d = 0; d < sparseDims; d++) {
               r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i];
             }
-            THBlas_axpy<scalar_t>(blockSize, cast_value,
-              s_values_ptr + s_i * blockSize, 1,
-              r_values_ptr + r_i * blockSize, 1);
+            if (s_values.numel() > 0) {  // We add all elements from s_values to r_values only if s_values is not an empty tensor
+              THBlas_axpy<scalar_t>(blockSize, cast_value,
+                s_values_ptr + s_i * blockSize, 1,
+                r_values_ptr + r_i * blockSize, 1);
+            }
             s_i++;
           }
           r_i++;
@@ -368,6 +372,7 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
   AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes());
 
   if (src_._nnz() == 0 || t_._nnz() == 0) {
+    r.resize_as_(src_);
     return r.zero_();
   }
 
@@ -519,7 +524,6 @@ Tensor& s_addmm_out_sparse_dense_cpu(
 
   AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor");
   AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values");
-  AT_CHECK(dense.numel() != 0, "addmm: matrices expected, got empty tensor");
   AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
 
   SparseTensor sparse = sparse_.coalesce();
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
index fc53c41b56f727..107a30f51c2a97 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -25,6 +25,9 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
   _alias_into_sparse(r, mask_indices.clone(), r_values);
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz());
+  if (t.numel() == 0) {  // if t is an empty tensor, there is no need to mask its elements
+    return r;
+  }
 
   LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options());
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index 751aaf7cae291e..23cc9a944a35c1 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -26,12 +26,16 @@ namespace at { namespace native {
 
 SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   int64_t nnz = self._nnz();
-  if (nnz < 2) {
-    _get_sparse_impl(self)->set_coalesced(true);
-  }
   if (self.is_coalesced()) {
     return self;
   }
+  // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false,
+  // we should keep the original tensor intact and do coalesce on a copy of the tensor
+  if (nnz < 2) {
+    SparseTensor dst = self.clone();
+    _get_sparse_impl(dst)->set_coalesced(true);
+    return dst;
+  }
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index c5664e8ba3afbe..15d9afc04307a2 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -103,37 +103,38 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
           r__.transpose_(0, 1);
         }
 
-        /* dense */
-        Tensor dense_;
-        char transpose_dense;
-        if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
-          transpose_dense = 'n';
-          dense_ = dense;
-        } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) {
-          transpose_dense = 't';
-          dense_ = dense;
-        } else {
-          transpose_dense = 't';
-          dense_ = dense.contiguous();
-        }
-
-        sparse::cuda::csrmm2(
-          'n',
-          transpose_dense,
-          m,
-          n,
-          k,
-          nnz,
-          cast_alpha,
-          values.data<scalar_t>(),
-          csr.data<int32_t>(),
-          colIndicesInt.data<int32_t>(),
-          dense_.data<scalar_t>(),
-          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
-          cast_beta,
-          r__.data<scalar_t>(),
-          r__.stride(1));
+        if (nnz > 0) {
+          /* dense */
+          Tensor dense_;
+          char transpose_dense;
+          if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
+            transpose_dense = 'n';
+            dense_ = dense;
+          } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) {
+            transpose_dense = 't';
+            dense_ = dense;
+          } else {
+            transpose_dense = 't';
+            dense_ = dense.contiguous();
+          }
 
+          sparse::cuda::csrmm2(
+            'n',
+            transpose_dense,
+            m,
+            n,
+            k,
+            nnz,
+            cast_alpha,
+            values.data<scalar_t>(),
+            csr.data<int32_t>(),
+            colIndicesInt.data<int32_t>(),
+            dense_.data<scalar_t>(),
+            (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
+            cast_beta,
+            r__.data<scalar_t>(),
+            r__.stride(1));
+        }
       });
 
   r_.copy_(r__);
@@ -270,6 +271,10 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
   int64_t nDim = dense.dim();
   int64_t nDimI = sparse._sparseDims();
 
+  if (sparse._values().numel() == 0) {
+    return r_;
+  }
+
   if (sparse.is_coalesced()) {
     // TODO benchmark to decide whether to remove this special case
     const dim3 block = cuda::getApplyBlock();
@@ -404,6 +409,7 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   SparseTensor src = src_.coalesce();
 
   if (src_._nnz() == 0 || t_._nnz() == 0) {
+    r_.resize_as_(src_);
     return r_.zero_();
   }
 
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
index 98b22c7f8e1d6a..d27d0da7240fc9 100644
--- a/aten/src/ATen/preprocess_declarations.py
+++ b/aten/src/ATen/preprocess_declarations.py
@@ -220,8 +220,6 @@ def signature(option, i=None, value=None):
 def is_extended_method(option):
     if 'method' in option['variants']:
         return False
-    elif option.get('deprecated', False):
-        return False
     elif not option['variants']:
         return False
     else:
diff --git a/aten/src/TH/THRandom.cpp b/aten/src/TH/THRandom.cpp
index 6477b13fc46b34..167e1418d3b551 100644
--- a/aten/src/TH/THRandom.cpp
+++ b/aten/src/TH/THRandom.cpp
@@ -228,14 +228,14 @@ static uint32_t FLOAT_MASK = (1 << 24) - 1;
 static float FLOAT_DIVISOR = 1.0f / (1 << 24);
 
 /* generates a random number on [0,1)-double-interval */
-static double uniform_double(THGenerator *_generator)
+static inline double uniform_double(THGenerator *_generator)
 {
   uint64_t x = THRandom_random64(_generator);
   return (x & DOUBLE_MASK) * DOUBLE_DIVISOR;
 }
 
 /* generates a random number on [0,1)-double-interval */
-static float uniform_float(THGenerator *_generator)
+static inline float uniform_float(THGenerator *_generator)
 {
   uint32_t x = (uint32_t)THRandom_random(_generator);
   return (x & FLOAT_MASK) * FLOAT_DIVISOR;
@@ -314,3 +314,9 @@ int THRandom_bernoulli(THGenerator *_generator, double p)
   THArgCheck(p >= 0 && p <= 1, 1, "must be >= 0 and <= 1");
   return(uniform_double(_generator) <= p);
 }
+
+int THRandom_bernoulliFloat(THGenerator *_generator, float p)
+{
+  THArgCheck(p >= 0 && p <= 1, 1, "must be >= 0 and <= 1");
+  return(uniform_float(_generator) <= p);
+}
diff --git a/aten/src/TH/THRandom.h b/aten/src/TH/THRandom.h
index 3641397635521b..68f0ceb3ed4ab0 100644
--- a/aten/src/TH/THRandom.h
+++ b/aten/src/TH/THRandom.h
@@ -77,7 +77,10 @@ TH_API double THRandom_logNormal(THGenerator *_generator, double mean, double st
 */
 TH_API int THRandom_geometric(THGenerator *_generator, double p);
 
-/* Returns true with probability $p$ and false with probability $1-p$ (p > 0). */
+/* Returns true with double probability $p$ and false with probability $1-p$ (p > 0). */
 TH_API int THRandom_bernoulli(THGenerator *_generator, double p);
 
+/* Returns true with float probability $p$ and false with probability $1-p$ (p > 0). */
+TH_API int THRandom_bernoulliFloat(THGenerator *_generator, float p);
+
 #endif
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
index fd2c9d0341df8b..84ead952385978 100644
--- a/aten/src/TH/generic/THTensorRandom.cpp
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -59,107 +59,6 @@ void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p)
   TH_TENSOR_APPLY(scalar_t, self, *self_data = (scalar_t)THRandom_geometric(_generator, p););
 }
 
-#ifdef TH_BLAS_MKL
-#define BERNOULLI_OMP 800
-#define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000
-
-void THTensor_(iBernoulli_generate_copy)(THTensor *self, THGenerator *_generator, const double p)
-{
-  int64_t seed = THRandom_random(_generator);
-  int64_t n = THTensor_(nElement)(self);
-  int contig = THTensor_(isContiguous)(self);
-  int *tmp = NULL;
-  THIntTensor* intTensor = NULL;
-
-  if (contig) {
-#ifdef TH_REAL_IS_INT
-    tmp = THIntTensor_data(self);
-#else
-    tmp = (int*)THAlloc(n*sizeof(int));
-#endif
-  } else {
-    intTensor = THIntTensor_new();
-    THIntTensor_resizeNd(intTensor, self->dim(), THTensor_getSizePtr(self), nullptr);
-    tmp = THIntTensor_data(intTensor);
-  }
-
-#ifdef _OPENMP
-  size_t nthr = !omp_in_parallel() && n >= BERNOULLI_OMP ? omp_get_num_threads() : 1;
-#pragma omp parallel num_threads(nthr) firstprivate(nthr)
-  {
-    size_t tid = omp_get_thread_num();
-    int64_t seg_len_tmp = n / nthr;
-    int64_t line_index_offset = tid * seg_len_tmp;
-    int64_t line_seg_len = (tid == nthr - 1)? (n-line_index_offset) : seg_len_tmp;
-#else
-  {
-    int64_t line_index_offset = 0;
-    int64_t line_seg_len = n;
-#endif
-
-    if (line_seg_len > 0) {
-      VSLStreamStatePtr stream;
-      vslNewStream(&stream, VSL_BRNG_MCG31, seed);
-      vslSkipAheadStream(stream, line_index_offset);
-      viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, line_seg_len,
-        tmp + line_index_offset, p);
-      vslDeleteStream(&stream);
-
-#ifndef TH_REAL_IS_INT
-      if (contig) {
-        scalar_t* self_seg = self->data<scalar_t>() + line_index_offset;
-        int* tmp_seg = tmp + line_index_offset;
-        THVector_(cvtFromInt)(self_seg, tmp_seg, line_seg_len);
-      }
-#endif
-    }
-  }
-
-  if(contig) {
-#ifndef TH_REAL_IS_INT
-    THFree(tmp);
-#endif
-  } else {
-#ifdef _OPENMP
-    TH_TENSOR_APPLY2_OMP(n, 1, 0, int, intTensor, scalar_t, self, *self_data = *intTensor_data;, TH_OMP_OVERHEAD_THRESHOLD_COPY)
-#else
-    TH_TENSOR_APPLY2(int, intTensor, scalar_t, self, *self_data = *intTensor_data;)
-#endif
-    THIntTensor_free(intTensor);
-  }
-
-}
-
-#endif
-
-void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p)
-{
-#ifdef TH_BLAS_MKL
-  if(cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
-    std::lock_guard<std::mutex> lock(_generator->mutex);
-    THTensor_(iBernoulli_generate_copy)(self, _generator, p);
-  } else {
-    std::lock_guard<std::mutex> lock(_generator->mutex);
-    TH_TENSOR_APPLY(scalar_t, self, *self_data = (scalar_t)THRandom_bernoulli(_generator, p););
-  }
-#else
-  std::lock_guard<std::mutex> lock(_generator->mutex);
-  TH_TENSOR_APPLY(scalar_t, self, *self_data = (scalar_t)THRandom_bernoulli(_generator, p););
-#endif
-}
-
-void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p)
-{
-  std::lock_guard<std::mutex> lock(_generator->mutex);
-  TH_TENSOR_APPLY2(scalar_t, self, float, p, *self_data = (scalar_t)THRandom_bernoulli(_generator, (double)*p_data););
-}
-
-void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p)
-{
-  std::lock_guard<std::mutex> lock(_generator->mutex);
-  TH_TENSOR_APPLY2(scalar_t, self, double, p, *self_data = (scalar_t)THRandom_bernoulli(_generator, (double)*p_data););
-}
-
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
 #if defined(TH_REAL_IS_FLOAT)
@@ -168,15 +67,6 @@ void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator,
 #define TH_REAL_MIN DBL_MIN
 #endif
 
-void THTensor_(bernoulli_Tensor)(THTensor *self, THGenerator *_generator, THTensor* p)
-{
-#if defined(TH_REAL_IS_FLOAT)
-  THTensor_(bernoulli_FloatTensor)(self, _generator, p);
-#else
-  THTensor_(bernoulli_DoubleTensor)(self, _generator, p);
-#endif
-}
-
 void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b)
 {
   std::lock_guard<std::mutex> lock(_generator->mutex);
diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h
index dc6bdafaae703c..fb68141930e168 100644
--- a/aten/src/TH/generic/THTensorRandom.h
+++ b/aten/src/TH/generic/THTensorRandom.h
@@ -6,9 +6,6 @@ TH_API void THTensor_(random)(THTensor *self, THGenerator *_generator);
 TH_API void THTensor_(clampedRandom)(THTensor *self, THGenerator *_generator, int64_t min, int64_t max);
 TH_API void THTensor_(cappedRandom)(THTensor *self, THGenerator *_generator, int64_t max);
 TH_API void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p);
-TH_API void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p);
-TH_API void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p);
-TH_API void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p);
 
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 TH_API void THTensor_(bernoulli_Tensor)(THTensor *self, THGenerator *_generator, THTensor *p);
diff --git a/aten/src/TH/generic/THVector.h b/aten/src/TH/generic/THVector.h
index df92994e3ffaf1..32dfec0b054e0b 100644
--- a/aten/src/TH/generic/THVector.h
+++ b/aten/src/TH/generic/THVector.h
@@ -15,13 +15,10 @@ TH_API void THVector_(divs)(scalar_t *y, const scalar_t *x, const scalar_t c, co
 TH_API void THVector_(copy)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 TH_API void THVector_(normal_fill)(scalar_t *data,
-                                   const int64_t size,
-                                   struct THGenerator *generator,
-                                   const scalar_t mean,
-                                   const scalar_t stddev);
-#ifndef TH_REAL_IS_INT
-TH_API void THVector_(cvtFromInt)(scalar_t *y, const int *x, const ptrdiff_t n);
-#endif
+								   const int64_t size,
+								   struct THGenerator *generator,
+								   const scalar_t mean,
+								   const scalar_t stddev);
 
 #if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
 TH_API void THVector_(abs)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
diff --git a/aten/src/TH/generic/THVectorDefault.cpp b/aten/src/TH/generic/THVectorDefault.cpp
index dba380c6b84bb9..f1c0c8f57cb047 100644
--- a/aten/src/TH/generic/THVectorDefault.cpp
+++ b/aten/src/TH/generic/THVectorDefault.cpp
@@ -130,24 +130,6 @@ void THVector_(divs_DEFAULT)(scalar_t *y, const scalar_t *x, const scalar_t c, c
     y[i] = x[i] / c;
 }
 
-#ifndef TH_REAL_IS_INT
-void THVector_(cvtFromInt_DEFAULT)(scalar_t *y, const int *x, const ptrdiff_t n)
-{
-  ptrdiff_t i = 0;
-
-  for(; i<n-4; i+=4)
-  {
-    y[i] = (scalar_t)x[i];
-    y[i+1] = (scalar_t)x[i+1];
-    y[i+2] = (scalar_t)x[i+2];
-    y[i+3] = (scalar_t)x[i+3];
-  }
-
-  for(; i < n; i++)
-    y[i] = (scalar_t)x[i];
-}
-#endif
-
 // Fills 16 normally distributed samples into data, interleaved with a
 // stride of 8, i.e. in order of ([0], [8]), ([1], [9]), ...
 static void THVector_(interleaved_normal_fill_16)(scalar_t *data,
diff --git a/aten/src/TH/generic/THVectorDispatch.cpp b/aten/src/TH/generic/THVectorDispatch.cpp
index 04cfbc0aac4da4..74b7a28b32dafa 100644
--- a/aten/src/TH/generic/THVectorDispatch.cpp
+++ b/aten/src/TH/generic/THVectorDispatch.cpp
@@ -239,28 +239,6 @@ void THVector_(copy)(scalar_t *y, const scalar_t *x, const ptrdiff_t n) {
   THVector_(copy_DISPATCHPTR)(y, x, n);
 }
 
-#ifndef TH_REAL_IS_INT
-static void (*THVector_(cvtFromInt_DISPATCHPTR))(scalar_t *, const int *, const ptrdiff_t) = &THVector_(cvtFromInt_DEFAULT);
-static FunctionDescription THVector_(cvtFromInt_DISPATCHTABLE)[] = {
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cvtFromInt_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cvtFromInt_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
-
-  FUNCTION_IMPL(THVector_(cvtFromInt_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(cvtFromInt)(scalar_t *y, const int *x, const ptrdiff_t n) {
-  THVector_(cvtFromInt_DISPATCHPTR)(y, x, n);
-}
-#endif
 
 static void (*THVector_(normal_fill_DISPATCHPTR))(scalar_t *, const int64_t, THGenerator *, const scalar_t, const scalar_t) = &THVector_(normal_fill_DEFAULT);
 static FunctionDescription THVector_(normal_fill_DISPATCHTABLE)[] = {
@@ -313,10 +291,6 @@ struct THVector_(startup) {
     INIT_DISPATCH_PTR(copy);
     INIT_DISPATCH_PTR(normal_fill);
 
-#ifndef TH_REAL_IS_INT
-    INIT_DISPATCH_PTR(cvtFromInt);
-#endif
-
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
     INIT_DISPATCH_PTR(sigmoid);
 #endif
diff --git a/aten/src/TH/vector/AVX.cpp b/aten/src/TH/vector/AVX.cpp
index 507eb2276ecf78..e5af660eae43b3 100644
--- a/aten/src/TH/vector/AVX.cpp
+++ b/aten/src/TH/vector/AVX.cpp
@@ -272,38 +272,4 @@ void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdi
   }
 }
 
-void THFloatVector_cvtFromInt_AVX(float *y, const int *x, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256i YMM0, YMM1;
-  __m256 YMM2, YMM3;
-  for (i=0; i<=((n)-16); i+=16) {
-    YMM0 = _mm256_loadu_si256((__m256i const*)(x+i));
-    YMM1 = _mm256_loadu_si256((__m256i const*)(x+i+8));
-    YMM2 = _mm256_cvtepi32_ps(YMM0);
-    YMM3 = _mm256_cvtepi32_ps(YMM1);
-    _mm256_storeu_ps(y+i, YMM2);
-    _mm256_storeu_ps(y+i+8, YMM3);
-  }
-  for (; i<(n); i++) {
-    y[i] = (float)x[i];
-  }
-}
-
-void THDoubleVector_cvtFromInt_AVX(double *y, const int *x, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128i YMM0, YMM1;
-  __m256d YMM2, YMM3;
-  for (i=0; i<=((n)- 8); i+=8) {
-    YMM0 = _mm_loadu_si128((__m128i const*)(x+i));
-    YMM1 = _mm_loadu_si128((__m128i const*)(x+i+4));
-    YMM2 = _mm256_cvtepi32_pd(YMM0);
-    YMM3 = _mm256_cvtepi32_pd(YMM1);
-    _mm256_storeu_pd(y+i, YMM2);
-    _mm256_storeu_pd(y+i+4, YMM3);
-  }
-  for (; i<(n); i++) {
-    y[i] = (double)x[i];
-  }
-}
-
 #endif // defined(__AVX__)
diff --git a/aten/src/TH/vector/AVX.h b/aten/src/TH/vector/AVX.h
index c4e47cbfcfc007..bcaa05326a2704 100644
--- a/aten/src/TH/vector/AVX.h
+++ b/aten/src/TH/vector/AVX.h
@@ -12,7 +12,6 @@ TH_API void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y,
 TH_API void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
 TH_API void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
 TH_API void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-TH_API void THDoubleVector_cvtFromInt_AVX(double *y, const int *x, const ptrdiff_t n);
 TH_API void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
 TH_API void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
 TH_API void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
@@ -21,5 +20,4 @@ TH_API void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, con
 TH_API void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
 TH_API void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
 TH_API void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-TH_API void THFloatVector_cvtFromInt_AVX(float *y, const int *x, const ptrdiff_t n);
 #endif
diff --git a/aten/src/TH/vector/SSE.cpp b/aten/src/TH/vector/SSE.cpp
index 20d58936107a06..90c7294806cd5e 100644
--- a/aten/src/TH/vector/SSE.cpp
+++ b/aten/src/TH/vector/SSE.cpp
@@ -267,37 +267,3 @@ static void THFloatVector_divs_SSE(float *y, const float *x, const float c, cons
   }
 }
 
-static void THFloatVector_cvtFromInt_SSE(float *y, const int *x, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128i YMM0, YMM1;
-  __m128 YMM2, YMM3;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm_loadu_si128((__m128i const*)(x+i));
-    YMM1 = _mm_loadu_si128((__m128i const*)(x+i+4));
-    YMM2 = _mm_cvtepi32_ps(YMM0);
-    YMM3 = _mm_cvtepi32_ps(YMM1);
-    _mm_storeu_ps(y+i, YMM2);
-    _mm_storeu_ps(y+i+4, YMM3);
-  }
-  for (; i<(n); i++) {
-    y[i] = (float)x[i];
-  }
-}
-
-static void THDoubleVector_cvtFromInt_SSE(double *y, const int *x, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128i YMM0, YMM1;
-  __m128d YMM2, YMM3;
-  for (i=0; i<=((n)- 4); i+=4) {
-    YMM0 = _mm_loadu_si128((__m128i const*)(x+i));
-    YMM2 = _mm_cvtepi32_pd(YMM0);
-    YMM1 = _mm_srli_si128(YMM0, 8);
-    YMM3 = _mm_cvtepi32_pd(YMM1);
-    _mm_storeu_pd(y+i, YMM2);
-    _mm_storeu_pd(y+i+2, YMM3);
-  }
-  for (; i<(n); i++) {
-    y[i] = (double)x[i];
-  }
-}
-
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index 386473a430329a..ff9b48d40e0918 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -142,31 +142,6 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)
   }                                                                                  \
 }
 
-template<typename T, typename U>
-struct is_same { static const bool value = false; };
-
-template<typename T>
-struct is_same<T, T> { static const bool value = true; };
-
-template<typename T, typename prob_type>
-__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
-        T *result, prob_type *probs)
-{
-  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
-  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
-    if (is_same<prob_type, double>::value) {
-      double x = curand_uniform_double(&state[blockIdx.x]);
-      if (i < size)
-        result[i] = ScalarConvert<bool, T>::to(x <= probs[i]);
-    } else {
-      float x = curand_uniform(&state[blockIdx.x]);
-      if (i < size)
-        result[i] = ScalarConvert<bool, T>::to(x <= probs[i]);
-    }
-  }
-}
-
 // NOTE: curand_uniform is (0, 1] and we want [a, b)
 GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
 GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
index 4cbc6dd1a29999..74032faa842d53 100644
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -382,62 +382,6 @@ void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, T
 
 #endif
 
-#if defined(THC_REAL_IS_DOUBLE)
-GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_double, x <= p)
-#else
-GENERATE_KERNEL1(generate_bernoulli, scalar_t, double p, float, curand_uniform, (ScalarConvert<bool, scalar_t>::to(x <= p)))
-#endif
-
-void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
-  ptrdiff_t size = THCTensor_(nElement)(state, self_);
-  if (size == 0) return;
-  THCGenerator* gen = THCRandom_getGenerator(state);
-  THCTensor *self = THCTensor_(newContiguous)(state, self_);
-  scalar_t *data = THCTensor_(data)(state, self);
-
-  generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, p);
-
-  THCTensor_(freeCopyTo)(state, self, self_);
-};
-
-void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p)
-{
-#if defined(THC_REAL_IS_FLOAT)
-  THCTensor_(bernoulli_FloatTensor)(state, self, p);
-#elif defined(THC_REAL_IS_DOUBLE)
-  THCTensor_(bernoulli_DoubleTensor)(state, self, p);
-#endif
-}
-
-#define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE)               \
-void THCTensor_(NAME)(THCState* state,                                 \
-        THCTensor *self_, PROB_TYPE *probs_)                                   \
-{                                                                              \
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_));             \
-  ptrdiff_t size = THCTensor_(nElement)(state, self_);                         \
-  if (size == 0) return;                                                       \
-  THCGenerator* gen = THCRandom_getGenerator(state);                           \
-  THCTensor *self = THCTensor_(newContiguous)(state, self_);                   \
-  PROB_TYPE *probs = PROB_TYPE##_newContiguous(state, probs_);                 \
-  ptrdiff_t prob_size = PROB_TYPE##_nElement(state, probs);                    \
-  scalar_t *result_data = THCTensor_(data)(state, self);                           \
-  PROB_DATA_TYPE *probs_data = PROB_TYPE##_data(state, probs);                 \
-                                                                               \
-  THArgCheck(size == prob_size, 3, "inconsistent tensor size");                \
-                                                                               \
-  generate_bernoulli_tensor<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( \
-      gen->state.gen_states, size, result_data, probs_data);                         \
-                                                                               \
-  PROB_TYPE##_free(state, probs);                                              \
-  THCTensor_(freeCopyTo)(state, self, self_);                                  \
-}
-
-DEFINE_BERNOULLI_TENSOR(bernoulli_FloatTensor, THCudaTensor, float)
-DEFINE_BERNOULLI_TENSOR(bernoulli_DoubleTensor, THCudaDoubleTensor, double)
-
 #if defined(THC_REAL_IS_DOUBLE)
 GENERATE_KERNEL1(generate_geometric, double, double p, double, curand_uniform_double, ceil(log(x) / log(1-p)))
 #else
diff --git a/aten/src/THC/generic/THCTensorRandom.h b/aten/src/THC/generic/THCTensorRandom.h
index 1deb2db76ef9cc..4494dba37e7e31 100644
--- a/aten/src/THC/generic/THCTensorRandom.h
+++ b/aten/src/THC/generic/THCTensorRandom.h
@@ -21,10 +21,6 @@ THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor
 THC_API void THCTensor_(random)(struct THCState *state, THCTensor *self);
 THC_API void THCTensor_(clampedRandom)(struct THCState *state, THCTensor *self, int64_t min, int64_t max);
 THC_API void THCTensor_(cappedRandom)(struct THCState *state, THCTensor *self, int64_t max);
-THC_API void THCTensor_(bernoulli)(struct THCState *state, THCTensor *self, double p);
-THC_API void THCTensor_(bernoulli_FloatTensor)(struct THCState *state, THCTensor *self, THCudaTensor *p);
-THC_API void THCTensor_(bernoulli_DoubleTensor)(struct THCState *state, THCTensor *self, THCudaDoubleTensor *p);
-THC_API void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p);
 THC_API void THCTensor_(geometric)(struct THCState *state, THCTensor *self, double p);
 
 #endif
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 6f679dde9e180b..d7490686ab7570 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -394,12 +394,7 @@ endif()
 
 # ---[ Test binaries.
 if (BUILD_TEST)
-  set(Caffe2_ALL_TEST_SRCS ${Caffe2_CPU_TEST_SRCS})
-  if (USE_CUDA)
-    list(APPEND Caffe2_ALL_TEST_SRCS ${Caffe2_GPU_TEST_SRCS})
-  endif()
-
-  foreach(test_src ${Caffe2_ALL_TEST_SRCS})
+  foreach(test_src ${Caffe2_CPU_TEST_SRCS})
     get_filename_component(test_name ${test_src} NAME_WE)
     add_executable(${test_name} "${test_src}")
     target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
@@ -414,6 +409,23 @@ if (BUILD_TEST)
     endif()
   endforeach()
 
+  if (USE_CUDA)
+    foreach(test_src ${Caffe2_GPU_TEST_SRCS})
+      get_filename_component(test_name ${test_src} NAME_WE)
+      cuda_add_executable(${test_name} "${test_src}")
+      target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main)
+      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
+      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
+      if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0)
+        target_compile_features(${test_name} PRIVATE cxx_range_for)
+      endif()
+      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+      if (INSTALL_TEST)
+        install(TARGETS ${test_name} DESTINATION test)
+      endif()
+    endforeach()
+  endif()
+
   if(USE_ROCM)
     foreach(test_src ${Caffe2_HIP_TEST_SRCS})
       set_source_files_properties(${test_src} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
diff --git a/caffe2/core/allocator.cc b/caffe2/core/allocator.cc
index 10fa078cf4b820..bd0e99b20a0ca2 100644
--- a/caffe2/core/allocator.cc
+++ b/caffe2/core/allocator.cc
@@ -1,6 +1,5 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
 #include "caffe2/core/typeid.h"
 
 CAFFE2_DEFINE_bool(
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
index 2bbbce7df1e4ef..954446a45207f0 100644
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@@ -8,7 +8,6 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/types.h"
-#include "caffe2/proto/caffe2_pb.h"
 
 #ifndef CAFFE2_USE_CUDNN
 #error("This Caffe2 install is not built with cudnn, so you should not include this file.");
diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
index b73a6aefa406a0..be71a41cc237c4 100644
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@@ -137,27 +137,27 @@ inline bool HasCudaGPU() { return NumCudaDevices() > 0; }
 /**
  * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
  */
-int CaffeCudaGetDevice();
+CAFFE2_CUDA_API int CaffeCudaGetDevice();
 
 /**
  * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
  */
-void CaffeCudaSetDevice(const int id);
+CAFFE2_CUDA_API void CaffeCudaSetDevice(const int id);
 
 /**
  * Gets the GPU id that the current pointer is located at.
  */
-int GetGPUIDForPointer(const void* ptr);
+CAFFE2_CUDA_API int GetGPUIDForPointer(const void* ptr);
 
 /**
  * Gets the device property for the given device. This function is thread safe.
  */
-const cudaDeviceProp& GetDeviceProperty(const int device);
+CAFFE2_CUDA_API const cudaDeviceProp& GetDeviceProperty(const int device);
 
 /**
  * Runs a device query function and prints out the results to LOG(INFO).
  */
-void DeviceQuery(const int deviceid);
+CAFFE2_CUDA_API void DeviceQuery(const int deviceid);
 
 /**
  * Return a peer access pattern by returning a matrix (in the format of a
@@ -166,22 +166,22 @@ void DeviceQuery(const int deviceid);
  * This function returns false if anything wrong happens during the query of
  * the GPU access pattern.
  */
-bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern);
+CAFFE2_CUDA_API bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern);
 
 /**
  * Return the availability of TensorCores for math
  */
-bool TensorCoreAvailable();
+CAFFE2_CUDA_API bool TensorCoreAvailable();
 
 /**
  * Return a human readable cublas error string.
  */
-const char* cublasGetErrorString(cublasStatus_t error);
+CAFFE2_CUDA_API const char* cublasGetErrorString(cublasStatus_t error);
 
 /**
  * Return a human readable curand error string.
  */
-const char* curandGetErrorString(curandStatus_t error);
+CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error);
 
 // CUDA: various checks for different function calls.
 #define CUDA_ENFORCE(condition, ...)     \
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
index 388a390d5aaa7f..b61b73cbad1cb5 100644
--- a/caffe2/core/context_base.cc
+++ b/caffe2/core/context_base.cc
@@ -1,22 +1,4 @@
 #include "context_base.h"
 
 namespace caffe2 {
-
-// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
-StaticContextMap& GetStaticContexts() {
-  static StaticContextMap static_contexts;
-  return static_contexts;
-}
-
-void set_static_context(DeviceType t, BaseStaticContext* ptr) {
-  auto& static_contexts = GetStaticContexts();
-  static_contexts[t] = ptr;
-}
-
-BaseStaticContext* get_static_context(DeviceType t) {
-  auto* ptr = GetStaticContexts()[t];
-  CAFFE_ENFORCE(ptr, "StaticContext is not registered yet.");
-  return ptr;
-}
-
 } // namespace caffe2
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
index 6f3eca1b6fb51f..3a6dfad5b95cc3 100644
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@@ -5,26 +5,3 @@
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {
-using at::BaseContext;
-using at::BaseStaticContext;
-
-using StaticContextMap = CaffeMap<DeviceType, BaseStaticContext*>;
-CAFFE2_API StaticContextMap& GetStaticContexts();
-CAFFE2_API void set_static_context(DeviceType t, BaseStaticContext* ptr);
-CAFFE2_API BaseStaticContext* get_static_context(DeviceType t);
-
-template <DeviceType t>
-struct StaticContextFunctionRegisterer {
-  explicit StaticContextFunctionRegisterer(BaseStaticContext* ptr) {
-    set_static_context(t, ptr);
-  }
-};
-
-#define REGISTER_STATIC_CONTEXT(t, f)                                \
-  namespace {                                                        \
-  static StaticContextFunctionRegisterer<t> g_static_context_##d(f); \
-  }
-
-} // namespace caffe2
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 987c9ffe35299d..5fcdb98b100794 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -403,7 +403,6 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
   }
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
-    CAFFE_ENFORCE(data, "data cannot be nullptr");
     device->set_device_type(TypeToProto(GetDeviceType()));
     device->set_cuda_gpu_id(GetGPUIDForPointer(data));
   }
diff --git a/caffe2/core/event_cpu.h b/caffe2/core/event_cpu.h
index 95143524635f15..dc99f99ab28a3a 100644
--- a/caffe2/core/event_cpu.h
+++ b/caffe2/core/event_cpu.h
@@ -1,7 +1,7 @@
 #include "caffe2/core/event.h"
-#include "caffe2/core/operator.h"
 
 #include <atomic>
+#include <condition_variable>
 
 namespace caffe2 {
 
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index 9b3c7f6fdec511..cd057444d31cf4 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -52,17 +52,6 @@ void ThrowEnforceNotMet(
   throw e;
 }
 
-static std::function<void(const OperatorDef&)> OperatorLogger =
-    [](const OperatorDef&) { return; };
-
-void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer) {
-  OperatorLogger = tracer;
-}
-
-std::function<void(const OperatorDef&)> GetOperatorLogger() {
-  return OperatorLogger;
-}
-
 }  // namespace caffe2
 
 
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index 2392c4cbcfc446..67428df833dd54 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -9,7 +9,6 @@
 
 #include <ATen/core/Error.h>
 #include "caffe2/core/flags.h"
-#include "caffe2/proto/caffe2_pb.h"
 
 // CAFFE2_LOG_THRESHOLD is a compile time flag that would allow us to turn off
 // logging at compile time so no logging message below that level is produced
@@ -106,9 +105,6 @@ size_t ReplaceAll(string& s, const char* from, const char* to);
 
 CAFFE2_API void SetStackTraceFetcher(std::function<string(void)> fetcher);
 
-CAFFE2_API void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer);
-std::function<void(const OperatorDef&)> GetOperatorLogger();
-
 using EnforceNotMet = at::Error;
 
 #define CAFFE_ENFORCE(condition, ...)                                         \
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 38cc117141b1cf..51f61454643106 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -693,4 +693,15 @@ std::set<std::string> GetRegisteredOperators() {
   return all_keys;
 }
 
+static std::function<void(const OperatorDef&)> OperatorLogger =
+    [](const OperatorDef&) { return; };
+
+void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer) {
+  OperatorLogger = tracer;
+}
+
+std::function<void(const OperatorDef&)> GetOperatorLogger() {
+  return OperatorLogger;
+}
+
 }  // namespace caffe2
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index e75681ff3a9df6..9b428f9003d958 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -988,6 +988,10 @@ CAFFE2_API std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTenso
 // Get a set of registered operator names
 CAFFE2_API std::set<std::string> GetRegisteredOperators();
 
+// Operator logging capabilities
+CAFFE2_API void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer);
+std::function<void(const OperatorDef&)> GetOperatorLogger();
+
 }  // namespace caffe2
 
 #endif  // CAFFE2_CORE_OPERATOR_H_
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 32b9137b39cd72..e52d0335f9c94e 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -478,13 +478,13 @@ bool RunPlanOnWorkspace(
     Workspace* ws,
     const PlanDef& plan,
     ShouldContinue shouldContinue) {
-  LOG(INFO) << "Started executing plan.";
+  LOG(INFO) << "Started executing plan " << plan.name();
   if (plan.execution_step_size() == 0) {
     LOG(WARNING) << "Nothing to run - did you define a correct plan?";
     // We will do nothing, but the plan is still legal so we will return true.
     return true;
   }
-  LOG(INFO) << "Initializing networks.";
+  LOG(INFO) << "Initializing networks for plan " << plan.name();
 
   NetDefMap net_defs;
   for (const NetDef& net_def : plan.network()) {
@@ -508,11 +508,12 @@ bool RunPlanOnWorkspace(
       LOG(ERROR) << "Failed initializing step " << step.name();
       return false;
     }
-    LOG(INFO) << "Step " << step.name() << " took " << step_timer.Seconds()
-              << " seconds.";
+    LOG(INFO) << "Step " << step.name() << " in plan " << plan.name()
+              << " took " << step_timer.Seconds() << " seconds.";
   }
-  LOG(INFO) << "Total plan took " << plan_timer.Seconds() << " seconds.";
-  LOG(INFO) << "Plan executed successfully.";
+  LOG(INFO) << "Total plan " << plan.name() << " took " << plan_timer.Seconds()
+            << " seconds.";
+  LOG(INFO) << "Plan " << plan.name() << " executed successfully.";
   return true;
 }
 }
diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h
index 652f2987bea442..e4f373ab372295 100644
--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@@ -14,7 +14,7 @@
 namespace caffe2 {
 
 template <class Context>
-class CAFFE2_API QTensor {
+class CAFFE2_EXPORT QTensor {
  public:
   QTensor() {}
   virtual ~QTensor() {}
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 3f42ed36b30954..5a2d2c821d2ba8 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -3,12 +3,12 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/TensorImpl.h>
 #include <ATen/core/context_base.h>
+#include <ATen/core/context_base.h>
 
 #include "caffe2/core/allocator.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/flags.h"
 #include "caffe2/core/logging.h"
-#include "caffe2/core/context_base.h"
 
 // A global boolean variable to control whether we free memory when a Tensor
 // is shrinked to a smaller size. As a result, a Tensor is always going to
@@ -22,6 +22,9 @@ CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
 
 namespace caffe2 {
 
+// Defined by protobuf
+class DeviceOption;
+
 /**
  * A utility function to convert vector<int> to vector<TIndex>.
  */
@@ -152,17 +155,17 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     if (size() > 0) {
       if (data_type_.copy()) {
         CAFFE_ENFORCE(
-            GetDeviceType() == CPU,
+            GetDeviceType() == ::at::DeviceType::CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
         CAFFE_ENFORCE(
-            src.GetDeviceType() == CPU,
+            src.GetDeviceType() == ::at::DeviceType::CPU,
             "In CopyFrom source and dest tensors must both be CPU for meta copy");
         data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
       } else {
         // We'll need to use a non-CPU context to perform the copy if
         // one of the context is not CPU since only non-CPU context
         // knows how to copy between CPU and that context
-        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
+        if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) {
           if (!context) {
             src.CreateContext()->CopyBytesToDevice(
                 nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
diff --git a/caffe2/operators/hip/conv_op_miopen.cc b/caffe2/operators/hip/conv_op_miopen.cc
index d29ff06fb30bda..5b9df03dc78953 100644
--- a/caffe2/operators/hip/conv_op_miopen.cc
+++ b/caffe2/operators/hip/conv_op_miopen.cc
@@ -259,7 +259,7 @@ bool MIOPENConvOp::DoRunWithType() {
         kernel_w()));
 
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, 1, C / group_, H, W));
+        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C / group_, H, W));
 
     MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
         conv_desc_,
@@ -286,9 +286,7 @@ bool MIOPENConvOp::DoRunWithType() {
     }
 
     int group_offset_X = C / group_ * H * W * D;
-    int batch_offset_X = group_offset_X * group_;
     int group_offset_Y = M / group_ * H_out * W_out * D_out;
-    int batch_offset_Y = group_offset_Y * group_;
 
     while (!bestAlgoFound_) {
       miopenConvAlgoPerf_t perf;
@@ -325,27 +323,23 @@ bool MIOPENConvOp::DoRunWithType() {
       fwdAlgo_ = perf.fwd_algo;
     }
 
-    for (int b = 0; b < N; b++) {
-      for (int g = 0; g < group_; g++) {
-        miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-          MIOPEN_ENFORCE(miopenConvolutionForward(
-            state->miopen_handle(),
-            &alpha_,
-            bottom_desc_,
-            X.template data<T_X>() + (b * batch_offset_X) +
-                (g * group_offset_X),
-            weight_desc_,
-            Weight.template data<T_W>() + g * group_offset_filter,
-            conv_desc_,
-            fwdAlgo_,
-            &beta_,
-            top_desc_,
-            Y->template mutable_data<T_Y>() + (b * batch_offset_Y) +
-                (g * group_offset_Y),
-            fwdConvWs_,
-            fwdConvWsSize_));
-        });
-      }
+    for (int g = 0; g < group_; g++) {
+      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
+        MIOPEN_ENFORCE(miopenConvolutionForward(
+          state->miopen_handle(),
+          &alpha_,
+          bottom_desc_,
+          X.template data<T_X>() + g * group_offset_X,
+          weight_desc_,
+          Weight.template data<T_W>() + g * group_offset_filter,
+          conv_desc_,
+          fwdAlgo_,
+          &beta_,
+          top_desc_,
+          Y->template mutable_data<T_Y>() + g * group_offset_Y,
+          fwdConvWs_,
+          fwdConvWsSize_));
+      });
     }
     hipDeviceSynchronize();
 
@@ -539,6 +533,8 @@ bool MIOPENConvGradientOp::DoRunWithType() {
       "If you set group, the number of output channels should be divisible "
       "by group.");
 
+  bool doBwdDataComputation = (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2)));
+
   if (group_ > 1) {
     int group_offset_filter = Weight.size() / group_;
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
@@ -550,7 +546,7 @@ bool MIOPENConvGradientOp::DoRunWithType() {
         kernel_w()));
 
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, 1, C / group_, H, W));
+        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C / group_, H, W));
 
     MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
         conv_desc_,
@@ -577,11 +573,9 @@ bool MIOPENConvGradientOp::DoRunWithType() {
     }
 
     int group_offset_X = C / group_ * H * W * D;
-    int batch_offset_X = group_offset_X * group_;
     int group_offset_Y = M / group_ * H_out * W_out * D_out;
-    int batch_offset_Y = group_offset_Y * group_;
 
-    while (!bestDataAlgoFound_) {
+    while ((!bestDataAlgoFound_) && doBwdDataComputation) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
@@ -652,46 +646,42 @@ bool MIOPENConvGradientOp::DoRunWithType() {
       bwdWeiAlgo_ = perf.bwd_weights_algo;
     }
 
-    for (int b = 0; b < N; b++) {
-      for (int g = 0; g < group_; g++) {
+    for (int g = 0; g < group_; g++) {
+      if (doBwdDataComputation) {
         miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
           MIOPEN_ENFORCE(miopenConvolutionBackwardData(
             state->miopen_handle(),
             &alpha_,
             top_desc_,
-            dY.template data<T_DY>() + (b * batch_offset_Y) +
-                (g * group_offset_Y),
+            dY.template data<T_DY>() + g * group_offset_Y,
             weight_desc_,
             Weight.template data<T_W>() + g * group_offset_filter,
             conv_desc_,
             bwdDataAlgo_,
             &beta_,
             bottom_desc_,
-            dX->template mutable_data<T_DX>() + (b * batch_offset_X) +
-                (g * group_offset_X),
+            dX->template mutable_data<T_DX>() + g * group_offset_X,
             bwdDataWs_,
             bwdDataWsSize_));
         });
-
-        miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-          MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
-            state->miopen_handle(),
-            &alpha_,
-            top_desc_,
-            dY.template data<T_DY>() + (b * batch_offset_Y) +
-                (g * group_offset_Y),
-            bottom_desc_,
-            X.template data<T_X>() + (b * batch_offset_X) +
-                (g * group_offset_X),
-            conv_desc_,
-            bwdWeiAlgo_,
-            &beta_,
-            weight_desc_,
-            dW->template mutable_data<T_DW>() + g * group_offset_filter,
-            bwdWeightWs_,
-            bwdWeightWsSize_));
-        });
       }
+
+      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
+        MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
+          state->miopen_handle(),
+          &alpha_,
+          top_desc_,
+          dY.template data<T_DY>() + g * group_offset_Y,
+          bottom_desc_,
+          X.template data<T_X>() + g * group_offset_X,
+          conv_desc_,
+          bwdWeiAlgo_,
+          &beta_,
+          weight_desc_,
+          dW->template mutable_data<T_DW>() + g * group_offset_filter,
+          bwdWeightWs_,
+          bwdWeightWsSize_));
+      });
     }
 
     // Synchronize the work across groups.
@@ -742,7 +732,7 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
     }
 
-    while (!bestDataAlgoFound_) {
+    while ((!bestDataAlgoFound_) && doBwdDataComputation) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
@@ -813,22 +803,24 @@ bool MIOPENConvGradientOp::DoRunWithType() {
       bwdWeiAlgo_ = perf.bwd_weights_algo;
     }
 
-    miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-      MIOPEN_ENFORCE(miopenConvolutionBackwardData(
-        state->miopen_handle(),
-        &alpha_,
-        top_desc_,
-        dY.template data<T_DY>(),
-        weight_desc_,
-        Weight.template data<T_W>(),
-        conv_desc_,
-        bwdDataAlgo_,
-        &beta_,
-        bottom_desc_,
-        dX->template mutable_data<T_DX>(),
-        bwdDataWs_,
-        bwdDataWsSize_));
-    });
+    if (doBwdDataComputation) {
+      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
+        MIOPEN_ENFORCE(miopenConvolutionBackwardData(
+          state->miopen_handle(),
+          &alpha_,
+          top_desc_,
+          dY.template data<T_DY>(),
+          weight_desc_,
+          Weight.template data<T_W>(),
+          conv_desc_,
+          bwdDataAlgo_,
+          &beta_,
+          bottom_desc_,
+          dX->template mutable_data<T_DX>(),
+          bwdDataWs_,
+          bwdDataWsSize_));
+        });
+    }
 
     miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
       MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
diff --git a/caffe2/operators/hip/pool_op_miopen.cc b/caffe2/operators/hip/pool_op_miopen.cc
index 6f1c6d6c4295dc..3973a3fa609e6b 100644
--- a/caffe2/operators/hip/pool_op_miopen.cc
+++ b/caffe2/operators/hip/pool_op_miopen.cc
@@ -26,8 +26,6 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
         miopen_wrapper_(&context_),
         alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
         beta_(OperatorBase::GetSingleArgument<float>("beta", 0.0)),
-        do_backward_(
-            OperatorBase::GetSingleArgument<bool>("do_backward", true)),
         poolWs_(nullptr),
         poolWsSize_(0)
 
@@ -95,12 +93,6 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
         top_desc_, miopenTypeWrapper<T>::type, N_out, C_out, H_out, W_out));
 
-    MIOPEN_ENFORCE(miopenPoolingGetWorkSpaceSize(top_desc_, &poolWsSize_));
-
-    if ((poolWsSize_ > 0) && (poolWs_ == nullptr)) {
-      HIP_CHECK(hipMalloc(&poolWs_, poolWsSize_));
-    }
-
     const T* Xdata = X.template data<T>();
     T* Ydata = Y->template mutable_data<T>();
     MIOPEN_ENFORCE(miopenPoolingForward(
@@ -112,9 +104,9 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
         &beta_,
         top_desc_,
         Ydata,
-        do_backward_,
-        poolWs_,
-        poolWsSize_));
+        false,
+        nullptr,
+        0));
 
     return true;
   }
@@ -139,7 +131,6 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
   miopenTensorDescriptor_t top_desc_;
   miopenPoolingDescriptor_t pooling_desc_;
   miopenPoolingMode_t mode_;
-  bool do_backward_;
   const float alpha_;
   const float beta_;
 };
@@ -208,6 +199,21 @@ class MIOPENPoolGradientOp : public ConvPoolOpBase<HIPContext> {
     W_out = Y.ndim() > 3 ? Y.dim32(3) : 1;
     D_out = Y.ndim() > 4 ? Y.dim32(4) : 1;
 
+    switch (kernel_.size())
+    {
+      case 1:
+        ConvPoolOpBase<HIPContext>::ComputePads({H});
+        break;
+      case 2:
+        ConvPoolOpBase<HIPContext>::ComputePads({H, W});
+        break;
+      case 3:
+        ConvPoolOpBase<HIPContext>::ComputePads({H, W, D});
+        break;
+      default:
+        CAFFE_THROW("Unsupported kernel size :", kernel_.size());
+    }
+
     CAFFE_ENFORCE(kernel_.size() == 2, "MIOpen supports only 2D pooling");
     MIOPEN_ENFORCE(miopenSet2dPoolingDescriptor(
         pooling_desc_,
@@ -225,23 +231,24 @@ class MIOPENPoolGradientOp : public ConvPoolOpBase<HIPContext> {
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
         top_desc_, miopenTypeWrapper<T>::type, N_out, C_out, H_out, W_out));
 
-    MIOPEN_ENFORCE(miopenPoolingGetWorkSpaceSize(top_desc_, &poolWsSize_));
-
-    if ((poolWsSize_ > 0) && (poolWs_ == nullptr)) {
-      HIP_CHECK(hipMalloc(&poolWs_, poolWsSize_));
-    }
-
-    if (bwdPoolScratch_ == nullptr) {
-      HIP_CHECK(hipMalloc(&bwdPoolScratch_, Y.size() * sizeof(float)));
-    }
-
     // Carry out the pooling computation.
     const T* Xdata = X.template data<T>();
     const T* Ydata = Y.template data<T>();
     const T* dYdata = dY.template data<T>();
     T* dXdata = dX->template mutable_data<T>();
 
-    MIOPEN_ENFORCE(miopenPoolingForward(
+    if (mode_ == miopenPoolingMax) {
+      MIOPEN_ENFORCE(miopenPoolingGetWorkSpaceSize(top_desc_, &poolWsSize_));
+
+      if ((poolWsSize_ > 0) && (poolWs_ == nullptr)) {
+        HIP_CHECK(hipMalloc(&poolWs_, poolWsSize_));
+      }
+
+      if (bwdPoolScratch_ == nullptr) {
+        HIP_CHECK(hipMalloc(&bwdPoolScratch_, Y.size() * sizeof(float)));
+      }
+
+      MIOPEN_ENFORCE(miopenPoolingForward(
         miopen_wrapper_.inline_miopen_handle(),
         pooling_desc_,
         &alpha_,
@@ -253,6 +260,7 @@ class MIOPENPoolGradientOp : public ConvPoolOpBase<HIPContext> {
         true,
         poolWs_,
         poolWsSize_));
+    }
 
     MIOPEN_ENFORCE(miopenPoolingBackward(
         miopen_wrapper_.inline_miopen_handle(),
diff --git a/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
index 4ec3ed1021719d..0a8b1d9b1cbfa6 100644
--- a/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
+++ b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
@@ -115,6 +115,7 @@ bool MIOpenSpatialBNOp::DoRunWithType() {
   auto& X = Input(INPUT);
   auto& scale = Input(SCALE);
   auto& bias = Input(BIAS);
+  auto* Y = Output(OUTPUT);
 
   // Only 2D BatchNorm is supported in MIopen for now
   // @petrex will follow up on adding 1D and 3D support
@@ -131,12 +132,14 @@ bool MIOpenSpatialBNOp::DoRunWithType() {
   CAFFE_ENFORCE_EQ(bias.ndim(), 1);
   CAFFE_ENFORCE_EQ(scale.dim32(0), C);
   CAFFE_ENFORCE_EQ(bias.dim32(0), C);
+
+  Y->ResizeLike(X);
+  T* Y_data = Y->template mutable_data<T>();
+
   // See if we need to reshape.
   if (N > 0 && X.dims() != miopen_input_dims_) {
     VLOG(1) << "Setting descriptors.";
     miopen_input_dims_ = X.dims();
-    vector<int> dims = {N, C, H, W, D};
-    vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
         data_desc_, miopenTypeWrapper<T>::type, N, C, H, W));
 
@@ -154,9 +157,6 @@ bool MIOpenSpatialBNOp::DoRunWithType() {
     CAFFE_ENFORCE_EQ(est_mean.dim32(0), C);
     CAFFE_ENFORCE_EQ(est_var.dim32(0), C);
 
-    auto* Y = Output(OUTPUT);
-    Y->ResizeLike(X);
-    T* Y_data = Y->template mutable_data<T>();
     if (N == 0) {
       return true;
     }
@@ -178,9 +178,6 @@ bool MIOpenSpatialBNOp::DoRunWithType() {
         epsilon_));
   } else {
     // Run training mode.
-    auto* Y = Output(OUTPUT);
-    Y->ResizeLike(X);
-    T* Y_data = Y->template mutable_data<T>();
     // obtain running mean and running inv var, and see if we need to
     // initialize them.
     auto* running_mean = Output(RUNNING_MEAN);
@@ -276,8 +273,7 @@ bool MIOpenSpatialBNGradientOp::DoRunWithType() {
   CAFFE_ENFORCE_EQ(scale.dim32(0), C);
   // See if we need to reshape.
   if (N > 0 && X.dims() != miopen_input_dims_) {
-    vector<int> dims = {N, C, H, W, D};
-    vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    miopen_input_dims_ = X.dims();
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
         data_desc_, miopenTypeWrapper<T>::type, N, C, H, W));
 
diff --git a/caffe2/predictor/predictor_config.cc b/caffe2/predictor/predictor_config.cc
index aabff0daffcd73..d49632a069bba7 100644
--- a/caffe2/predictor/predictor_config.cc
+++ b/caffe2/predictor/predictor_config.cc
@@ -1,8 +1,10 @@
 #include "predictor_config.h"
 #include "caffe2/core/init.h"
+#include "caffe2/utils/proto_utils.h"
 #ifdef CAFFE2_OPTIMIZER
 #include "caffe2/opt/optimizer.h"
 #endif
+
 namespace caffe2 {
 
 namespace {
@@ -69,7 +71,8 @@ PredictorConfig makePredictorConfig(
 #if CAFFE2_MOBILE
   GlobalInit();
 #endif
-  if (optimization) {
+  if (optimization &&
+      !ArgumentHelper::HasArgument(*config.predict_net, "disable_nomnigraph")) {
 #ifdef CAFFE2_OPTIMIZER
     try {
       *config.predict_net =
diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto
index 9e626d8d845260..ddf9e5a2066116 100644
--- a/caffe2/proto/torch.proto
+++ b/caffe2/proto/torch.proto
@@ -101,7 +101,7 @@ message AttributeProto {
 
   // The name field MUST be present for this version of the IR.
   optional string name = 1;           // namespace Attribute
- 
+
   // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
   // In this case, this AttributeProto does not contain data, and it's a reference of attribute
   // in parent scope.
@@ -170,6 +170,7 @@ message NodeProto {
   repeated AttributeProto attribute = 5;
 
   // A human-readable documentation for this node. Markdown is allowed.
+  // Equivalent to string debug_info
   optional string doc_string = 6;
 
   // Additional annotations, attributes are defined in Schema
@@ -177,7 +178,6 @@ message NodeProto {
   //    string engine
   //    string list control_input
   //    int64 is_gradient_op
-  //    string debug_info
   repeated AttributeProto annotations = 8;
 
   // Besides the node type, PyTorhc also serialize ATen function signature
diff --git a/caffe2/proto/torch_pb.h b/caffe2/proto/torch_pb.h
new file mode 100644
index 00000000000000..14a058e9652068
--- /dev/null
+++ b/caffe2/proto/torch_pb.h
@@ -0,0 +1,8 @@
+#ifndef CAFFE2_PROTO_TORCH_PB_H_
+#define CAFFE2_PROTO_TORCH_PB_H_
+
+#include <caffe2/core/common.h>
+#include <caffe2/proto/caffe2_pb.h>
+#include <caffe2/proto/torch.pb.h>
+
+#endif // CAFFE2_PROTO_TORCH_PB_H_
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
new file mode 100644
index 00000000000000..50eaf220c7f721
--- /dev/null
+++ b/caffe2/python/convert.py
@@ -0,0 +1,66 @@
+## @package workspace
+# Module caffe2.python.workspace
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2, torch_pb2
+
+import caffe2.python._import_c_extension as C
+
+
+def ArgumentToAttributeProto(arg):
+    serialized_arg = None
+    if hasattr(arg, 'SerializeToString') and callable(arg.SerializeToString):
+        serialized_arg = arg.SerializeToString()
+    elif isinstance(arg, bytes):
+        serialized_arg = arg
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither arg is bytes.\ntype is {}'.format(type(arg)))
+    attr = torch_pb2.AttributeProto()
+    attr.ParseFromString(C.argument_to_attribute_proto(serialized_arg))
+    return attr
+
+
+def AttributeProtoToArgument(attr):
+    serialized_attr = None
+    if hasattr(attr, 'SerializeToString') and callable(attr.SerializeToString):
+        serialized_attr = attr.SerializeToString()
+    elif isinstance(attr, bytes):
+        serialized_attr = attr
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither attr is bytes.\ntype is {}'.format(type(attr)))
+    arg = caffe2_pb2.Argument()
+    arg.ParseFromString(C.attribute_proto_to_argument(serialized_attr))
+    return arg
+
+
+def OperatorDefToNodeProto(op_def):
+    serialized_op_def = None
+    if hasattr(op_def, 'SerializeToString') and callable(op_def.SerializeToString):
+        serialized_op_def = op_def.SerializeToString()
+    elif isinstance(op_def, bytes):
+        serialized_op_def = op_def
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither op_def is bytes.\ntype is {}'.format(type(op_def)))
+    node = torch_pb2.NodeProto()
+    node.ParseFromString(C.operator_def_to_node_proto(serialized_op_def))
+    return node
+
+
+def NodeProtoToOperatorDef(node_proto):
+    serialized_node_proto = None
+    if hasattr(node_proto, 'SerializeToString') and callable(node_proto.SerializeToString):
+        serialized_node_proto = node_proto.SerializeToString()
+    elif isinstance(node_proto, bytes):
+        serialized_node_proto = node_proto
+    else:
+        raise ValueError('No SerializeToString method is detected. '
+                         'neither node_proto is bytes.\ntype is {}'.format(type(node_proto)))
+    op_def = caffe2_pb2.OperatorDef()
+    op_def.ParseFromString(C.node_proto_to_operator_def(serialized_node_proto))
+    return op_def
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
new file mode 100644
index 00000000000000..c8de7e9750680f
--- /dev/null
+++ b/caffe2/python/convert_test.py
@@ -0,0 +1,250 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import convert, workspace
+from caffe2.proto import caffe2_pb2, torch_pb2
+import unittest
+import numpy as np
+
+class TestOperator(unittest.TestCase):
+    def setUp(self):
+        workspace.ResetWorkspace()
+
+    def testArgument2AttributeProto(self):
+        arg_f = caffe2_pb2.Argument()
+        arg_f.name = "TestArgF"
+        arg_f.f = 10.0
+        attr_f = convert.ArgumentToAttributeProto(arg_f)
+        self.assertEqual(attr_f.name, arg_f.name)
+        self.assertEqual(attr_f.f, arg_f.f)
+
+        arg_i = caffe2_pb2.Argument()
+        arg_i.name = "TestArgI"
+        arg_i.i = 100
+        attr_i = convert.ArgumentToAttributeProto(arg_i)
+        self.assertEqual(attr_i.name, arg_i.name)
+        self.assertEqual(attr_i.i, arg_i.i)
+
+        arg_s = caffe2_pb2.Argument()
+        arg_s.name = "TestArgS"
+        arg_s.s = "TestS".encode("utf-8")
+        attr_s = convert.ArgumentToAttributeProto(arg_s)
+        self.assertEqual(attr_s.name, arg_s.name)
+        self.assertEqual(attr_s.s, arg_s.s)
+
+        # TODO: test net arg
+
+        arg_floats = caffe2_pb2.Argument()
+        arg_floats.name = "TestArgFloats"
+        arg_floats.floats.extend([10.0, 11.0, 12.0])
+        attr_floats = convert.ArgumentToAttributeProto(arg_floats)
+        self.assertEqual(attr_floats.name, arg_floats.name)
+        self.assertEqual(attr_floats.floats, arg_floats.floats)
+
+        arg_ints = caffe2_pb2.Argument()
+        arg_ints.name = "TestArgInts"
+        arg_ints.ints.extend([100, 101, 102])
+        attr_ints = convert.ArgumentToAttributeProto(arg_ints)
+        self.assertEqual(attr_ints.name, arg_ints.name)
+        self.assertEqual(attr_ints.ints, arg_ints.ints)
+
+        arg_strings = caffe2_pb2.Argument()
+        arg_strings.name = "TestArgStrings"
+        arg_strings.strings.extend([
+            "TestStrings1".encode("utf-8"),
+            "TestStrings2".encode("utf-8"),
+        ])
+        attr_strings = convert.ArgumentToAttributeProto(arg_strings)
+        self.assertEqual(attr_strings.name, arg_strings.name)
+        self.assertEqual(attr_strings.strings, arg_strings.strings)
+
+        # TODO: test nets arg
+
+    def testAttributeProto2Argument(self):
+        attr_f = torch_pb2.AttributeProto()
+        attr_f.type = torch_pb2.AttributeProto.FLOAT
+        attr_f.name = "TestAttrF"
+        attr_f.f = 10.0
+        arg_f = convert.AttributeProtoToArgument(attr_f)
+        self.assertEqual(arg_f.name, attr_f.name)
+        self.assertEqual(arg_f.f, attr_f.f)
+
+        attr_i = torch_pb2.AttributeProto()
+        attr_i.type = torch_pb2.AttributeProto.INT
+        attr_i.name = "TestArgI"
+        attr_i.i = 100
+        arg_i = convert.AttributeProtoToArgument(attr_i)
+        self.assertEqual(arg_i.name, attr_i.name)
+        self.assertEqual(arg_i.i, attr_i.i)
+
+        attr_s = torch_pb2.AttributeProto()
+        attr_s.type = torch_pb2.AttributeProto.STRING
+        attr_s.name = "TestArgS"
+        attr_s.s = "TestS".encode("utf-8")
+        arg_s = convert.AttributeProtoToArgument(attr_s)
+        self.assertEqual(arg_s.name, attr_s.name)
+        self.assertEqual(arg_s.s, attr_s.s)
+
+        # TODO: test graph attribute
+
+        attr_floats = torch_pb2.AttributeProto()
+        attr_floats.type = torch_pb2.AttributeProto.FLOATS
+        attr_floats.name = "TestAttrFloats"
+        attr_floats.floats.extend([10.0, 11.0, 12.0])
+        arg_floats = convert.AttributeProtoToArgument(attr_floats)
+        self.assertEqual(arg_floats.name, attr_floats.name)
+        self.assertEqual(arg_floats.floats, attr_floats.floats)
+
+        attr_ints = torch_pb2.AttributeProto()
+        attr_ints.type = torch_pb2.AttributeProto.INTS
+        attr_ints.name = "TestArgInts"
+        attr_ints.ints.extend([100, 101, 102])
+        arg_ints = convert.AttributeProtoToArgument(attr_ints)
+        self.assertEqual(arg_ints.name, attr_ints.name)
+        self.assertEqual(arg_ints.ints, attr_ints.ints)
+
+        attr_strings = torch_pb2.AttributeProto()
+        attr_strings.type = torch_pb2.AttributeProto.STRINGS
+        attr_strings.name = "TestArgStrings"
+        attr_strings.strings.extend([
+            "TestStrings1".encode("utf-8"),
+            "TestStrings2".encode("utf-8"),
+        ])
+        arg_strings = convert.AttributeProtoToArgument(attr_strings)
+        self.assertEqual(arg_strings.name, attr_strings.name)
+        self.assertEqual(arg_strings.strings, attr_strings.strings)
+
+        # TODO: test graphs attribute
+
+
+    def testOperatorDef2NodeProto(self):
+        op_def = caffe2_pb2.OperatorDef()
+        op_def.input.extend(["A", "B", "C"])
+        op_def.output.extend(["X", "Y"])
+        op_def.name = "TestOpName"
+        op_def.type = "TestOp"
+        arg1 = caffe2_pb2.Argument()
+        arg1.name = "TestArg1"
+        arg1.i = 1
+        arg2 = caffe2_pb2.Argument()
+        arg2.name = "TestArg2"
+        arg1.s = "TestInfo".encode("utf-8")
+        op_def.arg.extend([arg1, arg2])
+        op_def.device_option.CopyFrom(caffe2_pb2.DeviceOption())
+        op_def.engine = "TestEngine".encode("utf-8")
+        op_def.control_input.extend(["input1", "input2"])
+        op_def.is_gradient_op = True
+        op_def.debug_info = "TestDebugInfo"
+
+        node = convert.OperatorDefToNodeProto(op_def)
+
+        self.assertEqual(node.input, op_def.input)
+        self.assertEqual(node.output, op_def.output)
+        self.assertEqual(node.name, op_def.name)
+        self.assertEqual(node.op_type, op_def.type)
+        self.assertEqual(node.attribute[0].name, op_def.arg[0].name)
+        self.assertEqual(node.attribute[1].name, op_def.arg[1].name)
+        self.assertEqual(node.device_option, op_def.device_option)
+        node_engine = [a.s.decode("utf-8") for a in node.annotations if a.name == "engine"][0]
+        self.assertEqual(node_engine, op_def.engine)
+        node_control_input = [a.strings for a in node.annotations if a.name == "control_input"][0]
+        self.assertEqual(len(node_control_input), len(op_def.control_input))
+        for x, y in zip(node_control_input, op_def.control_input):
+            self.assertEqual(x.decode("utf-8"), y)
+        self.assertEqual(node.doc_string, op_def.debug_info)
+        node_is_gradient_op = [a.i for a in node.annotations if a.name == "is_gradient_op"][0]
+        self.assertEqual(node_is_gradient_op, int(op_def.is_gradient_op))
+
+    def testNodeProto2OperatorDef(self):
+        node = torch_pb2.NodeProto()
+        node.input.extend(["A", "B", "C"])
+        node.output.extend(["X", "Y"])
+        node.name = "TestOpName"
+        node.op_type = "TestOp"
+        attr1 = torch_pb2.AttributeProto()
+        attr1.name = "TestAttr1"
+        attr1.type = torch_pb2.AttributeProto.STRING
+        attr1.s = "TestInfo".encode("utf-8")
+        attr2 = torch_pb2.AttributeProto()
+        attr2.name = "TestAttr2"
+        attr2.type = torch_pb2.AttributeProto.INT
+        attr2.i = 10
+        node.attribute.extend([attr1, attr2])
+        node.device_option.CopyFrom(caffe2_pb2.DeviceOption())
+        anno1 = torch_pb2.AttributeProto()
+        anno1.name = "engine"
+        anno1.type = torch_pb2.AttributeProto.STRING
+        anno1.s = "TestEngine".encode("utf-8")
+        anno2 = torch_pb2.AttributeProto()
+        anno2.name = "control_input"
+        anno2.type = torch_pb2.AttributeProto.STRINGS
+        anno2.strings.extend(["input1".encode("utf-8"), "input2".encode("utf-8")])
+        anno3 = torch_pb2.AttributeProto()
+        anno3.name = "is_gradient_op"
+        anno3.type = torch_pb2.AttributeProto.INT
+        anno3.i = 1
+        node.annotations.extend([anno1, anno2, anno3])
+        node.doc_string = "TestDocString".encode("utf-8")
+
+        op_def = convert.NodeProtoToOperatorDef(node)
+
+        self.assertEqual(op_def.input, node.input)
+        self.assertEqual(op_def.output, node.output)
+        self.assertEqual(op_def.name, node.name)
+        self.assertEqual(op_def.type, node.op_type)
+        self.assertEqual(op_def.arg[0].name, node.attribute[0].name)
+        self.assertEqual(op_def.arg[1].name, node.attribute[1].name)
+        self.assertEqual(op_def.device_option, node.device_option)
+        node_engine = [a.s for a in node.annotations if a.name == "engine"][0]
+        self.assertEqual(op_def.engine, node_engine.decode("utf-8"))
+        node_control_input = [a.strings for a in node.annotations if a.name == "control_input"][0]
+        for x, y in zip(op_def.control_input, node_control_input):
+            self.assertEqual(x, y.decode("utf-8"))
+        self.assertEqual(op_def.debug_info, node.doc_string)
+        node_is_gradient_op = [a.i for a in node.annotations if a.name == "is_gradient_op"][0]
+        self.assertEqual(int(op_def.is_gradient_op), node_is_gradient_op)
+
+    def testEnd2End(self):
+        op_def = caffe2_pb2.OperatorDef()
+        op_def.type = "Add"
+        op_def.input.extend(["input1"])
+        op_def.input.extend(["input2"])
+        op_def.output.extend(["output1"])
+        node = convert.OperatorDefToNodeProto(op_def)
+
+        input1 = np.random.randn(1, 3, 1, 5).astype(np.float32)
+        input2 = np.random.randn(2, 1, 4, 1).astype(np.float32)
+        ref_output1 = input1 + input2
+        workspace.FeedBlob("input1", input1)
+        workspace.FeedBlob("input2", input2)
+        self.assertEqual(workspace.RunOperatorOnce(node.SerializeToString(), legacy_proto=False), True)
+
+        self.assertEqual(workspace.HasBlob("output1"), True)
+        fetched_back = workspace.FetchBlob("output1")
+        np.testing.assert_array_equal(fetched_back, ref_output1)
+
+    def testRoundTrip(self):
+        op_def = caffe2_pb2.OperatorDef()
+        op_def.type = "Add"
+        op_def.input.extend(["input1"])
+        op_def.input.extend(["input2"])
+        op_def.output.extend(["output1"])
+        node = convert.OperatorDefToNodeProto(op_def)
+        new_op_def = convert.NodeProtoToOperatorDef(node)
+
+        input1 = np.random.randn(1, 3, 1, 5).astype(np.float32)
+        input2 = np.random.randn(2, 1, 4, 1).astype(np.float32)
+        ref_output1 = input1 + input2
+        workspace.FeedBlob("input1", input1)
+        workspace.FeedBlob("input2", input2)
+        self.assertEqual(workspace.RunOperatorOnce(new_op_def.SerializeToString()), True)
+
+        self.assertEqual(workspace.HasBlob("output1"), True)
+        fetched_back = workspace.FetchBlob("output1")
+        np.testing.assert_array_equal(fetched_back, ref_output1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
index 91a08df74f165e..ebf3c3b8cd44a5 100644
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@@ -448,10 +448,16 @@ def add_model_ops(model, loss_scale):
         self.assertEqual(transform.call_count, 1)
 
     @given(seed=st.integers(0, 65535), batch_size=st.integers(1, 20))
-    def test_multi_device_spatial_bn_cpu(self, seed, batch_size):
-        self._spatial_bn_check("cpu", seed, batch_size)
+    def test_multi_device_bn_op_level(self, seed, batch_size):
+        self._bn_check_op_level("cpu", seed, batch_size)
 
-    def _spatial_bn_check(self, device_type, seed, batch_size):
+    def _bn_check_op_level(self, device_type, seed, batch_size):
+        '''
+        Test multi device batch normalization at the operation level. This is
+        done by checking the outputs of batch normalization and its gradient
+        operator. We compare values produced with our manually calculated
+        batch normalization values and gradients.
+        '''
         devices = [0, 1, 2]
         epsilon = 1e-3
         tolerance = 1e-3
@@ -465,7 +471,7 @@ def _test_forward_pass(x, devices, device_type, scale, bias, epsilon):
                 x_hat = (x_i - mean) / (np.sqrt(var + epsilon))
                 expected_out = scale * x_hat + bias
                 spatial_out = workspace.FetchBlob(
-                    "{}_{}/sp_out".format(device_type, device))
+                    "{}_{}/bn_out".format(device_type, device))
                 rel_error = np.linalg.norm(spatial_out - expected_out) \
                             / np.linalg.norm(expected_out)
                 self.assertTrue(rel_error < 0.005)
@@ -476,22 +482,22 @@ def _test_backward_pass(x, devices, device_type, scale, tolerance):
             dGamma_arr = []
             num_devices = len(devices)
             mean = np.array(workspace.FetchBlob(
-                "{}_0/sp_out_sm".format(device_type)), dtype=np.float32)
+                "{}_0/bn_out_sm".format(device_type)), dtype=np.float32)
             inv_var = np.array(workspace.FetchBlob(
-                "{}_0/sp_out_siv".format(device_type)), dtype=np.float32)
+                "{}_0/bn_out_siv".format(device_type)), dtype=np.float32)
 
             # dBias
             # Sum dBias values over all devices to find the average gradient
             for device in devices:
                 dY_blob = workspace.FetchBlob(
-                    "{}_{}/sp_out_grad".format(device_type, device))
+                    "{}_{}/bn_out_grad".format(device_type, device))
                 dY = np.array(dY_blob, dtype=np.float32)
                 dY_arr.append(dY)
                 dBias_arr.append(np.array(np.sum(dY, axis=0), dtype=np.float32))
             dBias = np.sum(dBias_arr, dtype=np.float32)
             dBias_avg = dBias / num_devices
             for device in devices:
-                dBiasActual = np.sum(workspace.FetchBlob("{}_{}/sp_out_b_grad"
+                dBiasActual = np.sum(workspace.FetchBlob("{}_{}/bn_out_b_grad"
                     .format(device_type, device)), dtype=np.float32)
                 self.assertTrue(np.isclose([dBiasActual], [dBias], atol=tolerance))
 
@@ -505,7 +511,7 @@ def _test_backward_pass(x, devices, device_type, scale, tolerance):
             dGamma_avg = dGamma / num_devices
             for device in devices:
                 dGammaActual = workspace.FetchBlob(
-                    "{}_{}/sp_out_s_grad".format(device_type, device))
+                    "{}_{}/bn_out_s_grad".format(device_type, device))
                 self.assertTrue(np.isclose([dGamma], [dGammaActual], atol=tolerance))
 
             # dX
@@ -524,8 +530,8 @@ def add_input_ops(model):
 
         def add_model_ops(model, loss_scale):
             model.Tanh("data", "tanh")
-            model.SpatialBN("tanh", "sp_out", 1, epsilon=epsilon, is_test=False)
-            model.Sqr("sp_out", "sqr")
+            model.SpatialBN("tanh", "bn_out", 1, epsilon=epsilon, is_test=False)
+            model.Sqr("bn_out", "sqr")
             loss = model.SumElements("sqr", "loss")
             return [loss]
 
@@ -550,8 +556,8 @@ def add_optimizer(model):
         )
 
         workspace.RunNetOnce(model.param_init_net)
-        scale = workspace.FetchBlob("{}_0/sp_out_s".format(device_type))
-        bias = workspace.FetchBlob("{}_0/sp_out_b".format(device_type))
+        scale = workspace.FetchBlob("{}_0/bn_out_s".format(device_type))
+        bias = workspace.FetchBlob("{}_0/bn_out_b".format(device_type))
         workspace.RunNetOnce(model.net)
 
         x = []
@@ -563,6 +569,149 @@ def add_optimizer(model):
         _test_forward_pass(x, devices, device_type, scale, bias, epsilon)
         _test_backward_pass(x, devices, device_type, scale, tolerance)
 
+    @given(seed=st.integers(0, 65535), batch_size=st.integers(1, 20))
+    def test_multi_device_bn_net_lvl_cpu(self, seed, batch_size):
+        if batch_size % 2 == 1:
+            batch_size += 1
+        self._test_multi_device_bn_net_lvl("cpu", seed, batch_size)
+
+    def _test_multi_device_bn_net_lvl(self, device_type, seed, batch_size):
+        '''
+        Test multi device batch normalization at the net level. This is done
+        by verifying that the final batch normalization outputs and the
+        gradient outputs from multiple devices are the same as those produced
+        from a single device
+        '''
+
+        # Verify that the gradients calculated over multiple devices are the
+        # same as the gradients calculated over one device. These values should
+        # be equivalent because combine_spatial_bn sums values over all devices
+        def _verify_bn_outputs(
+            devices,
+            device_type,
+            tolerance,
+            single_device_bn_out,
+            two_device_bn_out_vals,
+            single_device_grads,
+            two_device_grads,
+        ):
+            two_device_bn_out = np.concatenate(two_device_bn_out_vals)
+            self.assertTrue(np.isclose(
+                [single_device_bn_out], [two_device_bn_out], atol=tolerance).all())
+
+            # Scalar and Bias gradients should be the same across devices
+            gradient_names = ["bn_out_s_grad", "bn_out_b_grad"]
+            for name in gradient_names:
+                expected_grad = single_device_grads[name]
+                for device in devices:
+                    actual_grad = two_device_grads[device][name]
+                    self.assertTrue(
+                        np.isclose([actual_grad], [expected_grad], atol=tolerance))
+
+            # Expected tanh_grad should be the combined tanh_grad vectors
+            # across the devices
+            first_grad = two_device_grads[0]["tanh_grad"]
+            second_grad = two_device_grads[1]["tanh_grad"]
+            actual_grad = np.concatenate([first_grad, second_grad])
+            expected_grad = single_device_grads["tanh_grad"]
+            rel_error = np.linalg.norm(actual_grad - expected_grad) \
+                / np.linalg.norm(expected_grad)
+            self.assertTrue(rel_error < 1e-3)
+
+        def _create_model(multiple_devices):
+            def add_input_ops_no_combine(model):
+                workspace.FeedBlob("{}_0/data".format(device_type), data)
+
+            def add_input_ops_combine(model):
+                half = int(batch_size / 2)
+                workspace.FeedBlob("{}_0/data".format(device_type), data[:half])
+                workspace.FeedBlob("{}_1/data".format(device_type), data[half:])
+
+            def add_model_ops(model, loss_scale):
+                model.Tanh("data", "tanh")
+                model.SpatialBN("tanh", "bn_out", 1, epsilon=epsilon, is_test=False)
+                model.Sqr("bn_out", "sqr")
+                loss = model.SumElements("sqr", "loss")
+                return [loss]
+
+            def add_optimizer(model):
+                return optimizer.build_sgd(model, 0.1)
+
+            if multiple_devices:
+                input_fun = add_input_ops_combine
+                devices = [0, 1]
+                combine_spatial_bn = True
+            else:
+                input_fun = add_input_ops_no_combine
+                devices = [0]
+                combine_spatial_bn = False
+            model = cnn.CNNModelHelper(
+                order="NCHW",
+                name="test"
+            )
+            data_parallel_model.Parallelize(
+                model,
+                input_builder_fun=input_fun,
+                forward_pass_builder_fun=add_model_ops,
+                optimizer_builder_fun=add_optimizer,
+                devices=devices,
+                cpu_device=device_type == "cpu",
+                shared_model=False,
+                combine_spatial_bn=combine_spatial_bn,
+            )
+            return model
+
+        devices = [0, 1]
+        epsilon = 1e-3
+        tolerance = 1e-3
+        # We are generating random data
+        np.random.seed(seed)
+        data = np.random.rand(batch_size, 1, 1, 1).astype(np.float32)
+        data = np.reshape(data, (batch_size, 1, 1, 1))
+
+        # Get values calculated without combine_spatial_bn
+        workspace.ResetWorkspace()
+        model_no_combine = _create_model(multiple_devices=False)
+        workspace.RunNetOnce(model_no_combine.param_init_net)
+        workspace.RunNetOnce(model_no_combine.net)
+        single_device_bn_out = workspace.FetchBlob("{}_0/bn_out".format(device_type))
+        single_device_grads = {}
+        single_device_grads["bn_out_s_grad"] = workspace.FetchBlob(
+            "{}_0/bn_out_s_grad".format(device_type))
+        single_device_grads["bn_out_b_grad"] = workspace.FetchBlob(
+            "{}_0/bn_out_b_grad".format(device_type))
+        single_device_grads["tanh_grad"] = workspace.FetchBlob(
+            "{}_0/tanh_grad".format(device_type))
+
+        # Get values calculated over multiple devices with combine_spatial_bn true
+        workspace.ResetWorkspace()
+        model_combine = _create_model(multiple_devices=True)
+        workspace.RunNetOnce(model_combine.param_init_net)
+        workspace.RunNetOnce(model_combine.net)
+        two_device_bn_out_vals = []
+        two_device_grads = {}
+        for device in devices:
+            bn_out_blob = "{}_{}/bn_out".format(device_type, device)
+            two_device_bn_out_vals.append(workspace.FetchBlob(bn_out_blob))
+            two_device_grads[device] = {}
+            two_device_grads[device]["bn_out_s_grad"] = workspace.FetchBlob(
+                "{}_{}/bn_out_s_grad".format(device_type, device))
+            two_device_grads[device]["bn_out_b_grad"] = workspace.FetchBlob(
+                "{}_{}/bn_out_b_grad".format(device_type, device))
+            two_device_grads[device]["tanh_grad"] = workspace.FetchBlob(
+                "{}_{}/tanh_grad".format(device_type, device))
+
+        # Check to see if the combined values are equivalent
+        _verify_bn_outputs(
+            devices,
+            device_type,
+            tolerance,
+            single_device_bn_out,
+            two_device_bn_out_vals,
+            single_device_grads,
+            two_device_grads
+        )
+
 class RecurrentNetworkParallelTest(TestCase):
 
     def run_model(self, devices, gpu):
diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py
index 97e352e93f1807..8e0ac8b0ffac00 100644
--- a/caffe2/python/operator_test/rank_loss_operator_test.py
+++ b/caffe2/python/operator_test/rank_loss_operator_test.py
@@ -2,14 +2,16 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestPairWiseLossOps(hu.HypothesisTestCase):
+class TestPairWiseLossOps(serial.SerializedTestCase):
     @given(X=hu.arrays(dims=[2, 1],
                        elements=st.floats(min_value=0.0, max_value=10.0)),
            label=hu.arrays(dims=[2, 1],
@@ -98,7 +100,7 @@ def test_pair_wise_loss_gradient(self, X, label, dY, gc, dc):
                 (up_output_pred[0] - down_output_pred[0]) / delta),
             rtol=1e-2, atol=1e-2)
 
-    @given(n=st.integers(0, 10), k=st.integers(1, 5), **hu.gcs_cpu_only)
+    @serial.given(n=st.integers(0, 10), k=st.integers(1, 5), **hu.gcs_cpu_only)
     def test_pair_wise_loss_batch(self, n, k, gc, dc):
         lengths = np.random.randint(k, size=n).astype(np.int32) + 1
         X = np.random.rand(sum(lengths)).astype(np.float32)
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
index e37bd9092e8f8f..26c323866df80b 100644
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -7,13 +7,14 @@
 from caffe2.python.model_helper import ModelHelper
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
-import unittest
 import os
+import unittest
 
-class RecurrentNetworkTest(hu.HypothesisTestCase):
+class RecurrentNetworkTest(serial.SerializedTestCase):
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(T=st.integers(1, 4),
            n=st.integers(1, 5),
@@ -34,7 +35,7 @@ def test_sum_mul(self, T, n, d):
         self.simple_rnn(T, n, d, model, step, input_t, output_t, output_t_prev,
                         input_blob, initial_input_blob)
 
-    @given(T=st.integers(1, 4),
+    @serial.given(T=st.integers(1, 4),
            n=st.integers(1, 5),
            d=st.integers(1, 5))
     def test_mul(self, T, n, d):
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
index e5f3d13a51bc0e..a4ff61218cdb54 100644
--- a/caffe2/python/operator_test/reduce_ops_test.py
+++ b/caffe2/python/operator_test/reduce_ops_test.py
@@ -7,12 +7,13 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 import itertools as it
 
 
-class TestReduceOps(hu.HypothesisTestCase):
+class TestReduceOps(serial.SerializedTestCase):
     def run_reduce_op_test_impl(
             self, op_name, X, axes, keepdims, ref_func, gc, dc):
         if axes is None:
@@ -54,8 +55,9 @@ def run_reduce_op_test(
                 self.run_reduce_op_test_impl(
                     op_name, X, axes, keepdims, ref_func, gc, dc)
 
-    @given(X=hu.tensor(max_dim=3, dtype=np.float32), keepdims=st.booleans(),
-           num_axes=st.integers(1, 3), **hu.gcs)
+    @serial.given(
+        X=hu.tensor(max_dim=3, dtype=np.float32), keepdims=st.booleans(),
+        num_axes=st.integers(1, 3), **hu.gcs)
     def test_reduce_min(self, X, keepdims, num_axes, gc, dc):
         X_dims = X.shape
         X_size = X.size
@@ -65,8 +67,9 @@ def test_reduce_min(self, X, keepdims, num_axes, gc, dc):
         self.run_reduce_op_test(
             "ReduceMin", X, keepdims, num_axes, np.min, gc, dc)
 
-    @given(X=hu.tensor(max_dim=3, dtype=np.float32), keepdims=st.booleans(),
-           num_axes=st.integers(1, 3), **hu.gcs)
+    @serial.given(
+        X=hu.tensor(max_dim=3, dtype=np.float32), keepdims=st.booleans(),
+        num_axes=st.integers(1, 3), **hu.gcs)
     def test_reduce_max(self, X, keepdims, num_axes, gc, dc):
         X_dims = X.shape
         X_size = X.size
@@ -84,7 +87,7 @@ def test_reduce_sum(self, n, m, k, t, keepdims, num_axes, gc, dc):
         self.run_reduce_op_test(
             "ReduceSum", X, keepdims, num_axes, np.sum, gc, dc)
 
-    @given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
+    @serial.given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
            num_axes=st.integers(1, 4), **hu.gcs)
     def test_reduce_mean(self, X, keepdims, num_axes, gc, dc):
         self.run_reduce_op_test(
@@ -99,7 +102,7 @@ def test_reduce_l1(self, n, m, k, keepdims, num_axes, gc, dc):
         self.run_reduce_op_test(
             "ReduceL1", X, keepdims, num_axes, getNorm(1), gc, dc)
 
-    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+    @serial.given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
            keepdims=st.booleans(), num_axes=st.integers(1, 3), **hu.gcs_cpu_only)
     def test_reduce_l2(self, n, m, k, keepdims, num_axes, gc, dc):
         X = np.random.randn(n, m, k).astype(np.float32)
@@ -119,7 +122,7 @@ def norm(X, axis, keepdims):
     return norm
 
 
-class TestReduceFrontReductions(hu.HypothesisTestCase):
+class TestReduceFrontReductions(serial.SerializedTestCase):
     def grad_variant_input_test(self, grad_op_name, X, ref, num_reduce_dim):
         workspace.ResetWorkspace()
 
@@ -200,7 +203,7 @@ def reduce_op_test(self, op_name, op_ref, in_data, in_names,
         self.assertGradientChecks(
             device, op, in_data, 0, [0], stepsize=1e-2, threshold=1e-2)
 
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    @serial.given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
     def test_reduce_front_sum(self, num_reduce_dim, gc, dc):
         X = np.random.rand(7, 4, 3, 5).astype(np.float32)
 
@@ -265,7 +268,7 @@ def ref_sum(X, lengths):
             "ReduceFrontSum", ref_sum, [X, lengths], ["input", "lengths"],
             num_reduce_dim, gc)
 
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    @serial.given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
     def test_reduce_front_mean(self, num_reduce_dim, gc, dc):
         X = np.random.rand(6, 7, 8, 2).astype(np.float32)
 
@@ -296,7 +299,7 @@ def ref_mean(X, lengths):
             "ReduceFrontMean", ref_mean, [X, lengths], ["input", "lengths"],
             num_reduce_dim, gc)
 
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    @serial.given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
     def test_reduce_front_max(self, num_reduce_dim, gc, dc):
         X = np.random.rand(6, 7, 8, 2).astype(np.float32)
 
@@ -325,7 +328,7 @@ def ref_max(X, lengths):
             "ReduceFrontMax", num_reduce_dim, gc, dc, [X, lengths],
             ["X", "lengths"], ref_max)
 
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    @serial.given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
     def test_reduce_back_max(self, num_reduce_dim, gc, dc):
         X = np.random.rand(6, 7, 8, 2).astype(np.float32)
 
@@ -386,7 +389,7 @@ def ref_sum(X, lengths):
             "ReduceBackSum", ref_sum, [X, lengths], ["input", "lengths"],
             num_reduce_dim, gc)
 
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
+    @serial.given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
     def test_reduce_back_mean(self, num_reduce_dim, dc, gc):
         X = np.random.rand(6, 7, 8, 2).astype(np.float32)
 
diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py
index 17bdce728ba696..a138fea59708f1 100644
--- a/caffe2/python/operator_test/reduction_ops_test.py
+++ b/caffe2/python/operator_test/reduction_ops_test.py
@@ -3,17 +3,18 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 from hypothesis import assume, given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.proto import caffe2_pb2
 
 
-class TestReductionOps(hu.HypothesisTestCase):
+class TestReductionOps(serial.SerializedTestCase):
 
-    @given(n=st.integers(5, 8), **hu.gcs)
+    @serial.given(n=st.integers(5, 8), **hu.gcs)
     def test_elementwise_sum(self, n, gc, dc):
         X = np.random.rand(n).astype(np.float32)
 
@@ -41,7 +42,7 @@ def sum_op(X):
             outputs_with_grads=[0],
         )
 
-    @given(n=st.integers(5, 8), **hu.gcs)
+    @serial.given(n=st.integers(5, 8), **hu.gcs)
     def test_elementwise_int_sum(self, n, gc, dc):
         X = np.random.rand(n).astype(np.int32)
 
@@ -61,7 +62,7 @@ def sum_op(X):
             reference=sum_op,
         )
 
-    @given(n=st.integers(1, 65536),
+    @serial.given(n=st.integers(1, 65536),
            dtype=st.sampled_from([np.float32, np.float16]),
            **hu.gcs)
     def test_elementwise_sqrsum(self, n, dtype, gc, dc):
@@ -120,7 +121,7 @@ def avg_op(X):
             outputs_with_grads=[0],
         )
 
-    @given(batch_size=st.integers(1, 3),
+    @serial.given(batch_size=st.integers(1, 3),
            m=st.integers(1, 3),
            n=st.integers(1, 4),
            **hu.gcs)
@@ -143,7 +144,7 @@ def rowwise_max(X):
             reference=rowwise_max,
         )
 
-    @given(batch_size=st.integers(1, 3),
+    @serial.given(batch_size=st.integers(1, 3),
            m=st.integers(1, 3),
            n=st.integers(1, 4),
            **hu.gcs)
diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py
index b732d91e66ad07..0353454ffaddb2 100644
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ b/caffe2/python/operator_test/segment_ops_test.py
@@ -12,6 +12,7 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
 class TesterBase:
@@ -418,7 +419,7 @@ def test_unsorted_means_large(self, gc, dc):
         op = core.CreateOperator("UnsortedSegmentMean", ["X", "segments"], "out")
         self.assertDeviceChecks(dc, op, [X, segments], [0])
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor(
             dtype=np.float32,
             min_value=1,
@@ -447,7 +448,7 @@ def ref(D, L):
         self.assertDeviceChecks(dc, op, [X, Y], [0])
         self.assertGradientChecks(gc, op, [X, Y], 0, [0])
 
-    @given(
+    @serial.given(
         inputs=hu.sparse_lengths_tensor(
             dtype=np.float32,
             min_value=1,
@@ -476,7 +477,7 @@ def ref(D, I, L):
         self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
         self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor(
             dtype=np.float32,
             min_value=1,
@@ -511,7 +512,7 @@ def ref(D, L):
         self.assertDeviceChecks(dc, op, [X, Y], [0])
         self.assertGradientChecks(gc, op, [X, Y], 0, [0])
 
-    @given(
+    @serial.given(
         inputs=hu.sparse_lengths_tensor(
             dtype=np.float32,
             min_value=1,
@@ -547,7 +548,7 @@ def ref(D, I, L):
         self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
         self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
 
-    @given(
+    @serial.given(
         grad_on_weights=st.booleans(),
         inputs=hu.sparse_lengths_tensor(
             dtype=np.float32,
@@ -635,7 +636,7 @@ def test_sparse_lengths_sum_invalid_index(self, gc, dc):
         with self.assertRaises(RuntimeError):
             workspace.RunOperatorOnce(op)
 
-    @given(**hu.gcs_cpu_only)
+    @serial.given(**hu.gcs_cpu_only)
     def test_sparse_lengths_positional_weighted_sum(
             self, gc, dc):
         D = np.random.rand(50, 3, 4, 5).astype(np.float32)
diff --git a/caffe2/python/operator_test/selu_op_test.py b/caffe2/python/operator_test/selu_op_test.py
index a0321b138419f8..674cb47cd7e3a1 100644
--- a/caffe2/python/operator_test/selu_op_test.py
+++ b/caffe2/python/operator_test/selu_op_test.py
@@ -5,16 +5,17 @@
 
 from caffe2.python import core
 from hypothesis import given
-import hypothesis.strategies as st
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+import hypothesis.strategies as st
 import numpy as np
 
 import unittest
 
 
-class TestSelu(hu.HypothesisTestCase):
+class TestSelu(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            engine=st.sampled_from(["", "CUDNN"]),
             **hu.gcs)
     def test_selu_1(self, X, gc, dc, engine):
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 28eb9278da1d34..e29c252345ab66 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -2,14 +2,16 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
+from functools import partial
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 import unittest
 import os
-from functools import partial
 
 
 def _gen_test_add_padding(with_pad_data=True,
@@ -99,8 +101,8 @@ def _gather_padding_ref(start_pad_width, end_pad_width, data, lengths):
     return (start_padding, end_padding)
 
 
-class TestSequenceOps(hu.HypothesisTestCase):
-    @given(start_pad_width=st.integers(min_value=1, max_value=2),
+class TestSequenceOps(serial.SerializedTestCase):
+    @serial.given(start_pad_width=st.integers(min_value=1, max_value=2),
            end_pad_width=st.integers(min_value=0, max_value=2),
            args=_gen_test_add_padding(with_pad_data=True),
            ret_lengths=st.booleans(),
@@ -187,7 +189,7 @@ def test_remove_padding(self, start_pad_width, end_pad_width, args, gc, dc):
             reference=partial(_remove_padding_ref, start_pad_width, end_pad_width))
 
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
-    @given(start_pad_width=st.integers(min_value=0, max_value=2),
+    @serial.given(start_pad_width=st.integers(min_value=0, max_value=2),
            end_pad_width=st.integers(min_value=0, max_value=2),
            args=_gen_test_add_padding(with_pad_data=True),
            **hu.gcs)
@@ -208,7 +210,7 @@ def test_gather_padding(self, start_pad_width, end_pad_width, args, gc, dc):
             inputs=[padded_data, padded_lengths],
             reference=partial(_gather_padding_ref, start_pad_width, end_pad_width))
 
-    @given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32,
+    @serial.given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32,
                           elements=st.floats(min_value=-np.inf,
                                              max_value=np.inf),
                           min_value=1, max_value=10),
@@ -242,7 +244,7 @@ def op_grad_ref(grad_out, outputs, inputs):
             output_to_grad='reversed_data',
             grad_reference=op_grad_ref)
 
-    @given(data=hu.tensor(min_dim=1, max_dim=3, dtype=np.float32,
+    @serial.given(data=hu.tensor(min_dim=1, max_dim=3, dtype=np.float32,
                           elements=st.floats(min_value=-np.inf,
                                              max_value=np.inf),
                           min_value=10, max_value=10),
@@ -270,7 +272,7 @@ def op_ref(data, indices):
             inputs=[data, indices],
             reference=op_ref)
 
-    @given(elements=st.lists(st.integers(min_value=0, max_value=9),
+    @serial.given(elements=st.lists(st.integers(min_value=0, max_value=9),
                              min_size=0,
                              max_size=10),
            **hu.gcs_cpu_only)
diff --git a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
index 576c79892ca6ab..f3cb4486d87ed5 100644
--- a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
+++ b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
@@ -6,6 +6,7 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 import math
@@ -19,8 +20,8 @@
 MAX_TEST_AMPLITUDE = 10.0
 
 
-class TestSinusoidPositionEncodingOp(hu.HypothesisTestCase):
-    @given(
+class TestSinusoidPositionEncodingOp(serial.SerializedTestCase):
+    @serial.given(
         positions_vec=hu.arrays(
             dims=[MAX_TEST_SEQUENCE_LENGTH],
             dtype=np.int32,
diff --git a/caffe2/python/operator_test/softmax_ops_test.py b/caffe2/python/operator_test/softmax_ops_test.py
index ff2b0a526a03f6..113a44e50f88e0 100644
--- a/caffe2/python/operator_test/softmax_ops_test.py
+++ b/caffe2/python/operator_test/softmax_ops_test.py
@@ -2,18 +2,20 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 import unittest
 
 
-class TestSoftmaxOps(hu.HypothesisTestCase):
+class TestSoftmaxOps(serial.SerializedTestCase):
 
-    @given(n=st.sampled_from([0, 2, 4, 71, 103]),
+    @serial.given(n=st.sampled_from([0, 2, 4, 71, 103]),
            D=st.sampled_from([4, 8, 64, 79, 256, 333]),
            engine=st.sampled_from([None, 'CUDNN']),
            **hu.gcs)
@@ -51,7 +53,7 @@ def label_softmax(X):
             reference=label_softmax,
         )
 
-    @given(n=st.sampled_from([0, 2, 4, 71, 103, 555, 751, 1201]),
+    @serial.given(n=st.sampled_from([0, 2, 4, 71, 103, 555, 751, 1201]),
            D=st.sampled_from([4, 8, 64, 79, 256, 333, 1000]),
            engine=st.sampled_from([None, 'CUDNN']),
            **hu.gcs)
@@ -134,7 +136,7 @@ def label_softmax(X):
         self.assertGradientChecks(
             gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
 
-    @given(n=st.integers(2, 10), D=st.integers(4, 16),
+    @serial.given(n=st.integers(2, 10), D=st.integers(4, 16),
            only_loss=st.booleans(), **hu.gcs)
     def test_softmax_with_loss(self, n, D, gc, only_loss, dc):
         # n = number of examples, D = |labels|
@@ -449,7 +451,7 @@ def label_softmax_crossent_weighted(X, label, weights):
         self.assertGradientChecks(
             gc, op, [X, label, weights], 0, [1], stepsize=1e-4, threshold=1e-2)
 
-    @given(n=st.integers(2, 5), D=st.integers(2, 4),
+    @serial.given(n=st.integers(2, 5), D=st.integers(2, 4),
            weighted=st.booleans(), **hu.gcs)
     def test_spatial_softmax_with_loss(self, n, D, weighted, gc, dc):
         # n = number of examples, D = |labels|
diff --git a/caffe2/python/operator_test/sparse_ops_test.py b/caffe2/python/operator_test/sparse_ops_test.py
index b9d7d40fdee3e2..50c127c141f500 100644
--- a/caffe2/python/operator_test/sparse_ops_test.py
+++ b/caffe2/python/operator_test/sparse_ops_test.py
@@ -2,16 +2,18 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-import numpy as np
+
 from caffe2.python import core
 from caffe2.python.test_util import rand_array
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
-class TestScatterOps(hu.HypothesisTestCase):
+class TestScatterOps(serial.SerializedTestCase):
     # TODO(dzhulgakov): add test cases for failure scenarios
-    @given(num_args=st.integers(1, 5),
+    @serial.given(num_args=st.integers(1, 5),
            first_dim=st.integers(1, 20),
            index_dim=st.integers(1, 10),
            extra_dims=st.lists(st.integers(1, 4), min_size=0, max_size=3),
@@ -52,7 +54,7 @@ def ref(d, w0, ind, *args):
             inputs.extend([x,w])
         self.assertReferenceChecks(gc, op, inputs, ref, threshold=1e-3)
 
-    @given(first_dim=st.integers(1, 20),
+    @serial.given(first_dim=st.integers(1, 20),
            index_dim=st.integers(1, 10),
            extra_dims=st.lists(st.integers(1, 4), min_size=0, max_size=3),
            data_type=st.sampled_from([np.float16, np.float32, np.int32, np.int64]),
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index bf87f06ef7fe76..e84323b4704680 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -3,15 +3,15 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from hypothesis import given
-import hypothesis.strategies as st
-import numpy as np
+from caffe2.proto import caffe2_pb2
 from caffe2.python import brew, core, workspace
 import caffe2.python.hypothesis_test_util as hu
 from caffe2.python.model_helper import ModelHelper
-from caffe2.proto import caffe2_pb2
-
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
 import unittest
 
 
@@ -20,9 +20,9 @@ def _run_in_hip(gc, dc):
         caffe2_pb2.HIP in {d.device_type for d in dc})
 
 
-class TestSpatialBN(hu.HypothesisTestCase):
+class TestSpatialBN(serial.SerializedTestCase):
 
-    @given(size=st.integers(7, 10),
+    @serial.given(size=st.integers(7, 10),
            input_channels=st.integers(1, 10),
            batch_size=st.integers(0, 3),
            seed=st.integers(0, 65535),
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
index 253184a4bb8ead..b2b0ff7b93c490 100644
--- a/caffe2/python/operator_test/square_root_divide_op_test.py
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -9,6 +9,7 @@
 from hypothesis import strategies as st
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import math
 import numpy as np
 
@@ -55,8 +56,8 @@ def grad(output_grad, ref_outputs, inputs):
             None)
 
 
-class TestSquareRootDivide(hu.HypothesisTestCase):
-    @given(data_and_scale=_data_and_scale(),
+class TestSquareRootDivide(serial.SerializedTestCase):
+    @serial.given(data_and_scale=_data_and_scale(),
            **hu.gcs_cpu_only)
     def test_square_root_divide(self, data_and_scale, gc, dc):
         self.assertReferenceChecks(
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
index 49c25bc2bd6a0d..a5ffb66011112c 100644
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@@ -2,9 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -16,7 +18,7 @@ def _string_lists(alphabet=None):
         max_size=3)
 
 
-class TestStringOps(hu.HypothesisTestCase):
+class TestStringOps(serial.SerializedTestCase):
     @given(strings=_string_lists())
     def test_string_prefix(self, strings):
         length = 3
@@ -67,7 +69,7 @@ def string_suffix_ref(strings):
             [strings],
             string_suffix_ref)
 
-    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
+    @serial.given(strings=st.text(alphabet=['a', 'b'], average_size=3))
     def test_string_starts_with(self, strings):
         prefix = 'a'
         strings = np.array(
@@ -90,7 +92,7 @@ def string_starts_with_ref(strings):
             [strings],
             string_starts_with_ref)
 
-    @given(strings=st.text(alphabet=['a', 'b'], average_size=3))
+    @serial.given(strings=st.text(alphabet=['a', 'b'], average_size=3))
     def test_string_ends_with(self, strings):
         suffix = 'a'
         strings = np.array(
diff --git a/caffe2/python/operator_test/thresholded_relu_op_test.py b/caffe2/python/operator_test/thresholded_relu_op_test.py
index 6c50f02246101e..8d6f81492e2c56 100644
--- a/caffe2/python/operator_test/thresholded_relu_op_test.py
+++ b/caffe2/python/operator_test/thresholded_relu_op_test.py
@@ -7,17 +7,18 @@
 from hypothesis import given
 import hypothesis.strategies as st
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import numpy as np
 
 import unittest
 
 
-class TestThresholdedRelu(hu.HypothesisTestCase):
+class TestThresholdedRelu(serial.SerializedTestCase):
 
     # test case 1 - default alpha - we do reference and dc checks.
     # test case 2 does dc and reference checks over range of alphas.
     # test case 3 does gc over range of alphas.
-    @given(input=hu.tensor(),
+    @serial.given(input=hu.tensor(),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs)
     def test_thresholded_relu_1(self, input, gc, dc, engine):
diff --git a/caffe2/python/operator_test/tile_op_test.py b/caffe2/python/operator_test/tile_op_test.py
index b04f07f6f51f11..a9c5b02837fa29 100644
--- a/caffe2/python/operator_test/tile_op_test.py
+++ b/caffe2/python/operator_test/tile_op_test.py
@@ -11,10 +11,11 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestTile(hu.HypothesisTestCase):
-    @given(M=st.integers(min_value=1, max_value=10),
+class TestTile(serial.SerializedTestCase):
+    @serial.given(M=st.integers(min_value=1, max_value=10),
            K=st.integers(min_value=1, max_value=10),
            N=st.integers(min_value=1, max_value=10),
            tiles=st.integers(min_value=1, max_value=3),
diff --git a/caffe2/python/operator_test/top_k_test.py b/caffe2/python/operator_test/top_k_test.py
index b1306c4250e431..5d43629afebc65 100644
--- a/caffe2/python/operator_test/top_k_test.py
+++ b/caffe2/python/operator_test/top_k_test.py
@@ -9,9 +9,10 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestTopK(hu.HypothesisTestCase):
+class TestTopK(serial.SerializedTestCase):
 
     def top_k_ref(self, X, k, flatten_indices, axis=-1):
         in_dims = X.shape
@@ -64,7 +65,7 @@ def top_k_ref(self, X, k, flatten_indices, axis=-1):
         else:
             return (values_ref, indices_ref)
 
-    @given(
+    @serial.given(
         X=hu.tensor(),
         flatten_indices=st.booleans(),
         seed=st.integers(0, 10),
diff --git a/caffe2/python/operator_test/transpose_op_test.py b/caffe2/python/operator_test/transpose_op_test.py
index e0112ff698cf1c..088cc407efb71c 100644
--- a/caffe2/python/operator_test/transpose_op_test.py
+++ b/caffe2/python/operator_test/transpose_op_test.py
@@ -6,14 +6,16 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 
 import numpy as np
 import unittest
 
 
-class TestTransposeOp(hu.HypothesisTestCase):
-    @given(X=hu.tensor(dtype=np.float32), use_axes=st.booleans(), **hu.gcs)
+class TestTransposeOp(serial.SerializedTestCase):
+    @serial.given(
+        X=hu.tensor(dtype=np.float32), use_axes=st.booleans(), **hu.gcs)
     def test_transpose(self, X, use_axes, gc, dc):
         ndim = len(X.shape)
         axes = np.arange(ndim)
diff --git a/caffe2/python/operator_test/trigonometric_op_test.py b/caffe2/python/operator_test/trigonometric_op_test.py
index 834491141b1d72..a17662780579c4 100644
--- a/caffe2/python/operator_test/trigonometric_op_test.py
+++ b/caffe2/python/operator_test/trigonometric_op_test.py
@@ -7,25 +7,34 @@
 from hypothesis import given
 from hypothesis import strategies as st
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import numpy as np
 import unittest
 
 
-class TestTrigonometricOp(hu.HypothesisTestCase):
-    @given(X=hu.tensor(elements=st.floats(min_value=-0.7, max_value=0.7)), **hu.gcs)
+class TestTrigonometricOp(serial.SerializedTestCase):
+    @serial.given(
+        X=hu.tensor(elements=st.floats(min_value=-0.7, max_value=0.7)),
+        **hu.gcs)
     def test_acos(self, X, gc, dc):
         self.assertTrigonometricChecks("Acos", X, lambda x: (np.arccos(X),), gc, dc)
 
-    @given(X=hu.tensor(elements=st.floats(min_value=-0.7, max_value=0.7)), **hu.gcs)
+    @serial.given(
+        X=hu.tensor(elements=st.floats(min_value=-0.7, max_value=0.7)),
+        **hu.gcs)
     def test_asin(self, X, gc, dc):
         self.assertTrigonometricChecks("Asin", X, lambda x: (np.arcsin(X),), gc, dc)
 
-    @given(X=hu.tensor(elements=st.floats(min_value=-100, max_value=100)), **hu.gcs)
+    @serial.given(
+        X=hu.tensor(elements=st.floats(min_value=-100, max_value=100)),
+        **hu.gcs)
     def test_atan(self, X, gc, dc):
         self.assertTrigonometricChecks("Atan", X, lambda x: (np.arctan(X),), gc, dc)
 
-    @given(X=hu.tensor(elements=st.floats(min_value=-0.5, max_value=0.5)), **hu.gcs)
+    @serial.given(
+        X=hu.tensor(elements=st.floats(min_value=-0.5, max_value=0.5)),
+        **hu.gcs)
     def test_tan(self, X, gc, dc):
         self.assertTrigonometricChecks("Tan", X, lambda x: (np.tan(X),), gc, dc)
 
diff --git a/caffe2/python/operator_test/unique_ops_test.py b/caffe2/python/operator_test/unique_ops_test.py
index dc7c7b2f9175db..01efe4214b55d4 100644
--- a/caffe2/python/operator_test/unique_ops_test.py
+++ b/caffe2/python/operator_test/unique_ops_test.py
@@ -20,11 +20,11 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from hypothesis import given
 from functools import partial
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
 def _unique_ref(x, return_inverse):
@@ -34,8 +34,8 @@ def _unique_ref(x, return_inverse):
     return ret
 
 
-class TestUniqueOps(hu.HypothesisTestCase):
-    @given(
+class TestUniqueOps(serial.SerializedTestCase):
+    @serial.given(
         X=hu.tensor1d(
             # allow empty
             min_len=0,
diff --git a/caffe2/python/operator_test/upsample_op_test.py b/caffe2/python/operator_test/upsample_op_test.py
index c4dde062d09485..f8ec148a3d7047 100644
--- a/caffe2/python/operator_test/upsample_op_test.py
+++ b/caffe2/python/operator_test/upsample_op_test.py
@@ -17,16 +17,17 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+import unittest
 
 
-class TestUpSample(hu.HypothesisTestCase):
-    @given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
+class TestUpSample(serial.SerializedTestCase):
+    @serial.given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
            width_scale=st.floats(1.0, 4.0) | st.just(2.0),
            height=st.integers(4, 32),
            width=st.integers(4, 32),
@@ -86,7 +87,7 @@ def ref(X):
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.1, threshold=1e-2)
 
-    @given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
+    @serial.given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
            width_scale=st.floats(1.0, 4.0) | st.just(2.0),
            height=st.integers(4, 32),
            width=st.integers(4, 32),
diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py
index 76d3dee4f967cb..701d846e532f0f 100644
--- a/caffe2/python/operator_test/utility_ops_test.py
+++ b/caffe2/python/operator_test/utility_ops_test.py
@@ -7,15 +7,16 @@
 from hypothesis import assume, given
 from caffe2.proto import caffe2_pb2
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 import random
 import unittest
 
 
-class TestUtilityOps(hu.HypothesisTestCase):
+class TestUtilityOps(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(), args=st.booleans(), **hu.gcs)
+    @serial.given(X=hu.tensor(), args=st.booleans(), **hu.gcs)
     def test_slice(self, X, args, gc, dc):
         X = X.astype(dtype=np.float32)
         dim = random.randint(0, X.ndim - 1)
@@ -57,7 +58,7 @@ def slice_ref(x, starts, ends):
             outputs_with_grads=[0],
         )
 
-    @given(dtype=st.sampled_from([np.float32, np.int32]),
+    @serial.given(dtype=st.sampled_from([np.float32, np.int32]),
            ndims=st.integers(min_value=1, max_value=5),
            seed=st.integers(min_value=0, max_value=65536),
            null_axes=st.booleans(),
@@ -92,7 +93,7 @@ def transpose_ref(x, axes):
         self.assertReferenceChecks(gc, op, [X, axes],
                                    transpose_ref)
 
-    @given(m=st.integers(5, 10), n=st.integers(5, 10),
+    @serial.given(m=st.integers(5, 10), n=st.integers(5, 10),
            o=st.integers(5, 10), nans=st.booleans(), **hu.gcs)
     def test_nan_check(self, m, n, o, nans, gc, dc):
         other = np.array([1, 2, 3]).astype(np.float32)
@@ -143,7 +144,7 @@ def nan_reference(X, Y):
         except RuntimeError:
             pass
 
-    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+    @serial.given(n=st.integers(4, 5), m=st.integers(6, 7),
            d=st.integers(2, 3), **hu.gcs)
     def test_elementwise_max(self, n, m, d, gc, dc):
         X = np.random.rand(n, m, d).astype(np.float32)
@@ -168,7 +169,7 @@ def max_op(X, Y, Z):
         )
         self.assertDeviceChecks(dc, op, inputs, [0])
 
-    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+    @serial.given(n=st.integers(4, 5), m=st.integers(6, 7),
            d=st.integers(2, 3), **hu.gcs)
     def test_elementwise_max_grad(self, n, m, d, gc, dc):
         go = np.random.rand(n, m, d).astype(np.float32)
@@ -198,7 +199,7 @@ def mx_grad(a):
         )
         self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
 
-    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+    @serial.given(n=st.integers(4, 5), m=st.integers(6, 7),
            d=st.integers(2, 3), **hu.gcs)
     def test_elementwise_min(self, n, m, d, gc, dc):
         X = np.random.rand(n, m, d).astype(np.float32)
@@ -223,7 +224,7 @@ def min_op(X, Y, Z):
         )
         self.assertDeviceChecks(dc, op, inputs, [0])
 
-    @given(n=st.integers(4, 5), m=st.integers(6, 7),
+    @serial.given(n=st.integers(4, 5), m=st.integers(6, 7),
            d=st.integers(2, 3), **hu.gcs)
     def test_elementwise_min_grad(self, n, m, d, gc, dc):
         go = np.random.rand(n, m, d).astype(np.float32)
@@ -253,7 +254,7 @@ def mx_grad(a):
         )
         self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor().flatmap(
             lambda pair: st.tuples(
                 st.just(pair[0]),
@@ -294,7 +295,7 @@ def lengths_gather_op(items, lengths, indices):
             reference=lengths_gather_op,
         )
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor(),
         **hu.gcs_cpu_only)
     def test_lengths_to_ranges(self, inputs, gc, dc):
@@ -333,7 +334,7 @@ def lengths_to_ranges_op(lengths):
         self.assertEqual(shapes[output], list(lengths.shape) + [2])
         self.assertEqual(types[output], core.DataType.INT32)
 
-    @given(**hu.gcs)
+    @serial.given(**hu.gcs)
     def test_size_op(self, gc, dc):
         X = np.array([[1, 2], [3, 4]]).astype(np.float32)
 
diff --git a/caffe2/python/operator_test/wngrad_test.py b/caffe2/python/operator_test/wngrad_test.py
index 7f81dc2429c378..7b4e81cd8056c6 100644
--- a/caffe2/python/operator_test/wngrad_test.py
+++ b/caffe2/python/operator_test/wngrad_test.py
@@ -14,6 +14,7 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 logger = logging.getLogger(__name__)
 
@@ -76,8 +77,8 @@ def ref_sparse(param, seq_b, indices, grad, lr):
     )
 
 
-class TestWngrad(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=2),
+class TestWngrad(serial.SerializedTestCase):
+    @serial.given(inputs=hu.tensors(n=2),
            seq_b=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            lr=st.floats(min_value=0.01, max_value=0.99,
@@ -174,7 +175,7 @@ def test_sparse_wngrad(self, inputs, seq_b, lr, epsilon, gc, dc):
         return wngrad_sparse_test_helper(self, inputs, seq_b, lr, epsilon,
             None, gc, dc)
 
-    @given(inputs=hu.tensors(n=1),
+    @serial.given(inputs=hu.tensors(n=1),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            seq_b=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 9a29f73baef816..fe6157d797f91b 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -30,6 +30,7 @@
 #include "caffe2/predictor/predictor.h"
 #include "caffe2/python/pybind_state_registry.h"
 #include "caffe2/utils/cpuid.h"
+#include "caffe2/utils/proto_convert.h"
 #include "caffe2/utils/string_utils.h"
 
 namespace caffe2 {
@@ -1186,11 +1187,17 @@ void addGlobalMethods(py::module& m) {
     return true;
   });
   m.def("nets", []() { return gWorkspace->Nets(); });
-  m.def("run_operator_once", [](const py::bytes& op_def) {
+  m.def("run_operator_once", [](const py::bytes& op_def, bool legacy_proto=true) {
     CAFFE_ENFORCE(gWorkspace);
     OperatorDef def;
-    CAFFE_ENFORCE(
-        ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
+    if (legacy_proto) {
+      CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
+    } else {
+      ::torch::NodeProto node;
+      CAFFE_ENFORCE(
+          ParseProtoFromLargeString(op_def.cast<std::string>(), &node));
+      NodeProtoToOperatorDef(node, &def);
+    }
     py::gil_scoped_release g;
     CAFFE_ENFORCE(gWorkspace->RunOperatorOnce(def));
     return true;
@@ -1527,6 +1534,38 @@ void addGlobalMethods(py::module& m) {
     CAFFE_ENFORCE(blob);
     return BlobStat::sizeBytes(*blob);
   });
+  m.def("argument_to_attribute_proto", [](py::bytes arg_str) -> py::bytes {
+    Argument arg;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(arg_str.cast<std::string>(), &arg));
+    ::torch::AttributeProto attr;
+    ArgumentToAttributeProto(arg, &attr);
+    return attr.SerializeAsString();
+  });
+  m.def("attribute_proto_to_argument", [](py::bytes attr_str) -> py::bytes {
+    ::torch::AttributeProto attr;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(attr_str.cast<std::string>(), &attr));
+    Argument arg;
+    AttributeProtoToArgument(attr, &arg);
+    return arg.SerializeAsString();
+  });
+  m.def("operator_def_to_node_proto", [](py::bytes op_str) -> py::bytes {
+    OperatorDef op_def;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(op_str.cast<std::string>(), &op_def));
+    ::torch::NodeProto node;
+    OperatorDefToNodeProto(op_def, &node);
+    return node.SerializeAsString();
+  });
+  m.def("node_proto_to_operator_def", [](py::bytes node_str) -> py::bytes {
+    ::torch::NodeProto node_proto;
+    CAFFE_ENFORCE(
+      ParseProtoFromLargeString(node_str.cast<std::string>(), &node_proto));
+    OperatorDef op_def;
+    NodeProtoToOperatorDef(node_proto, &op_def);
+    return op_def.SerializeAsString();
+  });
   m.def("support_onnx_export", [](const std::string& op) -> bool {
     const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(op);
     if (!schema) {
diff --git a/caffe2/python/serialized_test/README.md b/caffe2/python/serialized_test/README.md
index 2885ed290cab8e..a122e7ed309696 100644
--- a/caffe2/python/serialized_test/README.md
+++ b/caffe2/python/serialized_test/README.md
@@ -6,7 +6,7 @@ Major functionality lives in `serialized_test_util.py`
 1. Extend the test case class from `SerializedTestCase`
 2. Change the `@given` decorator to `@serialized_test_util.given`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run.
 3. [Optional] Add (or change a call of `unittest.main()` to) `testWithArgs` in `__main__`. This allows you to generate outputs using `python caffe2/python/operator_test/my_test.py -G`.
-4.  Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one npz file per test function. Use `-O` to change the output directory.
+4.  Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one zip file per test function. The zip file contains an `inout.npz` file of the inputs, outputs, and meta data (like device type), a `op.pb` file of the operator, and `grad_#.pb` files of the gradients if there are any. Use `-O` to change the output directory.
 5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. 
 
 ##Additional Notes
diff --git a/caffe2/python/serialized_test/data/operator_test/rank_loss_operator_test.test_pair_wise_loss_batch.zip b/caffe2/python/serialized_test/data/operator_test/rank_loss_operator_test.test_pair_wise_loss_batch.zip
new file mode 100644
index 00000000000000..0b377f6c31763f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/rank_loss_operator_test.test_pair_wise_loss_batch.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/recurrent_network_test.test_mul.zip b/caffe2/python/serialized_test/data/operator_test/recurrent_network_test.test_mul.zip
new file mode 100644
index 00000000000000..330663b52eda1e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/recurrent_network_test.test_mul.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_max.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_max.zip
new file mode 100644
index 00000000000000..3f709bb1d2db1f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_max.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_mean.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_mean.zip
new file mode 100644
index 00000000000000..a291738573f40a
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_mean.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_max.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_max.zip
new file mode 100644
index 00000000000000..9f8aed3c93f210
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_max.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_mean.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_mean.zip
new file mode 100644
index 00000000000000..85487c40f483cc
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_mean.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_sum.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_sum.zip
new file mode 100644
index 00000000000000..ed898d62efeb5e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_sum.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_l2.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_l2.zip
new file mode 100644
index 00000000000000..3a3ca69a17610f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_l2.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_max.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_max.zip
new file mode 100644
index 00000000000000..934b3a548e75a5
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_max.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_mean.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_mean.zip
new file mode 100644
index 00000000000000..805775c93328bb
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_mean.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_min.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_min.zip
new file mode 100644
index 00000000000000..45e37d688b2151
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_min.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_columnwise_max.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_columnwise_max.zip
new file mode 100644
index 00000000000000..3eb10178c78bb8
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_columnwise_max.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_int_sum.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_int_sum.zip
new file mode 100644
index 00000000000000..3948ea005d7955
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_int_sum.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sqrsum.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sqrsum.zip
new file mode 100644
index 00000000000000..a1d513d641e88d
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sqrsum.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sum.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sum.zip
new file mode 100644
index 00000000000000..bda6b05b3850ad
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sum.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_rowwise_max.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_rowwise_max.zip
new file mode 100644
index 00000000000000..bd36e749b0da0a
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_rowwise_max.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/selu_op_test.test_selu_1.zip b/caffe2/python/serialized_test/data/operator_test/selu_op_test.test_selu_1.zip
new file mode 100644
index 00000000000000..967d48edc540aa
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/selu_op_test.test_selu_1.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_add_padding.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_add_padding.zip
new file mode 100644
index 00000000000000..4f95c338e95243
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_add_padding.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_find_duplicate_elements.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_find_duplicate_elements.zip
new file mode 100644
index 00000000000000..6a3a2dec9d7718
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_find_duplicate_elements.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_gather_padding.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_gather_padding.zip
new file mode 100644
index 00000000000000..58c0f55039bd52
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_gather_padding.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_remove_data_blocks.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_remove_data_blocks.zip
new file mode 100644
index 00000000000000..877e7757de6861
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_remove_data_blocks.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_reverse_packed_segs.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_reverse_packed_segs.zip
new file mode 100644
index 00000000000000..ca0d7eafa6b340
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_reverse_packed_segs.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sinusoid_position_encoding_op_test.test_sinusoid_embedding.zip b/caffe2/python/serialized_test/data/operator_test/sinusoid_position_encoding_op_test.test_sinusoid_embedding.zip
new file mode 100644
index 00000000000000..0850604cea11a8
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sinusoid_position_encoding_op_test.test_sinusoid_embedding.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax.zip
new file mode 100644
index 00000000000000..571fdb1ae9fb25
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_grad.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_grad.zip
new file mode 100644
index 00000000000000..440915c51a78f5
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_grad.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_with_loss.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_with_loss.zip
new file mode 100644
index 00000000000000..98f8809e4a2b67
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_with_loss.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_spatial_softmax_with_loss.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_spatial_softmax_with_loss.zip
new file mode 100644
index 00000000000000..69303ad5b1e88a
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_spatial_softmax_with_loss.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterAssign.zip b/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterAssign.zip
new file mode 100644
index 00000000000000..aff23e7502d00c
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterAssign.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterWeightedSum.zip b/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterWeightedSum.zip
new file mode 100644
index 00000000000000..6ad5bfd310e6fa
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterWeightedSum.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/spatial_bn_op_test.test_spatialbn_test_mode_3d.zip b/caffe2/python/serialized_test/data/operator_test/spatial_bn_op_test.test_spatialbn_test_mode_3d.zip
new file mode 100644
index 00000000000000..1bcde7c346f942
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/spatial_bn_op_test.test_spatialbn_test_mode_3d.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/square_root_divide_op_test.test_square_root_divide.zip b/caffe2/python/serialized_test/data/operator_test/square_root_divide_op_test.test_square_root_divide.zip
new file mode 100644
index 00000000000000..a9c2954a7a0b5f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/square_root_divide_op_test.test_square_root_divide.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_ends_with.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_ends_with.zip
new file mode 100644
index 00000000000000..6af51439bed6b7
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_ends_with.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip
new file mode 100644
index 00000000000000..e4019f68dfd0e0
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz
new file mode 100644
index 00000000000000..0dfa5f9790c01f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb
new file mode 100644
index 00000000000000..b1f14dad9aefd8
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_starts_with.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_starts_with.zip
new file mode 100644
index 00000000000000..cd0682f99b3089
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_starts_with.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip
new file mode 100644
index 00000000000000..cc60f7242ee693
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz
new file mode 100644
index 00000000000000..0dfa5f9790c01f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb
new file mode 100644
index 00000000000000..d59c5130048038
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb differ
diff --git a/caffe2/python/serialized_test/data/operator_test/thresholded_relu_op_test.test_thresholded_relu_1.zip b/caffe2/python/serialized_test/data/operator_test/thresholded_relu_op_test.test_thresholded_relu_1.zip
new file mode 100644
index 00000000000000..0f80f7df889745
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/thresholded_relu_op_test.test_thresholded_relu_1.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/tile_op_test.test_tile.zip b/caffe2/python/serialized_test/data/operator_test/tile_op_test.test_tile.zip
new file mode 100644
index 00000000000000..17b064f640667d
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/tile_op_test.test_tile.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/top_k_test.test_top_k.zip b/caffe2/python/serialized_test/data/operator_test/top_k_test.test_top_k.zip
new file mode 100644
index 00000000000000..592bc05b1ec9e2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/top_k_test.test_top_k.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/transpose_op_test.test_transpose.zip b/caffe2/python/serialized_test/data/operator_test/transpose_op_test.test_transpose.zip
new file mode 100644
index 00000000000000..19ef2d4a3c47ad
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/transpose_op_test.test_transpose.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_acos.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_acos.zip
new file mode 100644
index 00000000000000..0e93add75df727
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_acos.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_asin.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_asin.zip
new file mode 100644
index 00000000000000..0df01759115a60
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_asin.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_atan.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_atan.zip
new file mode 100644
index 00000000000000..02dd9a82bea2a2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_atan.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_tan.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_tan.zip
new file mode 100644
index 00000000000000..c0de1850d99a24
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_tan.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/unique_ops_test.test_unique_op.zip b/caffe2/python/serialized_test/data/operator_test/unique_ops_test.test_unique_op.zip
new file mode 100644
index 00000000000000..ba2a4cb04578b5
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/unique_ops_test.test_unique_op.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample.zip b/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample.zip
new file mode 100644
index 00000000000000..66b8ce7f7a686f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample_grad.zip b/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample_grad.zip
new file mode 100644
index 00000000000000..2ac20c92650a96
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample_grad.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max.zip
new file mode 100644
index 00000000000000..636865e46bccc5
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max_grad.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max_grad.zip
new file mode 100644
index 00000000000000..907f4671100e44
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max_grad.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min.zip
new file mode 100644
index 00000000000000..1bc0467bdd36a8
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min_grad.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min_grad.zip
new file mode 100644
index 00000000000000..ba8a8e65f994c8
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min_grad.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_gather.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_gather.zip
new file mode 100644
index 00000000000000..37dd4135fd7c0e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_gather.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_to_ranges.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_to_ranges.zip
new file mode 100644
index 00000000000000..1cf4166c8f2d57
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_to_ranges.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_nan_check.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_nan_check.zip
new file mode 100644
index 00000000000000..104da2e4df45db
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_nan_check.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_size_op.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_size_op.zip
new file mode 100644
index 00000000000000..97c76bb32df267
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_size_op.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_slice.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_slice.zip
new file mode 100644
index 00000000000000..e5088a7b374fa4
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_slice.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_transpose.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_transpose.zip
new file mode 100644
index 00000000000000..432d71b032a00e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_transpose.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_sparse_wngrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_sparse_wngrad_empty.zip
new file mode 100644
index 00000000000000..54edacc126347f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_sparse_wngrad_empty.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_wngrad_dense_base.zip b/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_wngrad_dense_base.zip
new file mode 100644
index 00000000000000..92225758a1fa02
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_wngrad_dense_base.zip differ
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index ef02f64dc993b7..a41cc153177639 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -163,8 +163,8 @@ def GetOperatorCost(operator, blobs):
     return C.get_operator_cost(StringifyProto(operator), blobs)
 
 
-def RunOperatorOnce(operator):
-    return C.run_operator_once(StringifyProto(operator))
+def RunOperatorOnce(operator, legacy_proto=True):
+    return C.run_operator_once(StringifyProto(operator), legacy_proto)
 
 
 def RunOperatorsOnce(operators):
diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h
index 3217198dd8cdf3..122147769e0282 100644
--- a/caffe2/utils/Array.h
+++ b/caffe2/utils/Array.h
@@ -202,7 +202,7 @@ class array final {
 #if defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201606
   template<typename _Tp, typename... _Up>
   array(_Tp, _Up...) ->
-    array<enable_if_t<(is_same_v<_Tp, _Up> && ...), _Tp>, 1 + sizeof...(_Up)>;
+    array<enable_if_t<(std::is_same<_Tp, _Up>::value && ...), _Tp>, 1 + sizeof...(_Up)>;
 #endif
 
 // Array comparisons.
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index ff8c323dd096de..5544f3f7e243af 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -1,4 +1,5 @@
 list(APPEND Caffe2_CPU_SRCS
+  utils/proto_convert.cc
   utils/proto_wrap.cc
   utils/proto_utils.cc
   utils/murmur_hash3.cc
diff --git a/caffe2/utils/proto_convert.cc b/caffe2/utils/proto_convert.cc
new file mode 100644
index 00000000000000..24984203bcb810
--- /dev/null
+++ b/caffe2/utils/proto_convert.cc
@@ -0,0 +1,186 @@
+#include "caffe2/utils/proto_convert.h"
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+
+CAFFE2_EXPORT void ArgumentToAttributeProto(
+    const Argument& arg,
+    ::torch::AttributeProto* attr) {
+  CAFFE_ENFORCE(arg.has_name());
+  attr->set_name(arg.name());
+  if (arg.has_f()) {
+    attr->set_f(arg.f());
+  } else if (arg.has_i()) {
+    attr->set_i(arg.i());
+  } else if (arg.has_s()) {
+    attr->set_s(arg.s());
+  } else if (arg.has_n()) {
+    // TODO
+    CAFFE_THROW("NetDef conversion is not implemented yet.");
+  } else if (arg.floats_size() > 0) {
+    attr->mutable_floats()->CopyFrom(arg.floats());
+  } else if (arg.ints_size() > 0) {
+    attr->mutable_ints()->CopyFrom(arg.ints());
+  } else if (arg.strings_size() > 0) {
+    attr->mutable_strings()->CopyFrom(arg.strings());
+  } else if (arg.nets_size() > 0) {
+    // TODO
+    CAFFE_THROW("NetDefs conversion is not implemented yet.");
+  }
+}
+
+CAFFE2_EXPORT void AttributeProtoToArgument(
+    const ::torch::AttributeProto& attr,
+    Argument* arg) {
+  CAFFE_ENFORCE(attr.has_name());
+  arg->set_name(attr.name());
+  CAFFE_ENFORCE(attr.has_type());
+  const auto type = attr.type();
+  if (type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_FLOAT) {
+    CAFFE_ENFORCE(attr.has_f());
+    arg->set_f(attr.f());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::AttributeProto_AttributeType_INT) {
+    CAFFE_ENFORCE(attr.has_i());
+    arg->set_i(attr.i());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_STRING) {
+    CAFFE_ENFORCE(attr.has_s());
+    arg->set_s(attr.s());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_TENSOR) {
+    CAFFE_THROW("Caffe2's Argument does not support tensor as attribute.");
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_GRAPH) {
+    // TODO
+    CAFFE_THROW("GraphProto conversion is not implemented yet.");
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_FLOATS) {
+    arg->mutable_floats()->CopyFrom(attr.floats());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_INTS) {
+    arg->mutable_ints()->CopyFrom(attr.ints());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_STRINGS) {
+    arg->mutable_strings()->CopyFrom(attr.strings());
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_TENSORS) {
+    CAFFE_THROW("Caffe2's Argument does not support tensors as attribute.");
+  } else if (
+      type ==
+      ::torch::AttributeProto_AttributeType::
+          AttributeProto_AttributeType_GRAPHS) {
+    // TODO
+    CAFFE_THROW("GraphProtos conversion is not implemented yet.");
+  } else {
+    CAFFE_THROW("Unknow Attribute type.");
+  }
+}
+
+CAFFE2_EXPORT void OperatorDefToNodeProto(
+    const OperatorDef& def,
+    ::torch::NodeProto* node) {
+  node->mutable_input()->CopyFrom(def.input());
+  node->mutable_output()->CopyFrom(def.output());
+  if (def.has_name()) {
+    node->set_name(def.name());
+  }
+  CAFFE_ENFORCE(def.has_type());
+  node->set_op_type(def.type());
+  for (int i = 0; i < def.arg_size(); ++i) {
+    auto attr = node->add_attribute();
+    ArgumentToAttributeProto(def.arg(i), attr);
+  }
+  if (def.has_device_option()) {
+    node->mutable_device_option()->CopyFrom(def.device_option());
+  }
+  if (def.has_engine()) {
+    auto attr = node->add_annotations();
+    attr->set_name("engine");
+    attr->set_type(::torch::AttributeProto_AttributeType::
+                       AttributeProto_AttributeType_STRING);
+    attr->set_s(def.engine());
+  }
+  if (def.control_input_size() > 0) {
+    auto attr = node->add_annotations();
+    attr->set_name("control_input");
+    attr->set_type(::torch::AttributeProto_AttributeType::
+                       AttributeProto_AttributeType_STRINGS);
+    attr->mutable_strings()->CopyFrom(def.control_input());
+  }
+  if (def.has_is_gradient_op()) {
+    auto attr = node->add_annotations();
+    attr->set_name("is_gradient_op");
+    attr->set_type(::torch::AttributeProto_AttributeType::
+                       AttributeProto_AttributeType_INT);
+    if (def.is_gradient_op()) {
+      attr->set_i(1);
+    } else {
+      attr->set_i(0);
+    }
+  }
+  if (def.has_debug_info()) {
+    node->set_doc_string(def.debug_info());
+  }
+}
+
+CAFFE2_EXPORT void NodeProtoToOperatorDef(
+    const ::torch::NodeProto& node,
+    OperatorDef* def) {
+  def->mutable_input()->CopyFrom(node.input());
+  def->mutable_output()->CopyFrom(node.output());
+  if (node.has_name()) {
+    def->set_name(node.name());
+  }
+
+  CAFFE_ENFORCE(node.has_op_type());
+  def->set_type(node.op_type());
+  for (int i = 0; i < node.attribute_size(); ++i) {
+    auto arg = def->add_arg();
+    AttributeProtoToArgument(node.attribute(i), arg);
+  }
+  if (node.has_doc_string()) {
+    def->set_debug_info(node.doc_string());
+  }
+  for (int i = 0; i < node.annotations_size(); ++i) {
+    const auto& attr = node.annotations(i);
+    CAFFE_ENFORCE(attr.has_name());
+    if (attr.name() == "engine") {
+      CAFFE_ENFORCE(attr.has_s());
+      def->set_engine(attr.s());
+    } else if (attr.name() == "control_input") {
+      def->mutable_control_input()->CopyFrom(attr.strings());
+    } else if (attr.name() == "is_gradient_op") {
+      CAFFE_ENFORCE(attr.has_i());
+      if (i == 0) {
+        def->set_is_gradient_op(false);
+      } else {
+        def->set_is_gradient_op(true);
+      }
+    }
+    auto arg = def->add_arg();
+    AttributeProtoToArgument(node.annotations(i), arg);
+  }
+  if (node.has_device_option()) {
+    def->mutable_device_option()->CopyFrom(node.device_option());
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/utils/proto_convert.h b/caffe2/utils/proto_convert.h
new file mode 100644
index 00000000000000..a9ca9c3ad4fa41
--- /dev/null
+++ b/caffe2/utils/proto_convert.h
@@ -0,0 +1,25 @@
+#ifndef CAFFE2_UTILS_PROTO_CONVERT_H_
+#define CAFFE2_UTILS_PROTO_CONVERT_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/proto/caffe2_pb.h"
+#include "caffe2/proto/torch_pb.h"
+
+namespace caffe2 {
+
+CAFFE2_API void ArgumentToAttributeProto(
+    const Argument& arg,
+    ::torch::AttributeProto* attr);
+CAFFE2_API void AttributeProtoToArgument(
+    const ::torch::AttributeProto& attr,
+    Argument* arg);
+CAFFE2_API void OperatorDefToNodeProto(
+    const OperatorDef& def,
+    ::torch::NodeProto* node);
+CAFFE2_API void NodeProtoToOperatorDef(
+    const ::torch::NodeProto& node,
+    OperatorDef* def);
+
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_PROTO_CONVERT_H_
diff --git a/cmake/Modules/FindBLAS.cmake b/cmake/Modules/FindBLAS.cmake
index c51f17b11facf8..7f1761a7758ff7 100644
--- a/cmake/Modules/FindBLAS.cmake
+++ b/cmake/Modules/FindBLAS.cmake
@@ -105,6 +105,34 @@ if((NOT BLAS_LIBRARIES)
   ENDIF(MKL_FOUND)
 endif()
 
+# Apple BLAS library?
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "accelerate")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "Accelerate")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "accelerate")
+    set(BLAS_IS_ACCELERATE 1)
+  endif (BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "veclib")))
+  check_fortran_libraries(
+    BLAS_LIBRARIES
+    BLAS
+    sgemm
+    ""
+    "vecLib")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "veclib")
+  endif (BLAS_LIBRARIES)
+endif()
+
 if((NOT BLAS_LIBRARIES)
     AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
   check_fortran_libraries(
@@ -183,34 +211,6 @@ if((NOT BLAS_LIBRARIES)
   endif (BLAS_LIBRARIES)
 endif()
 
-# Apple BLAS library?
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "accelerate")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "Accelerate")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "accelerate")
-    set(BLAS_IS_ACCELERATE 1)
-  endif (BLAS_LIBRARIES)
-endif()
-
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "veclib")))
-  check_fortran_libraries(
-    BLAS_LIBRARIES
-    BLAS
-    sgemm
-    ""
-    "vecLib")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "veclib")
-  endif (BLAS_LIBRARIES)
-endif()
-
 # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
 if((NOT BLAS_LIBRARIES)
     AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "atlas")))
diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
index 9641c45d196d69..99c8d347f4c405 100644
--- a/cmake/Modules/FindLAPACK.cmake
+++ b/cmake/Modules/FindLAPACK.cmake
@@ -95,6 +95,28 @@ if(BLAS_FOUND)
     SET(LAPACK_INFO "mkl")
   ENDIF()
 
+  # Accelerate
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "accelerate"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" ACCELERATE_LAPACK_WORKS)
+    if(ACCELERATE_LAPACK_WORKS)
+      SET(LAPACK_INFO "accelerate")
+    else()
+      message(STATUS "Strangely, this Accelerate library does not support Lapack?!")
+    endif()
+  endif()
+
+  # vecLib
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "veclib"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" VECLIB_LAPACK_WORKS)
+    if(VECLIB_LAPACK_WORKS)
+      SET(LAPACK_INFO "veclib")
+    else()
+      message(STATUS "Strangely, this vecLib library does not support Lapack?!")
+    endif()
+  endif()
+
   # OpenBlas
   IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "open"))
     SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
@@ -128,28 +150,6 @@ if(BLAS_FOUND)
     endif()
   endif()
 
-  # Accelerate
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "accelerate"))
-    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-    check_function_exists("cheev_" ACCELERATE_LAPACK_WORKS)
-    if(ACCELERATE_LAPACK_WORKS)
-      SET(LAPACK_INFO "accelerate")
-    else()
-      message(STATUS "Strangely, this Accelerate library does not support Lapack?!")
-    endif()
-  endif()
-
-  # vecLib
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "veclib"))
-    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-    check_function_exists("cheev_" VECLIB_LAPACK_WORKS)
-    if(VECLIB_LAPACK_WORKS)
-      SET(LAPACK_INFO "veclib")
-    else()
-      message(STATUS "Strangely, this vecLib library does not support Lapack?!")
-    endif()
-  endif()
-
   # Generic LAPACK library?
   IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "generic"))
     check_lapack_libraries(
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index 2a196c26ee0b0a..93f3ad1e2996a9 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -88,6 +88,9 @@ function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
               RUN_OUTPUT_VARIABLE compute_capabilities)
     endif()
 
+    # Filter unrelated content out of the output.
+    string(REGEX MATCHALL "[0-9]+\\.[0-9]+" compute_capabilities "${compute_capabilities}")
+
     if(run_result EQUAL 0)
       string(REPLACE "2.1" "2.1(2.0)" compute_capabilities "${compute_capabilities}")
       set(CUDA_GPU_DETECT_OUTPUT ${compute_capabilities}
diff --git a/docs/source/notes/faq.rst b/docs/source/notes/faq.rst
index 83bf434aca3b40..a7319f750bcfb1 100644
--- a/docs/source/notes/faq.rst
+++ b/docs/source/notes/faq.rst
@@ -99,7 +99,7 @@ My data loader workers return identical random numbers
 -------------------------------------------------------
 You are likely using other libraries to generate random numbers in the dataset.
 For example, NumPy's RNG is duplicated when worker subprocesses are started via
-``fork``. See :class:`torch.utils.data.DataLoader`'s document for how to
+``fork``. See :class:`torch.utils.data.DataLoader`'s documentation for how to
 properly set up random seeds in workers with its :attr:`worker_init_fn` option.
 
 .. _pack-rnn-unpack-with-data-parallelism:
diff --git a/setup.py b/setup.py
index 5fbe75833da0fb..381123b2b9ced8 100644
--- a/setup.py
+++ b/setup.py
@@ -432,6 +432,7 @@ def check_file(f):
                 print("Could not find {}".format(f))
                 print("Did you run 'git submodule update --init'?")
                 sys.exit(1)
+
         check_file(os.path.join(third_party_path, "gloo", "CMakeLists.txt"))
         check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt"))
         check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt'))
diff --git a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-backward.expect b/test/expect/TestCudaSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestCudaSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestJit.test_pretty_printer-if_one.expect b/test/expect/TestJit.test_pretty_printer-if_one.expect
new file mode 100644
index 00000000000000..3a9254be45048e
--- /dev/null
+++ b/test/expect/TestJit.test_pretty_printer-if_one.expect
@@ -0,0 +1,8 @@
+def script(c2, c1):
+  t2 = aten::lt(c2, c1)
+  t3 = prim::TensorToNum(t2)
+  if t3:
+    c = c2
+  else:
+    c = c1
+  return c
diff --git a/test/expect/TestJit.test_pretty_printer-if_test.expect b/test/expect/TestJit.test_pretty_printer-if_test.expect
new file mode 100644
index 00000000000000..130c9b0c5a8f41
--- /dev/null
+++ b/test/expect/TestJit.test_pretty_printer-if_test.expect
@@ -0,0 +1,8 @@
+def script(c2, c1):
+  t2 = aten::lt(c2, c1)
+  t3 = prim::TensorToNum(t2)
+  if t3:
+    c = c1
+  else:
+    c = c2
+  return c
diff --git a/test/expect/TestJit.test_pretty_printer-loop_use_test.expect b/test/expect/TestJit.test_pretty_printer-loop_use_test.expect
new file mode 100644
index 00000000000000..4e35ad2150ef58
--- /dev/null
+++ b/test/expect/TestJit.test_pretty_printer-loop_use_test.expect
@@ -0,0 +1,16 @@
+def script(y1):
+  x = aten::add(y1, 1, 1)
+  z1 = aten::add(x, 5, 1)
+  t9 = aten::lt(y1, 8)
+  t10 = prim::TensorToNum(t9)
+  y = y1
+  z = z1
+  t11 = t10
+  while t11:
+    y2 = aten::add(y, 1, 1)
+    t17 = aten::lt(y2, 8)
+    t18 = prim::TensorToNum(t17)
+    t11 = t18
+    y = y2
+    z = x
+  return (x, z)
diff --git a/test/expect/TestJit.test_pretty_printer-while_if_test.expect b/test/expect/TestJit.test_pretty_printer-while_if_test.expect
new file mode 100644
index 00000000000000..c830784510e6f3
--- /dev/null
+++ b/test/expect/TestJit.test_pretty_printer-while_if_test.expect
@@ -0,0 +1,24 @@
+def script(a1, b1):
+  t5 = aten::lt(a1, 10)
+  t6 = prim::TensorToNum(t5)
+  a = a1
+  b = b1
+  c = 0
+  t7 = t6
+  while t7:
+    a2 = aten::add(a, 1, 1)
+    b2 = aten::add(b, 1, 1)
+    t15 = aten::gt(a2, b2)
+    t16 = prim::TensorToNum(t15)
+    if t16:
+      c4 = 2
+    else:
+      c4 = 3
+    t21 = aten::lt(a2, 10)
+    t22 = prim::TensorToNum(t21)
+    t7 = t22
+    a = a2
+    b = b2
+    c = c4
+  t27 = aten::add(a, 1, 1)
+  return t27
diff --git a/test/expect/TestJit.test_pretty_printer-while_test.expect b/test/expect/TestJit.test_pretty_printer-while_test.expect
new file mode 100644
index 00000000000000..487087ad565646
--- /dev/null
+++ b/test/expect/TestJit.test_pretty_printer-while_test.expect
@@ -0,0 +1,15 @@
+def script(a1, i1):
+  t4 = aten::lt(i1, 3)
+  t5 = prim::TensorToNum(t4)
+  a = a1
+  i = i1
+  t6 = t5
+  while t6:
+    a2 = aten::mul(a, a)
+    i2 = aten::add(i, 1, 1)
+    t13 = aten::lt(i2, 3)
+    t14 = prim::TensorToNum(t13)
+    t6 = t14
+    a = a2
+    i = i2
+  return a
diff --git a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-backward.expect b/test/expect/TestSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestSparse.test_log1p-uncoalesced.expect b/test/expect/TestSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
deleted file mode 100644
index fab1614da93d4a..00000000000000
--- a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect
+++ /dev/null
@@ -1 +0,0 @@
-backend of indices (CUDA) must match backend of values (CPU)
\ No newline at end of file
diff --git a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
deleted file mode 100644
index 77b0b500f3b692..00000000000000
--- a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'other' to be a CPU tensor, but got a CUDA tensor
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
deleted file mode 100644
index b6af4e9f4280b5..00000000000000
--- a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect
+++ /dev/null
@@ -1 +0,0 @@
-add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting)
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
deleted file mode 100644
index 8e4e1fc8c1c18f..00000000000000
--- a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error.
\ No newline at end of file
diff --git a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
deleted file mode 100644
index b25c0d3db02b2e..00000000000000
--- a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect
+++ /dev/null
@@ -1 +0,0 @@
-log1p: in-place on uncoalesced tensors is not supported yet!
\ No newline at end of file
diff --git a/test/onnx/expect/TestOperators.test_embedding_bags.expect b/test/onnx/expect/TestOperators.test_embedding_bags.expect
new file mode 100644
index 00000000000000..13faaa6b4add3b
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_embedding_bags.expect
@@ -0,0 +1,104 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "2"
+    input: "0"
+    input: "1"
+    output: "3"
+    output: "4"
+    output: "5"
+    output: "6"
+    op_type: "ATen"
+    attribute {
+      name: "mode"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "operator"
+      s: "embedding_bag"
+      type: STRING
+    }
+    attribute {
+      name: "scale_grad_by_freq"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "sparse"
+      i: 0
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  initializer {
+    dims: 10
+    dims: 8
+    data_type: FLOAT
+    name: "2"
+    raw_data: "\264\314\344\275\017A\376\276\313\374&>J\266a\277s\306\\=\212\032+?\211[t\275\344[\357\276Dk\\\276OKb?\234\'B\277A\334\274\2767N\257\276\320s\263\277\371+\244>:\314\202\277K\200L??\001\275\275\236u4\2774\032\315\277\214\004\224>Z\320\372>\267B\305\276\346G6\277N\265.\276\343\316\272\277t\364a>\201)|>p\223\251\277Qm2?\346\275)\277\354\235\233?\027X\277\277\253\206a?\354\335\226\277L\032o\277\251J\021\277\311\360\215\276\312\274\013\300\252\320\273>\220\"p?\267\020\000<R\262\240\276\343\016\224\2779\241\353?8;\202\277\023\020\234?E\370#>\222\233\314?\334\360?\275|t\303\277\214\351\000\300\3065\302\2775\206\306>X\251\227\277x\2160?U^\251?d\221\350?\237F.?\rp9?9X\004=/c\324\277SL\360\277\'\274<?t\375l?\342\270l?\240\352:>\332\356\226\275\211\035\241>*\271\204\277>\025W>\036K\035?\036\233\200=\035\313\250\276\017\003\346\277\374p_?\313WD?!\006\351\275\232\\q\277\230\007A?"
+  }
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: INT64
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 10
+          }
+          dim {
+            dim_value: 8
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "3"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 8
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 9
+}
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index b996e49f7de853..2dfdd409a15ce7 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -426,7 +426,7 @@ def test_repeat_dim_overflow(self):
 
     def test_norm(self):
         x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
-        self.assertONNX(lambda x: x.norm(dim=2), (x))
+        self.assertONNX(lambda x: x.norm(p=2, dim=2), (x))
 
     def test_upsample(self):
         x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
@@ -440,6 +440,12 @@ def test_batchnorm_noaffine(self):
         x = Variable(torch.randn(128, 128, 1, 1), requires_grad=True)
         self.assertONNX(nn.BatchNorm2d(128, affine=False), x)
 
+    def test_embedding_bags(self):
+        emb_bag = nn.EmbeddingBag(10, 8)
+        input = Variable(torch.LongTensor([1, 2, 3, 4]))
+        offset = Variable(torch.LongTensor([0]))
+        self.assertONNX(emb_bag, (input, offset))
+
     def test_symbolic_override(self):
         """Lifted from fast-neural-style: custom implementation of instance norm
         to be mapped to ONNX operator"""
diff --git a/test/run_test.py b/test/run_test.py
index 12b5a4bee9ae0b..ec2bbde115509c 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -27,7 +27,6 @@
     'distributions',
     'indexing',
     'jit',
-    'legacy_nn',
     'multiprocessing',
     'nccl',
     'nn',
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 8f0b2a99787084..16286b89cc451f 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1006,8 +1006,6 @@ def test_requires_grad_factory(self):
 
     def test_grad_assignment(self):
         x = torch.randn(5, 5)
-        a = torch.randn(2, 2)  # size mismatch
-        b = Variable(torch.randn(5, 5).long())  # type mismatch
 
         with self.assertRaises(RuntimeError):
             x.grad = torch.randn(2, 2)
@@ -1020,6 +1018,8 @@ def test_grad_assignment(self):
             raise unittest.SkipTest("CUDA not available")
         with self.assertRaises(RuntimeError):
             x.grad = Variable(torch.randn(5, 5).cuda())
+        x = x.cuda().half()
+        x.grad = torch.zeros_like(x)  # would raise an error unless sparse type is properly handled
 
         if torch.cuda.device_count() < 2:
             raise unittest.SkipTest("At least 2 CUDA devices needed")
@@ -2995,16 +2995,28 @@ class dont_convert(tuple):
     ('zero_', (), NO_ARGS, 'scalar'),
     ('logsumexp', (S, S), (1,)),
     ('logsumexp', (), (0,), 'scalar'),
-    ('norm', (S, S), (2,)),
+    ('norm', (S, S), (), 'default'),
+    ('norm', (S, S), (2,), '2'),
     ('norm', (S, S), (0,), '0'),
     ('norm', (S, S), (0.5,), '0_5'),
     ('norm', (S, S), (1,), '1'),
     ('norm', (S, S), (3,), '3'),
     ('norm', (S, S), (inf,), 'inf'),
+    ('norm', (S, S), ('fro',), 'fro_default'),
+    ('norm', (S, S), ('fro', [0, 1],), 'fro'),
+    ('norm', (S, S), ('nuc',), 'nuc'),
     ('norm', (S, S), (-1,), 'neg_1'),
+    ('norm', (S, S), (-2,), 'neg_2'),
     ('norm', (S, S), (-0.5,), 'neg_0_5'),
     ('norm', (S, S), (-1.5,), 'neg_1_5'),
-    ('norm', torch.rand(S, S, S) + 5e-2, (1.5,), '1_5'),
+    ('norm', (S, S), (-2, 1,), 'neg_2_2_dim', [1]),
+    ('norm', (S, S), (-1, 1,), 'neg_1_2_dim', [1]),
+    ('norm', (S, S), (0, 1,), '0_2_dim', [1]),
+    ('norm', (S, S), (1, 1,), '1_2_dim', [1]),
+    ('norm', (S, S), (2, 1,), '2_2_dim', [1]),
+    ('norm', (S, S), (3, 1,), '3_2_dim', [1]),
+    ('norm', (S, S), (inf, 1,), 'inf_2_dim'),
+    ('norm', torch.rand(S, S, S) + 5e-2, (1.5,), '1_5_default'),
     ('norm', (S, S, S), (2, 1), '2_dim', [1]),
     ('norm', (S, S, S), (3, 1), '3_dim', [1]),
     ('norm', torch.rand(S, S, S) + 5e-2, (1.5, 1), '1_5_dim', [1]),
diff --git a/test/test_c10d.py b/test/test_c10d.py
index 0df1e3c749c6e6..aeb83501802862 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -1,6 +1,7 @@
 import copy
 import math
 import multiprocessing
+import os
 import sys
 import tempfile
 import time
@@ -146,6 +147,77 @@ def test_unknown_handler(self):
             c10d.rendezvous('invalid://')
 
 
+class RendezvousEnvTest(TestCase):
+    def test_common_errors(self):
+        vars = {
+            "WORLD_SIZE": "2",
+            "RANK": "0",
+            "MASTER_ADDR": "127.0.0.1",
+            "MASTER_PORT": common.find_free_port(),
+        }
+
+        class Env(object):
+            def __init__(self, vars):
+                self.vars = vars
+
+            def __enter__(self):
+                for key, value in self.vars.items():
+                    os.environ[key] = str(value)
+
+            def __exit__(self, type, value, traceback):
+                for key in self.vars.keys():
+                    del os.environ[key]
+
+        def without(d, key):
+            d = d.copy()
+            d.pop(key)
+            return d
+
+        with Env(without(vars, 'WORLD_SIZE')):
+            with self.assertRaisesRegex(ValueError, 'WORLD_SIZE expected'):
+                gen = c10d.rendezvous('env://')
+                next(gen)
+        with Env(without(vars, 'RANK')):
+            with self.assertRaisesRegex(ValueError, 'RANK expected'):
+                gen = c10d.rendezvous('env://')
+                next(gen)
+        with Env(without(vars, 'MASTER_ADDR')):
+            with self.assertRaisesRegex(ValueError, 'MASTER_ADDR expected'):
+                gen = c10d.rendezvous('env://')
+                next(gen)
+        with Env(without(vars, 'MASTER_PORT')):
+            with self.assertRaisesRegex(ValueError, 'MASTER_PORT expected'):
+                gen = c10d.rendezvous('env://')
+                next(gen)
+
+    def test_nominal(self):
+        os.environ['WORLD_SIZE'] = '2'
+        os.environ['MASTER_ADDR'] = '127.0.0.1'
+        os.environ['MASTER_PORT'] = str(common.find_free_port())
+
+        # First rank
+        os.environ['RANK'] = '0'
+        gen0 = c10d.rendezvous('env://')
+        store0, rank0, size0 = next(gen0)
+        self.assertEqual(0, rank0)
+        self.assertEqual(2, size0)
+
+        # Second rank
+        os.environ['RANK'] = '1'
+        gen1 = c10d.rendezvous('env://')
+        store1, rank1, size1 = next(gen1)
+        self.assertEqual(1, rank1)
+        self.assertEqual(2, size1)
+
+        # Set value on both stores
+        store0.set("key0", "value0")
+        store1.set("key1", "value1")
+
+        # Cross check with get
+        self.assertEqual(b"value0", store1.get("key0"))
+        self.assertEqual(b"value1", store0.get("key1"))
+
+
 class RendezvousFileTest(TestCase):
     def test_common_errors(self):
         with self.assertRaisesRegex(ValueError, 'path missing'):
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index f24571e6aad899..3702205e4c4491 100755
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -1,6 +1,7 @@
 import os
-import unittest
+import shutil
 import sys
+import unittest
 
 import torch
 import torch.utils.cpp_extension
@@ -23,6 +24,12 @@
 
 
 class TestCppExtension(common.TestCase):
+    def setUp(self):
+        if sys.platform != 'win32':
+            default_build_root = torch.utils.cpp_extension.get_default_build_root()
+            if os.path.exists(default_build_root):
+                shutil.rmtree(default_build_root)
+
     def test_extension_function(self):
         x = torch.randn(4, 4)
         y = torch.randn(4, 4)
@@ -315,6 +322,25 @@ def test_half_support(self):
         result = module.half_test(x)
         self.assertEqual(result[0], 123)
 
+    def test_reload_jit_extension(self):
+        def compile(code):
+            return torch.utils.cpp_extension.load_inline(
+                name='reloaded_jit_extension',
+                cpp_sources=code,
+                functions='f',
+                verbose=True)
+
+        module = compile('int f() { return 123; }')
+        self.assertEqual(module.f(), 123)
+
+        module = compile('int f() { return 456; }')
+        self.assertEqual(module.f(), 456)
+        module = compile('int f() { return 456; }')
+        self.assertEqual(module.f(), 456)
+
+        module = compile('int f() { return 789; }')
+        self.assertEqual(module.f(), 789)
+
 
 if __name__ == '__main__':
     common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 45ca47aae600a6..a295874f3d9b6b 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1178,8 +1178,8 @@ def test_cat_empty(self):
         TestTorch._test_cat_empty(self, use_cuda=True)
 
     def test_bernoulli(self):
-        x = torch.tensor([0, 1], dtype=torch.float32, device='cuda')
-        self.assertEqual(x.bernoulli().tolist(), [0, 1])
+        TestTorch._test_bernoulli(self, torch.double, 'cuda')
+        TestTorch._test_bernoulli(self, torch.half, 'cuda')
 
     def test_cat_bad_input_sizes(self):
         x = torch.randn(2, 1).cuda()
diff --git a/test/test_distributed.py b/test/test_distributed.py
index cc135c2ebec7a3..88eb4f063be5c9 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -947,18 +947,18 @@ def _test_barrier_helper(self, group, group_id, rank):
 
         self._barrier()
 
-    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier")
+    @unittest.skipIf(BACKEND == "nccl", "NCCL does not support barrier")
     def test_barrier(self):
         group, group_id, rank = self._init_global_test()
         self._test_barrier_helper(group, group_id, rank)
 
     @skip_if_small_worldsize
-    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier")
+    @unittest.skipIf(BACKEND == "nccl", "NCCL does not support barrier")
     def test_barrier_group(self):
         group, group_id, rank = self._init_group_test()
         self._test_barrier_helper(group, group_id, rank)
 
-    @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier")
+    @unittest.skipIf(BACKEND == "nccl", "NCCL does not support barrier")
     def test_barrier_full_group(self):
         group, group_id, rank = self._init_full_group_test()
         self._test_barrier_helper(group, group_id, rank)
diff --git a/test/test_indexing.py b/test/test_indexing.py
index afe9e6d60c653c..9ac13d1b17c3ae 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -274,12 +274,14 @@ def test_invalid_index(self):
         self.assertRaisesRegex(TypeError, 'slice indices', lambda: x["0":"1"])
 
     def test_zero_dim_index(self):
-        # We temporarily support indexing a zero-dim tensor as if it were
-        # a one-dim tensor to better maintain backwards compatibility.
         x = torch.tensor(10)
-        with warnings.catch_warnings(record=True) as w:
-            self.assertEqual(x, x[0])
-            self.assertEqual(len(w), 1)
+        self.assertEqual(x, x.item())
+
+        def runner():
+            print(x[0])
+            return x[0]
+
+        self.assertRaisesRegex(IndexError, 'invalid index', runner)
 
 
 # The tests below are from NumPy test_indexing.py with some modifications to
diff --git a/test/test_jit.py b/test/test_jit.py
index 97202239aeceee..aa04fcee02d6b7 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -366,7 +366,10 @@ def allSum(vs):
         return ge
 
     def assertAllFused(self, graph):
-        self.assertTrue(all(node.kind() in {'prim::Constant', 'prim::FusionGroup'} for node in graph.nodes()))
+        if [n.kind() for n in graph.nodes()] == ['prim::DifferentiableGraph']:
+            graph = next(graph.nodes()).g('Subgraph')
+        self.assertTrue(all(node.kind() in {'prim::Constant', 'prim::FusionGroup'} for node in graph.nodes()),
+                        'got {}'.format(graph))
         self.assertTrue([node.kind() for node in graph.nodes()].count('prim::FusionGroup') == 1)
 
     def assertExportImport(self, trace, inputs):
@@ -1898,6 +1901,59 @@ def random_bar(x):
         x = torch.rand(3, 4)
         self.assertEqual(random_bar(x), (x + 1)[0:1])
 
+    def test_pretty_printer(self):
+        @torch.jit.script
+        def if_test(a, b):
+            # FIXME: use 0 instead of a.
+            # c = 0
+            c = a
+            if bool(a < b):
+                c = b
+            else:
+                c = a
+            return c
+
+        @torch.jit.script
+        def if_one(a, b):
+            c = b
+            if bool(a < b):
+                c = a
+            return c
+
+        @torch.jit.script
+        def while_test(a, i):
+            while bool(i < 3):
+                a *= a
+                i += 1
+            return a
+
+        @torch.jit.script
+        def while_if_test(a, b):
+            c = 0
+            while bool(a < 10):
+                a = a + 1
+                b = b + 1
+                if bool(a > b):
+                    c = 2
+                else:
+                    c = 3
+            return a + 1
+
+        @torch.jit.script
+        def loop_use_test(y):
+            x = y + 1
+            z = x + 5
+            while bool(y < 8):
+                y += 1
+                z = x
+            return x, z
+
+        self.assertExpected(if_test.graph.pretty_print(), "if_test")
+        self.assertExpected(if_one.graph.pretty_print(), "if_one")
+        self.assertExpected(while_test.graph.pretty_print(), "while_test")
+        self.assertExpected(while_if_test.graph.pretty_print(), "while_if_test")
+        self.assertExpected(loop_use_test.graph.pretty_print(), "loop_use_test")
+
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
@@ -2638,6 +2694,23 @@ def func(a, b):
         b = torch.rand(1, requires_grad=True)
         self.checkScript(func, (a, b), optimize=True)
 
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
+    def test_clamp_fusion(self):
+        def func(a, b):
+            return torch.clamp(a + b, min=0, max=2)
+
+        a = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
+        b = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        s = self.checkScript(func, (a, b))
+        self.assertAllFused(s.graph_for(a, b))
+
+        c = s(a, b)
+        c.sum().backward()
+        self.assertAllFused(backward_graph(s))
+
     def test_mul(self):
         def func(a, b):
             return a * b
@@ -3409,7 +3482,7 @@ def test_fuser_multiple_blocks(this, that, theother, meme):
         self.assertEqual(cu.test_fuser_multiple_blocks(*inputs), outputs)
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
-    @unittest.skip("this test is flaky, see #11360")
+    @enable_cpu_fuser
     def test_scalar_fusion(self):
         def fn(x, y):
             return x + y.type_as(x)
@@ -7575,6 +7648,12 @@ def forward(self, x, y):
     'test_var_dim_1d',
     'test_var_dim_1d_neg0',
     'test_var_dim_neg0',
+    'test_norm_inf',
+    'test_norm_inf_2_dim',
+    'test_norm_fro',
+    'test_norm_fro_default',
+    'test_norm_nuc',
+    'test_renorm_norm_inf',
     'test_matrix_power_n=-1',  # involves inverse
     'test_matrix_power_n=-3',  # involves inverse
     # skipped nn functional tests
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
deleted file mode 100644
index f40dae6a32072f..00000000000000
--- a/test/test_legacy_nn.py
+++ /dev/null
@@ -1,1376 +0,0 @@
-import math
-import random
-import unittest
-from copy import deepcopy
-
-import torch
-from torch._six import container_abcs
-import torch.legacy.nn as nn
-from common import to_gpu, freeze_rng_state, run_tests, skipIfRocm, TEST_WITH_ROCM
-from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \
-    module_tests, criterion_tests, PRECISION
-from torch.autograd.gradcheck import get_numerical_jacobian
-from torch.autograd import Variable
-
-
-class OldModuleTest(ModuleTest):
-
-    def __init__(self, *args, **kwargs):
-        super(OldModuleTest, self).__init__(*args, **kwargs)
-        self.check_inplace = kwargs.get('check_inplace', False)
-        # Never check gradgrad for legacy NN
-        self.check_gradgrad = False
-
-    def _do_test(self, test_case, module, input):
-        # TODO: check update parameters
-        # TODO: test IO
-        module.training()
-        with torch.no_grad():
-            test_case.check_jacobian(module, input, self.jacobian_input)
-        module.evaluate()
-        with torch.no_grad():
-            test_case.check_jacobian(module, input, self.jacobian_input)
-
-        # Test .type()
-        module.float().double().forward(input)
-
-        # Test .clearState()
-        module.clearState()
-
-        # test if module can be printed
-        module.__repr__()
-
-        if self.check_inplace:
-            input2 = deepcopy(input)
-            module_ip = self.constructor(*self.constructor_args, inplace=True)
-            with freeze_rng_state():
-                output = module.forward(input)
-            test_case.assertEqual(input, input2)
-            with freeze_rng_state():
-                output2 = module_ip.forward(input2)
-            if not torch.equal(output, input):
-                test_case.assertNotEqual(input, input2)
-            test_case.assertEqual(output, input2)
-
-# TODO: hessian tests
-tests = [
-    OldModuleTest(nn.Add,
-                  constructor_args=(torch.Size([5, 4]),),
-                  input_size=(3, 5, 4),
-                  desc='3D'),
-    OldModuleTest(nn.Add,
-                  constructor_args=(1, True),
-                  input_size=(3, 1, 4),
-                  desc='scalar'),
-    OldModuleTest(nn.AddConstant,
-                  constructor_args=(3.5,),
-                  input_size=(3, 5, 4),
-                  reference_fn=lambda i, _: i + 3.5,
-                  check_inplace=True,
-                  ),
-    OldModuleTest(nn.BatchNormalization,
-                  constructor_args=(10,),
-                  input_size=(4, 10),
-                  desc='affine',
-                  test_cuda=(not TEST_WITH_ROCM)),
-    OldModuleTest(nn.BatchNormalization,
-                  constructor_args=(10, 1e-3, 0.3, False),
-                  input_size=(4, 10),
-                  desc='not_affine',
-                  test_cuda=(not TEST_WITH_ROCM)),
-    OldModuleTest(nn.SpatialBatchNormalization,
-                  constructor_args=(3,),
-                  input_size=(2, 3, 6, 6),
-                  ),
-    OldModuleTest(nn.SpatialBatchNormalization,
-                  constructor_args=(3, 1e-3, 0.8),
-                  input_size=(2, 3, 6, 6),
-                  desc='momentum',
-                  ),
-    OldModuleTest(nn.SpatialBatchNormalization,
-                  constructor_args=(3, 1e-3, 0.8, False),
-                  input_size=(2, 3, 6, 6),
-                  desc='no_affine'),
-    OldModuleTest(nn.VolumetricBatchNormalization,
-                  constructor_args=(3,),
-                  input_size=(2, 3, 4, 4, 4),
-                  ),
-    OldModuleTest(nn.VolumetricBatchNormalization,
-                  constructor_args=(3, 1e-3, 0.7),
-                  input_size=(2, 3, 4, 4, 4),
-                  desc='momentum',
-                  ),
-    OldModuleTest(nn.VolumetricBatchNormalization,
-                  constructor_args=(3, 1e-3, 0.7, False),
-                  input_size=(2, 3, 4, 4, 4),
-                  desc='no_affine'),
-    OldModuleTest(nn.CMul,
-                  constructor_args=(5, 6),
-                  input_size=(10, 5, 6),
-                  desc='3D',
-                  ),
-    OldModuleTest(nn.CMul,
-                  constructor_args=(50, 4),
-                  input_size=(1, 50, 4),
-                  desc='3D_single_example',
-                  ),
-    OldModuleTest(nn.CMul,
-                  constructor_args=(1, 5),
-                  input_fn=lambda: torch.randn(10, 3, 5)[:, 1],
-                  desc='3D_noncontiguous',
-                  ),
-    OldModuleTest(nn.Exp,
-                  input_size=(2, 3, 4),
-                  reference_fn=lambda i, _: i.exp(),
-                  ),
-    OldModuleTest(nn.Log,
-                  input_fn=lambda: torch.rand(2, 3, 2) + 0.1,
-                  reference_fn=lambda i, _: i.log(),
-                  ),
-    OldModuleTest(nn.Clamp,
-                  constructor_args=(-2., 5.),
-                  input_fn=lambda: torch.randn(3, 2, 50) * 6,
-                  reference_fn=lambda i, _: i.clamp(-2, 5)),
-    OldModuleTest(nn.Abs,
-                  input_size=(3, 20, 5),
-                  reference_fn=lambda i, _: i.abs(),
-                  ),
-    OldModuleTest(nn.Bilinear,
-                  constructor_args=(2, 3, 10),
-                  input_size=[(4, 2), (4, 3)],
-                  ),
-    OldModuleTest(nn.Bilinear,
-                  constructor_args=(5, 4, 2),
-                  input_size=[(2, 5), (2, 4)],
-                  desc='small_output',
-                  ),
-    OldModuleTest(nn.Euclidean,
-                  constructor_args=(5, 7),
-                  input_size=(10, 5),
-                  ),
-    OldModuleTest(nn.WeightedEuclidean,
-                  constructor_args=(5, 7),
-                  input_size=(10, 5),
-                  ),
-    OldModuleTest(nn.Cosine,
-                  constructor_args=(5, 7),
-                  input_size=(10, 5),
-                  ),
-    OldModuleTest(nn.CAddTable,
-                  input_size=[(5, 7), (5, 7)],
-                  ),
-    OldModuleTest(nn.CSubTable,
-                  input_size=[(5, 7), (5, 7)],
-                  ),
-    OldModuleTest(nn.CDivTable,
-                  input_fn=lambda: [torch.randn(1, 7), torch.rand(1, 7) + 0.1],
-                  ),
-    OldModuleTest(nn.CMulTable,
-                  input_size=[(5, 7), (5, 7)],
-                  ),
-    OldModuleTest(nn.Square,
-                  input_size=(10, 2, 4),
-                  reference_fn=lambda i, _: i.mul(i)),
-    OldModuleTest(nn.Sqrt,
-                  input_fn=lambda: torch.rand(10, 2, 4) + 0.01,
-                  reference_fn=lambda i, _: i.sqrt()),
-    OldModuleTest(nn.Squeeze,
-                  input_size=(2, 1, 1, 4, 5),
-                  reference_fn=lambda i, _: i.squeeze()),
-    OldModuleTest(nn.Squeeze,
-                  constructor_args=(1,),
-                  input_size=(2, 1, 1, 4, 5),
-                  reference_fn=lambda i, _: i.squeeze(1),
-                  desc='dim'),
-    OldModuleTest(nn.Unsqueeze,
-                  constructor_args=(1,),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.view(2, 1, 4, 5)),
-    OldModuleTest(nn.Unsqueeze,
-                  constructor_args=(0,),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.view(1, 2, 4, 5),
-                  desc='fist_dim'),
-    OldModuleTest(nn.Unsqueeze,
-                  constructor_args=(3,),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.view(2, 4, 5, 1),
-                  desc='last_dim'),
-    OldModuleTest(nn.View,
-                  constructor_args=(-1, 2, 20),
-                  input_size=(2, 2, 4, 5),
-                  reference_fn=lambda i, _: i.view(-1, 2, 20),
-                  desc='infer_batch'),
-    OldModuleTest(nn.View,
-                  constructor_args=(2, 2, 2, 5),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.view(2, 2, 2, 5),
-                  desc='split_dim'),
-    OldModuleTest(nn.View,
-                  constructor_args=(2, -1, 2, 5),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.view(2, -1, 2, 5),
-                  desc='infer_middle'),
-    OldModuleTest(nn.Sum,
-                  constructor_args=(1,),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.sum(1, keepdim=False),
-                  ),
-    OldModuleTest(nn.Sum,
-                  constructor_args=(1, True),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.sum(1, keepdim=False).div(i.size(1)),
-                  desc='sizeAverage',
-                  ),
-    OldModuleTest(nn.Mean,
-                  constructor_args=(1,),
-                  input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: torch.mean(i, 1, keepdim=False),
-                  ),
-    OldModuleTest(lambda: nn.Sequential().add(nn.GradientReversal()).add(nn.GradientReversal()),
-                  input_size=(4, 3, 2, 2),
-                  fullname='GradientReversal',
-                  ),
-    OldModuleTest(nn.Identity,
-                  input_size=(4, 3, 2, 4),
-                  reference_fn=lambda i, _: i),
-    OldModuleTest(nn.DotProduct,
-                  input_size=[(10, 4), (10, 4)],
-                  reference_fn=lambda i, _: torch.Tensor(list(
-                      a.dot(b) for a, b in zip(i[0], i[1]))),
-                  ),
-    OldModuleTest(nn.CosineDistance,
-                  input_size=[(10, 4), (10, 4)],
-                  reference_fn=lambda i, _: torch.Tensor(list(
-                      a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1]))),
-                  ),
-    OldModuleTest(nn.JoinTable,
-                  constructor_args=(0,),
-                  input_size=[(10, 4), (10, 4)],
-                  reference_fn=lambda i, _: torch.cat(i, 0),
-                  desc='first_dim'),
-    OldModuleTest(nn.JoinTable,
-                  constructor_args=(2,),
-                  input_size=[(2, 4, 2), (2, 4, 2)],
-                  reference_fn=lambda i, _: torch.cat(i, 2),
-                  desc='positive_dim_index'),
-    OldModuleTest(nn.JoinTable,
-                  constructor_args=(-1,),
-                  input_size=[(2, 4, 2, 4), (2, 4, 2, 4)],
-                  reference_fn=lambda i, _: torch.cat(i, 3),
-                  desc='negative_dim_index'),
-    OldModuleTest(nn.MM,
-                  input_size=[(4, 5, 3), (4, 3, 2)],
-                  reference_fn=lambda i, _: torch.bmm(*i)),
-    OldModuleTest(nn.MV,
-                  input_size=[(4, 5, 3), (4, 3)],
-                  reference_fn=lambda i, _: torch.bmm(i[0], i[1].view(i[1].size(0), i[1].size(1), 1)).squeeze()),
-    OldModuleTest(nn.Max,
-                  input_size=(4, 5, 3),
-                  reference_fn=lambda i, _: torch.max(i, 0, False)[0]),
-    OldModuleTest(nn.Max,
-                  constructor_args=(1,),
-                  input_size=(4, 5, 3),
-                  reference_fn=lambda i, _: torch.max(i, 1, False)[0],
-                  desc='with_dimension'),
-    OldModuleTest(nn.Min,
-                  input_size=(4, 5, 3),
-                  reference_fn=lambda i, _: torch.min(i, 0, False)[0]),
-    OldModuleTest(nn.Min,
-                  constructor_args=(1,),
-                  input_size=(4, 5, 3),
-                  reference_fn=lambda i, _: torch.min(i, 1, False)[0],
-                  desc='with_dimension'),
-    OldModuleTest(nn.MixtureTable,
-                  input_size=[(5, 3), (5, 3, 6)],
-                  ),
-    OldModuleTest(nn.LookupTable,
-                  constructor_args=(4, 3),
-                  input_fn=lambda: torch.randperm(2).repeat(1, 2),
-                  jacobian_input=False,
-                  test_cuda=(not TEST_WITH_ROCM)),
-    OldModuleTest(nn.Mul,
-                  input_size=(2, 3, 4, 2),
-                  reference_fn=lambda i, p: i * p[0][0],
-                  ),
-    OldModuleTest(nn.MulConstant,
-                  constructor_args=(4,),
-                  input_size=(2, 3, 4, 2),
-                  reference_fn=lambda i, _: i * 4,
-                  check_inplace=True,
-                  ),
-    OldModuleTest(nn.Narrow,
-                  constructor_args=(0, 0),
-                  input_size=(2, 3, 4, 2),
-                  reference_fn=lambda i, _: i.narrow(0, 0, 1)),
-    OldModuleTest(nn.Narrow,
-                  constructor_args=(1, 1, 2),
-                  input_size=(2, 3, 4, 2),
-                  reference_fn=lambda i, _: i.narrow(1, 1, 2),
-                  desc='length'),
-    OldModuleTest(nn.Transpose,
-                  constructor_args=((1, 2), (1, 3)),
-                  input_size=(2, 3, 4, 5),
-                  reference_fn=lambda i, _: i.transpose(1, 2).transpose(1, 3)),
-    OldModuleTest(nn.Transpose,
-                  constructor_args=((1, 2),),
-                  input_size=(2, 3, 4, 5),
-                  reference_fn=lambda i, _: i.transpose(1, 2),
-                  desc='single_arg'),
-    # TODO: this seems to be very slow
-    OldModuleTest(nn.Replicate,
-                  constructor_args=(2, 1),
-                  input_size=(10, 3, 4, 5),
-                  reference_fn=lambda i, _: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5),
-                  ),
-    OldModuleTest(nn.Padding,
-                  constructor_args=(0, 2, -10),
-                  input_size=(2, 3, 4, 5)),
-    OldModuleTest(nn.Padding,
-                  constructor_args=(0, 2, -10, 1),
-                  input_size=(2, 3, 4, 5),
-                  desc='index'),
-    OldModuleTest(nn.Padding,
-                  constructor_args=(0, -2, -10, 1),
-                  input_size=(2, 3, 4, 5),
-                  desc='negative_pad'),
-    OldModuleTest(nn.PartialLinear,
-                  constructor_args=(5, 6),
-                  input_size=(4, 5),
-                  test_cuda=(not TEST_WITH_ROCM)),
-    OldModuleTest(lambda: nn.PartialLinear(5, 6).setPartition(torch.Tensor((2, 4))),
-                  input_size=(4, 5),
-                  fullname='PartialLinear_setPartition',
-                  test_cuda=(not TEST_WITH_ROCM)),
-    OldModuleTest(nn.Power,
-                  constructor_args=(2,),
-                  input_size=(2, 3, 4, 5),
-                  ),
-    OldModuleTest(nn.Power,
-                  constructor_args=(1.5,),
-                  input_fn=lambda: torch.rand(3, 4, 5),
-                  desc='fractional',
-                  ),
-    OldModuleTest(nn.Reshape,
-                  constructor_args=(4, 5),
-                  input_size=(3, 4 * 5),
-                  desc='add_dim'),
-    OldModuleTest(nn.Reshape,
-                  constructor_args=(4 * 5,),
-                  input_size=(3, 4, 5),
-                  desc='squash_dim'),
-    OldModuleTest(nn.Select,
-                  constructor_args=(1, 2),
-                  input_size=(3, 4, 5),
-                  reference_fn=lambda i, _: i.select(1, 2)),
-    OldModuleTest(nn.SelectTable,
-                  constructor_args=(1,),
-                  input_size=[(1,), (2,), (3,), (4,)],
-                  reference_fn=lambda i, _: i[1]),
-    OldModuleTest(nn.SpatialAveragePooling,
-                  constructor_args=(2, 2),
-                  input_size=(2, 3, 6, 6)),
-    OldModuleTest(nn.SpatialAveragePooling,
-                  constructor_args=(2, 2, 2, 2),
-                  input_size=(2, 3, 6, 6),
-                  desc='stride'),
-    OldModuleTest(nn.SpatialAveragePooling,
-                  constructor_args=(2, 2, 2, 2, 1, 1),
-                  input_size=(2, 3, 6, 6),
-                  desc='stride_pad'),
-    OldModuleTest(nn.SpatialAdaptiveMaxPooling,
-                  constructor_args=(4, 4),
-                  input_size=(2, 3, 8, 8),
-                  reference_fn=lambda i, _: nn.SpatialMaxPooling(2, 2).forward(i)),
-    OldModuleTest(nn.SpatialAdaptiveMaxPooling,
-                  constructor_args=(4, 4),
-                  input_size=(2, 3, 7, 11),
-                  desc='irregular'),
-    OldModuleTest(nn.SpatialConvolution,
-                  constructor_args=(3, 4, 3, 3),
-                  input_size=(2, 3, 6, 6)),
-    OldModuleTest(nn.SpatialConvolution,
-                  constructor_args=(3, 4, 3, 3, 2, 2),
-                  input_size=(2, 3, 6, 6),
-                  desc='strided'),
-    OldModuleTest(nn.SpatialConvolution,
-                  constructor_args=(3, 4, 3, 3, 2, 2, 1, 1),
-                  input_size=(2, 3, 6, 6),
-                  desc='padding'),
-    OldModuleTest(nn.SpatialConvolutionLocal,
-                  constructor_args=(3, 2, 4, 4, 2, 2),
-                  input_size=(1, 3, 4, 4)),
-    OldModuleTest(nn.SpatialConvolutionLocal,
-                  constructor_args=(3, 2, 6, 6, 2, 2, 2, 2),
-                  input_size=(2, 3, 6, 6),
-                  desc='stride'),
-    OldModuleTest(nn.SpatialConvolutionLocal,
-                  constructor_args=(3, 2, 6, 6, 2, 2, 2, 2, 1, 1),
-                  input_size=(2, 3, 6, 6),
-                  desc='stride_pad'),
-    OldModuleTest(nn.SpatialDivisiveNormalization,
-                  constructor_args=(3,),
-                  input_size=(2, 3, 8, 8),
-                  ),
-    OldModuleTest(nn.SpatialContrastiveNormalization,
-                  constructor_args=(3,),
-                  input_size=(2, 3, 8, 8),
-                  ),
-    OldModuleTest(nn.SpatialDilatedConvolution,
-                  constructor_args=(3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
-                  input_size=(2, 3, 8, 8)),
-    OldModuleTest(nn.SpatialDilatedConvolution,
-                  constructor_args=(3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
-                  input_size=(2, 3, 8, 8),
-                  desc='stride_pad'),
-    OldModuleTest(nn.SpatialMaxPooling,
-                  constructor_args=(3, 3, 2, 2, 1, 1),
-                  input_size=(1, 3, 7, 7)),
-    OldModuleTest(nn.SpatialReflectionPadding,
-                  constructor_args=(1, 2, 3, 4),
-                  input_size=(2, 3, 8, 8)),
-    OldModuleTest(nn.SpatialReplicationPadding,
-                  constructor_args=(1, 2, 3, 4),
-                  input_size=(2, 3, 4, 4)),
-    OldModuleTest(nn.SpatialZeroPadding,
-                  constructor_args=(1, 2, 3, 4),
-                  input_size=(2, 3, 4, 4)),
-    OldModuleTest(nn.SpatialConvolutionMap,
-                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
-                  input_size=(3, 5, 5),
-                  desc='oneToOne'),
-    OldModuleTest(nn.SpatialConvolutionMap,
-                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
-                  input_size=(3, 5, 5),
-                  desc='oneToOne_stride'),
-    OldModuleTest(nn.SpatialConvolutionMap,
-                  constructor_args=(nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
-                  input_size=(3, 5, 5),
-                  desc='full'),
-    OldModuleTest(nn.SpatialFullConvolutionMap,
-                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
-                  input_size=(3, 5, 5),
-                  desc='oneToOne'),
-    OldModuleTest(nn.SpatialFullConvolutionMap,
-                  constructor_args=(nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
-                  input_size=(3, 5, 5),
-                  desc='oneToOne_stride'),
-    OldModuleTest(nn.SpatialFullConvolutionMap,
-                  constructor_args=(nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
-                  input_size=(3, 5, 5),
-                  desc='full'),
-    # TODO: test CUDA
-    OldModuleTest(lambda: nn.SpatialFractionalMaxPooling(2, 2, 0.5, 0.5).fixPoolingRegions(),
-                  input_size=(1, 3, 5, 5),
-                  fullname='SpatialFractionalMaxPooling_ratio',
-                  test_cuda=False),
-    OldModuleTest(lambda: nn.SpatialFractionalMaxPooling(2, 2, 4, 4).fixPoolingRegions(),
-                  input_size=(1, 3, 7, 7),
-                  fullname='SpatialFractionalMaxPooling_size',
-                  test_cuda=False),
-    OldModuleTest(nn.SpatialFullConvolution,
-                  constructor_args=(3, 4, 3, 3, 2, 2, 1, 1, 1, 1),
-                  input_size=(1, 3, 7, 7)),
-    OldModuleTest(nn.SpatialLPPooling,
-                  constructor_args=(3, 2, 2, 2, 2, 2),
-                  input_size=(1, 3, 7, 7),
-                  ),
-    OldModuleTest(nn.SpatialSubSampling,
-                  constructor_args=(3, 3, 3, 2, 2),
-                  input_size=(1, 3, 7, 7)),
-    OldModuleTest(nn.SpatialSubtractiveNormalization,
-                  constructor_args=(3,),
-                  input_size=(1, 3, 7, 7),
-                  ),
-    OldModuleTest(nn.SpatialSubtractiveNormalization,
-                  constructor_args=(3, torch.rand(3)),
-                  input_size=(1, 3, 7, 7),
-                  desc='kernel'),
-    OldModuleTest(nn.SpatialUpSamplingNearest,
-                  constructor_args=(2,),
-                  input_size=(1, 3, 4, 4)),
-
-    OldModuleTest(nn.TemporalConvolution,
-                  constructor_args=(4, 5, 3),
-                  input_size=(2, 10, 4)),
-    OldModuleTest(nn.TemporalConvolution,
-                  constructor_args=(4, 5, 3, 2),
-                  input_size=(2, 10, 4),
-                  desc='stride'),
-    # TODO: this runs in non-batch mode only
-    OldModuleTest(nn.TemporalSubSampling,
-                  constructor_args=(4, 3),
-                  input_size=(10, 4)),
-    OldModuleTest(nn.TemporalSubSampling,
-                  constructor_args=(4, 3, 2),
-                  input_size=(10, 4),
-                  desc='stride'),
-
-    OldModuleTest(nn.VolumetricAveragePooling,
-                  constructor_args=(2, 2, 2),
-                  input_size=(2, 3, 4, 4, 4)),
-    OldModuleTest(nn.VolumetricAveragePooling,
-                  constructor_args=(2, 2, 2, 2, 2, 2),
-                  input_size=(2, 3, 5, 5, 5),
-                  desc='stride'),
-    OldModuleTest(nn.VolumetricAveragePooling,
-                  constructor_args=(2, 2, 2, 2, 2, 2, 1, 1, 1),
-                  input_size=(2, 3, 5, 5, 5),
-                  desc='stride_pad'),
-    OldModuleTest(nn.VolumetricConvolution,
-                  constructor_args=(3, 4, 2, 2, 2),
-                  input_size=(2, 3, 3, 3, 3)),
-    OldModuleTest(nn.VolumetricConvolution,
-                  constructor_args=(3, 4, 2, 2, 2, 2, 2, 2),
-                  input_size=(2, 3, 5, 5, 5),
-                  desc='stride'),
-    OldModuleTest(nn.VolumetricConvolution,
-                  constructor_args=(3, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1),
-                  input_size=(2, 3, 5, 5, 5),
-                  desc='stride_padding'),
-    OldModuleTest(nn.VolumetricFullConvolution,
-                  constructor_args=(2, 3, 2, 2, 2),
-                  input_size=(1, 2, 4, 4, 4)),
-    OldModuleTest(nn.VolumetricMaxPooling,
-                  constructor_args=(2, 2, 2),
-                  input_fn=lambda: (torch.randn(2, 3, 5, 5, 5) * 1000)),
-    OldModuleTest(nn.VolumetricMaxPooling,
-                  constructor_args=(2, 2, 2, 2, 2, 2),
-                  input_fn=lambda: (torch.randn(2, 3, 5, 5, 5) * 1000),
-                  desc='stride'),
-    OldModuleTest(nn.VolumetricMaxPooling,
-                  constructor_args=(2, 2, 2, 2, 2, 2, 1, 1, 1),
-                  input_fn=lambda: (torch.randn(2, 3, 5, 5, 5) * 1000),
-                  desc='stride_padding'),
-    OldModuleTest(nn.VolumetricReplicationPadding,
-                  constructor_args=(1, 2, 3, 4, 5, 6),
-                  input_size=(2, 3, 5, 5, 5)),
-
-    CriterionTest(nn.L1Cost,
-                  input_size=(2, 3, 4, 5),
-                  target=None),
-    CriterionTest(nn.L1HingeEmbeddingCriterion,
-                  input_size=[(2, 3, 4, 5), (2, 3, 4, 5)],
-                  target=1),
-    CriterionTest(nn.L1HingeEmbeddingCriterion,
-                  constructor_args=(2,),
-                  input_size=[(2, 3, 4, 5), (2, 3, 4, 5)],
-                  target=1,
-                  desc='margin'),
-    CriterionTest(nn.WeightedMSECriterion,
-                  constructor_args_fn=lambda: (torch.rand(3, 4, 5),),
-                  input_size=(2, 3, 4, 5),
-                  target_size=(2, 3, 4, 5),
-                  ),
-    CriterionTest(nn.MarginCriterion,
-                  input_size=(5, 10),
-                  target_fn=lambda: torch.randn(5, 10).sign()),
-    CriterionTest(nn.ClassSimplexCriterion,
-                  constructor_args=(30,),
-                  input_fn=lambda: torch.randn(5, 30).mul(10).renorm(2, 0, 1),
-                  target_fn=lambda: torch.rand(5).mul(30).floor().long(),
-                  desc='margin'),
-]
-# TODO: FlattenTable gradient
-# TODO: NarrowTable gradient
-# TODO: CriterionTable
-# TODO: MultiCriterion
-# TODO: SplitTable
-
-for p in (1, 2, 1.5):
-    tests.append(
-        OldModuleTest(nn.Normalize,
-                      constructor_args=(p,),
-                      input_size=(4, 5),
-                      # Eh, we need to use p as a default, so it's passed by value
-                      reference_fn=lambda i, _, p=p: i.div(i.norm(p, 1, True).expand_as(i)),
-                      desc=str(p),
-                      ),
-    )
-for p in range(1, 4 + 1):
-    tests.append(
-        OldModuleTest(nn.PairwiseDistance,
-                      constructor_args=(p,),
-                      input_size=[(4, 10), (4, 10)],
-                      desc=str(p),
-                      )
-    )
-
-
-def build_spatial_unpooling_net():
-    pool = nn.SpatialMaxPooling(2, 2, 2, 2)
-    unpool = nn.SpatialMaxUnpooling(pool)
-    return nn.Sequential().add(pool).add(unpool)
-
-tests.append(
-    OldModuleTest(build_spatial_unpooling_net,
-                  input_size=(1, 3, 10, 10),
-                  desc='SpatialMaxUnpooling')
-)
-
-
-def build_volumetric_unpooling_net():
-    pool = nn.VolumetricMaxPooling(2, 2, 2, 2)
-    unpool = nn.VolumetricMaxUnpooling(pool)
-    return nn.Sequential().add(pool).add(unpool)
-
-tests.append(
-    OldModuleTest(build_volumetric_unpooling_net,
-                  input_size=(1, 3, 10, 10),
-                  desc='VolumetricMaxUnpooling')
-)
-
-
-def prepare_tests():
-    def add_test(test):
-        test_name = test.get_name()
-        cuda_test_name = test_name + '_cuda'
-        if hasattr(TestNN, test_name):
-            raise RuntimeError('Found two tests with the same name: ' + test_name)
-        if hasattr(TestNN, cuda_test_name):
-            raise RuntimeError('Found two tests with the same name: ' + cuda_test_name)
-        setattr(TestNN, test_name, lambda self, test=test: test(self))
-        setattr(TestNN, cuda_test_name, lambda self, test=test: test.test_cuda(self))
-    name_remap = {
-        'Conv2d': 'SpatialConvolution',
-        'MaxPool2d': 'SpatialMaxPooling',
-        'AvgPool2d': 'SpatialAveragePooling',
-        'Softmax': 'SoftMax',
-        'Softmax2d': 'SpatialSoftMax',
-        'LogSoftmax': 'LogSoftMax',
-        'BatchNorm1d': 'BatchNormalization',
-        'BatchNorm2d': 'SpatialBatchNormalization',
-        'BatchNorm3d': 'VolumetricBatchNormalization',
-        'Hardtanh': 'HardTanh',
-        'Hardshrink': 'HardShrink',
-        'Softplus': 'SoftPlus',
-        'Softshrink': 'SoftShrink',
-        'Softsign': 'SoftSign',
-        'Softmin': 'SoftMin',
-        'Tanhshrink': 'TanhShrink',
-        'CrossMapLRN2d': 'SpatialCrossMapLRN',
-        'L1Loss': 'AbsCriterion',
-        'NLLLoss': 'ClassNLLCriterion',
-        'NLLLoss2d': 'SpatialClassNLLCriterion',
-        'KLDivLoss': 'DistKLDivCriterion',
-    }
-    for test in tests:
-        name = test.get_name()
-        if ((name == "test_Max" or name == "test_Min" or name == "test_Max_with_dimension" or
-           name == "test_Min_with_dimension") and TEST_WITH_ROCM):
-            continue
-        add_test(test)
-    for test_params in module_tests:
-        test_params = deepcopy(test_params)
-        name = test_params.pop('module_name')
-        name = name_remap.get(name, name)
-        # hardshrink is deprecated in nn
-        if name == "HardShrink":
-            continue
-
-        test_params['constructor'] = getattr(nn, name)
-        test = OldModuleTest(**test_params)
-        add_test(test)
-    for test_params in criterion_tests:
-        test_params = deepcopy(test_params)
-        name = test_params.pop('module_name')
-        name = name_remap.get(name, name.replace('Loss', 'Criterion'))
-        # hardshrink is deprecated in nn
-        if name == "HardShrink":
-            continue
-
-        # nn.NLLLoss2d is deprecated, but there is a NLLLoss test for 2d
-        if name == 'ClassNLLCriterion' and 'desc' in test_params.keys() and '2d' in test_params['desc']:
-            name = 'SpatialClassNLLCriterion'
-
-        test_params['constructor'] = getattr(nn, name)
-
-        # If legacy constructor args are specified, use them instead
-        legacy_args = test_params.pop('legacy_constructor_args', None)
-        if legacy_args is not None:
-            test_params['constructor_args'] = legacy_args
-
-        test = CriterionTest(**test_params)
-        add_test(test)
-
-
-def require_grad(input):
-    if isinstance(input, torch.Tensor):
-        input = input.detach()
-        input.requires_grad = True
-        return input
-    elif isinstance(input, container_abcs.Iterable):
-        return type(input)(require_grad(e) for e in input)
-    return input
-
-
-class TestNN(NNTestCase):
-    _do_cuda_memory_leak_check = True
-
-    def _numerical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
-        def fw(input):
-            out = self._forward(module, input)
-            if isinstance(out, Variable):
-                return out.data
-            return out
-
-        res = tuple()
-        if jacobian_input:
-            input = require_grad(input)
-            res += get_numerical_jacobian(fw, input, eps=1e-6),
-        if jacobian_parameters:
-            params, _ = self._get_parameters(module)
-            jacobians = []
-            for p in params:
-                p = p.detach()
-                p.requires_grad = True
-                jacobians.append(get_numerical_jacobian(fw, input, p, eps=1e-6))
-            res += torch.cat(jacobians, 0),
-        return res
-
-    def _forward(self, module, input):
-        with freeze_rng_state():
-            with torch.no_grad():
-                return module.forward(input)
-
-    def _backward(self, module, input, output, grad_output, create_graph=False):
-        if isinstance(input, Variable):
-            input = input.data
-
-        return module.backward(input, grad_output)
-
-    def _forward_criterion(self, criterion, input, target, extra_args=None):
-        if extra_args is None:
-            extra_args = tuple()
-        with torch.no_grad():
-            return criterion.forward(input, target, *extra_args)
-
-    def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None):
-        if extra_args is None:
-            extra_args = tuple()
-        # Ignore gradOutput. It's used for non-legacy tests.
-        with torch.no_grad():
-            return criterion.backward(input, target, *extra_args)
-
-    def _zero_grad_parameters(self, module):
-        return module.zeroGradParameters()
-
-    def _get_parameters(self, module):
-        return module.parameters() or ([], [])
-
-    def test_Dropout(self):
-        p = 0.2
-        input = torch.Tensor(1000).fill_(1 - p)
-
-        module = nn.Dropout(p)
-        output = module.forward(input)
-        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
-        gradInput = module.backward(input, input)
-        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
-
-        module = nn.Dropout(p, True)
-        output = module.forward(input.clone())
-        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
-        gradInput = module.backward(input.clone(), input.clone())
-        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
-
-        # Check that these don't raise errors
-        module.__repr__()
-        str(module)
-
-    def test_SpatialDropout(self):
-        p = 0.2
-        b = random.randint(1, 5)
-        w = random.randint(1, 5)
-        h = random.randint(1, 5)
-        nfeats = 1000
-        input = torch.Tensor(b, nfeats, w, h).fill_(1)
-        module = nn.SpatialDropout(p)
-        module.training()
-        output = module.forward(input)
-        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
-        gradInput = module.backward(input, input)
-        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
-
-        # Check that these don't raise errors
-        module.__repr__()
-        str(module)
-
-    def test_VolumetricDropout(self):
-        p = 0.2
-        bsz = random.randint(1, 5)
-        t = random.randint(1, 5)
-        w = random.randint(1, 5)
-        h = random.randint(1, 5)
-        nfeats = 1000
-        input = torch.Tensor(bsz, nfeats, t, w, h).fill_(1)
-        module = nn.VolumetricDropout(p)
-        module.training()
-        output = module.forward(input)
-        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
-        gradInput = module.backward(input, input)
-        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
-
-        # Check that these don't raise errors
-        module.__repr__()
-        str(module)
-
-    def test_ReLU_reference(self):
-        input = torch.randn(10, 20)
-        module = nn.ReLU()
-        output = module.forward(input)
-        self.assertTrue(output[input.ge(0)].eq(input[input.gt(0)]).all())
-        self.assertTrue(output[input.lt(0)].eq(0).all())
-
-    def test_ReLU6_reference(self):
-        input = torch.randn(10, 20).mul(10)
-        module = nn.ReLU6()
-        output = module.forward(input)
-        self.assertTrue(output[input.ge(6)].eq(6).all())
-        self.assertTrue(output[input.lt(0)].eq(0).all())
-
-    def test_Copy(self):
-        input = torch.randn(3, 4).double()
-        c = nn.Copy(torch.DoubleTensor, torch.FloatTensor)
-        output = c.forward(input)
-        self.assertIsInstance(output, torch.FloatTensor)
-        self.assertEqual(output, input.float(), 1e-6)
-        gradInput = c.backward(input, output.fill_(1))
-        self.assertIsInstance(gradInput, torch.DoubleTensor)
-        self.assertEqual(gradInput, output.double(), 1e-6)
-        c.dontCast = True
-        c.double()
-        self.assertIsInstance(output, torch.FloatTensor)
-
-        # Check that these don't raise errors
-        c.__repr__()
-        str(c)
-
-    def test_FlattenTable(self):
-        input = [
-            torch.rand(1),
-            [
-                torch.rand(2),
-                [
-                    torch.rand(3)
-                ],
-            ],
-            torch.rand(4)
-        ]
-        gradOutput = [
-            torch.rand(1),
-            torch.rand(2),
-            torch.rand(3),
-            torch.rand(4)
-        ]
-
-        m = nn.FlattenTable()
-        output = m.forward(input)
-        self.assertEqual(len(output), 4)
-        self.assertEqual(output[0], input[0])
-        self.assertEqual(output[1], input[1][0])
-        self.assertEqual(output[2], input[1][1][0])
-        self.assertEqual(output[3], input[2])
-
-        gradInput = m.backward(input, gradOutput)
-        self.assertEqual(gradOutput[0], gradInput[0])
-        self.assertEqual(gradOutput[1], gradInput[1][0])
-        self.assertEqual(gradOutput[2], gradInput[1][1][0])
-        self.assertEqual(gradOutput[3], gradInput[2])
-
-        # Check that these don't raise errors
-        m.__repr__()
-        str(m)
-
-        # More uglyness: FlattenTable doesn't rebuild the table every updateOutput
-        # call, so we need to make sure that modifications to the input are
-        # detected correctly (and that the table is correctly rebuilt.
-        # CASE 1: Nothing changes so the output table shouldn't be redefined
-        old_input_map = m.input_map
-        old_output = m.output
-        m.forward(input)
-        self.assertEqual(old_input_map, m.input_map)
-        self.assertEqual(old_output, m.output)
-
-        # CASE 2: An element is added to the input table
-        old_input_map = m.input_map
-        old_output = m.output
-        input[1].append(torch.rand(5))
-        m.forward(input)
-        self.assertNotEqual(old_input_map, m.input_map)
-        self.assertNotEqual(old_output, m.output)
-
-        # CASE 3: An element is removed from the input table
-        old_input_map = m.input_map
-        old_output = m.output
-        input.pop()
-        m.forward(input)
-        self.assertNotEqual(old_input_map, m.input_map)
-        self.assertNotEqual(old_output, m.output)
-
-    def test_Concat(self):
-        input = torch.randn(4, 2)
-        num_modules = random.randint(2, 5)
-        linears = [nn.Linear(2, 5) for i in range(num_modules)]
-
-        m = nn.Concat(0)
-        for l in linears:
-            m.add(l)
-            l.zeroGradParameters()
-            l.weight.fill_(1)
-            l.bias.fill_(0)
-
-        # Check that these don't raise errors
-        m.__repr__()
-        str(m)
-
-        output = m.forward(input)
-        output2 = input.sum(1, True).expand(4, 5).repeat(num_modules, 1)
-        self.assertEqual(output2, output)
-
-        gradInput = m.backward(input, torch.ones(output2.size()))
-        gradInput2 = torch.ones(4, 2).fill_(num_modules * 5)
-        self.assertEqual(gradInput, gradInput2)
-
-        gradWeight = input.sum(0, keepdim=True).expand(5, 2)
-        for l in linears:
-            self.assertEqual(gradWeight, l.gradWeight)
-
-    def test_Parallel(self):
-        input = torch.randn(3, 4, 5)
-        m = nn.Parallel(0, 2)
-        m.add(nn.View(4, 5, 1))
-        m.add(nn.View(4, 5, 1))
-        m.add(nn.View(4, 5, 1))
-
-        # Check that these don't raise errors
-        m.__repr__()
-        str(m)
-
-        output = m.forward(input)
-        output2 = input.transpose(0, 2).transpose(0, 1)
-        self.assertEqual(output2, output)
-
-        gradInput = m.backward(input, output2)
-        self.assertEqual(gradInput, input)
-
-    def test_ParallelTable(self):
-        input = torch.randn(3, 4, 5)
-        p = nn.ParallelTable()
-        p.add(nn.View(4, 5, 1))
-        p.add(nn.View(4, 5, 1))
-        p.add(nn.View(4, 5, 1))
-        m = nn.Sequential()
-        m.add(nn.SplitTable(0))
-        m.add(p)
-        m.add(nn.JoinTable(2))
-
-        # Check that these don't raise errors
-        p.__repr__()
-        str(p)
-
-        output = m.forward(input)
-        output2 = input.transpose(0, 2).transpose(0, 1)
-        self.assertEqual(output2, output)
-
-        gradInput = m.backward(input, output2)
-        self.assertEqual(gradInput, input)
-
-    def test_ConcatTable(self):
-        input = [
-            torch.randn(3, 4).float(), torch.randn(3, 4).float(), [torch.randn(3, 4).float()]
-        ]
-        _gradOutput = [
-            torch.randn(3, 3, 4).float(), torch.randn(3, 3, 4).float(), torch.randn(3, 3, 4).float()
-        ]
-        gradOutput = [
-            [_gradOutput[0][0], _gradOutput[1][0], [_gradOutput[2][0]]],
-            [_gradOutput[0][1], _gradOutput[1][1], [_gradOutput[2][1]]],
-            [_gradOutput[0][2], _gradOutput[1][2], [_gradOutput[2][2]]]
-        ]
-        module = nn.ConcatTable()
-        module.add(nn.Identity())
-        module.add(nn.Identity())
-        module.add(nn.Identity())
-        module.float()
-
-        # Check that these don't raise errors
-        module.__repr__()
-        str(module)
-
-        output = module.forward(input)
-        output2 = [input, input, input]
-        self.assertEqual(output2, output)
-        gradInput = module.backward(input, gradOutput)
-        gradInput2 = [_gradOutput[0].sum(0, keepdim=False), _gradOutput[1].sum(
-            0, keepdim=False), [_gradOutput[2].sum(0, keepdim=False)]]
-        self.assertTrue(isinstance(gradInput, list))
-        self.assertFalse(isinstance(gradInput[0], list))
-        self.assertFalse(isinstance(gradInput[1], list))
-        self.assertTrue(isinstance(gradInput[2], list))
-        self.assertEqual(len(gradInput), 3)
-        self.assertEqual(len(gradInput[2]), 1)
-        for t1, t2 in zip(iter_tensors(gradInput), iter_tensors(gradInput2)):
-            self.assertEqual(t1, t2)
-
-        # test outputs for variable length inputs
-        test = nn.ConcatTable()
-        test.add(nn.Identity())
-        test.add(nn.Identity())
-
-        x = [torch.randn(5), torch.randn(5)]
-        y = [torch.randn(5)]
-
-        o1 = len(test.forward(x))
-        go1 = len(test.backward(x, [x, x]))
-        o2 = len(test.forward(y))
-        go2 = len(test.backward(y, [y, y]))
-        self.assertEqual(o1, 2)
-        self.assertEqual(go1, 2)
-        self.assertEqual(o2, 2)
-        self.assertEqual(go2, 1)
-
-    def test_DepthConcat(self):
-        outputSize = [5, 6, 7, 8]
-        input = torch.randn(2, 3, 12, 12)
-        gradOutput = torch.randn(2, sum(outputSize), 12, 12)
-        concat = nn.DepthConcat(1)
-        concat.add(nn.SpatialConvolution(3, outputSize[0], 1, 1, 1, 1))  # > 2, 5, 12, 12
-        concat.add(nn.SpatialConvolution(3, outputSize[1], 3, 3, 1, 1))  # > 2, 6, 10, 10
-        concat.add(nn.SpatialConvolution(3, outputSize[2], 4, 4, 1, 1))  # > 2, 7, 9, 9
-        concat.add(nn.SpatialConvolution(3, outputSize[3], 5, 5, 1, 1))  # > 2, 8, 8, 8
-        concat.zeroGradParameters()
-        # forward/backward
-        outputConcat = concat.forward(input)
-        gradInputConcat = concat.backward(input, gradOutput)
-        # the spatial dims are the largest, the nFilters is the sum
-        output = torch.Tensor(2, sum(outputSize), 12, 12).zero_()  # zero for padding
-        narrows = ((slice(None), slice(0, 5), slice(None), slice(None)),
-                   (slice(None), slice(5, 11), slice(1, 11), slice(1, 11)),
-                   (slice(None), slice(11, 18), slice(1, 10), slice(1, 10)),
-                   (slice(None), slice(18, 26), slice(2, 10), slice(2, 10)))
-        gradInput = input.clone().zero_()
-        for i in range(4):
-            conv = concat.get(i)
-            gradWeight = conv.gradWeight.clone()
-            conv.zeroGradParameters()
-            output[narrows[i]].copy_(conv.forward(input))
-            gradInput.add_(conv.backward(input, gradOutput[narrows[i]]))
-            self.assertEqual(gradWeight, conv.gradWeight)
-
-        self.assertEqual(output, outputConcat)
-        self.assertEqual(gradInput, gradInputConcat)
-
-        # Check that these don't raise errors
-        concat.__repr__()
-        str(concat)
-
-    def test_Contiguous(self):
-        input = torch.randn(10, 10, 10)
-        noncontig = input[:, 4]
-        module = nn.Contiguous()
-        assert not noncontig.is_contiguous()
-        output = module.forward(noncontig)
-        self.assertEqual(output, noncontig)
-        self.assertTrue(output.is_contiguous())
-
-        # Check that these don't raise errors
-        module.__repr__()
-        str(module)
-
-    def test_Index(self):
-        net = nn.Index(0)
-
-        # test 1D
-        input = [torch.Tensor((10, 20, 30)), torch.LongTensor((0, 1, 1, 2))]
-        output = net.forward(input)
-        self.assertEqual(output, torch.Tensor((10, 20, 20, 30)))
-
-        gradOutput = torch.Tensor((1, 1, 1, 3))
-        gradInput = net.backward(input, gradOutput)
-        self.assertEqual(gradInput[0], torch.Tensor((1, 2, 3)))
-
-        # test 2D
-        input = [torch.Tensor(((10, 20), (30, 40))), torch.LongTensor((0, 0))]
-        output = net.forward(input)
-        self.assertEqual(output, torch.Tensor(((10, 20), (10, 20))))
-
-        gradOutput = torch.Tensor(((1, 2), (1, 2)))
-        gradInput = net.backward(input, gradOutput)
-        self.assertEqual(gradInput[0], torch.Tensor(((2, 4), (0, 0))))
-
-        # Check that these don't raise errors
-        net.__repr__()
-        str(net)
-
-    def test_L1Penalty(self):
-        weight = 1
-        m = nn.L1Penalty(weight, False, False)
-
-        input = torch.rand(2, 10).add_(-0.5)
-        input[0][0] = 0
-
-        m.forward(input)
-        grad = m.backward(input, torch.ones(input.size()))
-
-        self.assertEqual(input.abs().sum() * weight, m.loss)
-
-        true_grad = (input.gt(0).type_as(grad) +
-                     input.lt(0).type_as(grad).mul_(-1)).mul_(weight)
-        self.assertEqual(true_grad, grad)
-
-        # Check that these don't raise errors
-        m.__repr__()
-        str(m)
-
-    def test_MaskedSelect(self):
-        input = torch.randn(4, 5)
-        mask = torch.ByteTensor(4, 5).bernoulli_()
-        module = nn.MaskedSelect()
-        out = module.forward([input, mask])
-        self.assertEqual(input.masked_select(mask), out)
-
-        gradOut = torch.Tensor((20, 80))
-        input = torch.Tensor(((10, 20), (30, 40)))
-        inTarget = torch.Tensor(((20, 0), (0, 80)))
-        mask = torch.ByteTensor(((1, 0), (0, 1)))
-        module = nn.MaskedSelect()
-        module.forward([input, mask])
-        gradIn = module.backward([input, mask], gradOut)
-        self.assertEqual(inTarget, gradIn[0])
-
-        # Check that these don't raise errors
-        module.__repr__()
-        str(module)
-
-    def test_MultiCriterion(self):
-        input = torch.rand(2, 10)
-        target = torch.LongTensor((1, 8))
-        nll = nn.ClassNLLCriterion()
-        nll2 = nn.CrossEntropyCriterion()
-        mc = nn.MultiCriterion().add(nll, 0.5).add(nll2)
-
-        output = mc.forward(input, target)
-        output2 = nll.forward(input, target) / 2 + nll2.forward(input, target)
-
-        self.assertEqual(output, output2)
-        gradInput = mc.backward(input, target)
-        gradInput2 = nll.backward(input, target).clone().div(2).add(nll2.backward(input, target))
-        self.assertEqual(gradInput, gradInput2)
-
-        # test type
-        mc.float()
-        gradInput = gradInput.clone()
-        input3 = input.float()
-        target3 = target
-        output3 = mc.forward(input3, target3)
-        gradInput3 = mc.backward(input3, target3)
-        self.assertEqual(output, output3)
-        self.assertEqual(gradInput.float(), gradInput3)
-
-        # Check that these don't raise errors
-        mc.__repr__()
-        str(mc)
-
-        # test table input
-        # TODO: enable when Criterion.clone is ready
-        # mc.double()
-        # input = [torch.randn(2, 10), [torch.randn(2, 10), torch.randn(2, 10)]]
-        # target = [torch.IntTensor((1, 8)), [torch.IntTensor((5, 6)), torch.IntTensor((4, 3))]]
-        # pnllc = nn.ParallelCriterion().add(nll).add(nn.ParallelCriterion().add(nll.clone()).add(nll.clone()))
-        # pnllc2 = nn.ParallelCriterion().add(nll2).add(nn.ParallelCriterion().add(nll2.clone()).add(nll2.clone()))
-        # mc = nn.MultiCriterion().add(pnllc, 0.5).add(pnllc2)
-        # output = mc.forward(input, target)
-        # output2 = pnllc.forward(input, target)/2 + pnllc2.forward(input, target)
-        # self.assertEqual(output, output2)
-        # gradInput = mc.backward(input, target)
-        # gradInput2 = pnllc.clone().backward(input, target)
-        # gradInput2b = pnllc2.backward(input, target)
-        # gradInput2[0].div(2).add(gradInput2b[0])
-        # gradInput2[1][0].div(2).add(gradInput2b[1][0])
-        # gradInput2[1][1].div(2).add(gradInput2b[1][1])
-        # self.assertEqual(gradInput[1], gradInput2[0])
-        # self.assertEqual(gradInput[1][9], gradInput2[1][0])
-        # self.assertEqual(gradInput[1][1], gradInput2[1][1])
-
-    def test_ParallelCriterion(self):
-        input = [torch.rand(2, 10), torch.randn(2, 10)]
-        target = [torch.LongTensor((1, 8)), torch.randn(2, 10)]
-        nll = nn.ClassNLLCriterion()
-        mse = nn.MSECriterion()
-        pc = nn.ParallelCriterion().add(nll, 0.5).add(mse)
-        output = pc.forward(input, target)
-        output2 = nll.forward(input[0], target[0]) / 2 + mse.forward(input[1], target[1])
-        self.assertEqual(output, output2)
-        gradInput2 = [nll.backward(input[0], target[0]).clone().div(2), mse.backward(input[1], target[1])]
-        gradInput = pc.backward(input, target)
-        self.assertEqual(gradInput[0], gradInput2[0])
-        self.assertEqual(gradInput[1], gradInput2[1])
-
-        # test type
-        pc.float()
-        gradInput[0], gradInput[1] = gradInput[0].clone(), gradInput[1].clone()
-        input3 = [input[0].float(), input[1].float()]
-        target3 = [target[0], target[1].float()]
-        output3 = pc.forward(input3, target3)
-        gradInput3 = pc.backward(input3, target3)
-        self.assertEqual(output, output3)
-        self.assertEqual(gradInput[0].float(), gradInput3[0])
-        self.assertEqual(gradInput[1].float(), gradInput3[1])
-
-        # test repeatTarget
-        input = [torch.rand(2, 10), torch.randn(2, 10)]
-        target = torch.randn(2, 10)
-        mse = nn.MSECriterion()
-        pc = nn.ParallelCriterion(True).add(mse, 0.5).add(nn.MSECriterion())
-        output = pc.forward(input, target)
-        output2 = mse.forward(input[0], target) / 2 + mse.forward(input[1], target)
-        self.assertEqual(output, output2)
-        gradInput = pc.backward(input, target)
-        gradInput2 = [mse.backward(input[0], target).clone().div(2), mse.backward(input[1], target)]
-        self.assertEqual(gradInput[0], gradInput2[0])
-        self.assertEqual(gradInput[1], gradInput2[1])
-
-        # table input
-        input = [torch.randn(2, 10), [torch.rand(2, 10), torch.randn(2, 10)]]
-        target = [torch.LongTensor((2, 5)), [torch.LongTensor((1, 8)), torch.randn(2, 10)]]
-        nll2 = nn.ClassNLLCriterion()
-        nll = nn.ClassNLLCriterion()
-        mse = nn.MSECriterion()
-        pc = nn.ParallelCriterion().add(nll, 0.5).add(mse)
-        pc2 = nn.ParallelCriterion().add(nll2, 0.4).add(pc)
-        output = pc2.forward(input, target)
-        output2 = (nll2.forward(input[0], target[0]) * 0.4 +
-                   nll.forward(input[1][0], target[1][0]) / 2 +
-                   mse.forward(input[1][1], target[1][1]))
-        self.assertEqual(output, output2)
-        gradInput2 = [
-            nll2.backward(input[0], target[0]).clone().mul(0.4),
-            [nll.backward(input[1][1], target[1][0]).clone().div(2), mse.backward(input[1][1], target[1][1])]
-        ]
-        gradInput = pc2.backward(input, target)
-        self.assertEqual(gradInput[0], gradInput2[0])
-        self.assertEqual(gradInput[1][0], gradInput2[1][0])
-        self.assertEqual(gradInput[1][1], gradInput2[1][1])
-
-        # Check that these don't raise errors
-        pc.__repr__()
-        str(pc)
-
-    def test_NarrowTable(self):
-        input = [torch.Tensor(i) for i in range(1, 6)]
-
-        module = nn.NarrowTable(1)
-        output = module.forward(input)
-        self.assertEqual(output, input[1:2])
-
-        module = nn.NarrowTable(2, 3)
-        output = module.forward(input)
-        self.assertEqual(output, input[2:5])
-
-        # Check that these don't raise errors
-        module.__repr__()
-        str(module)
-
-    def test_accUpdateGradParameters(self):
-        module = nn.LookupTable(5, 3)
-        module.weight.fill_(2)
-        input = torch.LongTensor([1, 3])
-        output = module.updateOutput(input)
-        module.backwardUpdate(input, output, 0.1)
-        self.assertEqual(module.weight[0, 0], 2)
-        self.assertEqual(module.weight[3, 0], 1.8)
-
-    def _build_net(self):
-        return (nn.Sequential()
-                .add(nn.Concat(0)
-                     .add(nn.Linear(2, 5))
-                     .add(nn.Linear(2, 5)))
-                .add(nn.ReLU())
-                .add(nn.Linear(10, 20)))
-
-    def test_parameters(self):
-        net = self._build_net()
-        concat = net.modules[0]
-        param, grad = net.parameters()
-
-        self.assertEqual(len(param), 6)
-        self.assertEqual(len(grad), 6)
-
-        self.assertObjectIn(concat.modules[0].weight, param)
-        self.assertObjectIn(concat.modules[0].bias, param)
-        self.assertObjectIn(concat.modules[1].weight, param)
-        self.assertObjectIn(concat.modules[1].bias, param)
-        self.assertObjectIn(net.modules[2].weight, param)
-        self.assertObjectIn(net.modules[2].bias, param)
-
-        self.assertObjectIn(concat.modules[0].gradWeight, grad)
-        self.assertObjectIn(concat.modules[0].gradBias, grad)
-        self.assertObjectIn(concat.modules[1].gradWeight, grad)
-        self.assertObjectIn(concat.modules[1].gradBias, grad)
-        self.assertObjectIn(net.modules[2].gradWeight, grad)
-        self.assertObjectIn(net.modules[2].gradBias, grad)
-
-    def test_flattenParameters(self):
-        net = self._build_net()
-        param, grad_param = net.flattenParameters()
-        self.assertEqual(param.dim(), 1)
-        self.assertEqual(param.size(0), 250)
-        self.assertEqual(grad_param.dim(), 1)
-        self.assertEqual(grad_param.size(0), 250)
-
-    def test_findModules(self):
-        net = self._build_net()
-        modules, containers = net.findModules(nn.Linear)
-        self.assertEqual(len(modules), 3)
-        self.assertEqual(len(modules), len(containers))
-        self.assertObjectIn(net.modules[0].modules[0], modules)
-        self.assertObjectIn(net.modules[0].modules[1], modules)
-        self.assertObjectIn(net.modules[2], modules)
-        self.assertObjectIn(net.modules[0], containers)
-        self.assertEqual(containers.count(net.modules[0]), 2)
-        self.assertObjectIn(net, containers)
-        for m, c in zip(modules, containers):
-            self.assertObjectIn(m, c.modules)
-
-    def test_apply(self):
-        net = self._build_net()
-        seen_modules = set()
-
-        def callback(module):
-            self.assertNotIn(module, seen_modules)
-            seen_modules.add(module)
-        net.apply(callback)
-        self.assertEqual(len(seen_modules), 6)
-
-    def test_listModules(self):
-        net = self._build_net()
-        module_list = list()
-
-        def callback(module):
-            module_list.append(module)
-        net.apply(callback)
-        self.assertEqual(module_list, net.listModules())
-
-    def test_replace(self):
-        ref_net = self._build_net()
-        net = self._build_net()
-
-        def callback(module):
-            if isinstance(module, nn.ReLU):
-                return nn.Tanh()
-            return module
-        net.replace(callback)
-
-        for module, reference in zip(net.listModules(), ref_net.listModules()):
-            if isinstance(reference, nn.ReLU):
-                self.assertIsInstance(module, nn.Tanh)
-            else:
-                self.assertIsInstance(module, type(reference))
-
-
-prepare_tests()
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index be72aeb1886f8a..69f2eb99b37b94 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -22,7 +22,6 @@
 import torch.nn.parallel as dp
 import torch.nn.init as init
 import torch.nn.utils.rnn as rnn_utils
-import torch.legacy.nn as legacy
 from torch.nn.utils import clip_grad_norm_, clip_grad_value_
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
 from torch.autograd import Variable, gradcheck
@@ -4187,6 +4186,44 @@ def get_inputs(input_shape, hidden_shape, mode):
             hidden_shape = update_shape(correct_hidden_shape, 0, bad_size)
             test(input_shape, hidden_shape, mode)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
+    def test_rnn_check_device(self):
+        input_size = 3
+        hidden_size = 5
+        num_layers = 2
+        batch_size = 4
+        seq_len = 6
+        num_directions = 1
+
+        correct_input_shape = (seq_len, batch_size, input_size)
+        correct_hidden_shape = (num_layers * num_directions, batch_size, hidden_size)
+        rnn_modes = ['RNN', 'GRU', 'LSTM']
+
+        for mode in rnn_modes:
+            model = getattr(nn, mode)(input_size, hidden_size, num_layers)
+            input = torch.randn(correct_input_shape)
+            hidden = torch.randn(correct_hidden_shape)
+
+            # input and weights are not at the same device
+            with self.assertRaisesRegex(RuntimeError,
+                                        "Input and parameter tensors are not at the same device"):
+                model(input.to('cuda:0'))
+
+            # input and hiddens are not at the same device
+            with self.assertRaisesRegex(RuntimeError,
+                                        r"Expected object of backend CPU but got backend CUDA for argument"):
+                if mode is 'LSTM':
+                    model(input, (hidden.to('cuda:0'), hidden.to('cuda:0')))
+                else:
+                    model(input, (hidden.to('cuda:0')))
+
+            # hidden tensors are not at the same CUDA device
+            if mode is 'LSTM':
+                with self.assertRaisesRegex(RuntimeError,
+                                            "Input and hidden tensors are not at the same device"):
+                    model(input.to('cuda:0'), (hidden.to('cuda:0'), hidden.to('cuda:1')))
+
     def test_rnn_initial_hidden_state(self):
         rnn_modes = ['RNN', 'GRU', 'LSTM']
         for mode in rnn_modes:
@@ -5768,42 +5805,6 @@ def test_linear_broadcasting(self):
         expected = m(inp.view(6, 5)).view(2, 3, 8)
         self.assertEqual(expected, m(inp))
 
-    def test_bilinear(self):
-        module = nn.Bilinear(10, 10, 8)
-        module_legacy = legacy.Bilinear(10, 10, 8)
-
-        module_legacy.weight.copy_(module.weight.data)
-        module_legacy.bias.copy_(module.bias.data)
-
-        input1 = torch.randn(4, 10)
-        input2 = torch.randn(4, 10)
-
-        output = module(Variable(input1), Variable(input2))
-        output_legacy = module_legacy.forward([input1, input2])
-
-        self.assertEqual(output.data, output_legacy)
-
-        input1_1 = torch.tensor(input1, requires_grad=True)
-        input2_1 = torch.tensor(input2, requires_grad=True)
-
-        module.zero_grad()
-        module_legacy.zeroGradParameters()
-
-        output = module(input1_1, input2_1)
-        grad_output = torch.randn(*output.size())
-        gi1_legacy, gi2_legacy = module_legacy.backward([input1, input2], grad_output)
-        output.backward(grad_output)
-        gi1 = input1_1.grad.data.clone()
-        gi2 = input2_1.grad.data.clone()
-
-        self.assertEqual(gi1, gi1_legacy)
-        self.assertEqual(gi2, gi2_legacy)
-        self.assertEqual(module.weight.grad.data, module_legacy.gradWeight)
-        self.assertEqual(module.bias.grad.data, module_legacy.gradBias)
-
-        _assertGradAndGradgradChecks(self, lambda x1, x2: F.bilinear(x1, x2, module.weight, module.bias),
-                                     (input1_1, input2_1))
-
     def test_bilinear_no_bias(self):
         module = nn.Bilinear(10, 10, 8)
         module_no_bias = nn.Bilinear(10, 10, 8, False)
diff --git a/test/test_optim.py b/test/test_optim.py
index 9e46959d7901fb..0bccb4f9b74dc7 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -5,7 +5,6 @@
 import torch
 from torch._six import inf
 import torch.optim as optim
-import torch.legacy.optim as old_optim
 import torch.nn.functional as F
 from torch.optim import SGD
 from torch.autograd import Variable
@@ -24,44 +23,7 @@ def drosenbrock(tensor):
     return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
 
 
-def wrap_old_fn(old_fn, **config):
-    def wrapper(closure, params, state):
-        return old_fn(closure, params, config, state)
-    return wrapper
-
-
 class TestOptim(TestCase):
-    def _test_rosenbrock(self, constructor, old_fn):
-        params_t = torch.Tensor([1.5, 1.5])
-        state = {}
-
-        params = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
-        optimizer = constructor([params])
-
-        solution = torch.Tensor([1, 1])
-        initial_dist = params.data.dist(solution)
-
-        def eval():
-            optimizer.zero_grad()
-            loss = rosenbrock(params)
-            loss.backward()
-            # loss.backward() will give **slightly** different
-            # gradients, than drosenbtock, because of a different ordering
-            # of floating point operations. In most cases it doesn't matter,
-            # but some optimizers are so sensitive that they can temporarily
-            # diverge up to 1e-4, just to converge again. This makes the
-            # comparison more stable.
-            params.grad.data.copy_(drosenbrock(params.data))
-            return loss
-
-        for i in range(2000):
-            optimizer.step(eval)
-            old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
-                   params_t, state)
-            self.assertEqual(params.data, params_t)
-
-        self.assertLessEqual(params.data.dist(solution), initial_dist)
-
     def _test_rosenbrock_sparse(self, constructor, sparse_only=False):
         params_t = torch.Tensor([1.5, 1.5])
 
@@ -237,16 +199,6 @@ def _build_params_dict_single(self, weight, bias, **kwargs):
         return [dict(params=bias, **kwargs)]
 
     def test_sgd(self):
-        self._test_rosenbrock(
-            lambda params: optim.SGD(params, lr=1e-3),
-            wrap_old_fn(old_optim.sgd, learningRate=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.SGD(params, lr=1e-3, momentum=0.9,
-                                     dampening=0, weight_decay=1e-4),
-            wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9,
-                        dampening=0, weightDecay=1e-4)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
         )
@@ -273,14 +225,6 @@ def test_sgd_sparse(self):
         )
 
     def test_adam(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adam(params, lr=1e-2),
-            wrap_old_fn(old_optim.adam, learningRate=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
         )
@@ -310,18 +254,6 @@ def test_sparse_adam(self):
             optim.SparseAdam(None, lr=1e-2, betas=(1.0, 0.0))
 
     def test_adadelta(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adadelta(params),
-            wrap_old_fn(old_optim.adadelta)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adadelta(params, rho=0.95),
-            wrap_old_fn(old_optim.adadelta, rho=0.95)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adadelta(params, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.Adadelta([weight, bias])
         )
@@ -333,18 +265,6 @@ def test_adadelta(self):
             optim.Adadelta(None, lr=1e-2, rho=1.1)
 
     def test_adagrad(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adagrad(params, lr=1e-1),
-            wrap_old_fn(old_optim.adagrad, learningRate=1e-1)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3),
-            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
         )
@@ -367,18 +287,6 @@ def test_adagrad_sparse(self):
 
     @skipIfRocm
     def test_adamax(self):
-        self._test_rosenbrock(
-            lambda params: optim.Adamax(params, lr=1e-1),
-            wrap_old_fn(old_optim.adamax, learningRate=1e-1)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2),
-            wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)),
-            wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
         )
@@ -391,18 +299,6 @@ def test_adamax(self):
             optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
 
     def test_rmsprop(self):
-        self._test_rosenbrock(
-            lambda params: optim.RMSprop(params, lr=1e-2),
-            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
-            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
-            wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
         )
@@ -415,18 +311,6 @@ def test_rmsprop(self):
             optim.RMSprop(None, lr=1e-2, momentum=-1.0)
 
     def test_asgd(self):
-        self._test_rosenbrock(
-            lambda params: optim.ASGD(params, lr=1e-3),
-            wrap_old_fn(old_optim.asgd, eta0=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.ASGD(params, lr=1e-3, alpha=0.8),
-            wrap_old_fn(old_optim.asgd, eta0=1e-3, alpha=0.8)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.ASGD(params, lr=1e-3, t0=1e3),
-            wrap_old_fn(old_optim.asgd, eta0=1e-3, t0=1e3)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
         )
@@ -440,18 +324,6 @@ def test_asgd(self):
 
     @skipIfRocm
     def test_rprop(self):
-        self._test_rosenbrock(
-            lambda params: optim.Rprop(params, lr=1e-3),
-            wrap_old_fn(old_optim.rprop, stepsize=1e-3)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
-            wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
-            wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
         )
@@ -464,14 +336,6 @@ def test_rprop(self):
             optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))
 
     def test_lbfgs(self):
-        self._test_rosenbrock(
-            lambda params: optim.LBFGS(params),
-            wrap_old_fn(old_optim.lbfgs)
-        )
-        self._test_rosenbrock(
-            lambda params: optim.LBFGS(params, lr=5e-2, max_iter=5),
-            wrap_old_fn(old_optim.lbfgs, learningRate=5e-2, maxIter=5)
-        )
         self._test_basic_cases(
             lambda weight, bias: optim.LBFGS([weight, bias]),
             ignore_multidevice=True
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 10622cd6798a85..2151f4c95d91ff 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -44,7 +44,7 @@ def setUp(self):
         self.SparseTensor = torch.sparse.DoubleTensor
         super(TestSparse, self).setUp()
 
-    def _gen_sparse(self, d, nnz, with_size):
+    def _gen_sparse(self, sparse_dims, nnz, with_size):
         # TODO: Consider implementing this in the CUDA case by directly
         # performing the operations on the GPU.  You won't be able to
         # use torch.rand/torch.randn in this case because they are
@@ -54,28 +54,30 @@ def _gen_sparse(self, d, nnz, with_size):
         # If you do this, be sure to update assert_uncoalesced too
 
         if isinstance(with_size, Number):
-            with_size = [with_size] * d
+            with_size = [with_size] * sparse_dims
 
         if self.is_uncoalesced:
             # We want to generate a tensor with a lot of uncoalesced
             # entries to stress test whether or not we handle this
             # (subtle) case correctly
-            v_size = [nnz * 2] + list(with_size[d:])
+            v_size = [nnz * 2] + list(with_size[sparse_dims:])
             v = torch.randn(*v_size)
-            r = torch.rand(d, nnz)
+            r = torch.rand(sparse_dims, nnz)
             # Repeat the indexes, so every position shows up twice
-            i = torch.cat([r, r], dim=1) * \
-                torch.Tensor(with_size[:d]).repeat(nnz * 2, 1).transpose(0, 1)
+            i = torch.cat([r, r], dim=1)
+            if nnz > 0:
+                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz * 2, 1).transpose(0, 1)
             i = i.type(torch.LongTensor)
             x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
             self.assert_uncoalesced(x)
         else:
-            # Generate a sparse tensor with d sparse dimensions; the
-            # rest the dimensions with_size[d:] are dense.
-            v_size = [nnz] + list(with_size[d:])
+            # Generate a sparse tensor with sparse_dims sparse dimensions; the
+            # rest the dimensions with_size[sparse_dims:] are dense.
+            v_size = [nnz] + list(with_size[sparse_dims:])
             v = torch.randn(*v_size)
-            i = torch.rand(d, nnz) * \
-                torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1)
+            i = torch.rand(sparse_dims, nnz)
+            if nnz > 0:
+                i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz, 1).transpose(0, 1)
             i = i.type(torch.LongTensor)
             x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
 
@@ -90,15 +92,13 @@ def assert_uncoalesced(self, x):
         correctness of the uncoalesced tensor generation algorithm.
         """
         assert not x.is_coalesced()
-        # Strategy: construct a new sparse tensor with the raw value
-        # field overwritten to a tensor of ones, coalesce it, and then
-        # check if any value entries are > 1 (which indicates that the
-        # original was uncoalesced.)
-        i = x._indices().clone()
-        v = x._values().clone().fill_(1)
-        y = torch.sparse.DoubleTensor(i, v, x.size())
-        z = self.safeCoalesce(y)
-        assert (z._values() > 1).sum() > 0
+        existing_indices = set()
+        for i in range(x._nnz()):
+            index = str(x._indices()[:, i])
+            if index in existing_indices:
+                return True
+            else:
+                existing_indices.add(index)
 
     def randn(self, *args, **kwargs):
         """
@@ -164,18 +164,20 @@ def test_print(self):
 
     @skipIfRocm
     def test_basic(self):
-        x, i, v = self._gen_sparse(3, 10, 100)
-
-        self.assertEqual(i, x._indices())
-        self.assertEqual(v, x._values())
-
-        x, i, v = self._gen_sparse(3, 10, [100, 100, 100])
-        self.assertEqual(i, x._indices())
-        self.assertEqual(v, x._values())
-        self.assertEqual(x.ndimension(), 3)
-        self.assertEqual(self.safeCoalesce(x)._nnz(), 10)
-        for i in range(3):
-            self.assertEqual(x.size(i), 100)
+        def test_shape(sparse_dims, nnz, with_size):
+            if isinstance(with_size, Number):
+                with_size = [with_size] * sparse_dims
+            x, i, v = self._gen_sparse(sparse_dims, nnz, with_size)
+            self.assertEqual(i, x._indices())
+            self.assertEqual(v, x._values())
+            self.assertEqual(x.ndimension(), len(with_size))
+            self.assertEqual(self.safeCoalesce(x)._nnz(), nnz)
+            self.assertEqual(list(x.size()), with_size)
+
+        test_shape(3, 10, 100)
+        test_shape(3, 10, [100, 100, 100])
+        test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
         # Make sure that coalesce handles duplicate indices correctly
         i = self.IndexTensor([[9, 0, 0, 0, 8, 1, 1, 1, 2, 7, 2, 2, 3, 4, 6, 9]])
@@ -213,6 +215,13 @@ def test_ctor_size_checks(self):
 
     @skipIfRocm
     def test_to_dense(self):
+        def test_tensor(x, res):
+            x.to_dense()  # Tests triple to_dense for memory corruption
+            x.to_dense()
+            x.to_dense()
+            self.assertEqual(res, x.to_dense())
+            self.assertEqual(res, self.safeToDense(x))
+
         i = self.IndexTensor([
             [0, 1, 2, 2],
             [0, 0, 0, 3],
@@ -234,12 +243,17 @@ def test_to_dense(self):
              [0, 0, 0, 0, 0],
              [0, 0, 0, 0, 4]],
         ])
+        test_tensor(x, res)
 
-        x.to_dense()  # Tests double to_dense for memory corruption
-        x.to_dense()
-        x.to_dense()
-        self.assertEqual(res, x.to_dense())
-        self.assertEqual(res, self.safeToDense(x))
+        i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
+        res = self.ValueTensor(3, 4, 5, 0)
+        test_tensor(x, res)
 
     @skipIfRocm
     def test_shared(self):
@@ -251,8 +265,21 @@ def test_shared(self):
         i[0][0] = 0
         self.assertEqual(self.ValueTensor([6, 0, 0]), self.safeToDense(x))
 
+        i = self.IndexTensor([[2]])
+        v = self.ValueTensor(1, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 0]))
+        i[0][0] = 0
+        self.assertEqual(self.ValueTensor(3, 0), self.safeToDense(x))
+
     @skipIfRocm
     def test_to_dense_hybrid(self):
+        def test_tensor(x, res):
+            x.to_dense()  # Tests double to_dense for memory corruption
+            x.to_dense()
+            x.to_dense()
+            self.assertEqual(res, x.to_dense())
+            self.assertEqual(res, self.safeToDense(x))
+
         i = self.IndexTensor([
             [0, 1, 2, 2],
             [0, 0, 0, 3],
@@ -273,15 +300,24 @@ def test_to_dense_hybrid(self):
              [0, 0],
              [4, 5]],
         ])
+        test_tensor(x, res)
 
-        x.to_dense()  # Tests double to_dense for memory corruption
-        x.to_dense()
-        x.to_dense()
-        self.assertEqual(res, x.to_dense())
-        self.assertEqual(res, self.safeToDense(x))
+        i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+        ])
+        v = self.ValueTensor(4, 2, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 2, 0]))
+        res = self.ValueTensor(3, 4, 2, 0)
+        test_tensor(x, res)
 
     @skipIfRocm
     def test_contig(self):
+        def test_tensor(x, exp_i, exp_v):
+            x = self.safeCoalesce(x)
+            self.assertEqual(exp_i, x._indices())
+            self.assertEqual(exp_v, x._values())
+
         i = self.IndexTensor([
             [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
             [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
@@ -293,9 +329,7 @@ def test_contig(self):
             [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
         ])
         exp_v = self.ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        test_tensor(x, exp_i, exp_v)
 
         i = self.IndexTensor([
             [2, 0, 2, 1],
@@ -310,10 +344,22 @@ def test_contig(self):
             [0, 0, 1, 4],
         ])
         exp_v = self.ValueTensor([2, 1, 3, 4])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
+        exp_i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = self.ValueTensor(4, 0)
+        test_tensor(x, exp_i, exp_v)
 
         # Duplicate indices
         i = self.IndexTensor([
@@ -329,13 +375,30 @@ def test_contig(self):
             [0, 4],
         ])
         exp_v = self.ValueTensor([6, 4])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0]))
+        exp_i = self.IndexTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = self.ValueTensor(2, 0)
+        test_tensor(x, exp_i, exp_v)
 
     @skipIfRocm
     def test_contig_hybrid(self):
+        def test_tensor(x, exp_i, exp_v):
+            x = self.safeCoalesce(x)
+            self.assertEqual(exp_i, x._indices())
+            self.assertEqual(exp_v, x._values())
+
         i = self.IndexTensor([
             [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
             [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
@@ -353,9 +416,7 @@ def test_contig_hybrid(self):
             [2, 3], [1, 2], [6, 7], [4, 5], [10, 11],
             [3, 4], [5, 6], [9, 10], [8, 9], [7, 8],
         ])
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        test_tensor(x, exp_i, exp_v)
 
         i = self.IndexTensor([
             [2, 0, 2, 1],
@@ -370,10 +431,22 @@ def test_contig_hybrid(self):
             [0, 0, 1, 4],
         ])
         exp_v = self.ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [2, 0, 2, 1],
+            [0, 0, 3, 0],
+            [1, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 3, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0]))
+        exp_i = self.IndexTensor([
+            [0, 1, 2, 2],
+            [0, 0, 0, 3],
+            [0, 0, 1, 4],
+        ])
+        exp_v = self.ValueTensor(4, 3, 0)
+        test_tensor(x, exp_i, exp_v)
 
         # Duplicate indices
         i = self.IndexTensor([
@@ -389,51 +462,79 @@ def test_contig_hybrid(self):
             [0, 4],
         ])
         exp_v = self.ValueTensor([[6, 4, 5], [4, 3, 4]])
+        test_tensor(x, exp_i, exp_v)
 
-        x = self.safeCoalesce(x)
-        self.assertEqual(exp_i, x._indices())
-        self.assertEqual(exp_v, x._values())
+        i = self.IndexTensor([
+            [0, 0, 2, 0],
+            [0, 0, 3, 0],
+            [0, 0, 4, 0],
+        ])
+        v = self.ValueTensor(4, 3, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0]))
+        exp_i = self.IndexTensor([
+            [0, 2],
+            [0, 3],
+            [0, 4],
+        ])
+        exp_v = self.ValueTensor(2, 3, 0)
+        test_tensor(x, exp_i, exp_v)
 
     @skipIfRocm
     def test_clone(self):
-        x, _, _ = self._gen_sparse(4, 20, 5)
-        if self.is_uncoalesced:
-            self.assertFalse(x.is_coalesced())
+        def test_shape(sparse_dims, nnz, with_size):
+            x = self._gen_sparse(sparse_dims, nnz, with_size)[0]
+            if self.is_uncoalesced:
+                self.assertFalse(x.is_coalesced())
+                y = x.clone()
+                self.assertFalse(y.is_coalesced())
+            x = x.coalesce()
+            self.assertTrue(x.is_coalesced())
             y = x.clone()
-            self.assertFalse(y.is_coalesced())
-        x = x.coalesce()
-        self.assertTrue(x.is_coalesced())
-        y = x.clone()
-        self.assertTrue(y.is_coalesced())
+            self.assertTrue(y.is_coalesced())
+
+        test_shape(4, 20, 5)
+        test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @cuda_only
     def test_cuda_empty(self):
+        def test_tensor(x):
+            y = x.cuda(0)
+            self.assertEqual(x._sparseDims(), y._sparseDims())
+            self.assertEqual(x._denseDims(), y._denseDims())
+            x = y.cpu()
+            self.assertEqual(y._sparseDims(), x._sparseDims())
+            self.assertEqual(y._denseDims(), x._denseDims())
+
         x = torch.sparse.FloatTensor(2, 3, 4)
-        y = x.cuda(0)
-        self.assertEqual(x._sparseDims(), y._sparseDims())
-        self.assertEqual(x._denseDims(), y._denseDims())
-        x = y.cpu()
-        self.assertEqual(y._sparseDims(), x._sparseDims())
-        self.assertEqual(y._denseDims(), x._denseDims())
+        test_tensor(x)
+
+        x = torch.sparse.FloatTensor(2, 3, 4, 0)
+        test_tensor(x)
 
     @skipIfRocm
     def test_transpose(self):
-        x = self._gen_sparse(4, 20, 5)[0]
-        y = self.safeToDense(x)
+        def test_shape(sparse_dims, nnz, with_size):
+            x = self._gen_sparse(sparse_dims, nnz, with_size)[0]
+            y = self.safeToDense(x)
 
-        for i, j in itertools.combinations(range(4), 2):
-            x = x.transpose_(i, j)
-            y = y.transpose(i, j)
-            self.assertEqual(self.safeToDense(x), y)
+            for i, j in itertools.combinations(range(4), 2):
+                x = x.transpose_(i, j)
+                y = y.transpose(i, j)
+                self.assertEqual(self.safeToDense(x), y)
 
-            x = x.transpose(i, j)
-            y = y.transpose(i, j)
-            self.assertEqual(self.safeToDense(x), y)
+                x = x.transpose(i, j)
+                y = y.transpose(i, j)
+                self.assertEqual(self.safeToDense(x), y)
+
+        test_shape(4, 20, 5)
+        test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @cpu_only
     def test_coalesce_transpose_mm(self):
-        def test_shape(di, dj, dk):
-            x, _, _ = self._gen_sparse(2, 20, [dj, di])
+        def test_shape(di, dj, dk, nnz):
+            x, _, _ = self._gen_sparse(2, nnz, [dj, di])
             y = torch.randn(dj, dk)
 
             x_coalesced = x.coalesce()
@@ -446,43 +547,58 @@ def test_shape(di, dj, dk):
             expected = torch.mm(self.safeToDense(x_coalesced_t), y)
             self.assertEqual(res, expected)
 
-        test_shape(10, 20, 30)
+        test_shape(10, 20, 30, 20)
+        test_shape(0, 20, 30, 0)
+        test_shape(10, 0, 30, 0)
+        test_shape(10, 20, 0, 0)
+        test_shape(10, 20, 0, 20)
 
     def test_t_empty(self):
-        x = self.SparseTensor(2, 3)
-        x.t_()
-        self.assertEqual(torch.Size([3, 2]), x.size())
-        self.assertEqual(0, x._indices().numel())
-        self.assertEqual(0, x._values().numel())
-        self.assertEqual(x._sparseDims(), 2)
-        self.assertEqual(x._denseDims(), 0)
+        def test_in_place(x):
+            shape_original = x.shape
+            x.t_()
+            self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), x.size())
+            self.assertEqual(0, x._indices().numel())
+            self.assertEqual(0, x._values().numel())
+            self.assertEqual(x._sparseDims(), 2)
+            self.assertEqual(x._denseDims(), 0)
+
+        def test_not_in_place(x):
+            shape_original = x.shape
+            y = x.t()
+            self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), y.size())
+            self.assertEqual(0, y._indices().numel())
+            self.assertEqual(0, y._values().numel())
+            self.assertEqual(x._sparseDims(), 2)
+            self.assertEqual(x._denseDims(), 0)
 
         x = self.SparseTensor(2, 3)
-        y = x.t()
-        self.assertEqual(torch.Size([3, 2]), y.size())
-        self.assertEqual(0, y._indices().numel())
-        self.assertEqual(0, y._values().numel())
-        self.assertEqual(x._sparseDims(), 2)
-        self.assertEqual(x._denseDims(), 0)
+        test_in_place(x)
+        test_not_in_place(x)
+
+        x = self.SparseTensor(2, 0)
+        test_in_place(x)
+        test_not_in_place(x)
 
     @skipIfRocm
     def test_add_zeros(self):
-        def test_shape(sparse_dims, sizes):
-            x, _, _ = self._gen_sparse(sparse_dims, 20, sizes)
+        def test_shape(sparse_dims, nnz, sizes):
+            x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes)
             zeros = torch.zeros(sizes, layout=torch.sparse_coo).to(x.device)
             r1 = zeros + x
             r2 = x + zeros
             self.assertEqual(r1, x)
             self.assertEqual(r2, x)
 
-        test_shape(1, [1])
-        test_shape(4, [3, 17, 19, 5])
-        test_shape(2, [3, 17, 19, 5])
+        test_shape(1, 20, [1])
+        test_shape(4, 20, [3, 17, 19, 5])
+        test_shape(2, 20, [3, 17, 19, 5])
+        test_shape(2, 20, [3, 17, 19, 0])
 
     @cpu_only
     def test_mm(self):
-        def test_shape(di, dj, dk):
-            x, _, _ = self._gen_sparse(2, 20, [di, dj])
+        def test_shape(di, dj, dk, nnz):
+            x, _, _ = self._gen_sparse(2, nnz, [di, dj])
             t = torch.randn(di, dk)
             y = torch.randn(dj, dk)
             alpha = random.random()
@@ -500,15 +616,19 @@ def test_shape(di, dj, dk):
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res, expected)
 
-        test_shape(10, 100, 100)
-        test_shape(100, 1000, 200)
-        test_shape(64, 10000, 300)
+        test_shape(10, 100, 100, 20)
+        test_shape(100, 1000, 200, 20)
+        test_shape(64, 10000, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(10, 0, 100, 0)
+        test_shape(10, 100, 0, 0)
+        test_shape(10, 100, 0, 20)
 
     @cpu_only
     def test_saddmm(self):
-        def test_shape(di, dj, dk):
-            x = self._gen_sparse(2, 20, [di, dj])[0]
-            t = self._gen_sparse(2, 20, [di, dk])[0]
+        def test_shape(di, dj, dk, nnz):
+            x = self._gen_sparse(2, nnz, [di, dj])[0]
+            t = self._gen_sparse(2, nnz, [di, dk])[0]
             y = torch.randn(dj, dk)
             alpha = random.random()
             beta = random.random()
@@ -525,43 +645,52 @@ def test_shape(di, dj, dk):
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(self.safeToDense(res), expected)
 
-        test_shape(7, 5, 3)
-        test_shape(1000, 100, 100)
-        test_shape(3000, 64, 300)
+        test_shape(7, 5, 3, 20)
+        test_shape(1000, 100, 100, 20)
+        test_shape(3000, 64, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(1000, 0, 100, 0)
+        test_shape(1000, 100, 0, 0)
 
     @skipIfRocm
     def test_dsmm(self):
-        def test_shape(di, dj, dk):
-            x = self._gen_sparse(2, 20, [di, dj])[0]
+        def test_shape(di, dj, dk, nnz):
+            x = self._gen_sparse(2, nnz, [di, dj])[0]
             y = self.randn(dj, dk)
 
             res = torch.dsmm(x, y)
             expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res, expected)
 
-        test_shape(7, 5, 3)
-        test_shape(1000, 100, 100)
-        test_shape(3000, 64, 300)
+        test_shape(7, 5, 3, 20)
+        test_shape(1000, 100, 100, 20)
+        test_shape(3000, 64, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(1000, 0, 100, 0)
+        test_shape(1000, 100, 0, 0)
+        test_shape(1000, 100, 0, 20)
 
     @skipIfRocm
     def test_hsmm(self):
-        def test_shape(di, dj, dk):
-            x = self._gen_sparse(2, 20, [di, dj])[0]
+        def test_shape(di, dj, dk, nnz):
+            x = self._gen_sparse(2, nnz, [di, dj])[0]
             y = self.randn(dj, dk)
 
             res = torch.hsmm(x, y)
-            # TODO: use self.safeToDense(), but this triggers
-            # https://github.com/pytorch/pytorch/issues/3170
-            expected = torch.mm(x.to_dense(), y)
+            expected = torch.mm(self.safeToDense(x), y)
             self.assertEqual(res.to_dense(), expected)
 
-        test_shape(7, 5, 3)
-        test_shape(1000, 100, 100)
-        test_shape(3000, 64, 300)
+        test_shape(7, 5, 3, 20)
+        test_shape(1000, 100, 100, 20)
+        test_shape(3000, 64, 300, 20)
+        test_shape(0, 100, 100, 0)
+        test_shape(1000, 0, 100, 0)
+        test_shape(1000, 100, 0, 0)
+        test_shape(1000, 100, 0, 20)
 
-    def _test_spadd_shape(self, shape_i, shape_v=None):
+    def _test_spadd_shape(self, nnz, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x, _, _ = self._gen_sparse(len(shape_i), 10, shape)
+        x, _, _ = self._gen_sparse(len(shape_i), nnz, shape)
         y = self.randn(*shape)
         r = random.random()
 
@@ -583,7 +712,7 @@ def _test_spadd_shape(self, shape_i, shape_v=None):
 
         self.assertEqual(res, expected)
 
-        x, i, v = self._gen_sparse(len(shape_i), 10, shape)
+        x, i, v = self._gen_sparse(len(shape_i), nnz, shape)
         nnz = i.size(1)
 
         # Non contiguous sparse indices tensor
@@ -606,28 +735,40 @@ def _test_spadd_shape(self, shape_i, shape_v=None):
 
     @skipIfRocm
     def test_spadd(self):
-        self._test_spadd_shape([5, 6])
-        self._test_spadd_shape([10, 10, 10])
-        self._test_spadd_shape([50, 30, 20])
-        self._test_spadd_shape([5, 5, 5, 5, 5, 5])
+        self._test_spadd_shape(10, [5, 6])
+        self._test_spadd_shape(10, [10, 10, 10])
+        self._test_spadd_shape(10, [50, 30, 20])
+        self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5])
+        self._test_spadd_shape(0, [0, 30, 20])
+        self._test_spadd_shape(0, [50, 0, 20])
+        self._test_spadd_shape(0, [50, 30, 0])
 
     @skipIfRocm
     def test_spadd_hybrid(self):
-        self._test_spadd_shape([5, 6], [2, 3])
-        self._test_spadd_shape([10, 10, 10], [3])
-        self._test_spadd_shape([50, 30, 20], [2])
-        self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2])
+        self._test_spadd_shape(10, [5, 6], [2, 3])
+        self._test_spadd_shape(10, [10, 10, 10], [3])
+        self._test_spadd_shape(10, [50, 30, 20], [2])
+        self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5], [2])
+        self._test_spadd_shape(0, [0, 30, 20], [2, 0])
+        self._test_spadd_shape(0, [50, 0, 20], [2, 0])
+        self._test_spadd_shape(0, [50, 30, 0], [2, 0])
+        self._test_spadd_shape(10, [50, 30, 20], [2, 0])
 
     @skipIfRocm
     def test_norm(self):
-        x, _, _ = self._gen_sparse(3, 10, 100)
-        y = x.coalesce()
-        self.assertEqual(x.norm(), y._values().norm())
+        def test_shape(sparse_dims, nnz, with_size):
+            x, _, _ = self._gen_sparse(sparse_dims, nnz, with_size)
+            y = x.coalesce()
+            self.assertEqual(x.norm(), y._values().norm())
 
-    def _test_basic_ops_shape(self, shape_i, shape_v=None):
+        test_shape(3, 10, 100)
+        test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0])
+        test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0])
+
+    def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
-        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
+        x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape)
 
         y1 = x1 + x2
         y2 = x1.clone()
@@ -689,30 +830,49 @@ def _test_basic_ops_shape(self, shape_i, shape_v=None):
 
     @skipIfRocm
     def test_basic_ops(self):
-        self._test_basic_ops_shape([5, 6])
-        self._test_basic_ops_shape([10, 10, 10])
-        self._test_basic_ops_shape([50, 30, 20])
-        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5])
+        self._test_basic_ops_shape(9, 12, [5, 6])
+        self._test_basic_ops_shape(9, 12, [10, 10, 10])
+        self._test_basic_ops_shape(9, 12, [50, 30, 20])
+        self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5])
+        self._test_basic_ops_shape(0, 12, [10, 10, 10])
+        self._test_basic_ops_shape(9, 0, [10, 10, 10])
+        self._test_basic_ops_shape(0, 0, [10, 10, 10])
+        self._test_basic_ops_shape(0, 0, [10, 10, 0])
 
     @skipIfRocm
     def test_basic_ops_hybrid(self):
-        self._test_basic_ops_shape([5, 6], [2, 3])
-        self._test_basic_ops_shape([10, 10, 10], [3])
-        self._test_basic_ops_shape([50, 30, 20], [2])
-        self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2])
+        self._test_basic_ops_shape(9, 12, [5, 6], [2, 3])
+        self._test_basic_ops_shape(9, 12, [10, 10, 10], [3])
+        self._test_basic_ops_shape(9, 12, [50, 30, 20], [2])
+        self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5], [2])
+        self._test_basic_ops_shape(0, 12, [10, 10, 10], [2])
+        self._test_basic_ops_shape(9, 0, [10, 10, 10], [2])
+        self._test_basic_ops_shape(0, 0, [10, 10, 10], [2])
+        self._test_basic_ops_shape(9, 12, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(0, 12, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(9, 0, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(0, 0, [10, 10, 10], [2, 0])
+        self._test_basic_ops_shape(0, 0, [10, 10, 0], [2, 0])
 
     @skipIfRocm
     def test_add_dense_sparse_mismatch(self):
-        x = torch.zeros([3, 4], dtype=self.value_dtype, device=self.device)
-        sparse_y = self.SparseTensor(torch.zeros(1, 4, dtype=torch.int64, device=self.device),
-                                     torch.randn(4, 4, 4, dtype=self.value_dtype, device=self.device),
-                                     torch.Size([3, 4, 4]))
-        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
-
-    def _test_sparse_mask_shape(self, shape_i, shape_v=None):
+        def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
+            x = torch.zeros(dense_size, dtype=self.value_dtype, device=self.device)
+            sparse_y = self.SparseTensor(torch.zeros(sparse_dims_shape, dtype=torch.int64, device=self.device),
+                                         torch.randn(dense_dims_shape, dtype=self.value_dtype, device=self.device),
+                                         torch.Size(sparse_size))
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    "add: expected 'self' and 'other' to have same size"):
+                x + sparse_y
+
+        test_shape([3, 4], [1, 4], [4, 4, 4], [3, 4, 4])
+        test_shape([3, 4, 0], [1, 4], [4, 4, 4, 0], [3, 4, 4, 0])
+
+    def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
         shape = shape_i + (shape_v or [])
-        x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
-        x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
+        x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape)
+        x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape)
 
         y1 = x1 + x2
         y2 = x1.clone()
@@ -740,87 +900,30 @@ def _test_sparse_mask_fixed(self):
         expected = self.SparseTensor(i, exp_v, torch.Size([5, 4]))
         self.assertEqual(res, expected)
 
+        i = self.IndexTensor([
+            [1, 3, 0, 4],
+            [2, 1, 2, 3],
+        ])
+        v = self.ValueTensor(4, 0)
+        x = self.SparseTensor(i, v, torch.Size([5, 4, 0])).coalesce()
+        dense = self.ValueTensor(5, 4, 0)
+        exp_v = self.ValueTensor(4, 0)
+        res = dense.sparse_mask(x)
+        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 0]))
+        self.assertEqual(res, expected)
+
     @skipIfRocm
     def test_sparse_mask(self):
         self._test_sparse_mask_fixed()
 
-        self._test_sparse_mask_shape([5, 6])
-        self._test_sparse_mask_shape([10, 10, 10])
-        self._test_sparse_mask_shape([50, 30, 20])
-        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5])
-
-    def _test_zeros(self, shape, out_shape_i, out_shape_v=None):
-        out_shape = out_shape_i + (out_shape_v or [])
-        for nnz in [9, 12]:
-            out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape)
-            torch.zeros(*shape, out=out)
-            self.assertEqual(tuple(out.size()), tuple(shape))
-            self.assertTrue(out._indices().numel() == out._values().numel() == 0)
-            self.assertEqual(out._nnz(), 0)
-            self.assertEqual(out._sparseDims(), len(shape))
-            self.assertEqual(out._denseDims(), 0)
-
-    @skipIfRocm
-    def test_log1p(self):
-        if self.is_cuda:
-            input = torch.cuda.sparse.DoubleTensor(
-                torch.LongTensor([[0], [1], [2]]).transpose(1, 0).cuda(),
-                torch.FloatTensor([3, 4, 5]).cuda(),
-                torch.Size([3]))
-        else:
-            input = torch.sparse.DoubleTensor(
-                torch.LongTensor([[0], [1], [2]]).transpose(1, 0),
-                torch.FloatTensor([3, 4, 5]),
-                torch.Size([3]))
-
-        expected_output = torch.tensor([3., 4., 5.]).log1p_()
-        self.assertEqual(expected_output, input.log1p().to_dense())
-        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
-
-        # test in-place op on uncoalesced input
-        self.assertExpectedRaises(RuntimeError, lambda: input.log1p_(), subname="uncoalesced")
-
-        input.requires_grad_()
-        self.assertTrue(input.requires_grad)
-
-        # test autograd
-        x = input.clone()
-        y = input.log1p()
-        self.assertExpectedRaises(RuntimeError, lambda: y.backward(x), subname="backward")
-
-        # test uncoalesced input
-        input_uncoalesced = torch.sparse.DoubleTensor(
-            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
-            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
-            torch.Size([3]))
-        self.assertEqual(expected_output, input_uncoalesced.log1p().to_dense())
-        self.assertEqual(expected_output, input_uncoalesced.coalesce().log1p_().to_dense())
-
-    def test_zeros(self):
-        i_shapes = [2, 3, 4]
-        v_shapes = [3, 4, 5, 6]
-        for i_dim in range(1, len(i_shapes) + 1):
-            for v_dim in range(len(v_shapes) + 1):
-                self._test_zeros([2, 3, 4], i_shapes[:i_dim], v_shapes[:v_dim])
-
-    def _test_zeros_like(self, template_shape_i, template_shape_v=None):
-        template_shape_v = template_shape_v or []
-        template_shape = template_shape_i + template_shape_v
-        for nnz in [9, 12]:
-            t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape)
-            res = torch.zeros_like(t)
-            self.assertEqual(tuple(res.size()), tuple(template_shape))
-            self.assertTrue(res._indices().numel() == res._values().numel() == 0)
-            self.assertEqual(res._nnz(), 0)
-            self.assertEqual(res._sparseDims(), len(template_shape_i))
-            self.assertEqual(res._denseDims(), len(template_shape_v))
-
-    def test_zeros_like(self):
-        i_shapes = [2, 3, 4]
-        v_shapes = [3, 4, 5, 6]
-        for i_dim in range(1, len(i_shapes) + 1):
-            for v_dim in range(len(v_shapes) + 1):
-                self._test_zeros_like(i_shapes[:i_dim], v_shapes[:v_dim])
+        self._test_sparse_mask_shape(9, 12, [5, 6])
+        self._test_sparse_mask_shape(9, 12, [10, 10, 10])
+        self._test_sparse_mask_shape(9, 12, [50, 30, 20])
+        self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5])
+        self._test_sparse_mask_shape(0, 12, [10, 10, 10])
+        self._test_sparse_mask_shape(9, 0, [10, 10, 10])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 10])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 0])
 
     def _test_sparse_mask_hybrid_fixed(self):
         i = self.IndexTensor([
@@ -844,110 +947,130 @@ def _test_sparse_mask_hybrid_fixed(self):
         expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
         self.assertEqual(res, expected)
 
-    @skipIfRocm
-    def test_sparse_variable_methods(self):
-        # TODO: delete when tensor/variable are merged
-        from torch.autograd import Variable
-        i = self.IndexTensor([[0, 1, 1], [2, 0, 2]])
-        v = self.ValueTensor([3, 4, 5])
-        sparse_mat = self.SparseTensor(i, v, torch.Size([2, 3]))
-        sparse_var = Variable(sparse_mat)
-
-        to_test_one_arg = {
-            'zeros_like': lambda x: torch.zeros_like(x),
-            'transpose': lambda x: x.transpose(0, 1),
-            'transpose_': lambda x: x.transpose_(0, 1),
-            't': lambda x: x.t(),
-            't_': lambda x: x.t_(),
-            'div': lambda x: x.div(2),
-            'div_': lambda x: x.div_(2),
-            'pow': lambda x: x.pow(2),
-            '_nnz': lambda x: x._nnz(),
-            'is_coalesced': lambda x: x.is_coalesced(),
-            'coalesce': lambda x: x.coalesce(),
-            'to_dense': lambda x: x.to_dense(),
-            '_sparseDims': lambda x: x._sparseDims(),
-            '_denseDims': lambda x: x._denseDims(),
-            'norm': lambda x: x.norm(),
-            'log1p': lambda x: x.log1p(),
-        }
-
-        for test_name, test_fn in to_test_one_arg.items():
-            var1 = sparse_var.clone()
-            tensor1 = sparse_mat.clone()
-
-            out_var = test_fn(var1)
-            out_tensor = test_fn(tensor1)
-
-            if isinstance(out_tensor, int) or isinstance(out_tensor, bool):
-                if not isinstance(out_var, int) and not isinstance(out_var, bool):
-                    check_var = out_var.data[0]
-                else:
-                    check_var = out_var
-                self.assertEqual(out_var, out_tensor)
-                continue
-
-            # Assume output is variable / tensor
-            self.assertEqual(test_fn(var1).data, test_fn(tensor1),
-                             test_name)
-
-        i = self.IndexTensor([[0, 0, 1], [1, 2, 1]])
-        v = self.ValueTensor([3, 3, 4])
-        sparse_mat2 = self.SparseTensor(i, v, torch.Size([2, 3]))
-        sparse_var2 = Variable(sparse_mat2)
-
-        to_test_two_arg = {
-            'sub': lambda x, y: x.sub(y),
-            'sub_': lambda x, y: x.sub_(y),
-            'mul': lambda x, y: x.mul(y),
-            'mul_': lambda x, y: x.mul_(y),
-        }
-
-        for test_name, test_fn in to_test_two_arg.items():
-            var1 = sparse_var.clone()
-            var2 = sparse_var2.clone()
-            tensor1 = sparse_mat.clone()
-            tensor2 = sparse_mat2.clone()
-            self.assertEqual(test_fn(var1, var2).data,
-                             test_fn(tensor1, tensor2), test_name)
-
-        to_test_mixed = [
-            # test name, lambda expression, should_run_when_cuda
-            ('sspaddmm', lambda sp, de: sp.sspaddmm(sp, de), False),
-            ('sspaddmm_b', lambda sp, de: sp.sspaddmm(2, sp, de), False),
-            ('sspaddmm_b_a', lambda sp, de: sp.sspaddmm(3, 2, sp, de), False),
-            ('addmm', lambda sp, de: de.addmm(sp, de), True),
-            # TODO: This looks like a typo
-            ('addmm_', lambda sp, de: de.addmm(sp, de), True),
-            ('mm', lambda sp, de: torch.mm(sp, de), True),
-            ('mm_out', lambda sp, de: torch.mm(sp, de, out=de), True),
-        ]
-
-        i = self.IndexTensor([[0, 0, 1, 2, 2], [1, 2, 1, 0, 1]])
-        v = self.ValueTensor([3, 3, 4, 1, 2])
-        sparse_mat = self.SparseTensor(i, v, torch.Size([3, 3]))
-        sparse_var = Variable(sparse_mat)
-        dense_mat = sparse_mat.to_dense().random_(0, 5)
-        dense_var = Variable(dense_mat)
-
-        for test_name, test_fn, test_cuda in to_test_mixed:
-            if sparse_var.is_cuda and not test_cuda:
-                continue
-            sp_var = sparse_var.clone()
-            de_var = dense_var.clone()
-            sp_mat = sparse_mat.clone()
-            de_mat = dense_mat.clone()
-            self.assertEqual(test_fn(sp_var, de_var).data,
-                             test_fn(sp_mat, de_mat), test_name)
+        i = self.IndexTensor([
+            [1, 3, 0, 4],
+            [2, 1, 2, 3],
+        ])
+        v = self.ValueTensor(4, 2, 0)
+        x = self.SparseTensor(i, v, torch.Size([5, 4, 2, 0])).coalesce()
+        dense = self.ValueTensor(5, 4, 2, 0)
+        res = dense.sparse_mask(x)
+        exp_v = self.ValueTensor(4, 2, 0)
+        expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2, 0]))
+        self.assertEqual(res, expected)
 
     @skipIfRocm
     def test_sparse_mask_hybrid(self):
         self._test_sparse_mask_hybrid_fixed()
 
-        self._test_sparse_mask_shape([5, 6], [2, 3])
-        self._test_sparse_mask_shape([10, 10, 10], [3])
-        self._test_sparse_mask_shape([50, 30, 20], [2])
-        self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5], [2])
+        self._test_sparse_mask_shape(9, 12, [5, 6], [2, 3])
+        self._test_sparse_mask_shape(9, 12, [10, 10, 10], [3])
+        self._test_sparse_mask_shape(9, 12, [50, 30, 20], [2])
+        self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5], [2])
+        self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2])
+        self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2])
+        self._test_sparse_mask_shape(9, 12, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2, 0])
+        self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0])
+
+    def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None):
+        out_shape = out_shape_i + (out_shape_v or [])
+        for nnz in nnzs:
+            out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape)
+            torch.zeros(*shape, out=out)
+            self.assertEqual(tuple(out.size()), tuple(shape))
+            self.assertTrue(out._indices().numel() == out._values().numel() == 0)
+            self.assertEqual(out._nnz(), 0)
+            self.assertEqual(out._sparseDims(), len(shape))
+            self.assertEqual(out._denseDims(), 0)
+
+    def test_zeros(self):
+        def test_shape(i_shapes, v_shapes, shape, nnzs):
+            for i_dim in range(1, len(i_shapes) + 1):
+                for v_dim in range(len(v_shapes) + 1):
+                    self._test_zeros(nnzs, shape, i_shapes[:i_dim], v_shapes[:v_dim])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 4], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 4], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 4], [9, 12])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 0], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 0], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12])
+
+    def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None):
+        template_shape_v = template_shape_v or []
+        template_shape = template_shape_i + template_shape_v
+        for nnz in nnzs:
+            t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape)
+            res = torch.zeros_like(t)
+            self.assertEqual(tuple(res.size()), tuple(template_shape))
+            self.assertTrue(res._indices().numel() == res._values().numel() == 0)
+            self.assertEqual(res._nnz(), 0)
+            self.assertEqual(res._sparseDims(), len(template_shape_i))
+            self.assertEqual(res._denseDims(), len(template_shape_v))
+
+    def test_zeros_like(self):
+        def test_shape(i_shapes, v_shapes, nnzs):
+            for i_dim in range(1, len(i_shapes) + 1):
+                for v_dim in range(len(v_shapes) + 1):
+                    self._test_zeros_like(nnzs, i_shapes[:i_dim], v_shapes[:v_dim])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
+        test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12])
+        test_shape([0, 3, 4], [3, 4, 5, 6], [0])
+        test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12])
+
+    def _test_log1p_tensor(self, input, dense_tensor):
+        expected_output = torch.tensor(dense_tensor).log1p_()
+        self.assertEqual(expected_output, input.log1p().to_dense())
+        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
+
+        # test in-place op on uncoalesced input
+        with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"):
+            input.log1p_()
+
+        input.requires_grad_()
+        self.assertTrue(input.requires_grad)
+
+        # test autograd
+        x = input.clone()
+        y = input.log1p()
+        with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"):
+            y.backward(x)
+
+    @skipIfRocm
+    def test_log1p(self):
+        input = torch.sparse_coo_tensor(
+            torch.LongTensor([[0], [1], [2]]).transpose(1, 0).clone().detach(),
+            torch.FloatTensor([3, 4, 5]),
+            torch.Size([3]),
+            device=self.device)
+        self._test_log1p_tensor(input, [3., 4., 5.])
+
+        # test uncoalesced input
+        input_uncoalesced = torch.sparse_coo_tensor(
+            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0).clone().detach(),
+            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
+            torch.Size([3]),
+            device=self.device)
+        self._test_log1p_tensor(input_uncoalesced, [3., 4., 5.])
+
+        input = torch.sparse_coo_tensor(
+            torch.zeros([2, 0]),
+            torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+            torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]),
+            device=self.device)
+        self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0]))
+
+        input = torch.sparse_coo_tensor(
+            torch.zeros([1, 5]),
+            torch.zeros([5, 6, 0]),
+            torch.Size([5, 6, 0]),
+            device=self.device)
+        self._test_log1p_tensor(input, torch.zeros([5, 6, 0]))
 
     @skipIfRocm
     def test_sparse_add_coalesce(self):
@@ -959,30 +1082,55 @@ def test_sparse_add_coalesce(self):
 
         self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
 
+        i = self.IndexTensor([[1, 2, 1]])
+        v = self.ValueTensor(3, 0)
+        x = self.SparseTensor(i, v, torch.Size([3, 0]))
+        y = self.SparseTensor(i, v, torch.Size([3, 0]))
+        z = x + y
+
+        self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
+
     @cuda_only
     def test_storage_not_null(self):
         x = torch.cuda.sparse.FloatTensor(2)
         self.assertNotEqual(x.get_device(), -1)
 
+        x = torch.cuda.sparse.FloatTensor(2, 0)
+        self.assertNotEqual(x.get_device(), -1)
+
     @cuda_only
     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     @skipIfRocm
     def test_same_gpu(self):
+        def check_device(x, device_id):
+            self.assertEqual(x.get_device(), device_id)
+            self.assertEqual(x._values().get_device(), device_id)
+            self.assertEqual(x._indices().get_device(), device_id)
+
         i = self.IndexTensor([[2]]).cuda(1)
         v = self.ValueTensor([5]).cuda(1)
         x = self.SparseTensor(i, v, torch.Size([3]), device=1)
-        self.assertEqual(x.get_device(), 1)
-        self.assertEqual(x._values().get_device(), 1)
-        self.assertEqual(x._indices().get_device(), 1)
+        check_device(x, 1)
+
+        i = self.IndexTensor([[2]]).cuda(1)
+        v = self.ValueTensor(1, 0).cuda(1)
+        x = self.SparseTensor(i, v, torch.Size([3, 0]), device=1)
+        check_device(x, 1)
 
         x = self.SparseTensor(3, device=1)
-        self.assertEqual(x.get_device(), 1)
-        self.assertEqual(x._values().get_device(), 1)
-        self.assertEqual(x._indices().get_device(), 1)
+        check_device(x, 1)
 
+        x = self.SparseTensor(3, 0, device=1)
+        check_device(x, 1)
+
+        i = self.IndexTensor([[2]]).cuda(1)
         v = self.ValueTensor([5]).cuda(0)
         self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3])))
 
+        i = self.IndexTensor([[2]]).cuda(1)
+        v = self.ValueTensor(1, 0).cuda(0)
+        self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3, 0])))
+
     def _test_new_device(self, size, device):
         with torch.cuda.device(device):
             x = torch.cuda.sparse.DoubleTensor(*size)
@@ -997,6 +1145,7 @@ def test_new_device_single_gpu(self):
         self._test_new_device((), 0)
         self._test_new_device((30, 20), 0)
         self._test_new_device((30, 20, 10), 0)
+        self._test_new_device((30, 20, 10, 0), 0)
 
     @cuda_only
     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
@@ -1004,49 +1153,67 @@ def test_new_device_multi_gpu(self):
         self._test_new_device((), 1)
         self._test_new_device((30, 20), 1)
         self._test_new_device((30, 20, 10), 1)
+        self._test_new_device((30, 20, 10, 0), 1)
 
     @skipIfRocm
     def test_new(self):
-        x, indices, values = self._gen_sparse(3, 10, 100)
-        if not x.is_cuda:
-            # CUDA sparse tensors currently requires the size to be
-            # specified if nDimV > 0
-            self.assertEqual(x.new(indices, values), x)
-        self.assertEqual(x.new(indices, values, x.size()), x)
+        def test_shape(sparse_dims, nnz, with_size):
+            x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size)
+            if not x.is_cuda:
+                # CUDA sparse tensors currently requires the size to be
+                # specified if nDimV > 0
+                self.assertEqual(x.new(indices, values), x)
+            self.assertEqual(x.new(indices, values, x.size()), x)
+
+        test_shape(3, 10, 100)
+        test_shape(3, 0, [100, 100, 0])
 
     @cpu_only  # not really, but we only really want to run this once
     @skipIfRocm
     def test_factory(self):
-        default_size = torch.Size([1, 3])
-        size = torch.Size([3, 3])
-        for include_size in [True, False]:
-            for use_tensor_idx in [True, False]:
-                for use_tensor_val in [True, False]:
-                    for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
-                        # have to include size with cuda sparse tensors
-                        include_size = include_size or use_cuda
-                        dtype = torch.float64
-                        long_dtype = torch.int64
-                        device = torch.device('cpu') if not use_cuda else torch.device(torch.cuda.device_count() - 1)
-                        indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
-                        values = torch.tensor([1.], dtype=dtype) if use_tensor_val else 1.
-                        if include_size:
-                            sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
-                                                                    device=device, requires_grad=True)
-                        else:
-                            sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
-                                                                    device=device, requires_grad=True)
-                        self.assertEqual(indices, sparse_tensor._indices())
-                        self.assertEqual(values, sparse_tensor._values())
-                        self.assertEqual(size if include_size else default_size, sparse_tensor.size())
-                        self.assertEqual(dtype, sparse_tensor.dtype)
-                        if use_cuda:
-                            self.assertEqual(device, sparse_tensor._values().device)
-                        self.assertEqual(True, sparse_tensor.requires_grad)
+        for test_empty_tensor in [True, False]:
+            if test_empty_tensor:
+                default_size = torch.Size([1, 3, 0])
+                size = torch.Size([3, 3, 0])
+            else:
+                default_size = torch.Size([1, 3])
+                size = torch.Size([3, 3])
+            for include_size in [True, False]:
+                for use_tensor_idx in [True, False]:
+                    for use_tensor_val in [True, False]:
+                        for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
+                            # have to include size with cuda sparse tensors
+                            include_size = include_size or use_cuda
+                            dtype = torch.float64
+                            long_dtype = torch.int64
+                            device = torch.device('cpu') if not use_cuda else \
+                                torch.device(torch.cuda.device_count() - 1)
+                            indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
+                            if test_empty_tensor:
+                                values = self.ValueTensor(1, 0)
+                            else:
+                                if use_tensor_val:
+                                    values = torch.tensor([1.], dtype=dtype)
+                                else:
+                                    values = 1.
+                            if include_size:
+                                sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
+                                                                        device=device, requires_grad=True)
+                            else:
+                                sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
+                                                                        device=device, requires_grad=True)
+                            self.assertEqual(indices, sparse_tensor._indices())
+                            self.assertEqual(values, sparse_tensor._values())
+                            self.assertEqual(size if include_size else default_size, sparse_tensor.size())
+                            self.assertEqual(dtype, sparse_tensor.dtype)
+                            if use_cuda:
+                                self.assertEqual(device, sparse_tensor._values().device)
+                            self.assertEqual(True, sparse_tensor.requires_grad)
 
     @skipIfRocm
     def test_factory_size_check(self):
-        indices = self.IndexTensor([[1, 2], [0, 2]])
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
         values = self.ValueTensor([.5, .5])
         sizes = torch.Size([2, 3])
         with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
@@ -1056,12 +1223,34 @@ def test_factory_size_check(self):
         with self.assertRaisesRegex(RuntimeError, "found negative index"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-        indices = self.IndexTensor([[1, 2], [0, 2]])
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
+        values = self.ValueTensor(2, 1, 0)
+        sizes = torch.Size([2, 3, 1, 0])
+        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
+        values = self.ValueTensor(2, 2, 2)
+        sizes = torch.Size([0, 0, 2, 2])
+        with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
         values = self.ValueTensor([[1, 1, 1], [1, 1, 1]])
         sizes = torch.Size([3, 3, 2])
         with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
+        indices = self.IndexTensor([[1, 2],
+                                    [0, 2]])
+        values = self.ValueTensor(2, 1, 0)
+        sizes = torch.Size([3, 3, 2, 0])
+        with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
     def test_factory_default(self):
         tensor = self.SparseTensor()
         expected_indices = self.IndexTensor(1, 0)
@@ -1095,25 +1284,31 @@ def test_factory_nnz(self):
         with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
-    def _test_factory_tensor_shape(self, i_shape, v_shape, size, expected_size):
-        device = 'cuda' if self.is_cuda else 'cpu'
-        if size:
-            t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device)
-        else:
-            t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device)
-        expected_indices = torch.empty(i_shape, device=device)
-        expected_values = torch.empty(v_shape, device=device)
-        expected_size = torch.Size(expected_size)
-        self.assertEqual(t._indices(), expected_indices)
-        self.assertEqual(t._values(), expected_values)
-        self.assertEqual(t.size(), expected_size)
+        indices = self.IndexTensor([[0]])  # (sparseDims, nnz): (1, 1)
+        values = self.ValueTensor(2, 0)  # (nnz, ...): (2, 0)
+        sizes = torch.Size([2, 0])
+        with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"):
+            torch.sparse_coo_tensor(indices, values, sizes)
 
     def test_factory_nnz_zero(self):
-        self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0])
-        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0])
-        self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0])
-        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0])
-        self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
+        def test_shape(i_shape, v_shape, size, expected_size):
+            device = 'cuda' if self.is_cuda else 'cpu'
+            if size:
+                t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device)
+            else:
+                t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device)
+            expected_indices = torch.empty(i_shape, device=device)
+            expected_values = torch.empty(v_shape, device=device)
+            expected_size = torch.Size(expected_size)
+            self.assertEqual(t._indices(), expected_indices)
+            self.assertEqual(t._values(), expected_values)
+            self.assertEqual(t.size(), expected_size)
+
+        test_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0])
+        test_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0])
+        test_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0])
+        test_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0])
+        test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
 
     @skipIfRocm
     def test_factory_dense_dims(self):
@@ -1123,6 +1318,12 @@ def test_factory_dense_dims(self):
         with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
             torch.sparse_coo_tensor(indices, values, sizes)
 
+        indices = self.IndexTensor([[0]])
+        values = self.ValueTensor(1, 2, 3, 0)
+        sizes = torch.Size([1, 3, 4, 0])
+        with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
+            torch.sparse_coo_tensor(indices, values, sizes)
+
     @cpu_only
     def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32))
@@ -1132,6 +1333,13 @@ def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1]))
         self.assertEqual(torch.int64, t.dtype)
 
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.FloatTensor(1, 0))
+        self.assertEqual(torch.float32, t.dtype)
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.DoubleTensor(1, 0))
+        self.assertEqual(torch.float64, t.dtype)
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.LongTensor(1, 0))
+        self.assertEqual(torch.int64, t.dtype)
+
     @cuda_only
     @skipIfRocm
     def test_factory_device_type_inference(self):
@@ -1140,41 +1348,66 @@ def test_factory_device_type_inference(self):
         for indices_device in ['cuda', 'cpu']:
             for values_device in ['cuda', 'cpu']:
                 for sparse_device in ['cuda', 'cpu', None]:
-                    t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
-                                                torch.tensor([1.], device=values_device),
-                                                (1, 3), device=sparse_device)
-                    should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda')
-                    self.assertEqual(should_be_cuda, t.is_cuda)
+                    for test_empty_tensor in [True, False]:
+                        if test_empty_tensor:
+                            t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
+                                                        self.ValueTensor(1, 0).to(values_device),
+                                                        (1, 3, 0), device=sparse_device)
+                        else:
+                            t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device),
+                                                        torch.tensor([1.], device=values_device),
+                                                        (1, 3), device=sparse_device)
+                        should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda')
+                        self.assertEqual(should_be_cuda, t.is_cuda)
 
     @cpu_only
     def test_factory_copy(self):
+        def test_tensor(indices, values, indices_equal, values_equal):
+            sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
+            if indices_equal:
+                self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+            else:
+                self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
+            if values_equal:
+                self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+            else:
+                self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+
         # both correct
         indices = torch.tensor(([0], [2]), dtype=torch.int64)
         values = torch.tensor([1.], dtype=torch.float64)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, True, True)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int64)
+        values = torch.DoubleTensor(1, 0)
+        test_tensor(indices, values, True, True)
 
         # only indices correct
         indices = torch.tensor(([0], [2]), dtype=torch.int64)
         values = torch.tensor([1.], dtype=torch.float32)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, True, False)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int64)
+        values = torch.FloatTensor(1, 0)
+        test_tensor(indices, values, True, True)  # An empty tensor's data_ptr is always equal to 0
 
         # only values correct
         indices = torch.tensor(([0], [2]), dtype=torch.int32)
         values = torch.tensor([1.], dtype=torch.float64)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, False, True)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int32)
+        values = torch.DoubleTensor(1, 0)
+        test_tensor(indices, values, False, True)
 
         # neither correct
         indices = torch.tensor(([0], [2]), dtype=torch.int32)
         values = torch.tensor([1.], dtype=torch.float32)
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64)
-        self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
-        self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
+        test_tensor(indices, values, False, False)
+
+        indices = torch.tensor(([0], [2]), dtype=torch.int32)
+        values = torch.FloatTensor(1, 0)
+        test_tensor(indices, values, False, True)  # An empty tensor's data_ptr is always equal to 0
 
     @cpu_only  # just run once, we test both cpu and cuda
     @skipIfRocm
@@ -1226,9 +1459,15 @@ def test_is_sparse(self):
         x = torch.randn(3, 3)
         self.assertFalse(x.is_sparse)
 
+        x = torch.randn(3, 3, 0)
+        self.assertFalse(x.is_sparse)
+
         x = self.SparseTensor()
         self.assertTrue(x.is_sparse)
 
+        x = self.SparseTensor(1, 0)
+        self.assertTrue(x.is_sparse)
+
     @skipIfRocm
     def test_resize_as(self):
         def do_test(t):
@@ -1239,6 +1478,8 @@ def do_test(t):
             self.assertEqual(t, t + y)
 
         do_test(self.SparseTensor())
+        do_test(self.SparseTensor(3, 0))
+        do_test(self.SparseTensor(3, 3))
 
     @skipIfRocm
     def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size):
@@ -1265,10 +1506,13 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size):
 
     @skipIfRocm
     def test_resize(self):
-        # 1. Increase the size of some dense dimensions [Supported]
+        # 1. Expand the size of some dense dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                 [1, 1], [1, 2, 4], [2, 2, 4])
 
+        self._test_resize_shape([1, 1], [1, 2, 0], [2, 2, 0],
+                                [1, 1], [1, 2, 4], [2, 2, 4])
+
         # 2. Expand the size of some sparse dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                 [1, 1], [1, 2, 3], [4, 2, 3])
@@ -1277,11 +1521,18 @@ def test_resize(self):
         self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3],
                                 [2, 0], [0, 2, 4, 5], [1, 1, 2, 4, 5])
 
+        self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3],
+                                [2, 0], [0, 2, 4, 0], [1, 1, 2, 4, 0])
+
         # 4. Add dims to dense dimensions [Not Supported]
         with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 3, 4], [2, 2, 3, 4])
 
+        with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
+            self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
+                                    [1, 1], [1, 2, 3, 0], [2, 2, 3, 0])
+
         # 5. Remove dims from dense dimensions [Not Supported]
         with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
@@ -1302,6 +1553,11 @@ def test_resize(self):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 2], [2, 2, 2])
 
+        with self.assertRaisesRegex(RuntimeError, "shrinking the size of dense dimensions"):
+            self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
+                                    [1, 1], [1, 2, 0], [2, 2, 0])
+
+    @skipIfRocm
     def test_is_nonzero(self):
         self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero())
@@ -1309,6 +1565,8 @@ def test_is_nonzero(self):
         self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (0., 0.), (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (-1., 1.), (1,)).is_nonzero())
         self.assertTrue(torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, []).is_nonzero())  # scalar sparse tensor
+        with self.assertRaisesRegex(RuntimeError, "bool value of Tensor with no values is ambiguous"):
+            torch.sparse_coo_tensor(([0, 1],), self.ValueTensor(2, 0), (4, 0)).is_nonzero()
 
 
 class TestUncoalescedSparse(TestSparse):
@@ -1339,11 +1597,26 @@ class TestSparseOneOff(TestCase):
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @skipIfRocm
     def test_cuda_from_cpu(self):
-        self.assertExpectedRaises(
-            RuntimeError,
-            lambda: torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                             torch.randn(4, 4, 4),
-                                             [3, 4, 4]))
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                     torch.randn(4, 4, 4),
+                                     [3, 4, 4])
+
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                     torch.randn(4, 4, 4, 0),
+                                     [3, 4, 4, 0])
+
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"):
+            torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
+                                     torch.randn(0, 4, 4, 0),
+                                     [0, 4, 4, 0])
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @skipIfRocm
@@ -1352,7 +1625,22 @@ def test_cuda_sparse_cpu_dense_add(self):
         sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
                                                  torch.randn(4, 4, 4).cuda(),
                                                  [3, 4, 4])
-        self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y)
+        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
+            x + sparse_y
+
+        x = torch.zeros(3, 4, 4, 0)
+        sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
+                                                 torch.randn(4, 4, 4, 0).cuda(),
+                                                 [3, 4, 4, 0])
+        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
+            x + sparse_y
+
+        x = torch.zeros(0, 4, 4, 0)
+        sparse_y = torch.cuda.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
+                                                 torch.randn(0, 4, 4, 0).cuda(),
+                                                 [0, 4, 4, 0])
+        with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"):
+            x + sparse_y
 
 
 if __name__ == '__main__':
diff --git a/test/test_torch.py b/test/test_torch.py
index a01a93f580865c..5d8fe640c20af1 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -842,6 +842,7 @@ def _test_norm(self, device):
             res = x.norm(p).item()
             expected = np.linalg.norm(xn, p)
             self.assertEqual(res, expected, "full reduction failed for {}-norm".format(p))
+
         # one dimension
         x = torch.randn(5, 5, device=device)
         xn = x.cpu().numpy()
@@ -851,6 +852,13 @@ def _test_norm(self, device):
             self.assertEqual(res.shape, expected.shape)
             self.assertTrue(np.allclose(res, expected), "dim reduction failed for {}-norm".format(p))
 
+        # matrix norm
+        for p in ['fro', 'nuc']:
+            res = x.norm(p).cpu().numpy()
+            expected = np.linalg.norm(xn, p)
+            self.assertEqual(res.shape, expected.shape)
+            self.assertTrue(np.allclose(res, expected), "dim reduction failed for {}-norm".format(p))
+
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_norm(self):
         self._test_norm(self, device='cpu')
@@ -7378,22 +7386,51 @@ def test_norm_fastpaths(self):
         expected = torch.pow(x.pow(3).abs().sum(1), 1.0 / 3.0)
         self.assertEqual(result, expected)
 
-    def test_bernoulli(self):
-        t = torch.ByteTensor(10, 10)
+    @staticmethod
+    def _test_bernoulli(self, p_dtype, device):
+        for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]):
+            x = torch.tensor(trivial_p, dtype=p_dtype, device=device)
+            self.assertEqual(x.bernoulli().tolist(), trivial_p)
 
         def isBinary(t):
-            return torch.ne(t, 0).mul_(torch.ne(t, 1)).sum() == 0
+            return torch.ne(t, 0).mul_(torch.ne(t, 1)).sum().item() == 0
 
-        p = 0.5
-        t.bernoulli_(p)
+        p = torch.rand(5, 5, dtype=p_dtype, device=device)
+        self.assertTrue(isBinary(p.bernoulli()))
+
+        p = torch.rand(5, dtype=p_dtype, device=device).expand(5, 5)
+        self.assertTrue(isBinary(p.bernoulli()))
+
+        p = torch.rand(5, 5, dtype=p_dtype, device=device)
+        torch.bernoulli(torch.rand_like(p), out=p)
+        self.assertTrue(isBinary(p))
+
+        p = torch.rand(5, dtype=p_dtype, device=device).expand(5, 5)
+        torch.bernoulli(torch.rand_like(p), out=p)
+        self.assertTrue(isBinary(p))
+
+        # test that it works with integral tensors
+        t = torch.empty(10, 10, dtype=torch.uint8, device=device)
+
+        t.fill_(2)
+        t.bernoulli_(0.5)
         self.assertTrue(isBinary(t))
 
-        p = torch.rand(10, 10)
+        p = torch.rand(10, dtype=p_dtype, device=device).expand(10, 10)
+        t.fill_(2)
         t.bernoulli_(p)
         self.assertTrue(isBinary(t))
 
-        q = torch.rand(5, 5)
-        self.assertTrue(isBinary(q.bernoulli()))
+        t.fill_(2)
+        torch.bernoulli(torch.rand_like(t, dtype=p_dtype), out=t)
+        self.assertTrue(isBinary(t))
+
+        t.fill_(2)
+        t.bernoulli_(torch.rand_like(t, dtype=p_dtype))
+        self.assertTrue(isBinary(t))
+
+    def test_bernoulli(self):
+        self._test_bernoulli(self, torch.double, 'cpu')
 
     def test_normal(self):
         q = torch.Tensor(100, 100)
diff --git a/test/test_utils.py b/test/test_utils.py
index af93e3652e63be..971e8a4f05f8e0 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -441,98 +441,6 @@ def test_gpu(self):
                           lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
 
 
-class TestLuaReader(TestCase):
-
-    @staticmethod
-    def _module_test(name, test):
-        def do_test(self):
-            module = test['module']
-            input = test['input']
-            grad_output = test['grad_output']
-            if hasattr(self, '_transform_' + name):
-                input = getattr(self, '_transform_' + name)(input)
-            output = module.forward(input)
-            module.zeroGradParameters()
-            grad_input = module.backward(input, grad_output)
-            self.assertEqual(output, test['output'])
-            self.assertEqual(grad_input, test['grad_input'])
-            if module.parameters() is not None:
-                params, d_params = module.parameters()
-                self.assertEqual(params, test['params'])
-                self.assertEqual(d_params, test['d_params'])
-            else:
-                self.assertFalse('params' in test and test['params'])
-                self.assertFalse('params' in test and test['d_params'])
-        return do_test
-
-    @staticmethod
-    def _criterion_test(name, test):
-        def do_test(self):
-            module = test['module']
-            input = test['input']
-            if name == 'L1Cost':
-                target = None
-            else:
-                target = test['target']
-            if hasattr(self, '_transform_' + name):
-                input, target = getattr(self, '_transform_' + name)(input, target)
-
-            output = module.forward(input, target)
-            grad_input = module.backward(input, target)
-            self.assertEqual(output, test['loss'])
-            self.assertEqual(grad_input, test['grad_input'])
-        return do_test
-
-    @classmethod
-    def init(cls):
-        try:
-            path = download_file('https://download.pytorch.org/test_data/legacy_modules.t7')
-        except unittest.SkipTest:
-            return
-        long_size = 8 if sys.platform == 'win32' else None
-        tests = load_lua(path, long_size=long_size)
-        for name, test in tests['modules'].items():
-            if name == "HardShrink":
-                continue
-            test_name = 'test_' + name.replace('nn.', '')
-            setattr(cls, test_name, cls._module_test(name, test))
-        for name, test in tests['criterions'].items():
-            if name == "HardShrink":
-                continue
-            test_name = 'test_' + name.replace('nn.', '')
-            setattr(cls, test_name, cls._criterion_test(name, test))
-
-    def _transform_Index(self, input):
-        return [input[0], input[1].sub(1)]
-
-    def _transform_LookupTable(self, input):
-        return input.sub(1)
-
-    def _transform_MultiLabelMarginCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_ClassNLLCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_SpatialClassNLLCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_ClassSimplexCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_CrossEntropyCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_ParallelCriterion(self, input, target):
-        return input, [target[0].sub(1), target[1]]
-
-    def _transform_MultiCriterion(self, input, target):
-        return input, target.sub(1)
-
-    def _transform_MultiMarginCriterion(self, input, target):
-        return input, target.sub(1)
-
-
 @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
 class TestBottleneck(TestCase):
     def _run(self, command):
@@ -700,6 +608,4 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
 
 
 if __name__ == '__main__':
-    from torch.utils.serialization import load_lua
-    TestLuaReader.init()
     run_tests()
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index caec9575ef9cca..e7d89246179cf8 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -143,7 +143,14 @@
   batch1: grad.bmm(batch2.transpose(1, 2)) * alpha
   batch2: batch1.transpose(1, 2).bmm(grad) * alpha
 
-- name: bernoulli(Tensor self, double p, Generator generator)
+- name: bernoulli(Tensor self, Generator generator)
+  self: zeros_like(grad)
+
+- name: bernoulli_(Tensor self, Tensor p, Generator generator)
+  self: zeros_like(grad)
+  p: zeros_like(p)
+
+- name: bernoulli_(Tensor self, double p, Generator generator)
   self: zeros_like(grad)
 
 - name: bmm(Tensor self, Tensor mat2)
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 249ba042bb7056..81856c62ad07d9 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -396,7 +396,7 @@ def append_actuals_formals(actual, formal):
 
                     if not has_tensor_options:
                         # add type, device formals and corresponding actuals.
-                        # The type actual isthe ATen type mapped from (ScalarType, Layout, Device)
+                        # The type actual is the ATen type mapped from (ScalarType, Layout, Device)
                         # The device actual is the corresponding AutoGPU index for the Device.
                         formal_args.append(parsed_type_args[1])
                         formal_args.append(device_type)
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index acc5bed4a98bfb..123ba5f303e097 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -178,7 +178,7 @@ goto:eof
                   -DBUILD_SHARED_LIBS="%BUILD_SHARED_LIBS%" ^
                   -DBUILD_PYTHON=%BUILD_PYTHON% ^
                   -DBUILD_BINARY=%BUILD_BINARY% ^
-                  -DBUILD_TEST=OFF ^
+                  -DBUILD_TEST=%BUILD_TEST% ^
                   -DINSTALL_TEST=%INSTALL_TEST% ^
                   -DBUILD_CAFFE2_OPS=%BUILD_CAFFE2_OPS% ^
                   -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 58814d21ffa301..749d438707c9ed 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -175,6 +175,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp
   ${TORCH_SRC_DIR}/csrc/jit/fusers/interface.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/pretty_print.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 151b0dee218876..d55db5ddfee614 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -434,16 +434,35 @@ def add_docstr_all(method, docstr):
 
 add_docstr_all('bernoulli',
                r"""
-bernoulli() -> Tensor
+bernoulli(*, generator=None) -> Tensor
+
+Returns a result tensor where each :math:`\texttt{result[i]}` is independently
+sampled from :math:`\text{Bernoulli}(\texttt{self[i]})`. :attr:`self` must have
+floating point ``dtype``, and the result will have the same ``dtype``.
 
 See :func:`torch.bernoulli`
 """)
 
 add_docstr_all('bernoulli_',
                r"""
-bernoulli_() -> Tensor
+.. function:: bernoulli_(p=0.5, *, generator=None) -> Tensor
+
+    Fills each location of :attr:`self` with an independent sample from
+    :math:`\text{Bernoulli}(\texttt{p})`. :attr:`self` can have integral
+    ``dtype``.
+
+.. function:: bernoulli_(p_tensor, *, generator=None) -> Tensor
+
+    :attr:`p_tensor` should be a tensor containing probabilities to be used for
+    drawing the binary random number.
+
+    The :math:`\text{i}^{th}` element of :attr:`self` tensor will be set to a
+    value sampled from :math:`\text{Bernoulli}(\texttt{p\_tensor[i]})`.
+
+    :attr:`self` can have integral ``dtype``, but :attr`p_tensor` must have
+    floating point ``dtype``.
 
-In-place version of :meth:`~Tensor.bernoulli`
+See also :meth:`~Tensor.bernoulli` and :func:`torch.bernoulli`
 """)
 
 add_docstr_all('bincount',
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 9abb9f1bbf76d3..c43334aa438943 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -584,7 +584,7 @@ def parse_kwargs(desc):
 
 add_docstr(torch.bernoulli,
            r"""
-bernoulli(input, out=None) -> Tensor
+bernoulli(input, *, generator=None, out=None) -> Tensor
 
 Draws binary random numbers (0 or 1) from a Bernoulli distribution.
 
@@ -594,14 +594,17 @@ def parse_kwargs(desc):
 :math:`0 \leq \text{input}_i \leq 1`.
 
 The :math:`\text{i}^{th}` element of the output tensor will draw a
-value `1` according to the :math:`\text{i}^{th}` probability value given
+value :math:`1` according to the :math:`\text{i}^{th}` probability value given
 in :attr:`input`.
 
 .. math::
     \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
 
 The returned :attr:`out` tensor only has values 0 or 1 and is of the same
-shape as :attr:`input`
+shape as :attr:`input`.
+
+:attr:`out` can have integral ``dtype``, but :attr`input` must have floating
+point ``dtype``.
 
 Args:
     input (Tensor): the input tensor of probability values for the Bernoulli distribution
@@ -3059,60 +3062,6 @@ def parse_kwargs(desc):
             [ 3,  3]])
 """)
 
-add_docstr(torch.norm,
-           r"""
-.. function:: norm(input, p=2) -> Tensor
-
-Returns the p-norm of the :attr:`input` tensor.
-
-.. math::
-    ||x||_{p} = \sqrt[p]{x_{1}^{p} + x_{2}^{p} + \ldots + x_{N}^{p}}
-
-Args:
-    input (Tensor): the input tensor
-    p (float, optional): the exponent value in the norm formulation
-Example::
-
-    >>> a = torch.randn(1, 3)
-    >>> a
-    tensor([[-0.5192, -1.0782, -1.0448]])
-    >>> torch.norm(a, 3)
-    tensor(1.3633)
-
-.. function:: norm(input, p, dim, keepdim=False, out=None) -> Tensor
-
-Returns the p-norm of each row of the :attr:`input` tensor in the given
-dimension :attr:`dim`.
-
-If :attr:`keepdim` is ``True``, the output tensor is of the same size as
-:attr:`input` except in the dimension :attr:`dim` where it is of size 1.
-Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
-in the output tensor having 1 fewer dimension than :attr:`input`.
-
-Args:
-    input (Tensor): the input tensor
-    p (float):  the exponent value in the norm formulation
-    dim (int): the dimension to reduce
-    keepdim (bool): whether the output tensor has :attr:`dim` retained or not
-    out (Tensor, optional): the output tensor
-
-Example::
-
-    >>> a = torch.randn(4, 2)
-    >>> a
-    tensor([[ 2.1983,  0.4141],
-            [ 0.8734,  1.9710],
-            [-0.7778,  0.7938],
-            [-0.1342,  0.7347]])
-    >>> torch.norm(a, 2, 1)
-    tensor([ 2.2369,  2.1558,  1.1113,  0.7469])
-    >>> torch.norm(a, 0, 1, True)
-    tensor([[ 2.],
-            [ 2.],
-            [ 2.],
-            [ 2.]])
-""")
-
 add_docstr(torch.normal,
            r"""
 .. function:: normal(mean, std, out=None) -> Tensor
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 26dc9daf4a7350..76191ea9688bcf 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -23,7 +23,7 @@ def make_jacobian(input, num_out):
         if not input.requires_grad:
             return None
         return torch.zeros(input.nelement(), num_out, dtype=input.dtype)
-    elif isinstance(input, container_abcs.Iterable):
+    elif isinstance(input, container_abcs.Iterable) and not isinstance(input, str):
         jacobians = list(filter(
             lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input)))
         if not jacobians:
@@ -37,7 +37,7 @@ def iter_tensors(x, only_requiring_grad=False):
     if isinstance(x, torch.Tensor):
         if x.requires_grad or not only_requiring_grad:
             yield x
-    elif isinstance(x, container_abcs.Iterable):
+    elif isinstance(x, container_abcs.Iterable) and not isinstance(x, str):
         for elem in x:
             for result in iter_tensors(elem, only_requiring_grad):
                 yield result
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 97a0fa8e97d85f..574eddf1812bc2 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -428,22 +428,9 @@ def __repr__(self):
 ################################################################################
 # Utilities
 
-def demangle(name):
-    """Demangle a C++ identifier using c++filt"""
-    try:
-        with open(os.devnull, 'w') as devnull:
-            is_win = sys.platform == 'win32'
-            filt_cmd = ['undname', name] if is_win else ['c++filt', '-n', name]
-            orig_name = subprocess.check_output(filt_cmd, stderr=devnull).rstrip().decode("ascii")
-            orig_name = re.search('is :- \"(.*)"', orig_name).group(1) if is_win else orig_name
-            return orig_name
-    except (subprocess.CalledProcessError, AttributeError, FileNotFoundError, OSError):
-        return name
-
-
 class StringTable(defaultdict):
     def __missing__(self, key):
-        self[key] = demangle(key)
+        self[key] = torch._C._demangle(key)
         return self[key]
 
 
@@ -526,7 +513,7 @@ def parse_nvprof_trace(path):
     # Parse strings table
     strings = {}
     for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
-        strings[r["id"]] = demangle(r["value"])
+        strings[r["id"]] = torch._C._demangle(r["value"])
 
     # First, find all functions and create FunctionEvents for them
     marker_query = """
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 3381e423673374..f6b41b234feb5d 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -605,6 +605,10 @@ static PyObject* initModule() {
   // setting up TH Errors so that they throw C++ exceptions
   at::init();
 
+
+  py::reinterpret_borrow<py::module>(module)
+    .def("_demangle", &at::demangle);
+
   // Set ATen warnings to issue Python warnings
   at::Warning::set_warning_handler(&warning_handler);
 
diff --git a/torch/csrc/api/include/torch/arg.h b/torch/csrc/api/include/torch/arg.h
new file mode 100644
index 00000000000000..d4640ebe3f6d76
--- /dev/null
+++ b/torch/csrc/api/include/torch/arg.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <utility>
+
+#define TORCH_ARG(T, name)                                       \
+  auto name(const T& new_##name)->decltype(*this) { /* NOLINT */ \
+    this->name##_ = new_##name;                                  \
+    return *this;                                                \
+  }                                                              \
+  auto name(T&& new_##name)->decltype(*this) { /* NOLINT */      \
+    this->name##_ = std::move(new_##name);                       \
+    return *this;                                                \
+  }                                                              \
+  const T& name() const noexcept { /* NOLINT */                  \
+    return this->name##_;                                        \
+  }                                                              \
+  T name##_ /* NOLINT */
diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h
index af83f2e0ed7318..48c331e148686f 100644
--- a/torch/csrc/api/include/torch/nn/pimpl.h
+++ b/torch/csrc/api/include/torch/nn/pimpl.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <torch/arg.h>
 #include <torch/csrc/utils/variadic.h>
 #include <torch/tensor.h>
 
@@ -155,20 +156,6 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
 } // namespace nn
 } // namespace torch
 
-#define TORCH_ARG(T, name)                                       \
-  auto name(const T& new_##name)->decltype(*this) { /* NOLINT */ \
-    this->name##_ = new_##name;                                  \
-    return *this;                                                \
-  }                                                              \
-  auto name(T&& new_##name)->decltype(*this) { /* NOLINT */      \
-    this->name##_ = std::move(new_##name);                       \
-    return *this;                                                \
-  }                                                              \
-  const T& name() const noexcept { /* NOLINT */                  \
-    return this->name##_;                                        \
-  }                                                              \
-  T name##_ /* NOLINT */
-
 /// Defines a class `Name` which inherits from `nn::ModuleHolder` to provide a
 /// wrapper over a `std::shared_ptr<Impl>`.
 #define TORCH_MODULE_IMPL(Name, Impl)                                         \
diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h
index 514715b6b0c181..3671d424ca1d62 100644
--- a/torch/csrc/api/include/torch/optim/adam.h
+++ b/torch/csrc/api/include/torch/optim/adam.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include <torch/arg.h>
 #include <torch/nn/module.h>
-#include <torch/nn/pimpl.h>
 #include <torch/optim/optimizer.h>
 #include <torch/serialization.h>
 
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index eec8c8fdc415d0..052505dc344035 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <torch/arg.h>
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
 #include <torch/serialization.h>
diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h
index 4a84331f92810c..81cc040322b158 100644
--- a/torch/csrc/api/include/torch/optim/rmsprop.h
+++ b/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <torch/arg.h>
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
 #include <torch/serialization.h>
diff --git a/torch/csrc/api/include/torch/optim/sgd.h b/torch/csrc/api/include/torch/optim/sgd.h
index 345d0343c1ba85..4b2716bfb79101 100644
--- a/torch/csrc/api/include/torch/optim/sgd.h
+++ b/torch/csrc/api/include/torch/optim/sgd.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include <torch/arg.h>
 #include <torch/nn/module.h>
-#include <torch/nn/pimpl.h>
 #include <torch/optim/optimizer.h>
 #include <torch/serialization.h>
 #include <torch/tensor.h>
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index b306426025e025..eddf5ca357c135 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -45,8 +45,8 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
   m.def("_enable_profiler", torch::autograd::profiler::enableProfiler);
   m.def("_disable_profiler", torch::autograd::profiler::disableProfiler);
 
-  m.def("_push_range", [](const char* name) {
-    torch::autograd::profiler::pushRange(name);
+  m.def("_push_range", [](std::string name) {
+    torch::autograd::profiler::pushRange(std::move(name));
   });
   m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
 
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 77c884e9b71b64..f6abee4c6bb24b 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -6,11 +6,11 @@
 namespace torch { namespace autograd { namespace profiler {
 
 ProfilerState state = ProfilerState::Disabled;
-uint32_t next_thread_id = 0;
+uint16_t next_thread_id = 0;
 std::mutex all_event_lists_mutex;
 std::list<std::shared_ptr<RangeEventList>> all_event_lists;
 thread_local std::shared_ptr<RangeEventList> event_list;
-thread_local int32_t thread_id;
+thread_local uint16_t thread_id;
 
 RangeEventList& getEventList() {
   if (!event_list) {
@@ -23,6 +23,9 @@ RangeEventList& getEventList() {
 }
 
 void mark(std::string name, bool include_cuda /* = true */) {
+  if (state == ProfilerState::Disabled) {
+    return;
+  }
   if (state == ProfilerState::NVTX) {
 #ifdef USE_CUDA
     nvtxMarkA(name.c_str());
@@ -39,7 +42,12 @@ void mark(std::string name, bool include_cuda /* = true */) {
   }
 }
 
-void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*= -1*/) {
+const char* c_str(const char *str) { return str; }
+// NB: non-const to disallow temporaries (lifetime issues)
+const char* c_str(std::string& str) { return str.c_str(); }
+
+template<typename T>
+void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
   if (state == ProfilerState::Disabled) {
     return;
   }
@@ -49,9 +57,9 @@ void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*=
       std::stringstream s;
       s << name << msg << sequence_nr;
       nvtxRangePushA(s.str().c_str());
-    } 
-    else
-      nvtxRangePushA(name.c_str());
+    } else {
+      nvtxRangePushA(c_str(name));
+    }
 #else
     throw std::logic_error(
         "pushRange called with NVTX tracing, but compiled without CUDA");
@@ -65,6 +73,10 @@ void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*=
   }
 }
 
+void pushRange(std::string name) {
+  pushRangeImpl(std::move(name));
+}
+
 void popRange() {
   if (state == ProfilerState::Disabled) {
     return;
@@ -79,45 +91,30 @@ void popRange() {
   } else {
     getEventList().record(
         EventKind::PopRange,
-        std::string(),
+        "",
         thread_id,
         state == ProfilerState::CUDA);
   }
 }
 
 RecordFunction::RecordFunction(Function* fn) {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushFunctionRange(fn);
+  // NB: we don't use fn->name() here, because it will unnecessarily allocate
+  // a string. We will run a demangler on all the names anyway, so it's ok to
+  // avoid doing it now.
+  pushRangeImpl(typeid(*fn).name(), ", stashed seq=", fn->sequence_nr());
 }
 
 RecordFunction::RecordFunction(std::string name) {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushRange(std::move(name));
+  pushRangeImpl(std::move(name));
 }
 
 RecordFunction::RecordFunction(const char* name) {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushRange(name);
+  pushRangeImpl<const char*>(name);
 }
 
-RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr) 
+RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
 {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushRange(name, ", seq=", current_sequence_nr);
-}
-
-RecordFunction::~RecordFunction() {
-  if (state == ProfilerState::Disabled)
-    return;
-  popRange();
-}
-
-void RecordFunction::pushFunctionRange(Function* fn) {
-  pushRange(fn->name(), ", stashed seq=", fn->sequence_nr());
+  pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
 }
 
 #ifdef USE_CUDA
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 29dc1044fb2f10..83058b9da5353e 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -21,6 +21,9 @@
 #include "ATen/cuda/CUDAContext.h"
 #include <cuda_runtime.h>
 #endif
+#ifndef _WIN32
+#include <time.h>
+#endif
 
 namespace torch { namespace autograd {
 
@@ -32,36 +35,48 @@ constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
   return ((a + b - 1) / b) * b;
 }
 
-inline uint64_t getTime() {
+inline int64_t getTime() {
+#ifdef _WIN32
   using namespace std::chrono;
   using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
   return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
+#else
+  // clock_gettime is *much* faster than std::chrono implementation on Linux
+  struct timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
+#endif
 }
 
-enum class EventKind {
+enum class EventKind : uint16_t {
   Mark,
   PushRange,
   PopRange
 };
 
-struct Event {
-  Event(EventKind kind, std::string name, uint32_t thread_id, bool record_cuda)
-  : kind_(kind)
-  , name_(std::move(name))
-  , thread_id_(thread_id) {
+struct Event final {
+  Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
+  : owned_name_(new std::string(std::move(name)))
+  , name_ptr_(owned_name_->c_str())
+  , kind_(kind)
+  , thread_id_(thread_id) { record(record_cuda); }
+  Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
+  : name_ptr_(name)
+  , kind_(kind)
+  , thread_id_(thread_id) { record(record_cuda); }
+
+  void record(bool record_cuda) {
 #ifdef USE_CUDA
-    if(record_cuda) {
+    if (record_cuda) {
       TORCH_CUDA_CHECK(cudaGetDevice(&device_));
       TORCH_CUDA_CHECK(cudaEventCreate(&event));
       auto stream = at::cuda::getCurrentCUDAStream();
       cpu_ns_ = getTime();
       TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
-    } else {
-      cpu_ns_ = getTime();
+      return;
     }
-#else
-    cpu_ns_ = getTime();
 #endif
+    cpu_ns_ = getTime();
   }
   std::string kind() const {
     switch(kind_) {
@@ -71,10 +86,10 @@ struct Event {
     }
     throw std::runtime_error("unknown EventKind");
   }
-  const std::string & name() const {
-    return name_;
+  const char* name() const {
+    return name_ptr_;
   }
-  uint32_t thread_id() const {
+  uint16_t thread_id() const {
     return thread_id_;
   }
   double cpu_elapsed_us(const Event & e) {
@@ -108,14 +123,18 @@ struct Event {
     return device_;
   }
 private:
-  EventKind kind_;
-  std::string name_;
-  uint32_t thread_id_;
   int64_t cpu_ns_; // signed to allow for negative intervals
+  // std::string is a very large object (usually around 32B),
+  // and this field is used only for user-created ranges, so
+  // it's better to save on size of Events.
+  std::unique_ptr<std::string> owned_name_;
+  const char * name_ptr_;
+  EventKind kind_;
+  uint16_t thread_id_;
+  int device_ = -1;
 #ifdef USE_CUDA
   cudaEvent_t event = nullptr;
 #endif
-  int device_ = -1;
 };
 
 // a linked-list of fixed sized vectors, to avoid
@@ -132,7 +151,14 @@ struct RangeEventList {
 
   void allocBlock() {
     blocks.emplace_front();
-    blocks.front().reserve(num_block_elements);
+    auto & new_block = blocks.front();
+    new_block.reserve(num_block_elements);
+    // Materialize all pages in the new block to release jitter when recording events.
+    const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
+    for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
+         ptr < end_ptr; ptr += 4 * 1024) {
+      (*ptr);
+    }
   }
 
   template<typename... Args>
@@ -166,7 +192,7 @@ enum class ProfilerState {
 
 TORCH_API RangeEventList& getEventList();
 TORCH_API void mark(std::string name, bool include_cuda = true);
-TORCH_API void pushRange(std::string name, const char* msg = "", int64_t sequence_nr = -1);
+TORCH_API void pushRange(std::string name);
 TORCH_API void popRange();
 
 struct TORCH_API RecordFunction {
@@ -178,10 +204,9 @@ struct TORCH_API RecordFunction {
 
   explicit RecordFunction(const char* name, int64_t current_sequence_nr);
 
-  ~RecordFunction();
-
-  // Needed only because we don't have Function defined yet.
-  void pushFunctionRange(Function *fn);
+  ~RecordFunction() {
+    popRange();
+  }
 };
 
 using thread_event_lists = std::vector<std::vector<Event>>;
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index c3f9bd510a8289..07270ad34a8d0c 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -237,9 +237,15 @@ int THPVariable_set_grad(THPVariable *self, PyObject *py_grad)
       "can't assign Variable as its own grad");
 
   auto& grad = ((THPVariable*)py_grad)->cdata;
-  auto& sparseType = var.type().toBackend(var.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
+  bool gradIsSparse = false;
+  auto backend = var.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU;
+  auto typeOpt = at::globalContext().getNonVariableTypeOpt(backend, var.type().scalarType());  
+  if (typeOpt) {
+       auto& sparseType = at::globalContext().getNonVariableType(backend, var.type().scalarType());
+       gradIsSparse = grad.type() == sparseType;
+  }
 
-  THPUtils_assertRet(-1, grad.type() == var.type() || grad.type() == sparseType,
+  THPUtils_assertRet(-1, grad.type() == var.type() || gradIsSparse,
       "assigned grad has data of a different type");
   if (var.type().is_cuda()) {
     THPUtils_assertRet(-1, grad.get_device() == var.get_device(),
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 37847139bf60d6..b50dddace66c50 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -91,11 +91,9 @@ static Variable applySlice(const Variable& self, int64_t dim, PyObject* slice, b
 
 static Variable applySelect(const Variable& self, int64_t dim, int64_t index) {
   if (index == 0 && dim == 0 && self.dim() == 0) {
-    // Deprecated support for indexing 0-dim tensors as if they were 1-dim.
-    PyErr_WarnEx(PyExc_UserWarning,
-        "invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. "
-        "Use tensor.item() to convert a 0-dim tensor to a Python number", 1);
-    return at::alias(self);
+    throw IndexError(
+        "invalid index of a 0-dim tensor. "
+        "Use tensor.item() to convert a 0-dim tensor to a Python number");
   }
   int64_t size = self.size(dim);
   if (index < -size || index >= size) {
diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h
index 0a9f7e3fd26878..2610643918b642 100644
--- a/torch/csrc/jit/attributes.h
+++ b/torch/csrc/jit/attributes.h
@@ -12,6 +12,8 @@
 
 namespace torch { namespace jit {
 
+constexpr int max_tensor_display_size = 10;
+
 enum class AttributeKind {
   f,fs,i,is,s,ss,t,ts,g,gs
 };
@@ -201,6 +203,89 @@ struct Attributes {
     return get<TensorsAttr>(name);
   }
 
+  template<typename T>
+  static void printPrimList(std::ostream & out, const std::vector<T> & items) {
+    out << "[";
+    int i = 0;
+    for(auto & item : items) {
+      if(i++ > 0)
+        out << ", ";
+      out << item;
+    }
+    out << "]";
+  }
+
+  static std::string escapeString(std::string s) {
+    std::vector<char> search = {'\n', '\t', '\v'};
+    std::vector<std::string> replace = {"\\n", "\\t", "\\v"};
+    for (size_t i = 0; i < search.size(); i++) {
+      size_t pos = s.find(search[i]);
+      while(pos != std::string::npos) {
+        s.replace(pos, 1, replace[i]);
+        pos = s.find(search[i], pos + 1);
+      }
+    }
+    return s;
+  }
+
+  void printValue(std::ostream & out, Symbol & name) const {
+    switch(kindOf(name)) {
+      case AttributeKind::f:
+        out << f(name);
+        break;
+      case AttributeKind::fs:
+        printPrimList(out, fs(name));
+        break;
+      case AttributeKind::i:
+        out << i(name);
+        break;
+      case AttributeKind::is:
+        printPrimList(out, is(name));
+        break;
+      case AttributeKind::s:
+        out << "\"" << escapeString(s(name)) << "\"";
+        break;
+      case AttributeKind::ss:
+        printPrimList(out,ss(name));
+        break;
+      case AttributeKind::t:
+        {
+          at::Tensor tensor = t(name);
+          // 1-elem tensors are usually boxed scalars, so print them like it
+          if (tensor.numel() == 1) {
+            auto scalar_tensor = at::_local_scalar(tensor.view({}));
+            out << "{";
+            if (scalar_tensor.isFloatingPoint()) {
+              out << scalar_tensor.toDouble();
+            } else {
+              out << scalar_tensor.toLong();
+            }
+            out << "}";
+          } else if (tensor.numel() <= max_tensor_display_size) {
+            // TODO: This is awful code.  Also it doesn't work on Windows.
+            std::ostringstream tensor_ss;
+            tensor_ss << tensor;
+            std::string tensor_s{tensor_ss.str()};
+            // Remove newlines
+            std::replace(tensor_s.begin(), tensor_s.end(), '\n', ' ');
+            out << tensor_s;
+          } else {
+            out << "<Tensor>";
+          }
+          break;
+        }
+      case AttributeKind::ts:
+        out << "[<Tensors>]";
+        break;
+      case AttributeKind::g:
+        out << "<Graph>";
+        break;
+      case AttributeKind::gs:
+        out << "[<Graphs>]";
+        break;
+    }
+  }
+
 private:
   // UBSAN error: https://github.com/pytorch/pytorch/issues/9055
   Derived* This() __ubsan_ignore_vptr__ {
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index ab6a07cbb1d959..251a6466ee3a4f 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -33,6 +33,7 @@ bool isDifferentiable(Node * n) {
     "aten::exp(Tensor self) -> Tensor",
     "aten::t(Tensor self) -> Tensor",
     "aten::neg(Tensor self) -> Tensor",
+    "aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor",
     "aten::type_as(Tensor self, Tensor other) -> Tensor",
     "aten::unsqueeze(Tensor self, int dim) -> Tensor",
     "aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor",
@@ -106,6 +107,20 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     } else if (node->matches("aten::relu(Tensor self) -> Tensor")) {
       return {grads.at(0) * (outputs.at(0) > at::Scalar(0)).type_as(outputs.at(0))};
 
+    } else if (node->matches("aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor")) {
+      // we do two type_as and "*" in lieu of boolean "and"
+      // the "! (val > min)" is chosen such that the gradient is 0 on the
+      // boundary and the factor is 1 when the boundary is NaN
+      // the ! is expressed as "1-" for lack of a "not" function and
+      // the the fuser insisting on float
+      // A NaN input will cause the gradient to propagate through,
+      // the more pure approach would be to have NaNs in that case
+      // but that is hard to reliably code and costs extra checks
+      // so we decided against it, see
+      // https://github.com/pytorch/pytorch/pull/11574#discussion_r218104538
+      return {grads.at(0)
+	      * (1-(inputs.at(0) <= inputs.at(1)).type_as(inputs.at(0)))
+	      * (1-(inputs.at(0) >= inputs.at(2)).type_as(inputs.at(0))), nullptr, nullptr};
     } else if (node->matches("aten::exp(Tensor self) -> Tensor")) {
       return {grads.at(0) * (outputs.at(0))};
 
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp
index 2021b9fa1b832d..54a3c57b83a754 100644
--- a/torch/csrc/jit/fusers/common/fused_kernel.cpp
+++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp
@@ -296,6 +296,7 @@ static std::string encodeRHS(Node* n) {
     // TODO: some of these ops will not get generated because
     // we only work on float inputs/outputs, but they are here to record
     // that they are valid mappable ops once we handle more type
+
     {aten::__and__, "${0} && ${1}"},
     {aten::__lshift__, "${0} << ${1}"},
     {aten::__or__, "${0} || ${1}"},
@@ -319,6 +320,12 @@ static std::string encodeRHS(Node* n) {
     {aten::sub, "(${0} - ${2}*${1})"},
     {aten::rand_like, "uniform(rnd())"},
 
+    // min, max
+    // It may seem unusual to have the bounds as the first case below,
+    // this is so that if min or max is NaN, they are "ignored"
+    // and when the input is NaN, the output is, too
+    {aten::clamp, "(${0}<${1}?${1}:(${0}>${2}?${2}:${0}))"},
+
     // simple derivatives
     {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"},
     {aten::_tanh_backward,    "${0} * (1.f - ${1} * ${1})"},
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 4cc59e8b9bb8e1..82b14fa0b6839d 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -6,6 +6,7 @@
 #include "torch/csrc/jit/constants.h"
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/jit/script/compiler.h"
+#include "torch/csrc/jit/passes/pretty_print.h"
 
 #include <iostream>
 #include <unordered_map>
@@ -21,8 +22,6 @@ namespace torch { namespace jit {
 // Sigh, see https://stackoverflow.com/questions/8016780/undefined-reference-to-static-constexpr-char
 constexpr Symbol PythonOp::Kind;
 
-constexpr int max_tensor_display_size = 10;
-
 void printValueRef(std::ostream & out, const Value * n) {
   out << "%" << n->uniqueName();
 }
@@ -72,30 +71,6 @@ std::ostream& operator<<(std::ostream & out, const_value_list_with_types l) {
   }
   return out;
 }
-template<typename T>
-void printPrimList(std::ostream & out, const std::vector<T> & items) {
-  out << "[";
-  int i = 0;
-  for(auto & item : items) {
-    if(i++ > 0)
-      out << ", ";
-    out << item;
-  }
-  out << "]";
-}
-
-std::string escapeString(std::string s) {
-  std::vector<char> search = {'\n', '\t', '\v'};
-  std::vector<std::string> replace = {"\\n", "\\t", "\\v"};
-  for (size_t i = 0; i < search.size(); i++) {
-    size_t pos = s.find(search[i]);
-    while(pos != std::string::npos) {
-      s.replace(pos, 1, replace[i]);
-      pos = s.find(search[i], pos + 1);
-    }
-  }
-  return s;
-}
 
 void printAttributes(std::ostream & out, const Node * n, bool ignore_subgraph=false) {
   out << "[";
@@ -110,62 +85,9 @@ void printAttributes(std::ostream & out, const Node * n, bool ignore_subgraph=fa
     // don't want to print the qualifier since it should always
     // be attribute, but you might be able to track down a weird
     // bug by printing it out.
-    out << name.toUnqualString() <<"=";
-    switch(n->kindOf(name)) {
-      case AttributeKind::f:
-        out << n->f(name);
-        break;
-      case AttributeKind::fs:
-        printPrimList(out,n->fs(name));
-        break;
-      case AttributeKind::i:
-        out << n->i(name);
-        break;
-      case AttributeKind::is:
-        printPrimList(out,n->is(name));
-        break;
-      case AttributeKind::s:
-        out << "\"" << escapeString(n->s(name)) << "\"";
-        break;
-      case AttributeKind::ss:
-        printPrimList(out,n->ss(name));
-        break;
-      case AttributeKind::t:
-        {
-          at::Tensor t = n->t(name);
-          // 1-elem tensors are usually boxed scalars, so print them like it
-          if (t.numel() == 1) {
-            auto scalar_tensor = at::_local_scalar(t.view({}));
-            out << "{";
-            if (scalar_tensor.isFloatingPoint()) {
-              out << scalar_tensor.toDouble();
-            } else {
-              out << scalar_tensor.toLong();
-            }
-            out << "}";
-          } else if (t.numel() <= max_tensor_display_size) {
-            // TODO: This is awful code.  Also it doesn't work on Windows.
-            std::ostringstream tensor_ss;
-            tensor_ss << t;
-            std::string tensor_s{tensor_ss.str()};
-            // Remove newlines
-            std::replace(tensor_s.begin(), tensor_s.end(), '\n', ' ');
-            out << tensor_s;
-          } else {
-            out << "<Tensor>";
-          }
-          break;
-        }
-      case AttributeKind::ts:
-        out << "[<Tensors>]";
-        break;
-      case AttributeKind::g:
-        out << "<Graph>";
-        break;
-      case AttributeKind::gs:
-        out << "[<Graphs>]";
-        break;
-    }
+    out << name.toUnqualString() << "=";
+
+    n->printValue(out, name);
   }
   out << "]";
 }
@@ -251,6 +173,15 @@ std::ostream& operator<<(std::ostream & out, const Graph & g) {
   return out;
 }
 
+std::ostream& Graph::prettyPrint(std::ostream & out) {
+  PrettyPrint(out, *this);
+  return out;
+}
+
+void Graph::dumpPretty() {
+  PrettyPrint(std::cout, *this);
+}
+
 static void checkSameDevice(const Node* node) {
   bool has_device = false;
   int device;
@@ -627,10 +558,8 @@ const OperatorSet& nondeterminstic_aten_ops() {
     "aten::dropout(Tensor input, float p, int train) -> Tensor",
     "aten::_fused_dropout(Tensor self, float p, Generator generator) -> (Tensor, Tensor)",
     "aten::_standard_gamma(Tensor self, Generator generator) -> Tensor",
-    "aten::_th_bernoulli(Tensor self, *, Generator generator) -> Tensor",
-    "aten::bernoulli(Tensor self) -> Tensor",
-    "aten::bernoulli(Tensor self, Tensor p, Generator generator) -> Tensor",
-    "aten::bernoulli(Tensor self, float p, Generator generator) -> Tensor",
+    "aten::bernoulli(Tensor self, *, Generator generator) -> Tensor",
+    "aten::bernoulli(Tensor self, float p, *, Generator generator) -> Tensor",
     "aten::multinomial(Tensor self, int num_samples, int replacement, *, Generator generator) -> Tensor",
     "aten::normal(Tensor mean, Tensor std, *, Generator generator) -> Tensor",
     "aten::normal(float mean, Tensor std, *, Generator generator) -> Tensor",
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index fd730adb69570a..062d0422c2be07 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -972,6 +972,9 @@ friend struct Block;
     new_node_stage_ = s;
     return ResourceGuard([prev_stage, this]() { this->new_node_stage_ = prev_stage; });
   }
+  const std::unordered_map<std::string, Value*>& uniqueNames() const {
+    return unique_names_;
+  }
 
   size_t registerOutput(Value * n) {
     return block_->registerOutput(n);
@@ -1170,6 +1173,10 @@ friend struct Block;
   }
 
   friend TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g);
+
+  TORCH_API std::ostream& prettyPrint(std::ostream & out);
+  TORCH_API void dumpPretty();
+
   TORCH_API std::shared_ptr<Graph> copy();
 
 private:
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 4d69ed57e63786..6c7166f3b43552 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -75,8 +75,8 @@ std::unordered_set<NodeKind> simple_mappable = {
   aten::type_as,
   aten::_sigmoid_backward,
   aten::_tanh_backward,
+  aten::clamp,
   // TODO support those
-  //aten::clamp,
   //aten::lerp,
   aten::rand_like,
 };
@@ -217,7 +217,8 @@ struct GraphFuser {
         node->matches("aten::mul(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::mul(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::div(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
-        node->matches("aten::div(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other)) {
+        node->matches("aten::div(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor", /*const=*/{attr::min, attr::max})) {
       auto inputs = tensorInputs(node);
       return haveSupportedType(inputs);
     }
diff --git a/torch/csrc/jit/passes/pretty_print.cpp b/torch/csrc/jit/passes/pretty_print.cpp
new file mode 100644
index 00000000000000..62be6f3041fe04
--- /dev/null
+++ b/torch/csrc/jit/passes/pretty_print.cpp
@@ -0,0 +1,322 @@
+#include "torch/csrc/jit/attributes.h"
+#include "torch/csrc/jit/passes/pretty_print.h"
+
+namespace torch {
+namespace jit {
+
+static std::ostream& indent(std::ostream& out, size_t level) {
+  for (size_t  i = 0; i < level; ++i) {
+    out << "  ";
+  }
+  return out;
+}
+
+class PrettyPrintPass {
+  const Graph& graph_;
+
+  // When printing a name if there is a conflict with an existing name in the
+  // graph, record the value -> new generated name mapping
+  std::unordered_map<const Value*, const Value*> aliases_;
+
+  // The Graph already tracks unique_names_, this is just for additional ones
+  // generated during printing
+  std::unordered_map<std::string, const Value*> generated_names_;
+
+  // Cache of value names
+  std::unordered_map<const Value*, std::string> value_names_;
+
+  template <class T>
+  void zipWith(
+      at::ArrayRef<T> list_a,
+      at::ArrayRef<T> list_b,
+      std::function<void(T, T)> action) const {
+    auto it_a = list_a.begin();
+    auto it_b = list_b.begin();
+
+    if (list_a.size() != list_b.size()) {
+      AT_ERROR("Pretty printer expected 2 lists of same size");
+    }
+
+    for (; it_a != list_a.end(); ++it_a, ++it_b) {
+      action(*it_a, *it_b);
+    }
+  }
+
+  std::ostream& printValueList(
+      std::ostream& out,
+      at::ArrayRef<const Value*> list) {
+    out << "(";
+    auto delimiter = "";
+    for (const auto* value : list) {
+      out << delimiter;
+      printValue(out, value);
+      delimiter = ", ";
+    }
+    out << ")";
+    return out;
+  }
+
+  void printAssignment(
+      std::ostream& out,
+      const Value* lhs,
+      const Value* rhs,
+      const size_t level) {
+    indent(out, level);
+    printValue(out, lhs);
+    out << " = ";
+    printValue(out, rhs);
+    out << "\n";
+  }
+
+  std::ostream& printIf(
+      std::ostream& out,
+      const Node* node,
+      const size_t level) {
+    indent(out, level);
+    out << "if ";
+    const auto if_block = node->blocks()[0];
+    const auto else_block = node->blocks()[1];
+    printValue(out, node->inputs()[0]);
+    out << ":"
+        << "\n";
+
+    // Print node contents
+    printBlock(out, if_block, level + 1);
+
+    // Print if block output
+    zipWith<const Value*>(
+        node->outputs(),
+        if_block->outputs(),
+        [&](const Value* node_output, const Value* return_input) {
+          printAssignment(out, node_output, return_input, level + 1);
+        });
+
+    indent(out, level);
+    out << "else:\n";
+    printBlock(out, else_block, level + 1);
+    zipWith<const Value*>(
+        node->outputs(),
+        else_block->outputs(),
+        [&](const Value* node_output, const Value* return_input) {
+          printAssignment(out, node_output, return_input, level + 1);
+        });
+
+    return out;
+  }
+
+  std::ostream& printLoop(
+      std::ostream& out,
+      const Node* node,
+      const size_t level) {
+    // Prints assignments between the loop node and body block around the
+    // loop body itself in the following manner:
+    //
+    // (silently) alias block input names to node output names
+    // assign each node input to the corresponding node output
+    // assign condition to loop condition value
+    // while ...:
+    //    print loop body nodes
+    //    assign each block output to the corresponding block input
+
+    const auto body_block = node->blocks()[0];
+    // Add aliases for loop-carried dependencies
+    zipWith<const Value*>(
+        body_block->inputs().slice(1), // Start at 1 to ignore trip count
+        node->outputs(),
+        [&](const Value* block_input, const Value* node_output) {
+          aliases_[block_input] = node_output;
+        });
+
+    // Print initial assignments of loop node outputs = loop node inputs
+    zipWith<const Value*>(
+        node->outputs(),
+        node->inputs().slice(2),
+        [&](const Value* node_output, const Value* node_input) {
+          printAssignment(out, node_output, node_input, level);
+        });
+
+    // Print condition initial assignment
+    printAssignment(out, body_block->inputs()[0], node->inputs()[1], level);
+
+    // Loop header
+    indent(out, level);
+    out << "while ";
+    printValue(out, body_block->inputs()[0]);
+    out << ":\n";
+
+    // Loop body
+    printBlock(out, body_block, level + 1);
+
+    // Update block outputs to block inputs for next loop iteration
+    zipWith<const Value*>(
+        body_block->inputs(),
+        body_block->outputs(),
+        [&](const Value* block_input, const Value* block_output) {
+          printAssignment(out, block_input, block_output, level + 1);
+        });
+    return out;
+  }
+
+  std::ostream& printNode(
+      std::ostream& out,
+      const Node* node,
+      const size_t level) {
+    // if there are subblocks on this node, visit them
+    switch (node->kind()) {
+      case prim::Return:
+        // Handled elsewhere, do nothing
+        break;
+      case prim::Constant:
+        break;
+      case prim::Loop:
+        printLoop(out, node, level);
+        break;
+      case prim::If:
+        printIf(out, node, level);
+        break;
+      default:
+        indent(out, level);
+        // Print outputs
+        if (node->outputs().size() > 0) {
+          auto delim = "";
+          for (const auto* output_value : node->outputs()) {
+            out << delim;
+            printValue(out, output_value);
+            delim = ", ";
+          }
+          out << " = ";
+        }
+
+        out << node->kind().toQualString();
+
+        // Print instruction parameters
+        printValueList(out, node->inputs());
+
+        out << "\n";
+    }
+
+    return out;
+  }
+
+  std::ostream& printReturn(
+      std::ostream& out,
+      const Node* node,
+      const size_t level) {
+    indent(out, level);
+    const auto& returns = node->inputs();
+    if (returns.size() > 0) {
+      out << "return ";
+      std::string delimiter = "";
+      if (returns.size() > 1) {
+        printValueList(out, returns);
+      } else {
+        printValue(out, returns[0]);
+      }
+      out << "\n";
+    }
+    return out;
+  }
+
+  std::ostream& printBlock(
+      std::ostream& out,
+      const Block* root,
+      const size_t level) {
+    for (const auto* node : root->nodes()) {
+      printNode(out, node, level);
+    }
+
+    printNode(out, root->return_node(), level);
+
+    return out;
+  }
+
+  inline bool isNameUnique(std::string& name, const Value* val) const {
+    auto generated_name_value = generated_names_.find(name);
+    if (generated_name_value != generated_names_.end() &&
+        generated_name_value->second != val) {
+      // Found a generated name match, check that it's for a different value
+      return false;
+    }
+    return graph_.uniqueNames().find(name) == graph_.uniqueNames().end();
+  }
+
+  std::ostream& printValue(std::ostream& out, const Value* val) {
+    auto cached_name = value_names_.find(val);
+    if (cached_name != value_names_.end()) {
+      // If this value has been seen before, print out cached name
+      out << cached_name->second;
+      return out;
+    }
+
+    const auto node = val->node();
+    if (node->kind() == prim::Constant) {
+      // printAttributeValue(out, node->attributeNames()[0], node);
+      node->printValue(out, node->attributeNames()[0]);
+      return out;
+    }
+
+    auto name_source = val;
+
+    auto aliased_name = aliases_.find(val);
+    if (aliased_name != aliases_.end()) {
+      name_source = aliased_name->second;
+    }
+
+    auto name = name_source->uniqueName();
+
+    bool using_generated_name = false;
+    if (isdigit(name.at(0))) {
+      std::stringstream ss;
+      ss << "t" << name;
+      name = ss.str();
+      using_generated_name = true;
+    } else if (name.find_last_of('.') != std::string::npos) {
+      // Make unique name a valid variable name (e.g. a.1 -> a1)
+      name.erase(std::remove(name.begin(), name.end(), '.'), name.end());
+      using_generated_name = true;
+    }
+
+    if (using_generated_name) {
+      // Make sure name is unique
+      size_t suffix = 0;
+      while (!isNameUnique(name, name_source)) {
+        std::stringstream ss;
+        ss << name << suffix;
+        name = ss.str();
+        ++suffix;
+      }
+
+      // These names aren't in the Graph's list of names but we still need to
+      // make sure there are no name conflicts
+      generated_names_[name] = name_source;
+    }
+
+    value_names_[val] = name;
+    out << name;
+    return out;
+  }
+
+ public:
+  PrettyPrintPass(const Graph& graph) : graph_(graph) {}
+
+  std::ostream& run(std::ostream& out) {
+    out << "def script";
+    const Node* params = graph_.block()->param_node();
+    printValueList(out, params->outputs());
+    out << ":\n";
+
+    // Print body
+    printBlock(out, graph_.block(), 1);
+
+    printReturn(out, graph_.block()->return_node(), 1);
+
+    return out;
+  }
+};
+
+TORCH_API std::ostream& PrettyPrint(std::ostream& out, const Graph& graph) {
+  return PrettyPrintPass(graph).run(out);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/pretty_print.h b/torch/csrc/jit/passes/pretty_print.h
new file mode 100644
index 00000000000000..786edbcb78f69a
--- /dev/null
+++ b/torch/csrc/jit/passes/pretty_print.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+TORCH_API std::ostream& PrettyPrint(std::ostream& out, const Graph& graph);
+
+}}
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 7499546f27684f..7e28a94369b179 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -424,13 +424,13 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) {
     "aten::ceil(Tensor self) -> Tensor",
     "aten::clone(Tensor self) -> Tensor",
     "aten::contiguous(Tensor self) -> Tensor",
-    "aten::bernoulli(Tensor self) -> Tensor",
+    "aten::bernoulli(Tensor self, *, Generator generator) -> Tensor",
     "aten::celu(Tensor self, Scalar alpha) -> Tensor",
     "aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor",
     "aten::clamp_max(Tensor self, Scalar max) -> Tensor",
     "aten::clamp_min(Tensor self, Scalar min) -> Tensor",
     "aten::alpha_dropout(Tensor input, float p, int train) -> Tensor",
-    "aten::bernoulli(Tensor self, float p, Generator generator) -> Tensor",
+    "aten::bernoulli(Tensor self, float p, *, Generator generator) -> Tensor",
     "aten::cos(Tensor self) -> Tensor",
     "aten::cosh(Tensor self) -> Tensor",
     "aten::digamma(Tensor self) -> Tensor",
@@ -592,7 +592,6 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) {
   //   tensor outputs : 1
   static const register_formula_for binary_ops_strict_match {{
     "aten::normal(Tensor mean, Tensor std, *, Generator generator) -> Tensor",
-    "aten::bernoulli(Tensor self, Tensor p, Generator generator) -> Tensor",
     "aten::mm(Tensor self, Tensor mat2) -> Tensor",
     "aten::bmm(Tensor self, Tensor mat2) -> Tensor",
   }, [](Node * node) -> type_vec_t {
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index c745f9f6d8f122..66747b940e72cf 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -216,6 +216,11 @@ void initPythonIRBindings(PyObject * module_) {
     .def("return_node", [](Graph &g) {
       return g.block()->return_node();
     })
+    .def("pretty_print", [](Graph &g) {
+      std::ostringstream oss;
+      g.prettyPrint(oss);
+      return oss.str();
+    })
     .GS(createFusionGroup)
     .def("createClone",[](Graph & g, Node * n, py::object fn) {
       return g.createClone(n, [&](Value * e) {
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index dabe88991073a3..a96730cc1eb9f6 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -24,17 +24,13 @@ namespace torch { namespace jit { namespace tracer {
 std::string getPythonInterpreterStackTrace() {
   std::stringstream stack_trace;
   AutoGIL gil;
-  PyThreadState *tstate = PyThreadState_GET();
-  if (nullptr != tstate && nullptr != tstate->frame) {
-    PyFrameObject *frame = tstate->frame;
-
-    while (nullptr != frame) {
-      int line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
-      std::string filename = THPUtils_unpackString(frame->f_code->co_filename);
-      std::string funcname = THPUtils_unpackString(frame->f_code->co_name);
-      stack_trace << filename << "(" << line << "): " << funcname << "\n";
-      frame = frame->f_back;
-    }
+  PyFrameObject *frame = PyEval_GetFrame();
+  while (nullptr != frame) {
+    int line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
+    std::string filename = THPUtils_unpackString(frame->f_code->co_filename);
+    std::string funcname = THPUtils_unpackString(frame->f_code->co_name);
+    stack_trace << filename << "(" << line << "): " << funcname << "\n";
+    frame = frame->f_back;
   }
   return stack_trace.str();
 }
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index 4edc279f8f7afe..3e38b4323da329 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -64,15 +64,27 @@ struct SymbolicVariable {
   SymbolicVariable operator>(at::Scalar rhs) const {
     return create(aten::gt, {*this, insertConstant(rhs)})[0].typeLikeWithScalarType(*this, at::kByte);
   }
+  SymbolicVariable operator>(const SymbolicVariable rhs) const {
+    return create(aten::gt, {*this, rhs})[0].typeLikeWithScalarType(*this, at::kByte);
+  }
   SymbolicVariable operator<(at::Scalar rhs) const {
     return create(aten::lt, {*this, insertConstant(rhs)})[0].typeLikeWithScalarType(*this, at::kByte);
   }
+  SymbolicVariable operator<(const SymbolicVariable rhs) const {
+    return create(aten::lt, {*this, rhs})[0].typeLikeWithScalarType(*this, at::kByte);
+  }
   SymbolicVariable operator>=(at::Scalar rhs) const {
     return create(aten::ge, {*this, insertConstant(rhs)})[0].typeLikeWithScalarType(*this, at::kByte);
   }
+  SymbolicVariable operator>=(const SymbolicVariable rhs) const {
+    return create(aten::ge, {*this, rhs})[0].typeLikeWithScalarType(*this, at::kByte);
+  }
   SymbolicVariable operator<=(at::Scalar rhs) const {
     return create(aten::le, {*this, insertConstant(rhs)})[0].typeLikeWithScalarType(*this, at::kByte);
   }
+  SymbolicVariable operator<=(const SymbolicVariable rhs) const {
+    return create(aten::le, {*this, rhs})[0].typeLikeWithScalarType(*this, at::kByte);
+  }
   SymbolicVariable operator==(at::Scalar rhs) const {
     return create(aten::eq, {*this, insertConstant(rhs)})[0].typeLikeWithScalarType(*this, at::kByte);
   }
@@ -97,6 +109,9 @@ struct SymbolicVariable {
   SymbolicVariable operator%(at::Scalar rhs) const {
     return create(aten::remainder, {*this, insertConstant(rhs)})[0].typeLike(*this);
   }
+  SymbolicVariable isnan() const {
+    return create(aten::ne, {*this, *this})[0].typeLikeWithScalarType(*this, at::kByte);
+  }
   SymbolicVariable mm(const SymbolicVariable rhs) const {
     return create(t("mm"), {*this, rhs})[0];
   }
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 30c9f2dfe7dd3b..f1bae30fe6a8cf 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -49,9 +49,13 @@ def rendezvous(url, **kwargs):
     return _rendezvous_handlers[result.scheme](url, **kwargs)
 
 
+def _rendezvous_error(msg):
+    return ValueError("Error initializing torch.distributed using " + msg)
+
+
 def _file_rendezvous_handler(url):
     def _error(msg):
-        return ValueError("file:// rendezvous: " + msg)
+        return _rendezvous_error("file:// rendezvous: " + msg)
 
     result = urlparse(url)
     path = result.path
@@ -74,7 +78,7 @@ def _error(msg):
 
 def _tcp_rendezvous_handler(url):
     def _error(msg):
-        return ValueError("tcp:// rendezvous: " + msg)
+        return _rendezvous_error("tcp:// rendezvous: " + msg)
 
     result = urlparse(url)
     if not result.port:
@@ -97,22 +101,25 @@ def _error(msg):
 
 def _env_rendezvous_handler(url):
     def _error(msg):
-        return ValueError("env:// rendezvous: " + msg)
+        return _rendezvous_error("env:// rendezvous: " + msg)
+
+    def _env_error(var):
+        return _error("environment variable %s expected, but not set" % var)
 
     if url != "env://":
-        raise _error("Only `env://` is expected for the env init method")
-    world_size = os.environ["WORLD_SIZE"]
+        raise _error("url must be equal to `env://`")
+    world_size = os.environ.get("WORLD_SIZE", None)
     if world_size is None:
-        raise _error("world size is missing")
-    rank = os.environ["RANK"]
+        raise _env_error("WORLD_SIZE")
+    rank = os.environ.get("RANK", None)
     if rank is None:
-        raise _error("rank is missing")
-    master_addr = os.environ["MASTER_ADDR"]
+        raise _env_error("RANK")
+    master_addr = os.environ.get("MASTER_ADDR", None)
     if master_addr is None:
-        raise _error("master addr is missing")
-    master_port = os.environ["MASTER_PORT"]
+        raise _env_error("MASTER_ADDR")
+    master_port = os.environ.get("MASTER_PORT", None)
     if master_port is None:
-        raise _error("master port is missing")
+        raise _env_error("MASTER_PORT")
 
     # Converting before creating the store
     rank = int(rank)
diff --git a/torch/functional.py b/torch/functional.py
index 40c74ba59abd94..0eac8f16741766 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -3,6 +3,7 @@
 from torch._six import inf
 from operator import mul
 from functools import reduce
+from collections import Iterable
 import math
 
 __all__ = [
@@ -16,6 +17,7 @@
     'isfinite',
     'isinf',
     'isnan',
+    'norm',
     'meshgrid',
     'split',
     'stft',
@@ -637,3 +639,81 @@ def argsort(input, dim=None, descending=False):
     if dim is None:
         return torch.sort(input, -1, descending)[1]
     return torch.sort(input, dim, descending)[1]
+
+
+def norm(input, p="fro", dim=None, keepdim=False, out=None):
+    r"""Returns the matrix norm or vector norm of a given tensor.
+
+    Args:
+        input (Tensor): the input tensor
+        p ({int, float, inf, -inf, 'fro', 'nuc'}): the order of norm
+            The following norms can be calculated:
+            =====  ============================  ==========================
+            ord    matrix norm                   vector norm
+            =====  ============================  ==========================
+            None   Frobenius norm                2-norm
+            'fro'  Frobenius norm                --
+            'nuc'  nuclear norm                  --
+            Other  as vec norm when dim is None  sum(abs(x)**ord)**(1./ord)
+            =====  ============================  ==========================
+        dim ({int, 2-tuple of ints, 2-list of ints}, optional): If it is an int,
+        vector norm will be calculated, if it is 2-tuple of ints, matrix norm
+        will be calculated. If the value is None, matrix norm will be calculated
+        when the input tensor only has two dimensions, vector norm will be
+        calculated when the input tensor only has one dimension. If the input
+        tensor has more than two dimensions, the vector norm will be applied to
+        last dimension.
+        keepdim (bool): whether the output tensors have :attr:`dim`
+            retained or not. Ignored if attr:`dim`=``None`` and
+            :attr:`out`=``None``.
+        out (Tensor, optional): the output tensor. Ignored if
+        attr:`dim`=``None`` and :attr:`out`=``None``.
+
+    Example::
+        >>> import torch
+        >>> a = torch.arange(9, dtype= torch.float) - 4
+        >>> b = a.reshape((3, 3))
+        >>> torch.norm(a)
+        tensor(7.7460)
+        >>> torch.norm(b)
+        tensor(7.7460)
+        >>> torch.norm(a, float('inf'))
+        tensor(4.)
+        >>> torch.norm(b, float('inf'))
+        tensor([4., 3., 4.])
+        >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float)
+        >>> torch.norm(c, dim=0)
+        tensor([1.4142, 2.2361, 5.0000])
+        >>> torch.norm(c, dim=1)
+        tensor([3.7417, 4.2426])
+        >>> torch.norm(c, p=1, dim=1)
+        tensor([6., 6.])
+        >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2)
+        >>> torch.norm(d, dim=(1,2))
+        tensor([ 3.7417, 11.2250])
+        >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
+        (tensor(3.7417), tensor(11.2250))
+    """
+    ndim = input.dim()
+
+    # catch default case
+    if dim is None and out is None:
+        if p == "fro":
+            return torch._C._VariableFunctions.frobenius_norm(input)
+        elif p != "nuc":
+            return torch._C._VariableFunctions.norm(input, p)
+
+    if p == "fro":
+        if dim is None:
+            dim = tuple(range(ndim))
+        if out is None:
+            return torch._C._VariableFunctions.frobenius_norm(input, dim, keepdim=keepdim)
+        return torch._C._VariableFunctions.frobenius_norm(input, dim, keepdim=keepdim, out=out)
+    elif p == "nuc":
+        if out is None:
+            torch._C._VariableFunctions.nuclear_norm(input, keepdim=keepdim)
+        return torch._C._VariableFunctions.nuclear_norm(input, keepdim=keepdim, out=out)
+    else:
+        if out is None:
+            return torch._C._VariableFunctions.norm(input, p, dim, keepdim=keepdim)
+    return torch._C._VariableFunctions.norm(input, p, dim, keepdim=keepdim, out=out)
diff --git a/torch/legacy/README.md b/torch/legacy/README.md
new file mode 100644
index 00000000000000..5a8133b3b21581
--- /dev/null
+++ b/torch/legacy/README.md
@@ -0,0 +1 @@
+If you're looking for this legacy code please consider versions of PyTorch before 0.5
diff --git a/torch/legacy/__init__.py b/torch/legacy/__init__.py
deleted file mode 100644
index 0954abd4a43de2..00000000000000
--- a/torch/legacy/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-"""Package containing code ported from Lua torch.
-
-To make it possible to work with existing models and ease the transition
-for current Lua torch users, we've created this package. You can find the
-``nn`` code in ``torch.legacy.nn``, and ``optim`` in ``torch.legacy.optim``.
-The APIs should exactly match Lua torch.
-"""
diff --git a/torch/legacy/nn/Abs.py b/torch/legacy/nn/Abs.py
deleted file mode 100644
index 4b61c320414b14..00000000000000
--- a/torch/legacy/nn/Abs.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Abs(Module):
-
-    def __init__(self):
-        super(Abs, self).__init__()
-
-    def updateOutput(self, input):
-        self._backend.Abs_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.Abs_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/AbsCriterion.py b/torch/legacy/nn/AbsCriterion.py
deleted file mode 100644
index 66f7615205d187..00000000000000
--- a/torch/legacy/nn/AbsCriterion.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class AbsCriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(AbsCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = torch.Tensor(1)
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.AbsCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.AbsCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/Add.py b/torch/legacy/nn/Add.py
deleted file mode 100644
index 3e528e8c64c5d1..00000000000000
--- a/torch/legacy/nn/Add.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class Add(Module):
-
-    def __init__(self, inputSize, scalar=False):
-        super(Add, self).__init__()
-        size = inputSize
-        if scalar:
-            assert size == 1
-        self.scalar = scalar
-        self.bias = torch.Tensor(size)
-        self.gradBias = torch.Tensor(size)
-
-        self._ones = torch.Tensor((1,))
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.bias.size(0))
-
-        self.bias.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        if self.scalar:
-            self.output.add_(self.bias[0])
-        else:
-            batchSize = input.size(0)
-            if self._ones.size(0) != batchSize:
-                self._ones.resize_(batchSize).fill_(1)
-
-            bias = self.bias.view(-1)
-            output = self.output.view(batchSize, -1)
-            output.addr_(self._ones, bias)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is not None:
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-            return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        if self.gradBias.size(0) == 1:
-            self.gradBias[0] = self.gradBias[0] + scale * gradOutput.sum()
-        else:
-            if input.is_same_size(self.bias):
-                self.gradBias.add_(scale, gradOutput)
-            else:
-                gradOutput = gradOutput.contiguous().view(input.size(0), -1)
-                self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)
diff --git a/torch/legacy/nn/AddConstant.py b/torch/legacy/nn/AddConstant.py
deleted file mode 100644
index 4e9f10dcbf772f..00000000000000
--- a/torch/legacy/nn/AddConstant.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import torch
-from .Module import Module
-
-
-class AddConstant(Module):
-
-    def __init__(self, constant_scalar, inplace=False):
-        super(AddConstant, self).__init__()
-        self.constant_scalar = constant_scalar
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        if self.inplace:
-            input.add_(self.constant_scalar)
-            self.output.set_(input)
-        else:
-            self.output.resize_as_(input)
-            self.output.copy_(input)
-            self.output.add_(self.constant_scalar)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.inplace:
-            self.gradInput.set_(gradOutput)
-            # restore previous input value
-            input.add_(-self.constant_scalar)
-        else:
-            self.gradInput.resize_as_(gradOutput)
-            self.gradInput.copy_(gradOutput)
-
-        return self.gradInput
diff --git a/torch/legacy/nn/BCECriterion.py b/torch/legacy/nn/BCECriterion.py
deleted file mode 100644
index 48d9c4280413e7..00000000000000
--- a/torch/legacy/nn/BCECriterion.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-# TODO: use THNN
-
-
-class BCECriterion(Criterion):
-    eps = 1e-12
-
-    def __init__(self, weights=None, sizeAverage=True):
-        if weights is not None and weights.dim() != 1:
-            raise ValueError("weights input should be 1D Tensor")
-
-        super(BCECriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.buffer = None
-        self.weights = weights
-
-    def updateOutput(self, input, target):
-        # - log(input) * target - log(1 - input) * (1 - target)
-        if input.nelement() != target.nelement():
-            raise RuntimeError("input and target size mismatch")
-
-        if self.buffer is None:
-            self.buffer = input.new()
-
-        buffer = self.buffer
-        weights = self.weights
-
-        buffer.resize_as_(input)
-
-        if weights is not None and target.dim() != 1:
-            weights = self.weights.view(1, target.size(1)).expand_as(target)
-
-        # log(input) * target
-        torch.add(input, self.eps, out=buffer).log_()
-        if weights is not None:
-            buffer.mul_(weights)
-
-        target_1d = target.contiguous().view(-1)
-        # don't save a 1-d view of buffer: it should already be contiguous, and it's
-        # used as non-1d tensor later.
-        output = torch.dot(target_1d, buffer.contiguous().view(-1))
-
-        # log(1 - input) * (1 - target)
-        torch.mul(input, -1, out=buffer).add_(1 + self.eps).log_()
-        if weights is not None:
-            buffer.mul_(weights)
-
-        output = output + torch.sum(buffer)
-        output = output - torch.dot(target_1d, buffer.contiguous().view(-1))
-
-        if self.sizeAverage:
-            output = output / input.nelement()
-
-        self.output = - output.item()
-
-        return self.output
-
-    def updateGradInput(self, input, target):
-        # - (target - input) / ( input (1 - input) )
-        # The gradient is slightly incorrect:
-        # It should have be divided by (input + self.eps) (1 - input + self.eps)
-        # but it is divided by input (1 - input + self.eps) + self.eps
-        # This modification requires less memory to be computed.
-        if input.nelement() != target.nelement():
-            raise RuntimeError("input and target size mismatch")
-
-        if self.buffer is None:
-            self.buffer = input.new()
-
-        buffer = self.buffer
-        weights = self.weights
-        gradInput = self.gradInput
-
-        if weights is not None and target.dim() != 1:
-            weights = self.weights.view(1, target.size(1)).expand_as(target)
-
-        buffer.resize_as_(input)
-        # - x ( 1 + self.eps -x ) + self.eps
-        torch.add(input, -1, out=buffer).add_(-self.eps).mul_(input).add_(-self.eps)
-
-        gradInput.resize_as_(input)
-        # y - x
-        torch.add(target, -1, input, out=gradInput)
-        # - (y - x) / ( x ( 1 + self.eps -x ) + self.eps )
-        gradInput.div_(buffer)
-
-        if weights is not None:
-            gradInput.mul_(weights)
-
-        if self.sizeAverage:
-            gradInput.div_(target.nelement())
-
-        return gradInput
diff --git a/torch/legacy/nn/BatchNormalization.py b/torch/legacy/nn/BatchNormalization.py
deleted file mode 100644
index 223879823e4492..00000000000000
--- a/torch/legacy/nn/BatchNormalization.py
+++ /dev/null
@@ -1,192 +0,0 @@
-"""
-        This file implements Batch Normalization as described in the paper:
-        "Batch Normalization: Accelerating Deep Network Training
-                              by Reducing Internal Covariate Shift"
-                        by Sergey Ioffe, Christian Szegedy
-
-        This implementation is useful for inputs NOT coming from convolution layers.
-        For convolution layers, use nn.SpatialBatchNormalization.
-
-        The operation implemented is:
-        y =     ( x - mean(x) )
-             ########## * gamma + beta
-             standard-deviation(x)
-        where gamma and beta are learnable parameters.
-
-        The learning of gamma and beta is optional.
-
-        Usage:
-        with    learnable parameters: nn.BatchNormalization(N [, eps] [, momentum])
-                                      where N = dimensionality of input
-        without learnable parameters: nn.BatchNormalization(N [, eps] [, momentum], False)
-
-        eps is a small value added to the standard-deviation to avoid divide-by-zero.
-            Defaults to 1e-5
-
-        In training time, this layer keeps a running estimate of it's computed mean and std.
-        The running sum is kept with a default momentum of 0.1 (unless over-ridden)
-        In test time, this running mean/std is used to normalize.
-"""
-
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class BatchNormalization(Module):
-    # expected dimension of input
-    nDim = 2
-
-    def __init__(self, nOutput, eps=1e-5, momentum=0.1, affine=True):
-        super(BatchNormalization, self).__init__()
-        assert nOutput != 0
-
-        self.affine = affine
-        self.eps = eps
-        self.train = True
-        self.momentum = momentum
-        self.running_mean = torch.zeros(nOutput)
-        self.running_var = torch.ones(nOutput)
-
-        self.save_mean = None
-        self.save_std = None
-        self._input = None
-        self._gradOutput = None
-
-        if self.affine:
-            self.weight = torch.Tensor(nOutput)
-            self.bias = torch.Tensor(nOutput)
-            self.gradWeight = torch.Tensor(nOutput)
-            self.gradBias = torch.Tensor(nOutput)
-            self.reset()
-        else:
-            self.weight = None
-            self.bias = None
-            self.gradWeight = None
-            self.gradBias = None
-
-    def reset(self):
-        if self.weight is not None:
-            self.weight.uniform_()
-
-        if self.bias is not None:
-            self.bias.zero_()
-
-        self.running_mean.zero_()
-        self.running_var.fill_(1)
-
-    def _checkInputDim(self, input):
-        if input.dim() != self.nDim:
-            raise RuntimeError(
-                'only mini-batch supported ({}D tensor), got {}D tensor instead'.format(self.nDim, input.dim()))
-        if input.size(1) != self.running_mean.nelement():
-            raise RuntimeError('got {}-feature tensor, expected {}'.format(input.size(1), self.running_mean.nelement()))
-
-    def _makeContiguous(self, input, gradOutput=None):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input).copy_(input)
-            input = self._input
-
-        if gradOutput is not None:
-            if not gradOutput.is_contiguous():
-                if self._gradOutput is None:
-                    self._gradOutput = gradOutput.new()
-                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-                gradOutput = self._gradOutput
-
-        return input, gradOutput
-
-    def updateOutput(self, input):
-        self._checkInputDim(input)
-
-        input = self._makeContiguous(input)[0]
-
-        self.output.resize_as_(input)
-        if self.save_mean is None:
-            self.save_mean = input.new()
-        self.save_mean.resize_as_(self.running_mean)
-        if self.save_std is None:
-            self.save_std = input.new()
-        self.save_std.resize_as_(self.running_var)
-
-        self._backend.BatchNormalization_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.running_mean,
-            self.running_var,
-            self.save_mean,
-            self.save_std,
-            self.train,
-            self.momentum,
-            self.eps
-        )
-
-        return self.output
-
-    def _backward(self, input, gradOutput, scale, gradInput=None, gradWeight=None, gradBias=None):
-        self._checkInputDim(input)
-        self._checkInputDim(gradOutput)
-        if not hasattr(self, 'save_mean') or not hasattr(self, 'save_std'):
-            raise RuntimeError('you have to call updateOutput() at least once before backward()')
-
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-
-        scale = scale or 1.
-        if gradInput is not None:
-            gradInput.resize_as_(gradOutput)
-
-        self._backend.BatchNormalization_backward(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            gradInput,
-            gradWeight,
-            gradBias,
-            self.weight,
-            self.running_mean,
-            self.running_var,
-            self.save_mean,
-            self.save_std,
-            self.train,
-            scale,
-            self.eps
-        )
-
-        return self.gradInput
-
-    def backward(self, input, gradOutput, scale=1.):
-        return self._backward(input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias)
-
-    def updateGradInput(self, input, gradOutput):
-        return self._backward(input, gradOutput, 1., self.gradInput)
-
-    def accGradParameters(self, input, gradOutput, scale=1.):
-        return self._backward(input, gradOutput, scale, None, self.gradWeight, self.gradBias)
-
-    def read(self, file, version):
-        super(BatchNormalization, self).read(self, file)
-        if version < 2:
-            if self.running_std:
-                self.running_var = self.running_std.pow_(-2).add_(-self.eps)
-                self.running_std = None
-
-    def clearState(self):
-        # first 5 buffers are not present in the current implementation,
-        # but we keep them for cleaning old saved models
-        clear(self, [
-            'buffer',
-            'buffer2',
-            'centered',
-            'std',
-            'normalized',
-            '_input',
-            '_gradOutput',
-            'save_mean',
-            'save_std',
-        ])
-        return super(BatchNormalization, self).clearState()
diff --git a/torch/legacy/nn/Bilinear.py b/torch/legacy/nn/Bilinear.py
deleted file mode 100644
index dd8b684c404514..00000000000000
--- a/torch/legacy/nn/Bilinear.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Bilinear(Module):
-
-    def _assertInput(self, input):
-        if len(input) != 2 or not isinstance(input[0], torch.Tensor) or not isinstance(input[1], torch.Tensor):
-            raise RuntimeError('input should be a table containing two data Tensors')
-        if input[0].ndimension() != 2 or input[1].ndimension() != 2:
-            raise RuntimeError('input Tensors should be two-dimensional')
-        if input[0].size(0) != input[1].size(0):
-            raise RuntimeError('input Tensors should have the same number of rows')
-        if input[0].size(1) != self.weight.size(1):
-            raise RuntimeError('dimensionality of first input is erroneous')
-        if input[1].size(1) != self.weight.size(2):
-            raise RuntimeError('dimensionality of second input is erroneous')
-
-    def _assertInputGradOutput(self, input, gradOutput):
-        if input[0].size(0) != gradOutput.size(0):
-            raise RuntimeError('number of rows in gradOutput.es not match input')
-        if gradOutput.size(1) != self.weight.size(0):
-            raise RuntimeError('number of columns in gradOutput does not match layer\'s output size')
-
-    def __init__(self, inputSize1, inputSize2, outputSize, bias=True):
-        # set up model:
-        super(Bilinear, self).__init__()
-        self.weight = torch.Tensor(outputSize, inputSize1, inputSize2)
-        self.gradWeight = torch.Tensor(outputSize, inputSize1, inputSize2)
-        if bias:
-            self.bias = torch.Tensor(outputSize)
-            self.gradBias = torch.Tensor(outputSize)
-        else:
-            self.bias = None
-            self.gradBias = None
-
-        self.buff1 = None
-        self.buff2 = None
-
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(1))
-
-        self.weight.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.uniform_(-stdv, stdv)
-        return self
-
-    def updateOutput(self, input):
-        self._assertInput(input)
-
-        # set up buffer:
-        if self.buff2 is None:
-            self.buff2 = input[0].new()
-        self.buff2.resize_as_(input[1])
-
-        # compute output scores:
-        self.output.resize_(input[0].size(0), self.weight.size(0))
-        for k in range(self.weight.size(0)):
-            torch.mm(input[0], self.weight[k], out=self.buff2)
-            self.buff2.mul_(input[1])
-            torch.sum(self.buff2, 1, True, out=self.output.narrow(1, k, 1))
-
-        if self.bias is not None:
-            self.output.add_(self.bias.view(1, self.bias.nelement()).expand_as(self.output))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        self._assertInputGradOutput(input, gradOutput)
-        # compute d output / d input:
-        self.gradInput[0].resize_as_(input[0]).fill_(0)
-        self.gradInput[1].resize_as_(input[1]).fill_(0)
-
-        #: first slice of weight tensor (k = 1)
-        self.gradInput[0].addmm_(input[1], self.weight[0].t())
-        self.gradInput[0].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0),
-                                                                 self.gradInput[0].size(1)))
-        self.gradInput[1].addmm_(input[0], self.weight[0])
-        self.gradInput[1].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0),
-                                                                 self.gradInput[1].size(1)))
-
-        #: remaining slices of weight tensor
-        if self.weight.size(0) > 1:
-            if self.buff1 is None:
-                self.buff1 = input[0].new()
-            self.buff1.resize_as_(input[0])
-
-            for k in range(1, self.weight.size(0)):
-                torch.mm(input[1], self.weight[k].t(), out=self.buff1)
-                self.buff1.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0),
-                                                                  self.gradInput[0].size(1)))
-                self.gradInput[0].add_(self.buff1)
-
-                torch.mm(input[0], self.weight[k], out=self.buff2)
-                self.buff2.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0),
-                                                                  self.gradInput[1].size(1)))
-                self.gradInput[1].add_(self.buff2)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._assertInputGradOutput(input, gradOutput)
-
-        # make sure we have buffer:
-        if self.buff1 is None:
-            self.buff1 = input[0].new()
-        self.buff1.resize_as_(input[0])
-
-        # accumulate parameter gradients:
-        for k in range(self.weight.size(0)):
-            torch.mul(input[0], gradOutput.narrow(1, k, 1).expand_as(input[0]), out=self.buff1)
-            self.gradWeight[k].addmm_(self.buff1.t(), input[1])
-
-        if self.bias is not None:
-            self.gradBias.add_(scale, gradOutput.sum(0, keepdim=False))
-
-    def __repr__(self):
-        return str(type(self)) + \
-            '({}x{} -> {}) {}'.format(
-            self.weight.size(1), self.weight.size(2), self.weight.size(0),
-            (' without bias' if self.bias is None else '')
-        )
-
-    def clearState(self):
-        clear(self, 'buff1', 'buff2')
-        return super(Bilinear, self).clearState()
diff --git a/torch/legacy/nn/CAddTable.py b/torch/legacy/nn/CAddTable.py
deleted file mode 100644
index bcefa11f2acb31..00000000000000
--- a/torch/legacy/nn/CAddTable.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CAddTable(Module):
-
-    def __init__(self, inplace=False):
-        super(CAddTable, self).__init__()
-        self.inplace = inplace
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        if self.inplace:
-            self.output.set_(input[0])
-        else:
-            self.output.resize_as_(input[0]).copy_(input[0])
-
-        for i in range(1, len(input)):
-            self.output.add_(input[i])
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        for i in range(len(input)):
-            if i >= len(self.gradInput):
-                assert i == len(self.gradInput)
-                self.gradInput.append(input[0].new())
-
-            if self.inplace:
-                self.gradInput[i].set_(gradOutput)
-            else:
-                self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
-
-        del self.gradInput[len(input):]
-
-        return self.gradInput
diff --git a/torch/legacy/nn/CDivTable.py b/torch/legacy/nn/CDivTable.py
deleted file mode 100644
index c7f10804b3ebde..00000000000000
--- a/torch/legacy/nn/CDivTable.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CDivTable(Module):
-
-    def __init__(self, ):
-        super(CDivTable, self).__init__()
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input[0]).copy_(input[0])
-        self.output.div_(input[1])
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        while len(self.gradInput) < 2:
-            self.gradInput.append(input[0].new())
-        gradOutput = gradOutput.contiguous().view_as(input[0])
-        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput).div_(input[1])
-        self.gradInput[1].resize_as_(input[1]).zero_().addcdiv_(-1, self.gradInput[0], input[1]).mul_(input[0])
-
-        del self.gradInput[len(input):]
-
-        return self.gradInput
diff --git a/torch/legacy/nn/CMul.py b/torch/legacy/nn/CMul.py
deleted file mode 100644
index 0d4265a33e2592..00000000000000
--- a/torch/legacy/nn/CMul.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import math
-
-import torch
-from .Module import Module
-from .utils import clear, contiguousView
-
-
-class CMul(Module):
-
-    def __init__(self, *args):
-        super(CMul, self).__init__()
-
-        if len(args) == 1 and isinstance(args[0], torch.Size):
-            self.size = args[0]
-        else:
-            self.size = torch.Size(args)
-
-        self.weight = torch.Tensor(self.size)
-        self.gradWeight = torch.Tensor(self.size)
-        self.output.resize_(self.size)
-        self.reset()
-
-        self._output = None
-        self._weight = None
-        self._expand = None
-        self._repeat = None
-        self._gradOutput = None
-        self._gradInput = None
-        self._input = None
-        self._gradWeight = None
-        self._sum = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.nelement())
-
-        self.weight.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        # lazy-initialize
-        if self._output is None:
-            self._output = input.new()
-            self._weight = input.new()
-            self._expand = input.new()
-            self._repeat = input.new()
-
-        self.output.resize_as_(input).copy_(input)
-        batchSize = input.size(0)
-        # TODO: expand_as_, view_
-        self._output = self.output.view(batchSize, -1)
-        self._weight = self.weight.view(1, -1)
-        self._expand = self._weight.expand_as(self._output)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat.resize_as_(self._expand).copy_(self._expand)
-            self._output.mul_(self._repeat)
-        else:
-            self._output.mul_(self._expand)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        if self._gradOutput is None:
-            self._gradOutput = input.new()
-            self._gradInput = input.new()
-
-        self.gradInput.resize_as_(input).zero_()
-        batchSize = input.size(0)
-        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
-        contiguousView(self._gradInput, self.gradInput, batchSize, -1)
-        self._weight = self.weight.view(1, -1)
-        self._expand = self._weight.expand_as(self._gradOutput)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat.resize_as_(self._expand).copy_(self._expand)
-            self._gradInput.addcmul_(1, self._repeat, self._gradOutput)
-        else:
-            self._gradInput.addcmul_(1, self._expand, self._gradOutput)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        if self._input is None:
-            self._input = input.new()
-            self._gradWeight = input.new()
-            self._sum = input.new()
-
-        batchSize = input.size(0)
-        contiguousView(self._input, input, batchSize, -1)
-        contiguousView(self._gradOutput, gradOutput, batchSize, -1)
-        self._gradWeight = self.gradWeight.view(1, -1)
-
-        torch.mul(self._input, self._gradOutput, out=self._repeat)
-        torch.sum(self._repeat, 0, True, out=self._sum)
-        self._gradWeight.add_(scale, self._sum)
-
-    def type(self, type=None, tensorCache=None):
-        if type:
-            self.clearState()
-        return super(CMul, self).type(type, tensorCache)
-
-    def clearState(self):
-        clear(self, [
-            '_input',
-            '_output',
-            '_weight',
-            '_gradWeight',
-            '_expand',
-            '_repeat',
-            '_sum',
-        ])
-        return super(CMul, self).clearState()
diff --git a/torch/legacy/nn/CMulTable.py b/torch/legacy/nn/CMulTable.py
deleted file mode 100644
index 64a58f0c79243b..00000000000000
--- a/torch/legacy/nn/CMulTable.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class CMulTable(Module):
-
-    def __init__(self, ):
-        super(CMulTable, self).__init__()
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input[0]).copy_(input[0])
-        for i in range(1, len(input)):
-            self.output.mul_(input[i])
-
-        return self.output
-
-    def updateGradInput_efficient(self, input, gradOutput):
-        if self.tout is None:
-            self.tout = input[0].new()
-        self.tout.resize_as_(self.output)
-        for i in range(len(input)):
-            if len(self.gradInput) <= i:
-                assert i == len(self.gradInput)
-                self.gradInput.append(input[0].new())
-            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
-            self.tout.copy_(self.output).div_(input[i])
-            self.gradInput[i].mul_(self.tout)
-
-        self.gradInput = self.gradInput[:len(input)]
-        return self.gradInput
-
-    def updateGradInput(self, input, gradOutput):
-        for i in range(len(input)):
-            if len(self.gradInput) <= i:
-                assert i == len(self.gradInput)
-                self.gradInput.append(input[0].new())
-            self.gradInput[i].resize_as_(input[i]).copy_(gradOutput)
-            for j in range(len(input)):
-                if i != j:
-                    self.gradInput[i].mul_(input[j])
-
-        self.gradInput = self.gradInput[:len(input)]
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'tout')
-        return super(CMulTable, self).clearState()
diff --git a/torch/legacy/nn/CSubTable.py b/torch/legacy/nn/CSubTable.py
deleted file mode 100644
index 85d8527f8cf524..00000000000000
--- a/torch/legacy/nn/CSubTable.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CSubTable(Module):
-
-    def __init__(self, ):
-        super(CSubTable, self).__init__()
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input[0]).copy_(input[0])
-        self.output.add_(-1, input[1])
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput[0] is None:
-            self.gradInput[0] = input[0].new()
-        if self.gradInput[1] is None:
-            self.gradInput[1] = input[1].new()
-        self.gradInput[0].resize_as_(input[0]).copy_(gradOutput)
-        self.gradInput[1].resize_as_(input[1]).copy_(gradOutput).mul_(-1)
-
-        self.gradInput = self.gradInput[:2]
-        return self.gradInput
diff --git a/torch/legacy/nn/Clamp.py b/torch/legacy/nn/Clamp.py
deleted file mode 100644
index 0bfcac3266b87c..00000000000000
--- a/torch/legacy/nn/Clamp.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import torch
-from .HardTanh import HardTanh
-
-
-class Clamp(HardTanh):
-
-    def __init__(self, min_value, max_value):
-        super(Clamp, self,).__init__(min_value, max_value)
diff --git a/torch/legacy/nn/ClassNLLCriterion.py b/torch/legacy/nn/ClassNLLCriterion.py
deleted file mode 100644
index 33c28e5d21cb96..00000000000000
--- a/torch/legacy/nn/ClassNLLCriterion.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class ClassNLLCriterion(Criterion):
-
-    def __init__(self, weights=None, sizeAverage=True, ignore_index=-100):
-        super(ClassNLLCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.ignore_index = ignore_index
-
-        if weights is not None:
-            assert weights.dim() == 1
-        self.weights = weights
-
-        self.output_tensor = torch.zeros(1)
-        self.total_weight_tensor = torch.ones(1)
-
-    def updateOutput(self, input, target):
-        self.ignore_index = getattr(self, "ignore_index", -100)
-        target = target.long()
-        self._backend.ClassNLLCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.weights,
-            self.total_weight_tensor,
-            self.ignore_index,
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.gradInput.resize_as_(input).zero_()
-        target = target.long()
-        implicit_gradOutput = torch.ones(1).type_as(input)
-
-        self._backend.ClassNLLCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.weights,
-            self.total_weight_tensor,
-            self.ignore_index,
-        )
-
-        return self.gradInput
diff --git a/torch/legacy/nn/ClassSimplexCriterion.py b/torch/legacy/nn/ClassSimplexCriterion.py
deleted file mode 100644
index 1de585147347e1..00000000000000
--- a/torch/legacy/nn/ClassSimplexCriterion.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import math
-import torch
-from torch.nn.functional import _Reduction
-from .MSECriterion import MSECriterion
-
-"""
-         This file implements a criterion for multi-class classification.
-         It learns an embedding per class, where each class' embedding
-         is a point on an (N-1)-dimensional simplex, where N is
-         the number of classes.
-         For example usage of this class, look at.c/criterion.md
-
-         Reference: http.//arxiv.org/abs/1506.08230
-"""
-
-
-class ClassSimplexCriterion(MSECriterion):
-
-    def __init__(self, nClasses):
-        super(ClassSimplexCriterion, self).__init__()
-        self.nClasses = nClasses
-
-        # embedding the simplex in a space of dimension strictly greater than
-        # the minimum possible (nClasses-1) is critical for effective training.
-        simp = self._regsplex(nClasses - 1)
-        self.simplex = torch.cat((simp, torch.zeros(simp.size(0), nClasses - simp.size(1))), 1)
-        self._target = torch.Tensor(nClasses)
-
-        self.output_tensor = None
-
-    def _regsplex(self, n):
-        """
-        regsplex returns the coordinates of the vertices of a
-        regular simplex centered at the origin.
-        The Euclidean norms of the vectors specifying the vertices are
-        all equal to 1. The input n is the dimension of the vectors;
-        the simplex has n+1 vertices.
-
-        input:
-        n # dimension of the vectors specifying the vertices of the simplex
-
-        output:
-        a # tensor dimensioned (n+1, n) whose rows are
-             vectors specifying the vertices
-
-        reference:
-        http.//en.wikipedia.org/wiki/Simplex#Cartesian_coordinates_for_regular_n-dimensional_simplex_in_Rn
-        """
-        a = torch.zeros(n + 1, n)
-
-        for k in range(n):
-            # determine the last nonzero entry in the vector for the k-th vertex
-            if k == 0:
-                a[k][k] = 1
-            else:
-                a[k][k] = math.sqrt(1 - a[k:k + 1, 0:k + 1].norm() ** 2)
-
-            # fill_ the k-th coordinates for the vectors of the remaining vertices
-            c = (a[k][k] ** 2 - 1 - 1 / n) / a[k][k]
-            a[k + 1:n + 2, k:k + 1].fill_(c)
-
-        return a
-
-    # handle target being both 1D tensor, and
-    # target being 2D tensor (2D tensor means.nt: anything)
-    def _transformTarget(self, target):
-        assert target.dim() == 1
-        nSamples = target.size(0)
-        self._target.resize_(nSamples, self.nClasses)
-        for i in range(nSamples):
-            self._target[i].copy_(self.simplex[int(target[i])])
-
-    def updateOutput(self, input, target):
-        self._transformTarget(target)
-
-        assert input.nelement() == self._target.nelement()
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.MSECriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            self._target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        assert input.nelement() == self._target.nelement()
-        implicit_gradOutput = torch.Tensor([1]).type(input.type())
-        self._backend.MSECriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            self._target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
-
-    def getPredictions(self, input):
-        return torch.mm(input, self.simplex.t())
-
-    def getTopPrediction(self, input):
-        prod = self.getPredictions(input)
-        _, maxs = prod.max(prod.ndimension() - 1)
-        return maxs.view(-1)
diff --git a/torch/legacy/nn/Concat.py b/torch/legacy/nn/Concat.py
deleted file mode 100644
index cb54d7674c2d23..00000000000000
--- a/torch/legacy/nn/Concat.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import torch
-from .Container import Container
-
-
-class Concat(Container):
-
-    def __init__(self, dimension):
-        super(Concat, self).__init__()
-        self.outputSize = torch.Size()
-        self.dimension = dimension
-
-    def updateOutput(self, input):
-        outs = []
-        for i in range(len(self.modules)):
-            currentOutput = self.modules[i].updateOutput(input)
-            outs.append(currentOutput)
-            if i == 0:
-                size = list(currentOutput.size())
-            else:
-                size[self.dimension] += currentOutput.size(self.dimension)
-        self.outputSize = torch.Size(size)
-        self.output.resize_(self.outputSize)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = outs[i]
-            self.output.narrow(self.dimension, offset, currentOutput.size(self.dimension)).copy_(currentOutput)
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            currentGradInput = module.updateGradInput(input, gradOutput.narrow(
-                self.dimension, offset, currentOutput.size(self.dimension)))
-
-            # if the module does not produce a gradInput (for example first layer),: ignore it and move on.
-            if currentGradInput:
-                if i == 0:
-                    self.gradInput.copy_(currentGradInput)
-                else:
-                    self.gradInput.add_(currentGradInput)
-
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            module.accGradParameters(
-                input,
-                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
-                scale)
-            offset = offset + currentOutput.size(self.dimension)
-
-    def backward(self, input, gradOutput, scale=1):
-        self.gradInput.resize_as_(input)
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            currentGradInput = module.backward(input, gradOutput.narrow(
-                self.dimension, offset, currentOutput.size(self.dimension)), scale)
-            # if the module.es not produce a gradInput (for example first layer),: ignore it and move on.
-            if currentGradInput is not None:
-                if i == 0:
-                    self.gradInput.copy_(currentGradInput)
-                else:
-                    self.gradInput.add_(currentGradInput)
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            module.accUpdateGradParameters(
-                input,
-                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
-                lr)
-            offset = offset + currentOutput.size(self.dimension)
-
-    def __tostring__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   +. -> '
-        res = torch.type(self)
-        res += ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
-
-        res += line + tab + last + 'output'
-        res += line + '}'
-        return res
diff --git a/torch/legacy/nn/ConcatTable.py b/torch/legacy/nn/ConcatTable.py
deleted file mode 100644
index afebf8c296d53a..00000000000000
--- a/torch/legacy/nn/ConcatTable.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import torch
-from .Container import Container
-
-
-class ConcatTable(Container):
-
-    def __init__(self, ):
-        super(ConcatTable, self).__init__()
-        self.modules = []
-        self.output = []
-
-    def updateOutput(self, input):
-        self.output = [module.updateOutput(input) for module in self.modules]
-        return self.output
-
-    def _map_list(self, l1, l2, f):
-        for i, v in enumerate(l2):
-            if isinstance(v, list):
-                res = self._map_list(l1[i] if i < len(l1) else [], v, f)
-                if i >= len(l1):
-                    assert i == len(l1)
-                    l1.append(res)
-                else:
-                    l1[i] = res
-            else:
-                f(l1, i, v)
-        for i in range(len(l1) - 1, len(l2) - 1, -1):
-            del l1[i]
-        return l1
-
-    def _backward(self, method, input, gradOutput, scale=1):
-        isTable = isinstance(input, list)
-        wasTable = isinstance(self.gradInput, list)
-        if isTable:
-            for i, module in enumerate(self.modules):
-                if method == 'updateGradInput':
-                    currentGradInput = module.updateGradInput(input, gradOutput[i])
-                elif method == 'backward':
-                    currentGradInput = module.backward(input, gradOutput[i], scale)
-                if not isinstance(currentGradInput, list):
-                    raise RuntimeError("currentGradInput is not a table!")
-
-                if len(input) != len(currentGradInput):
-                    raise RuntimeError("table size mismatch")
-
-                if i == 0:
-                    self.gradInput = self.gradInput if wasTable else []
-
-                    def fn(l, i, v):
-                        if i >= len(l):
-                            assert len(l) == i
-                            l.append(v.clone())
-                        else:
-                            l[i].resize_as_(v)
-                            l[i].copy_(v)
-                    self._map_list(self.gradInput, currentGradInput, fn)
-                else:
-                    def fn(l, i, v):
-                        if i < len(l):
-                            l[i].add_(v)
-                        else:
-                            assert len(l) == i
-                            l.append(v.clone())
-                    self._map_list(self.gradInput, currentGradInput, fn)
-        else:
-            self.gradInput = self.gradInput if not wasTable else input.clone()
-            for i, module in enumerate(self.modules):
-                if method == 'updateGradInput':
-                    currentGradInput = module.updateGradInput(input, gradOutput[i])
-                elif method == 'backward':
-                    currentGradInput = module.backward(input, gradOutput[i], scale)
-                if i == 0:
-                    self.gradInput.resize_as_(currentGradInput).copy_(currentGradInput)
-                else:
-                    self.gradInput.add_(currentGradInput)
-
-        return self.gradInput
-
-    def updateGradInput(self, input, gradOutput):
-        return self._backward('updateGradInput', input, gradOutput)
-
-    def backward(self, input, gradOutput, scale=1):
-        return self._backward('backward', input, gradOutput, scale)
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        for i, module in ipairs(self.modules):
-            self.rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        for i, module in ipairs(self.modules):
-            self.rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   +. -> '
-        res = torch.typename(self)
-        res = res + ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + ext)
-
-        res = res + line + tab + last + 'output'
-        res = res + line + '}'
-        return res
diff --git a/torch/legacy/nn/Container.py b/torch/legacy/nn/Container.py
deleted file mode 100644
index 84a726e00333a2..00000000000000
--- a/torch/legacy/nn/Container.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-from functools import wraps
-import sys
-
-
-class Container(Module):
-
-    def __init__(self, *args):
-        super(Container, self).__init__(*args)
-        self.modules = []
-
-    def add(self, module):
-        self.modules.append(module)
-        return self
-
-    def get(self, index):
-        return self.modules[index]
-
-    def size(self):
-        return len(self.modules)
-
-    def applyToModules(self, func):
-        for module in self.modules:
-            func(module)
-
-    def zeroGradParameters(self):
-        self.applyToModules(lambda m: m.zeroGradParameters())
-
-    def updateParameters(self, learningRate):
-        self.applyToModules(lambda m: m.updateParameters(learningRate))
-
-    def training(self):
-        self.applyToModules(lambda m: m.training())
-        super(Container, self).training()
-
-    def evaluate(self, ):
-        self.applyToModules(lambda m: m.evaluate())
-        super(Container, self).evaluate()
-
-    def share(self, mlp, *args):
-        for module, other_module in zip(self.modules, mlp.modules):
-            module.share(other_module, *args)
-
-    def reset(self, stdv=None):
-        self.applyToModules(lambda m: m.reset(stdv))
-
-    def parameters(self):
-        w = []
-        gw = []
-        for module in self.modules:
-            mparam = module.parameters()
-            if mparam is not None:
-                w.extend(mparam[0])
-                gw.extend(mparam[1])
-        if not w:
-            return
-        return w, gw
-
-    def clearState(self):
-        clear('output')
-        clear('gradInput')
-        for module in self.modules:
-            module.clearState()
-        return self
diff --git a/torch/legacy/nn/Contiguous.py b/torch/legacy/nn/Contiguous.py
deleted file mode 100644
index aacadb05e56c6f..00000000000000
--- a/torch/legacy/nn/Contiguous.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Contiguous(Module):
-
-    def updateOutput(self, input):
-        if not input.is_contiguous():
-            self.output.resize_as_(input).copy_(input)
-        else:
-            self.output.set_(input)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if not gradOutput.is_contiguous():
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-        else:
-            self.gradInput.set_(gradOutput)
-
-        return self.gradInput
diff --git a/torch/legacy/nn/Copy.py b/torch/legacy/nn/Copy.py
deleted file mode 100644
index 71c8682cc90d79..00000000000000
--- a/torch/legacy/nn/Copy.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Copy(Module):
-
-    def __init__(self, intype, outtype, dontCast=False):
-        self.dontCast = dontCast
-        super(Copy, self).__init__()
-        self.gradInput = intype()
-        self.output = outtype()
-
-    def updateOutput(self, input):
-        self.output.resize_(input.size()).copy_(input)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_(gradOutput.size()).copy_(gradOutput)
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if type and self.dontCast:
-            return self
-
-        return super(Copy, self).type(self, type, tensorCache)
diff --git a/torch/legacy/nn/Cosine.py b/torch/legacy/nn/Cosine.py
deleted file mode 100644
index ae35c85e565dd9..00000000000000
--- a/torch/legacy/nn/Cosine.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Cosine(Module):
-
-    def __init__(self, inputSize, outputSize):
-        super(Cosine, self).__init__()
-        self.weight = torch.Tensor(outputSize, inputSize)
-        self.gradWeight = torch.Tensor(outputSize, inputSize)
-        self.reset()
-
-        self._weight = None
-        self._sum = None
-        self._gradOutput = None
-        self._sum = None
-        self._weightNorm = None
-        self._inputNorm = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(0))
-        self.weight.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        assert input.dim() == 2
-
-        inputSize = self.weight.size(1)
-        outputSize = self.weight.size(0)
-
-        if self._weightNorm is None:
-            self._weightNorm = self.weight.new()
-        if self._inputNorm is None:
-            self._inputNorm = self.weight.new()
-
-        # y_j = (w_j * x) / ( || w_j || * || x || )
-
-        torch.norm(self.weight, 2, 1, out=self._weightNorm, keepdim=True).add_(1e-12)
-
-        batchSize = input.size(0)
-        nelement = self.output.nelement()
-        self.output.resize_(batchSize, outputSize)
-        if self.output.nelement() != nelement:
-            self.output.zero_()
-
-        self.output.addmm_(0., 1., input, self.weight.t())
-
-        torch.norm(input, 2, 1, out=self._inputNorm, keepdim=True).add_(1e-12)
-        self.output.div_(self._weightNorm.view(1, outputSize).expand_as(self.output))
-        self.output.div_(self._inputNorm.expand_as(self.output))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 2
-
-        if self.gradInput is None:
-            return
-
-        inputSize = self.weight.size(1)
-        outputSize = self.weight.size(0)
-
-        """
-        dy_j           w_ji                   x_i
-        ---- = -------------------  -  y_j ---------
-        dx_i   || w_j || * || x ||         || x ||^2
-        """
-
-        nelement = self.gradInput.nelement()
-        self.gradInput.resize_as_(input)
-        if self.gradInput.nelement() != nelement:
-            self.gradInput.zero_()
-
-        inputNorm = self._inputNorm.expand_as(input)
-        weightNorm = self._weightNorm.view(1, outputSize).expand_as(gradOutput)
-
-        if self._gradOutput is None:
-            self._gradOutput = gradOutput.new()
-        if self._sum is None:
-            self._sum = input.new()
-
-        self.gradInput.copy_(input).div_(inputNorm)
-        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-        self._gradOutput.mul_(self.output)
-        torch.sum(self._gradOutput, 1, out=self._sum, keepdim=True)
-        self.gradInput.mul_(self._sum.expand_as(input))
-
-        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-        self._gradOutput.div_(weightNorm)
-        self.gradInput.addmm_(-1, 1, self._gradOutput, self.weight)
-        self.gradInput.div_(inputNorm)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        assert input.dim() == 2
-        inputSize = self.weight.size(1)
-        outputSize = self.weight.size(0)
-
-        """
-        dy_j            x_i                     w_ji
-        ----- = -------------------  -  y_j -----------
-        dw_ji   || w_j || * || x ||         || w_j ||^2
-        """
-
-        if self._weight is None:
-            self._weight = self.weight.new()
-        if self._sum is None:
-            self._sum = input.new()
-
-        self._weight.resize_as_(self.weight).copy_(self.weight)
-        if self._gradOutput is None:
-            self._gradOutput = gradOutput.new()
-        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-        self._gradOutput.mul_(self.output)
-        torch.sum(self._gradOutput, 0, out=self._sum, keepdim=True)
-        grad = self._sum[0]
-        grad.div_(self._weightNorm.select(1, 0))
-        self._weight.mul_(grad.view(outputSize, 1).expand_as(self._weight))
-
-        input_ = self._gradOutput
-        input_.resize_as_(input).copy_(input)
-        input_.div_(self._inputNorm.expand_as(input))
-        self._weight.addmm_(-1, 1, gradOutput.t(), input_)
-
-        self._weight.div_(self._weightNorm.expand_as(self._weight))
-        self.gradWeight.add_(self._weight)
-
-    def type(self, type=None, tensorCache=None):
-        if type is not None:
-            # prevent premature memory allocations
-            self._input = None
-            self._weight = None
-            self._inputNorm = None
-            self._weightNorm = None
-            self._gradOutput = None
-            self._sum = None
-
-        return super(Cosine, self).type(type, tensorCache)
-
-    def clearState(self):
-        clear(self, [
-            '_input',
-            '_weight',
-            '_gradOutput',
-            '_sum',
-            '_inputNorm',
-            '_weightNorm',
-        ])
-        return super(Cosine, self).clearState()
diff --git a/torch/legacy/nn/CosineDistance.py b/torch/legacy/nn/CosineDistance.py
deleted file mode 100644
index c379dd49c560ce..00000000000000
--- a/torch/legacy/nn/CosineDistance.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class CosineDistance(Module):
-
-    def __init__(self, ):
-        super(CosineDistance, self).__init__()
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-        self._input1 = None
-        self._input2 = None
-        self.buffer = None
-        self.w1 = None
-        self.w22 = None
-        self.w = None
-        self.w32 = None
-        self.ones = None
-
-    def _makeContiguous(self, input1, input2):
-        if not input1.is_contiguous():
-            if self._input1 is None:
-                self._input1 = input1.new()
-            self._input1.resize_as_(input1).copy_(input1)
-            input1 = self._input1
-
-        if not input2.is_contiguous():
-            if self._input2 is None:
-                self._input2 = input2.new()
-            self._input2.resize_as_(input2).copy_(input2)
-            input2 = self._input2
-
-        return input1, input2
-
-    def updateOutput(self, input):
-        input1, input2 = input[0], input[1]
-        input1, input2 = self._makeContiguous(input1, input2)
-
-        if self.buffer is None:
-            self.buffer = input1.new()
-            self.w1 = input1.new()
-            self.w22 = input1.new()
-            self.w = input1.new()
-            self.w32 = input1.new()
-            self.ones = input1.new()
-
-        torch.mul(input1, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
-
-        epsilon = 1e-12
-        torch.mul(input1, input1, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
-        self.w22.reciprocal_()
-        self.w.resize_as_(self.w22).copy_(self.w22)
-
-        torch.mul(input2, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
-        self.w32.reciprocal_()
-        self.w.mul_(self.w32)
-        self.w.sqrt_()
-
-        torch.mul(self.w1, self.w, out=self.output)
-        self.output.resize_(input1.size(0))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        v1 = input[0]
-        v2 = input[1]
-        v1, v2 = self._makeContiguous(v1, v2)
-
-        if len(self.gradInput) != 2:
-            if self.gradInput[0] is None:
-                self.gradInput[0] = v1.new()
-            if self.gradInput[1] is None:
-                self.gradInput[1] = v1.new()
-            self.gradInput = self.gradInput[:2]
-
-        gw1 = self.gradInput[0]
-        gw2 = self.gradInput[1]
-        gw1.resize_as_(v1).copy_(v2)
-        gw2.resize_as_(v1).copy_(v1)
-
-        torch.mul(self.w1, self.w22, out=self.buffer)
-        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
-        gw1.mul_(self.w.expand_as(v1))
-
-        torch.mul(self.w1, self.w32, out=self.buffer)
-        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
-        gw2.mul_(self.w.expand_as(v1))
-
-        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
-        gw1.mul_(go)
-        gw2.mul_(go)
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, [
-            'buffer',
-            'w1',
-            'w22',
-            'w',
-            'w32',
-            'ones',
-        ])
-        return super(CosineDistance, self).clearState()
diff --git a/torch/legacy/nn/CosineEmbeddingCriterion.py b/torch/legacy/nn/CosineEmbeddingCriterion.py
deleted file mode 100644
index 5a33ee9a3db0c9..00000000000000
--- a/torch/legacy/nn/CosineEmbeddingCriterion.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class CosineEmbeddingCriterion(Criterion):
-
-    def __init__(self, margin=0, sizeAverage=True):
-        super(CosineEmbeddingCriterion, self).__init__()
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-        self.buffer = None
-        self.w1 = None
-        self.w22 = None
-        self.w = None
-        self.w32 = None
-        self._outputs = None
-        self._idx = None
-
-    def updateOutput(self, input, y):
-        input1, input2 = input[0], input[1]
-
-        # keep backward compatibility
-        if self.buffer is None:
-            self.buffer = input1.new()
-            self.w1 = input1.new()
-            self.w22 = input1.new()
-            self.w = input1.new()
-            self.w32 = input1.new()
-            self._outputs = input1.new()
-
-            # comparison operators behave differently from cuda/c implementations
-            # TODO: verify name
-            if input1.type() == 'torch.cuda.FloatTensor':
-                self._idx = torch.cuda.ByteTensor()
-            else:
-                self._idx = torch.ByteTensor()
-
-        torch.mul(input1, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w1, keepdim=True)
-
-        epsilon = 1e-12
-        torch.mul(input1, input1, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon)
-        # self._outputs is also used as a temporary buffer
-        self._outputs.resize_as_(self.w22).fill_(1)
-        torch.div(self._outputs, self.w22, out=self.w22)
-        self.w.resize_as_(self.w22).copy_(self.w22)
-
-        torch.mul(input2, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon)
-        torch.div(self._outputs, self.w32, out=self.w32)
-        self.w.mul_(self.w32)
-        self.w.sqrt_()
-
-        torch.mul(self.w1, self.w, out=self._outputs)
-        self._outputs = self._outputs.select(1, 0)
-
-        torch.eq(y, -1, out=self._idx)
-        self._outputs[self._idx] = self._outputs[self._idx].add_(-self.margin).clamp_(min=0)
-        torch.eq(y, 1, out=self._idx)
-        self._outputs[self._idx] = self._outputs[self._idx].mul_(-1).add_(1)
-
-        self.output = self._outputs.sum().item()
-
-        if self.sizeAverage:
-            self.output = self.output / y.size(0)
-
-        return self.output
-
-    def updateGradInput(self, input, y):
-        v1 = input[0]
-        v2 = input[1]
-
-        gw1 = self.gradInput[0]
-        gw2 = self.gradInput[1]
-        gw1.resize_as_(v1).copy_(v2)
-        gw2.resize_as_(v1).copy_(v1)
-
-        torch.mul(self.w1, self.w22, out=self.buffer)
-        gw1.addcmul_(-1, self.buffer.expand_as(v1), v1)
-        gw1.mul_(self.w.expand_as(v1))
-
-        torch.mul(self.w1, self.w32, out=self.buffer)
-        gw2.addcmul_(-1, self.buffer.expand_as(v1), v2)
-        gw2.mul_(self.w.expand_as(v1))
-
-        # self._idx = self._outputs <= 0
-        torch.le(self._outputs, 0, out=self._idx)
-        self._idx = self._idx.view(-1, 1).expand(gw1.size())
-        gw1[self._idx] = 0
-        gw2[self._idx] = 0
-
-        torch.eq(y, 1, out=self._idx)
-        self._idx = self._idx.view(-1, 1).expand(gw2.size())
-        gw1[self._idx] = gw1[self._idx].mul_(-1)
-        gw2[self._idx] = gw2[self._idx].mul_(-1)
-
-        if self.sizeAverage:
-            gw1.div_(y.size(0))
-            gw2.div_(y.size(0))
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if not type:
-            return self._type
-
-        self._idx = None
-        super(CosineEmbeddingCriterion, self).type(type, tensorCache)
-        # comparison operators behave differently from cuda/c implementations
-        if type == 'torch.cuda.FloatTensor':
-            self._idx = torch.cuda.ByteTensor()
-        else:
-            self._idx = torch.ByteTensor()
-
-        return self
diff --git a/torch/legacy/nn/Criterion.py b/torch/legacy/nn/Criterion.py
deleted file mode 100644
index ef51b1819fc328..00000000000000
--- a/torch/legacy/nn/Criterion.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from .Module import Module
-from .utils import recursiveType
-import torch._thnn
-
-
-class Criterion(object):
-
-    def __init__(self):
-        self.gradInput = torch.Tensor()
-        self.output = 0
-        self._backend = torch._thnn.type2backend[self.gradInput.type()]
-
-    def updateOutput(self, input, target):
-        raise NotImplementedError
-
-    def forward(self, input, target):
-        return self.updateOutput(input, target)
-
-    def backward(self, input, target):
-        return self.updateGradInput(input, target)
-
-    def updateGradInput(self, input, target):
-        raise NotImplementedError
-
-    def clone(self):
-        raise NotImplementedError
-
-    def type(self, type, tensorCache=None):
-        # find all tensors and convert them
-        for key, param in self.__dict__.items():
-            setattr(self, key, recursiveType(param, type, tensorCache or {}))
-
-        self._backend = torch._thnn.type2backend[type]
-        return self
-
-    def float(self):
-        return self.type('torch.FloatTensor')
-
-    def double(self):
-        return self.type('torch.DoubleTensor')
-
-    def cuda(self):
-        return self.type('torch.cuda.FloatTensor')
diff --git a/torch/legacy/nn/CriterionTable.py b/torch/legacy/nn/CriterionTable.py
deleted file mode 100644
index 7e20a8fb850478..00000000000000
--- a/torch/legacy/nn/CriterionTable.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch
-from .Module import Module
-
-
-class CriterionTable(Module):
-
-    def __init__(self, criterion):
-        super(CriterionTable, self).__init__()
-        self.criterion = criterion
-        self.gradInput = [criterion.gradInput]
-
-    def updateOutput(self, input):
-        self.output = self.criterion.updateOutput(*input)
-        return self.output
-
-    def updateGradInput(self, input, grad_output):
-        self.criterion.updateGradInput(*input)
-        return self.gradInput
diff --git a/torch/legacy/nn/CrossEntropyCriterion.py b/torch/legacy/nn/CrossEntropyCriterion.py
deleted file mode 100644
index 67e8b0d9ab9d48..00000000000000
--- a/torch/legacy/nn/CrossEntropyCriterion.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .LogSoftMax import LogSoftMax
-from .ClassNLLCriterion import ClassNLLCriterion
-
-
-class CrossEntropyCriterion(Criterion):
-
-    def __init__(self, weights=None):
-        super(CrossEntropyCriterion, self).__init__()
-        self.lsm = LogSoftMax()
-        self.nll = ClassNLLCriterion(weights)
-
-    def updateOutput(self, input, target):
-        input = input.squeeze()
-        target = target.squeeze()
-        self.lsm.updateOutput(input)
-        self.nll.updateOutput(self.lsm.output, target)
-        self.output = self.nll.output
-        return self.output
-
-    def updateGradInput(self, input, target):
-        size = input.size()
-        input = input.squeeze()
-        target = target.squeeze()
-        self.nll.updateGradInput(self.lsm.output, target)
-        self.lsm.updateGradInput(input, self.nll.gradInput)
-        self.gradInput = self.lsm.gradInput.view(size)
-        return self.gradInput
diff --git a/torch/legacy/nn/DepthConcat.py b/torch/legacy/nn/DepthConcat.py
deleted file mode 100644
index 19c31873ff3c77..00000000000000
--- a/torch/legacy/nn/DepthConcat.py
+++ /dev/null
@@ -1,106 +0,0 @@
-####################################
-# DepthConcat
-# Concatenates the output of Convolutions along the depth dimension
-# (nOutputFrame). This is used to implement the DepthConcat layer
-# of the Going deeper with convolutions paper :
-# http.//arxiv.org/pdf/1409.4842v1.pdf
-# The normal Concat Module can't be used since the spatial dimensions
-# of tensors to be concatenated may have different values. To deal with
-# this, we select the largest spatial dimensions and add zero-padding
-# around the smaller dimensions.
-####################################
-
-import math
-import torch
-from .Concat import Concat
-
-
-class DepthConcat(Concat):
-
-    def windowNarrow(self, output, currentOutput, offset):
-        outputWindow = output.narrow(self.dimension, offset, currentOutput.size(self.dimension))
-        for dim in range(len(self.outputSize)):
-            currentSize = currentOutput.size(dim)
-            if dim != self.dimension and self.outputSize[dim] != currentSize:
-                # 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
-                # 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
-                # 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
-                start = int(math.floor(((self.outputSize[dim] - currentSize) / 2)))
-                outputWindow = outputWindow.narrow(dim, start, currentSize)
-        return outputWindow
-
-    def updateOutput(self, input):
-        outs = []
-        for i in range(len(self.modules)):
-            currentOutput = self.modules[i].updateOutput(input)
-            outs.append(currentOutput)
-            if i == 0:
-                size = list(currentOutput.size())
-            else:
-                size[self.dimension] += currentOutput.size(self.dimension)
-                for dim in range(len(self.outputSize)):
-                    if dim != self.dimension:
-                        # take the maximum size (shouldn't change anything for batch dim)
-                        size[dim] = max(size[dim], currentOutput.size(dim))
-
-        self.outputSize = torch.Size(size)
-        self.output.resize_(self.outputSize).zero_()  # zero for padding
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = outs[i]
-            outputWindow = self.windowNarrow(self.output, currentOutput, offset)
-            outputWindow.copy_(currentOutput)
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            currentGradInput = module.updateGradInput(input, gradOutputWindow)
-            if i == 0:
-                self.gradInput.copy_(currentGradInput)
-            else:
-                self.gradInput.add_(currentGradInput)
-
-            offset += currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            module.accGradParameters(input, gradOutputWindow, scale)
-            offset += currentOutput.size(self.dimension)
-
-    def backward(self, input, gradOutput, scale=1):
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            currentGradInput = module.backward(input, gradOutputWindow)
-            if i == 0:
-                self.gradInput.copy_(currentGradInput)
-            else:
-                self.gradInput.add_(currentGradInput)
-
-            offset = offset + currentOutput.size(self.dimension)
-
-        return self.gradInput
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        offset = 0
-        for i, module in enumerate(self.modules):
-            currentOutput = module.output
-            gradOutputWindow = self.windowNarrow(gradOutput, currentOutput, offset)
-            module.accUpdateGradParameters(input, gradOutputWindow, lr)
-            offset = offset + currentOutput.size(self.dimension)
diff --git a/torch/legacy/nn/DistKLDivCriterion.py b/torch/legacy/nn/DistKLDivCriterion.py
deleted file mode 100644
index 5aa175604a05ff..00000000000000
--- a/torch/legacy/nn/DistKLDivCriterion.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class DistKLDivCriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(DistKLDivCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = torch.Tensor(1)
-
-    def updateOutput(self, input, target):
-        assert input.is_same_size(target)
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.DistKLDivCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        assert input.is_same_size(target)
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.DistKLDivCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/DotProduct.py b/torch/legacy/nn/DotProduct.py
deleted file mode 100644
index 70f6490a8fd627..00000000000000
--- a/torch/legacy/nn/DotProduct.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class DotProduct(Module):
-
-    def __init__(self):
-        super(DotProduct, self).__init__()
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-        self.buffer = None
-
-    def updateOutput(self, input):
-        input1, input2 = input[0], input[1]
-
-        if self.buffer is None:
-            self.buffer = input1.new()
-
-        torch.mul(input1, input2, out=self.buffer)
-        torch.sum(self.buffer, 1, True, out=self.output)
-        self.output.resize_(input1.size(0))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        v1 = input[0]
-        v2 = input[1]
-        not_batch = False
-
-        if len(self.gradInput) != 2:
-            if self.gradInput[0] is None:
-                self.gradInput[0] = input[0].new()
-            if self.gradInput[1] is None:
-                self.gradInput[1] = input[1].new()
-            self.gradInput = self.gradInput[:2]
-
-        gw1 = self.gradInput[0]
-        gw2 = self.gradInput[1]
-        gw1.resize_as_(v1).copy_(v2)
-        gw2.resize_as_(v2).copy_(v1)
-
-        go = gradOutput.contiguous().view(-1, 1).expand_as(v1)
-        gw1.mul_(go)
-        gw2.mul_(go)
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'buffer')
-        return super(DotProduct, self).clearState()
diff --git a/torch/legacy/nn/Dropout.py b/torch/legacy/nn/Dropout.py
deleted file mode 100644
index 41330e503b5e45..00000000000000
--- a/torch/legacy/nn/Dropout.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Dropout(Module):
-
-    def __init__(self, p=0.5, inplace=False):
-        super(Dropout, self).__init__()
-        self.p = p
-        self.inplace = inplace
-        self.train = True
-        self.noise = torch.Tensor()
-
-    def updateOutput(self, input):
-        if self.inplace:
-            self.output.set_(input)
-        else:
-            self.output.resize_as_(input).copy_(input)
-
-        if self.p > 0 and self.train:
-            self.noise.resize_as_(input)
-            self.noise.bernoulli_(1 - self.p)
-            self.noise.div_(1 - self.p)
-            self.output.mul_(self.noise)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.inplace:
-            self.gradInput.set_(gradOutput)
-        else:
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-
-        if self.p > 0 and self.train:
-            self.gradInput.mul_(self.noise)  # simply mask the gradients with the noise vector
-
-        return self.gradInput
-
-    def setp(self, p):
-        self.p = p
-
-    def __repr__(self):
-        return super(Dropout, self).__repr__() + '({:.4f})'.format(self.p)
-
-    def clearState(self):
-        clear(self, 'noise')
-        return super(Dropout, self).clearState()
diff --git a/torch/legacy/nn/ELU.py b/torch/legacy/nn/ELU.py
deleted file mode 100644
index 9e00e8a172fc88..00000000000000
--- a/torch/legacy/nn/ELU.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# -*- coding: utf8 -*-
-import torch
-from .Module import Module
-
-
-class ELU(Module):
-    """
-            Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter
-            Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-            http.//arxiv.org/pdf/1511.07289.pdf
-    """
-
-    def __init__(self, alpha=1., inplace=False):
-        assert type(alpha) == float
-        super(ELU, self).__init__()
-        self.alpha = alpha
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        self._backend.ELU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.alpha,
-            1.0,
-            1.0,
-            self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.ELU_updateGradInput(
-            self._backend.library_state,
-            gradOutput,
-            self.gradInput,
-            self.output,
-            self.alpha,
-            1.0,
-            1.0
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return '{}(alpha={:.3f})'.format(str(type(self)), self.alpha)
diff --git a/torch/legacy/nn/Euclidean.py b/torch/legacy/nn/Euclidean.py
deleted file mode 100644
index 411ca833bd4cda..00000000000000
--- a/torch/legacy/nn/Euclidean.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Euclidean(Module):
-
-    def __init__(self, inputSize, outputSize):
-        super(Euclidean, self).__init__()
-
-        self.weight = torch.Tensor(inputSize, outputSize)
-        self.gradWeight = torch.Tensor(inputSize, outputSize)
-
-        # state
-        self.gradInput.resize_(inputSize)
-        self.output.resize_(outputSize)
-
-        self.fastBackward = True
-        self.reset()
-
-        self._input = None
-        self._weight = None
-        self._expand = None
-        self._expand2 = None
-        self._repeat = None
-        self._repeat2 = None
-        self._div = None
-        self._output = None
-        self._gradOutput = None
-        self._expand3 = None
-        self._sum = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(0))
-
-        self.weight.uniform_(-stdv, stdv)
-
-    def _view(self, res, src, *args):
-        if src.is_contiguous():
-            res.set_(src.view(*args))
-        else:
-            res.set_(src.contiguous().view(*args))
-
-    def updateOutput(self, input):
-        # lazy initialize buffers
-        if self._input is None:
-            self._input = input.new()
-        if self._weight is None:
-            self._weight = self.weight.new()
-        if self._expand is None:
-            self._expand = self.output.new()
-        if self._expand2 is None:
-            self._expand2 = self.output.new()
-        if self._repeat is None:
-            self._repeat = self.output.new()
-        if self._repeat2 is None:
-            self._repeat2 = self.output.new()
-
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        # y_j = || w_j - x || = || x - w_j ||
-        assert input.dim() == 2
-
-        batchSize = input.size(0)
-        self._view(self._input, input, batchSize, inputSize, 1)
-        self._expand = self._input.expand(batchSize, inputSize, outputSize)
-        # make the expanded tensor contiguous (requires lots of memory)
-        self._repeat.resize_as_(self._expand).copy_(self._expand)
-
-        self._weight = self.weight.view(1, inputSize, outputSize)
-        self._expand2 = self._weight.expand_as(self._repeat)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            # TODO: after adding new allocators this can be changed
-            # requires lots of memory, but minimizes cudaMallocs and loops
-            self._repeat2.resize_as_(self._expand2).copy_(self._expand2)
-            self._repeat.add_(-1, self._repeat2)
-        else:
-            self._repeat.add_(-1, self._expand2)
-
-        torch.norm(self._repeat, 2, 1, True, out=self.output)
-        self.output.resize_(batchSize, outputSize)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        if self._div is None:
-            self._div = input.new()
-        if self._output is None:
-            self._output = self.output.new()
-        if self._gradOutput is None:
-            self._gradOutput = input.new()
-        if self._expand3 is None:
-            self._expand3 = input.new()
-
-        if not self.fastBackward:
-            self.updateOutput(input)
-
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        """
-        dy_j   -2 * (w_j - x)     x - w_j
-        ---- = ---------------- = -------
-         dx    2 || w_j - x ||      y_j
-        """
-
-        # to prevent div by zero (NaN) bugs
-        self._output.resize_as_(self.output).copy_(self.output).add_(0.0000001)
-        self._view(self._gradOutput, gradOutput, gradOutput.size())
-        torch.div(gradOutput, self._output, out=self._div)
-        assert input.dim() == 2
-        batchSize = input.size(0)
-
-        self._div.resize_(batchSize, 1, outputSize)
-        self._expand3 = self._div.expand(batchSize, inputSize, outputSize)
-
-        if torch.typename(input) == 'torch.cuda.FloatTensor':
-            self._repeat2.resize_as_(self._expand3).copy_(self._expand3)
-            self._repeat2.mul_(self._repeat)
-        else:
-            torch.mul(self._repeat, self._expand3, out=self._repeat2)
-
-        torch.sum(self._repeat2, 2, True, out=self.gradInput)
-        self.gradInput.resize_as_(input)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        """
-        dy_j    2 * (w_j - x)    w_j - x
-        ---- = --------------- = -------
-        dw_j   2 || w_j - x ||     y_j
-        """
-        # assumes a preceding call to updateGradInput
-        assert input.dim() == 2
-        if self._sum is None:
-            self._sum = input.new()
-        torch.sum(self._repeat2, 0, True, out=self._sum)
-        self._sum.resize_(inputSize, outputSize)
-        self.gradWeight.add_(-scale, self._sum)
-
-    def type(self, type=None, tensorCache=None):
-        if type:
-            # prevent premature memory allocations
-            self.clearState()
-
-        return super(Euclidean, self).type(type, tensorCache)
-
-    def clearState(self):
-        clear(self, [
-            '_input',
-            '_output',
-            '_gradOutput',
-            '_weight',
-            '_div',
-            '_sum',
-            '_expand',
-            '_expand2',
-            '_expand3',
-            '_repeat',
-            '_repeat2',
-        ])
-        return super(Euclidean, self).clearState()
diff --git a/torch/legacy/nn/Exp.py b/torch/legacy/nn/Exp.py
deleted file mode 100644
index 7156a99eb9ec73..00000000000000
--- a/torch/legacy/nn/Exp.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Exp(Module):
-
-    def updateOutput(self, input):
-        return torch.exp(input, out=self.output)
-
-    def updateGradInput(self, input, gradOutput):
-        return torch.mul(self.output, gradOutput, out=self.gradInput)
diff --git a/torch/legacy/nn/FlattenTable.py b/torch/legacy/nn/FlattenTable.py
deleted file mode 100644
index 1468f0ceddd959..00000000000000
--- a/torch/legacy/nn/FlattenTable.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import torch
-from .Module import Module
-
-
-class FlattenTable(Module):
-
-    def __init__(self):
-        super(FlattenTable, self).__init__()
-
-        self.output = []
-        self.input_map = []
-        self.gradInput = []
-
-    def _flatten(self, output, input):
-        if isinstance(input, list):
-            input_map = []
-            # forward DFS order
-            for i in range(len(input)):
-                input_map.append(self._flatten(output, input[i]))
-        else:
-            input_map = len(output)
-            output.append(input)
-
-        return input_map
-
-    def _checkMapping(self, output, input, input_map):
-        if isinstance(input, list):
-            if len(input) != len(input_map):
-                return False
-
-            # forward DFS order
-            for i in range(len(input)):
-                if not self._checkMapping(output, input[i], input_map[i]):
-                    return False
-
-            return True
-        else:
-            return output[input_map] is input
-
-    # During BPROP we have to build a gradInput with the same shape as the
-    # input.  This is a recursive function to build up a gradInput
-    def _inverseFlatten(self, gradOutput, input_map):
-        if isinstance(input_map, list):
-            gradInput = []
-            for i in range(len(input_map)):
-                gradInput.append(self._inverseFlatten(gradOutput, input_map[i]))
-
-            return gradInput
-        else:
-            return gradOutput[input_map]
-
-    def updateOutput(self, input):
-        assert isinstance(input, list)
-        # to avoid updating rebuilding the flattened table every updateOutput call
-        # we will: a DFS pass over the existing output table and the inputs to
-        # see if it needs to be rebuilt.
-        if not self._checkMapping(self.output, input, self.input_map):
-            self.output = []
-            self.input_map = self._flatten(self.output, input)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert isinstance(input, list)
-        assert isinstance(gradOutput, list)
-        # If the input changes between the updateOutput and updateGradInput call,
-        #: we may have to rebuild the input_map!  However, let's assume that
-        # the input_map is valid and that forward has already been called.
-
-        # However, we should check that the gradInput is valid:
-        if not self._checkMapping(gradOutput, self.gradInput, self.input_map):
-            self.gradInput = self._inverseFlatten(gradOutput, self.input_map)
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if not type:
-            return self._type
-        # This function just stores references so we don't need to do any type
-        # conversions. Just force the tables to be empty.
-        self.clearState()
-
-    def clearState(self):
-        self.input_map = []
-        return super(FlattenTable, self).clearState()
diff --git a/torch/legacy/nn/GradientReversal.py b/torch/legacy/nn/GradientReversal.py
deleted file mode 100644
index 36c048b3b0bead..00000000000000
--- a/torch/legacy/nn/GradientReversal.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-
-
-class GradientReversal(Module):
-
-    def __init__(self, lambd=1):
-        super(GradientReversal, self).__init__()
-        self.lambd = lambd
-
-    def setLambda(self, lambd):
-        self.lambd = lambd
-
-    def updateOutput(self, input):
-        self.output.set_(input)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(gradOutput)
-        self.gradInput.copy_(gradOutput)
-        self.gradInput.mul_(-self.lambd)
-        return self.gradInput
diff --git a/torch/legacy/nn/HardShrink.py b/torch/legacy/nn/HardShrink.py
deleted file mode 100644
index 99b3bb2292a753..00000000000000
--- a/torch/legacy/nn/HardShrink.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-from .Module import Module
-
-
-class HardShrink(Module):
-
-    def __init__(self, lambd=0.5):
-        assert type(lambd) == float
-        super(HardShrink, self).__init__()
-        self.lambd = lambd
-
-    def updateOutput(self, input):
-        self._backend.HardShrink_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.lambd
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.HardShrink_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.lambd
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/HardTanh.py b/torch/legacy/nn/HardTanh.py
deleted file mode 100644
index b8bae62f9bc739..00000000000000
--- a/torch/legacy/nn/HardTanh.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-from .Module import Module
-
-
-class HardTanh(Module):
-
-    def __init__(self, min_value=-1, max_value=1, inplace=False):
-        super(HardTanh, self).__init__()
-        self.min_val = min_value
-        self.max_val = max_value
-        self.inplace = inplace
-        assert self.max_val > self.min_val
-
-    def updateOutput(self, input):
-        self._backend.HardTanh_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.min_val,
-            self.max_val,
-            self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.HardTanh_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.min_val,
-            self.max_val,
-            self.inplace
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/HingeEmbeddingCriterion.py b/torch/legacy/nn/HingeEmbeddingCriterion.py
deleted file mode 100644
index 068acf93a2e214..00000000000000
--- a/torch/legacy/nn/HingeEmbeddingCriterion.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class HingeEmbeddingCriterion(Criterion):
-
-    def __init__(self, margin=1, sizeAverage=True):
-        super(HingeEmbeddingCriterion, self).__init__()
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        self.buffer = None
-
-    def updateOutput(self, input, y):
-        if self.buffer is None:
-            self.buffer = input.new()
-        self.buffer.resize_as_(input).copy_(input)
-        self.buffer[torch.eq(y, -1.)] = 0
-        self.output = self.buffer.sum().item()
-
-        self.buffer.fill_(self.margin).add_(-1, input)
-        self.buffer.clamp_(min=0)
-        self.buffer[torch.eq(y, 1.)] = 0
-        self.output = self.output + self.buffer.sum().item()
-
-        if self.sizeAverage:
-            self.output = self.output / input.nelement()
-
-        return self.output
-
-    def updateGradInput(self, input, y):
-        self.gradInput.resize_as_(input).copy_(y)
-        self.gradInput[torch.mul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0
-
-        if self.sizeAverage:
-            self.gradInput.mul_(1. / input.nelement())
-
-        return self.gradInput
diff --git a/torch/legacy/nn/Identity.py b/torch/legacy/nn/Identity.py
deleted file mode 100644
index 1909bf4d1041ce..00000000000000
--- a/torch/legacy/nn/Identity.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Identity(Module):
-
-    def updateOutput(self, input):
-        self.output = input
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = gradOutput
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'gradInput')
diff --git a/torch/legacy/nn/Index.py b/torch/legacy/nn/Index.py
deleted file mode 100644
index eb9a1f8ff020a7..00000000000000
--- a/torch/legacy/nn/Index.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Index(Module):
-
-    def __init__(self, dimension):
-        super(Index, self).__init__()
-        self.dimension = dimension
-        self.gradInput = [self.gradInput]
-
-    def updateOutput(self, input):
-        t = input[0]
-        index = input[1]
-        torch.index_select(t, self.dimension, index, out=self.output)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        t = input[0]
-        index = input[1]
-
-        gradInput = self.gradInput[0]  # no gradient for the index tensor
-        gradInput.resize_as_(t).zero_()
-        gradInput.index_add_(self.dimension, index, gradOutput)
-        return self.gradInput
diff --git a/torch/legacy/nn/JoinTable.py b/torch/legacy/nn/JoinTable.py
deleted file mode 100644
index 0031945d081ee4..00000000000000
--- a/torch/legacy/nn/JoinTable.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import torch
-from .Module import Module
-
-
-class JoinTable(Module):
-
-    def __init__(self, dimension):
-        super(JoinTable, self).__init__()
-        self.size = torch.Size()
-        self.dimension = dimension
-        self.gradInput = []
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input[0].dim() + dimension
-
-        return dimension
-
-    def updateOutput(self, input):
-        dim = self._getPositiveDimension(input)
-
-        for i in range(len(input)):
-            currentOutput = input[i]
-            if i == 0:
-                size = list(currentOutput.size())
-            else:
-                size[dim] += currentOutput.size(dim)
-
-        self.size = torch.Size(size)
-        self.output.resize_(self.size)
-
-        # TODO: use cat?
-        offset = 0
-        for i in range(len(input)):
-            currentOutput = input[i]
-            self.output.narrow(dim, offset, currentOutput.size(dim)).copy_(currentOutput)
-            offset += currentOutput.size(dim)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        dim = self._getPositiveDimension(input)
-
-        for i in range(len(input)):
-            if len(self.gradInput) < i + 1:
-                self.gradInput.append(input[i].new())
-            self.gradInput[i].resize_as_(input[i])
-        self.gradInput = self.gradInput[:len(input)]
-
-        offset = 0
-        for i in range(len(input)):
-            currentOutput = input[i]
-            currentGradInput = gradOutput.narrow(dim, offset, currentOutput.size(dim))
-            self.gradInput[i].copy_(currentGradInput)
-            offset = offset + currentOutput.size(dim)
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        self.gradInput = []
-        return super(JoinTable, self).type(type, tensorCache)
diff --git a/torch/legacy/nn/L1Cost.py b/torch/legacy/nn/L1Cost.py
deleted file mode 100644
index fabbab34861e87..00000000000000
--- a/torch/legacy/nn/L1Cost.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .utils import clear
-
-
-class L1Cost(Criterion):
-
-    def __init__(self):
-        super(L1Cost, self).__init__()
-        self.output_tensor = torch.Tensor(1)
-
-    def updateOutput(self, input, target=None):
-        assert target is None
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.L1Cost_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output_tensor
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target=None):
-        assert target is None
-        self._backend.L1Cost_updateGradInput(
-            self._backend.library_state,
-            input,
-            None,
-            self.gradInput
-        )
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'output_tensor')
-        return super(L1Cost, self).clearState()
diff --git a/torch/legacy/nn/L1HingeEmbeddingCriterion.py b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
deleted file mode 100644
index aa4dc6c62a7a92..00000000000000
--- a/torch/legacy/nn/L1HingeEmbeddingCriterion.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class L1HingeEmbeddingCriterion(Criterion):
-
-    def __init__(self, margin=1):
-        super(L1HingeEmbeddingCriterion, self).__init__()
-        self.margin = float(margin)
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input, y):
-        self.output = float(input[0].dist(input[1], 1))
-        if y == -1:
-            self.output = max(0, self.margin - self.output)
-
-        return self.output
-
-    def _mathsign(t):
-        return 1 if x > 0 else -1
-
-    def updateGradInput(self, input, y):
-        self.gradInput[0].resize_as_(input[0])
-        self.gradInput[1].resize_as_(input[1])
-        self.gradInput[0].copy_(input[0])
-        self.gradInput[0].add_(-1, input[1])
-        dist = self.gradInput[0].norm(1)
-        self.gradInput[0].sign_()
-        if y == -1:  # just to avoid a mul by 1
-            if dist > self.margin:
-                self.gradInput[0].zero_()
-            else:
-                self.gradInput[0].mul_(-1)
-
-        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
-        return self.gradInput
diff --git a/torch/legacy/nn/L1Penalty.py b/torch/legacy/nn/L1Penalty.py
deleted file mode 100644
index 05472d75f6b689..00000000000000
--- a/torch/legacy/nn/L1Penalty.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-from .Module import Module
-
-# This module acts as an L1 latent state regularizer, adding the
-# [gradOutput] to the gradient of the L1 loss. The [input] is copied to
-# the [output].
-
-
-class L1Penalty(Module):
-
-    def __init__(self, l1weight, sizeAverage=False, provideOutput=True):
-        super(L1Penalty, self).__init__()
-        self.l1weight = l1weight
-        self.sizeAverage = sizeAverage
-        self.provideOutput = provideOutput
-
-    def updateOutput(self, input):
-        m = self.l1weight
-        if self.sizeAverage:
-            m = m / input.nelement()
-
-        loss = m * input.norm(1)
-        self.loss = loss
-        self.output = input
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        m = self.l1weight
-        if self.sizeAverage:
-            m = m / input.nelement()
-
-        self.gradInput.resize_as_(input).copy_(input).sign_().mul_(m)
-
-        if self.provideOutput:
-            self.gradInput.add_(gradOutput)
-
-        return self.gradInput
diff --git a/torch/legacy/nn/LeakyReLU.py b/torch/legacy/nn/LeakyReLU.py
deleted file mode 100644
index ca3a5cc6b21375..00000000000000
--- a/torch/legacy/nn/LeakyReLU.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-from .Module import Module
-
-
-class LeakyReLU(Module):
-
-    def __init__(self, negval=1 / 100, inplace=False):
-        super(LeakyReLU, self).__init__()
-        if isinstance(negval, bool):
-            inplace = negval
-            self.negval = 1 / 100
-        else:
-            self.negval = negval
-
-        # default for inplace is False
-        self.inplace = inplace
-        if self.negval < 0:
-            # TODO: warning here
-            self.inplace = False
-
-    def updateOutput(self, input):
-        self._backend.LeakyReLU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.negval,
-            self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.LeakyReLU_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.negval,
-            self.inplace
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return str(type(self)) + '({:.4f})'.format(self.negval)
diff --git a/torch/legacy/nn/Linear.py b/torch/legacy/nn/Linear.py
deleted file mode 100644
index eb69a63e704e44..00000000000000
--- a/torch/legacy/nn/Linear.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Linear(Module):
-
-    def __init__(self, inputSize, outputSize, bias=True):
-        super(Linear, self).__init__()
-        self.weight = torch.Tensor(outputSize, inputSize)
-        self.gradWeight = torch.Tensor(outputSize, inputSize)
-        self.bias = torch.Tensor(outputSize) if bias else None
-        self.gradBias = torch.Tensor(outputSize) if bias else None
-        self.reset()
-
-        self.addBuffer = None
-
-    def noBias(self):
-        self.bias = None
-        self.gradBias = None
-        return self
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(1))
-
-        self.weight.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.uniform_(-stdv, stdv)
-
-        return self
-
-    def _updateAddBuffer(self, input):
-        nframe = input.size(0)
-        if self.addBuffer is None:
-            self.addBuffer = input.new()
-        if self.addBuffer.nelement() != nframe:
-            self.addBuffer.resize_(nframe).fill_(1)
-
-    def updateOutput(self, input):
-        assert input.dim() == 2
-        nframe = input.size(0)
-        nelement = self.output.nelement()
-        self.output.resize_(nframe, self.weight.size(0))
-        if self.output.nelement() != nelement:
-            self.output.zero_()
-
-        self._updateAddBuffer(input)
-        self.output.addmm_(0, 1, input, self.weight.t())
-        if self.bias is not None:
-            self.output.addr_(self.addBuffer, self.bias)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        nelement = self.gradInput.nelement()
-        self.gradInput.resize_as_(input)
-        if self.gradInput.nelement() != nelement:
-            self.gradInput.zero_()
-
-        assert input.dim() == 2
-        self.gradInput.addmm_(0, 1, gradOutput, self.weight)
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        assert input.dim() == 2
-        self.gradWeight.addmm_(scale, gradOutput.t(), input)
-        if self.bias is not None:
-            # update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
-            self._updateAddBuffer(input)
-            self.gradBias.addmv_(scale, gradOutput.t(), self.addBuffer)
-
-    def clearState(self):
-        clear(self, 'addBuffer')
-        return super(Linear, self).clearState()
-
-    def __repr__(self):
-        return super(Linear, self).__repr__() + \
-            '({} -> {})'.format(self.weight.size(1), self.weight.size(0)) + \
-            (' without bias' if self.bias is None else '')
diff --git a/torch/legacy/nn/Log.py b/torch/legacy/nn/Log.py
deleted file mode 100644
index 1f5e4bd2066808..00000000000000
--- a/torch/legacy/nn/Log.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Log(Module):
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input)
-        self.output.copy_(input)
-        self.output.log_()
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-        self.gradInput.fill_(1)
-        self.gradInput.div_(input)
-        self.gradInput.mul_(gradOutput)
-        return self.gradInput
diff --git a/torch/legacy/nn/LogSigmoid.py b/torch/legacy/nn/LogSigmoid.py
deleted file mode 100644
index d6b8761729d6d9..00000000000000
--- a/torch/legacy/nn/LogSigmoid.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class LogSigmoid(Module):
-
-    def __init__(self):
-        super(LogSigmoid, self).__init__()
-        self.buffer = None
-
-    def updateOutput(self, input):
-        if self.buffer is None:
-            self.buffer = input.new()
-        self._backend.LogSigmoid_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.buffer
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.LogSigmoid_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.buffer
-        )
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'buffer')
-        return super(LogSigmoid, self).clearState()
diff --git a/torch/legacy/nn/LogSoftMax.py b/torch/legacy/nn/LogSoftMax.py
deleted file mode 100644
index 82deda448827db..00000000000000
--- a/torch/legacy/nn/LogSoftMax.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-from .Module import Module
-
-
-class LogSoftMax(Module):
-
-    def __init__(self, dim=None):
-        super(LogSoftMax, self).__init__()
-        if dim is not None:
-            self.dim = dim
-
-    def _get_dim(self, input):
-        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
-
-    def updateOutput(self, input):
-        self.output = torch.log_softmax(
-            input,
-            self._get_dim(input)
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = torch.log_softmax_backward_data(
-            gradOutput,
-            self.output,
-            self._get_dim(input),
-            input
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/LookupTable.py b/torch/legacy/nn/LookupTable.py
deleted file mode 100644
index ed3db0678f16f1..00000000000000
--- a/torch/legacy/nn/LookupTable.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class LookupTable(Module):
-
-    def __init__(self, nIndex, nOutput, paddingValue=-1, maxNorm=None, normType=None):
-        super(LookupTable, self).__init__()
-        self.weight = torch.Tensor(nIndex, nOutput)
-        self.gradWeight = torch.Tensor(nIndex, nOutput).zero_()
-        self.paddingValue = paddingValue
-        self.maxNorm = maxNorm
-        self.normType = normType
-        self.shouldScaleGradByFreq = False
-
-        self._gradOutput = None
-        self._sorted = None
-        self._indices = None
-
-        self._count = torch.IntTensor()
-        self._input = torch.LongTensor()
-
-        self.reset()
-
-    def accUpdateOnly(self):
-        self.gradWeight = None
-        return self
-
-    def setPadding(self, paddingValue):
-        self.paddingValue = paddingValue
-        return self
-
-    def setMaxNorm(self, maxNorm):
-        self.maxNorm = maxNorm
-        return self
-
-    def setNormType(self, normType):
-        self.normType = normType
-        return self
-
-    def scaleGradByFreq(self):
-        self.shouldScaleGradByFreq = True
-        return self
-
-    def reset(self, stdv=1):
-        self.weight.normal_(0, stdv)
-
-    def _makeInputContiguous(self, input):
-        # make sure input is a contiguous torch.LongTensor
-        if not input.is_contiguous() or input.type() != self._input.type():
-            self.copiedInput = True
-            self._input.resize_(input.size()).copy_(input)
-            return self._input
-        else:
-            self.copiedInput = False
-            return input
-
-    def updateOutput(self, input):
-        self.renorm(input)
-        input = self._makeInputContiguous(input)
-        if input.dim() == 1:
-            torch.index_select(self.weight, 0, input, out=self.output)
-        elif input.dim() == 2:
-            torch.index_select(self.weight, 0, input.view(-1), out=self.output)
-            self.output = self.output.view(input.size(0), input.size(1), self.weight.size(1))
-        else:
-            raise RuntimeError("input must be a vector or matrix")
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        # the input can be of any type (as in the forward it's
-        # converted anyway to LongTensor) thus, need to allocate
-        # new memory each time the user changes the input type
-        if self.gradInput.type() != input.type():
-            self.gradInput = input.new()
-
-        if not self.gradInput.is_same_size(input):
-            self.gradInput.resize_as_(input).zero_()
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        input = self._input if self.copiedInput else input
-        if input.dim() == 2:
-            input = input.view(-1)
-        elif input.dim() != 1:
-            raise RuntimeError("input must be a vector or matrix")
-
-        if not gradOutput.is_contiguous():
-            if self._gradOutput is None:
-                self._gradOutput = gradOutput.new()
-            self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-            gradOutput = self._gradOutput
-
-        self._backend.LookupTable_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self._count,
-            self._sorted,
-            self._indices,
-            self.shouldScaleGradByFreq,
-            self.paddingValue or 0,
-            scale
-        )
-
-    def renorm(self, input):
-        if self.maxNorm is None:
-            return
-
-        # copy input into _input, so _input is continuous.
-        # The copied _input will be modified in the C code.
-        self._input.resize_(input.size()).copy_(input)
-        row_idx = self._input
-        if row_idx.dim() == 2:
-            row_idx = row_idx.view(-1)
-        elif row_idx.dim() != 1:
-            raise RuntimeError("input must be a vector or matrix")
-
-        # "row_idx" and "weight" will be modified in the C code
-        self._backend.LookupTable_renorm(
-            self._backend.library_state,
-            row_idx,
-            self.weight,
-            self.maxNorm,
-            self.normType or 2
-        )
-
-    def type(self, type=None, tensorCache=None):
-        if type is None:
-            return self._type
-        super(LookupTable, self).type(type, tensorCache)
-
-        if type == 'torch.cuda.FloatTensor':
-            # CUDA uses _sorted and _indices temporary tensors
-            self._sorted = torch.cuda.LongTensor()
-            self._indices = torch.cuda.LongTensor()
-            self._count = torch.cuda.LongTensor()
-            self._input = torch.cuda.LongTensor()
-        else:
-            # self._count and self._input should only be converted if using Cuda
-            self._count = torch.IntTensor()
-            self._input = torch.LongTensor()
-
-        return self
-
-    def clearState(self):
-        clear(self, '_count', '_input', '_sorted', '_indices', '_gradOutput')
-        return super(LookupTable, self).clearState()
diff --git a/torch/legacy/nn/MM.py b/torch/legacy/nn/MM.py
deleted file mode 100644
index e3316eac2917a0..00000000000000
--- a/torch/legacy/nn/MM.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import torch
-from .Module import Module
-
-
-class MM(Module):
-
-    def __init__(self, transA=False, transB=False):
-        super(MM, self).__init__()
-        self.transA = transA
-        self.transB = transB
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input):
-        assert len(input) == 2
-        a, b = input
-        assert a.ndimension() == 2 or a.ndimension() == 3
-        assert a.dim() == b.dim()
-
-        if a.ndimension() == 2:
-            if self.transA:
-                a = a.t()
-            if self.transB:
-                b = b.t()
-            self.output.resize_(a.size(0), b.size(1))
-            torch.mm(a, b, out=self.output)
-        else:
-            if self.transA:
-                a = a.transpose(1, 2)
-            if self.transB:
-                b = b.transpose(1, 2)
-
-            self.output.resize_(a.size(0), a.size(1), b.size(2))
-            torch.bmm(a, b, out=self.output)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput[0] is None:
-            self.gradInput[0] = input[0].new()
-        if self.gradInput[1] is None:
-            self.gradInput[1] = input[1].new()
-
-        assert len(input) == 2
-        a, b = input
-        self.gradInput[0].resize_as_(a)
-        self.gradInput[1].resize_as_(b)
-
-        assert gradOutput.ndimension() == 2 or gradOutput.ndimension() == 3
-        assert a.dim() == b.dim() == gradOutput.dim()
-
-        if gradOutput.ndimension() == 2:
-            h_dim, w_dim = 0, 1
-            f = "mm"
-        else:
-            h_dim, w_dim = 1, 2
-            f = "bmm"
-
-        if self.transA == self.transB:
-            a = a.transpose(h_dim, w_dim)
-            b = b.transpose(h_dim, w_dim)
-
-        if self.transA:
-            getattr(torch, f)(b, gradOutput.transpose(h_dim, w_dim), out=self.gradInput[0])
-        else:
-            getattr(torch, f)(gradOutput, b, out=self.gradInput[0])
-
-        if self.transB:
-            getattr(torch, f)(gradOutput.transpose(h_dim, w_dim), a, out=self.gradInput[1])
-        else:
-            getattr(torch, f)(a, gradOutput, out=self.gradInput[1])
-
-        return self.gradInput
diff --git a/torch/legacy/nn/MSECriterion.py b/torch/legacy/nn/MSECriterion.py
deleted file mode 100644
index 2079d366c2ce6c..00000000000000
--- a/torch/legacy/nn/MSECriterion.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class MSECriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(MSECriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.MSECriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.Tensor([1]).type(input.type())
-
-        self._backend.MSECriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/MV.py b/torch/legacy/nn/MV.py
deleted file mode 100644
index ad9ff4619e41c5..00000000000000
--- a/torch/legacy/nn/MV.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import torch
-from .Module import Module
-
-
-class MV(Module):
-    """Module to perform matrix vector multiplication on two minibatch inputs,
-       producing a minibatch.
-    """
-
-    def __init__(self, trans=False):
-        super(MV, self).__init__()
-
-        self.trans = trans
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-    def updateOutput(self, input):
-        M, v = input
-        assert M.ndimension() == 2 or M.ndimension() == 3
-
-        if M.ndimension() == 2:
-            assert v.ndimension() == 1
-            if self.trans:
-                M = M.transpose(0, 1)
-            self.output.resize_(M.size(0))
-            torch.mv(M, v, out=self.output)
-        else:
-            assert v.ndimension() == 2
-            if self.trans:
-                M = M.transpose(1, 2)
-            self.output.resize_(M.size(0), M.size(1), 1)
-            torch.bmm(M, v.view(v.size(0), v.size(1), 1), out=self.output).resize_(M.size(0), M.size(1))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        M, v = input
-        self.gradInput[0].resize_as_(M)
-        self.gradInput[1].resize_as_(v)
-        gradOutput = gradOutput.contiguous()
-
-        assert gradOutput.ndimension() == 1 or gradOutput.ndimension() == 2
-
-        if gradOutput.ndimension() == 2:
-            assert M.ndimension() == 3
-            assert v.ndimension() == 2
-            bdim = M.size(0)
-            odim = M.size(1)
-            idim = M.size(2)
-
-            if self.trans:
-                torch.bmm(v.view(bdim, odim, 1), gradOutput.view(bdim, 1, idim), out=self.gradInput[0])
-                torch.bmm(M, gradOutput.view(bdim, idim, 1), out=self.gradInput[1].view(bdim, odim, 1))
-            else:
-                torch.bmm(gradOutput.view(bdim, odim, 1), v.view(bdim, 1, idim), out=self.gradInput[0])
-                torch.bmm(M.transpose(1, 2), gradOutput.view(bdim, odim, 1), out=self.gradInput[1].view(bdim, idim, 1))
-        else:
-            assert M.ndimension() == 2
-            assert v.ndimension() == 1
-
-            if self.trans:
-                torch.ger(v, gradOutput, out=self.gradInput[0])
-                self.gradInput[1] = M * gradOutput
-            else:
-                torch.ger(gradOutput, v, out=self.gradInput[0])
-                self.gradInput[1] = M.t() * gradOutput
-
-        return self.gradInput
diff --git a/torch/legacy/nn/MarginCriterion.py b/torch/legacy/nn/MarginCriterion.py
deleted file mode 100644
index 0af79935bcb92b..00000000000000
--- a/torch/legacy/nn/MarginCriterion.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class MarginCriterion(Criterion):
-
-    def __init__(self, margin=1, sizeAverage=True):
-        super(MarginCriterion, self).__init__()
-        self.sizeAverage = True
-        self.margin = margin
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.MarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            self.sizeAverage,
-            self.margin
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self._backend.MarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            self.gradInput,
-            self.sizeAverage,
-            self.margin
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/MarginRankingCriterion.py b/torch/legacy/nn/MarginRankingCriterion.py
deleted file mode 100644
index 9b8f5128e3ca92..00000000000000
--- a/torch/legacy/nn/MarginRankingCriterion.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-from .Criterion import Criterion
-
-
-class MarginRankingCriterion(Criterion):
-
-    def __init__(self, margin=0, sizeAverage=True):
-        super(MarginRankingCriterion, self).__init__()
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        self.gradInput = [torch.Tensor(), torch.Tensor()]
-
-        self._output = None
-        self.dist = None
-        self.mask = None
-
-    def updateOutput(self, input, y):
-        if input[0].size(0) == 1:
-            self.output = max(0, -y * (input[0][0] - input[1][0]) + self.margin)
-        else:
-            if self._output is None:
-                self._output = input[0].clone()
-            self._output.resize_as_(input[0])
-            self._output.copy_(input[0])
-
-            self._output.add_(-1, input[1])
-            self._output.mul_(-1).mul_(y)
-            self._output.add_(self.margin)
-
-            self._output.clamp_(min=0)
-
-            self.output = self._output.sum().item()
-
-            if self.sizeAverage:
-                self.output = self.output / y.size(0)
-
-        return self.output
-
-    def updateGradInput(self, input, y):
-        if input[0].size(0) == 1:
-            dist = -y * (input[0][0] - input[1][0]) + self.margin
-            if dist < 0:
-                self.gradInput[0][0] = 0
-                self.gradInput[1][0] = 0
-            else:
-                self.gradInput[0][0] = -y
-                self.gradInput[1][0] = y
-        else:
-            if self.dist is None:
-                self.dist = input[0].new()
-            self.dist = self.dist.resize_as_(input[0]).copy_(input[0])
-            dist = self.dist
-
-            dist.add_(-1, input[1])
-            dist.mul_(-1).mul_(y)
-            dist.add_(self.margin)
-
-            self.mask = dist > 0
-            mask = self.mask
-
-            torch.ge(dist, 0, out=mask)
-
-            self.gradInput[0].resize_(dist.size())
-            self.gradInput[1].resize_(dist.size())
-
-            self.gradInput[0].copy_(mask)
-            self.gradInput[0].mul_(-1).mul_(y)
-            self.gradInput[1].copy_(mask)
-            self.gradInput[1].mul_(y)
-
-            if self.sizeAverage:
-                self.gradInput[0].div_(y.size(0))
-                self.gradInput[1].div_(y.size(0))
-
-        return self.gradInput
diff --git a/torch/legacy/nn/MaskedSelect.py b/torch/legacy/nn/MaskedSelect.py
deleted file mode 100644
index a804883a7a49a5..00000000000000
--- a/torch/legacy/nn/MaskedSelect.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class MaskedSelect(Module):
-
-    def __init__(self):
-        super(MaskedSelect, self).__init__()
-        self._maskIndices = torch.LongTensor()
-        self._maskIndexBuffer = torch.LongTensor()
-        self._maskIndexBufferCPU = torch.FloatTensor()
-        self._gradBuffer = torch.Tensor()
-        self._gradMask = torch.ByteTensor()
-
-    def updateOutput(self, input):
-        input, mask = input
-        torch.masked_select(input, mask, out=self.output)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        input, mask = input
-        if input.type() == 'torch.cuda.FloatTensor':
-            torch.arange(0, mask.nelement(), out=self._maskIndexBufferCPU).resize_(mask.size())
-            self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy_(self._maskIndexBufferCPU)
-        else:
-            torch.arange(0, mask.nelement(), out=self._maskIndexBuffer).resize_(mask.size())
-
-        torch.masked_select(self._maskIndexBuffer, mask, out=self._maskIndices)
-        self._gradBuffer.resize_(input.nelement()).zero_()
-        self._gradBuffer.scatter_(0, self._maskIndices, gradOutput)
-        self._gradBuffer.resize_(input.size())
-        self.gradInput = [self._gradBuffer, self._gradMask.resize_(mask.size()).fill_(0)]
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if type is None:
-            return self._type
-
-        self._gradBuffer = self._gradBuffer.type(type)
-        self.gradInput = self.gradInput.type(type)
-        self.output = self.output.type(type)
-
-        # These casts apply when switching between cuda/non-cuda types
-        if type != 'torch.cuda.FloatTensor':
-            self._maskIndexBuffer = self._maskIndexBuffer.long()
-            self._maskIndices = self._maskIndices.long()
-            self._gradMask = self._gradMask.byte()
-        else:
-            self._maskIndexBuffer = self._maskIndexBuffer.cuda()
-            self._maskIndices = self._maskIndices.cuda()
-            self._gradMask = self._gradMask.cuda()
-
-        self._type = type
-        return self
-
-    def clearState(self):
-        return clear(self, ['output',
-                            'gradInput',
-                            '_maskIndexBuffer',
-                            '_maskIndexBufferCPU',
-                            '_maskIndices',
-                            '_gradBuffer',
-                            '_gradMask'])
diff --git a/torch/legacy/nn/Max.py b/torch/legacy/nn/Max.py
deleted file mode 100644
index aa03dc503c59b9..00000000000000
--- a/torch/legacy/nn/Max.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, addSingletondimension
-
-
-class Max(Module):
-
-    def __init__(self, dimension=0):
-        super(Max, self).__init__()
-        self.dimension = dimension
-        self._output = None
-        self._indices = None
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input.dim() + dimension
-
-        return dimension
-
-    def _lazyInit(self):
-        if self._output is None:
-            self._output = self.output.new()
-        if self._indices is None:
-            self._indices = \
-                (torch.cuda.LongTensor() if self.output.is_cuda else torch.LongTensor())
-
-    def updateOutput(self, input):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        torch.max(input, dimension, out=(self._output, self._indices), keepdim=True)
-        if input.dim() > 1:
-            self.output.set_(self._output.select(dimension, 0))
-        else:
-            self.output.set_(self._output)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        if input.dim() > 1:
-            gradOutputView = addSingletondimension(gradOutput, dimension)
-        else:
-            gradOutputView = gradOutput
-
-        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-        if type == 'torch.cuda.FloatTensor':
-            indices, self._indices = self._indices, None
-            super(Max, self).type(type, tensorCache)
-            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
-        else:
-            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
-            # unnecessary memory allocations.
-            indices, self._indices = self._indices, None
-            super(Max, self).type(type, tensorCache)
-            self._indices = indices.long() if indices is not None else None
-
-        return self
-
-    def clearState(self):
-        clear(self, '_indices', '_output')
-        return super(Max, self).clearState()
diff --git a/torch/legacy/nn/Mean.py b/torch/legacy/nn/Mean.py
deleted file mode 100644
index 67048d2aa6a7e9..00000000000000
--- a/torch/legacy/nn/Mean.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import torch
-from .Sum import Sum
-
-"""
-
-This file is still here because of backward compatibility.
-
-Please use instead "nn.Sum(dimension, nInputDims, sizeAverage)"
-
-"""
-
-
-class Mean(Sum):
-
-    def __init__(self, dimension):
-        super(Mean, self).__init__(dimension, True)
diff --git a/torch/legacy/nn/Min.py b/torch/legacy/nn/Min.py
deleted file mode 100644
index b7bdbcaebded5d..00000000000000
--- a/torch/legacy/nn/Min.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, addSingletondimension
-
-
-class Min(Module):
-
-    def __init__(self, dimension=0):
-        super(Min, self).__init__()
-        self.dimension = dimension
-        self._output = None
-        self._indices = None
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input.dim() + dimension
-
-        return dimension
-
-    def _lazyInit(self):
-        if self._output is None:
-            self._output = self.output.new()
-        if self._indices is None:
-            self._indices = \
-                (torch.cuda.LongTensor() if self.output.type() == 'torch.cuda.FloatTensor'
-                 else torch.LongTensor())
-
-    def updateOutput(self, input):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        torch.min(input, dimension, out=(self._output, self._indices), keepdim=True)
-        if input.dim() > 1:
-            self.output.set_(self._output.select(dimension, 0))
-        else:
-            self.output.set_(self._output)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._lazyInit()
-        dimension = self._getPositiveDimension(input)
-        if input.dim() > 1:
-            gradOutputView = addSingletondimension(gradOutput, dimension)
-        else:
-            gradOutputView = gradOutput
-
-        self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        # torch.min expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-        if type == 'torch.cuda.FloatTensor':
-            indices, self._indices = self._indices, None
-            super(Min, self).type(type, tensorCache)
-            self._indices = indices.type('torch.cuda.LongTensor') if indices is not None else None
-        else:
-            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
-            # unnecessary memory allocations.
-            indices, self._indices = self._indices, None
-            super(Min, self).type(type, tensorCache)
-            self._indices = indices.long() if indices is not None else None
-
-        return self
-
-    def clearState(self):
-        clear(self, '_indices', '_output')
-        return super(Min, self).clearState()
diff --git a/torch/legacy/nn/MixtureTable.py b/torch/legacy/nn/MixtureTable.py
deleted file mode 100644
index 541ddd81852aed..00000000000000
--- a/torch/legacy/nn/MixtureTable.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, recursiveResizeAs
-
-
-class MixtureTable(Module):
-
-    def __init__(self, dim=1):
-        super(MixtureTable, self).__init__()
-        self.dim = dim
-        self.size = torch.Size()
-        self.size2 = torch.Size()
-        self.batchSize = 0
-        self.backwardSetup = False
-        self.gradInput = []
-
-        self._gaterView = None
-        self._expert = None
-        self._expertView = None
-        self._sum = None
-        self._expertView2 = None
-        self._expert2 = None
-        self.table = False
-
-    def updateOutput(self, input):
-        gaterInput, expertInputs = input
-
-        # buffers
-        if self._gaterView is None:
-            self._gaterView = input[0].new()
-        if self._expert is None:
-            self._expert = input[0].new()
-        if self._expertView is None:
-            self._expertView = input[0].new()
-
-        self.dimG = 1
-        batchSize = gaterInput.size(0)
-
-        if self.table or isinstance(expertInputs, list):
-            self.table = True
-            if gaterInput.size(self.dimG) != len(expertInputs):
-                raise RuntimeError("Should be one gater output per expert")
-
-            expertInput = expertInputs[0]
-            if self.batchSize != batchSize:
-                size = [1] * (expertInput.dim() + 1)
-                if self.dimG > 0:
-                    size[0] = gaterInput.size(0)
-                size[self.dim] = gaterInput.size(self.dimG)
-                self.size = torch.Size(size)
-                self.output.resize_as_(expertInput)
-                self.backwardSetup = False
-                self.batchSize = batchSize
-
-            self._gaterView = gaterInput.view(self.size)
-            self.output.zero_()
-            # multiply accumulate gater outputs by their commensurate expert
-            for i, expertInput in enumerate(expertInputs):
-                gate = self._gaterView.select(self.dim, i).expand_as(expertInput)
-                self.output.addcmul_(expertInput, gate)
-        else:
-            if self.batchSize != batchSize:
-                size = [1] * expertInputs.dim()
-                if self.dimG > 0:
-                    size[0] = gaterInput.size(0)
-                size[self.dim] = gaterInput.size(self.dimG)
-                self.size = torch.Size(size)
-                self.output.resize_as_(expertInputs.select(self.dim, 0))
-                self.batchSize = batchSize
-                self.backwardSetup = False
-
-            self._gaterView = gaterInput.view(self.size)
-            torch.mul(self._gaterView.expand_as(expertInputs), expertInputs, out=self._expert)
-            torch.sum(self._expert, self.dim, True, out=self.output)
-            self.output.resize_as_(expertInputs.select(self.dim, 0))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        gaterInput, expertInputs = input
-        recursiveResizeAs(self.gradInput, input)
-        gaterGradInput, expertGradInputs = self.gradInput
-
-        # buffers
-        if self._sum is None:
-            self._sum = input[0].new()
-        if self._expertView2 is None:
-            self._expertView2 = input[0].new()
-        if self._expert2 is None:
-            self._expert2 = input[0].new()
-
-        if self.table:
-            if not self.backwardSetup:
-                for i, expertInput in enumerate(expertInputs):
-                    expertGradInput = expertGradInputs[i] or expertInput.clone()
-                    expertGradInput.resize_as_(expertInput)
-                    expertGradInputs[i] = expertGradInput
-
-                gaterGradInput.resize_as_(gaterInput)
-                self.backwardSetup = True
-
-            # like CMulTable, but with broadcasting
-            for i, expertGradInput in enumerate(expertGradInputs):
-                # gater updateGradInput
-                torch.mul(gradOutput, expertInputs[i], out=self._expert)
-                if self.dimG == 0:
-                    self._expertView = self._expert.view(-1)
-                else:
-                    self._expertView = self._expert.view(gradOutput.size(0), -1)
-
-                torch.sum(self._expertView, self.dimG, True, out=self._sum)
-                if self.dimG == 0:
-                    gaterGradInput[i] = self._sum.select(self.dimG, 0)
-                else:
-                    gaterGradInput.select(self.dimG, i).copy_(self._sum.select(self.dimG, 0))
-
-                # expert updateGradInput
-                gate = self._gaterView.select(self.dim, i).expand_as(expertGradInput)
-                expertGradInput.mul_(gate, gradOutput)
-        else:
-            if not self.backwardSetup:
-                size2 = list(expertInputs.size())
-                size2[self.dim] = 1
-                self.size2 = torch.Size(size2)
-                gaterGradInput.resize_as_(gaterInput)
-                self.backwardSetup = True
-
-            # gater updateGradInput
-            self._expertView = gradOutput.contiguous().view(torch.Size(self.size2))
-            gradOutput = self._expertView.expand_as(expertInputs)
-            torch.mul(gradOutput, expertInputs, out=self._expert)
-            expert = self._expert.transpose(self.dim, self.dimG)
-            if not expert.is_contiguous():
-                self._expert2.resize_as_(expert)
-                self._expert2.copy_(expert)
-                expert = self._expert2
-            if self.dimG == 0:
-                self._expertView2 = expert.view(gaterInput.size(0), -1)
-            else:
-                self._expertView2 = expert.view(gaterInput.size(0), gaterInput.size(1), -1)
-
-            torch.sum(self._expertView2, self.dimG + 1, True, out=gaterGradInput)
-            gaterGradInput.resize_as_(gaterInput)
-
-            # expert updateGradInput
-            torch.mul(self._gaterView.expand_as(expertInputs), gradOutput, out=expertGradInputs)
-
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        self._gaterView = None
-        self._expert = None
-        self._expertView = None
-        self._sum = None
-        self._expert2 = None
-        self._expertView2 = None
-        return super(MixtureTable, self).type(type, tensorCache)
-
-    def clearState(self, ):
-        clear(self, [
-            '_gaterView',
-            '_expert',
-            '_expertView',
-            '_sum',
-            '_expert2',
-            '_expertView2',
-        ])
-        return super(MixtureTable, self).clearState()
diff --git a/torch/legacy/nn/Module.py b/torch/legacy/nn/Module.py
deleted file mode 100644
index 4c43cf801f39a8..00000000000000
--- a/torch/legacy/nn/Module.py
+++ /dev/null
@@ -1,296 +0,0 @@
-import torch
-import torch._thnn
-from .utils import clear, recursiveType
-
-
-class Module(object):
-
-    def __init__(self):
-        self.gradInput = torch.Tensor()
-        self.output = torch.Tensor()
-        self._type = self.output.type()
-        self._backend = torch._thnn.type2backend[self.output.type()]
-
-    def __repr__(self):
-        return 'nn.' + self.__class__.__name__
-
-    def parameters(self):
-        has_weight = hasattr(self, 'weight') and self.weight is not None
-        has_bias = hasattr(self, 'bias') and self.bias is not None
-        if has_weight and has_bias:
-            return [self.weight, self.bias], [self.gradWeight, self.gradBias]
-        elif has_weight:
-            return [self.weight], [self.gradWeight]
-        elif has_bias:
-            return [self.bias], [self.gradBias]
-        else:
-            return
-
-    def updateOutput(self, input):
-        return self.output
-
-    def forward(self, input):
-        return self.updateOutput(input)
-
-    def backward(self, input, gradOutput, scale=1):
-        self.updateGradInput(input, gradOutput)
-        self.accGradParameters(input, gradOutput, scale)
-        return self.gradInput
-
-    def backwardUpdate(self, input, gradOutput, lr):
-        self.updateGradInput(input, gradOutput)
-        self.accUpdateGradParameters(input, gradOutput, lr)
-        return self.gradInput
-
-    def updateGradInput(self, input, gradOutput):
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        pass
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        has_weight = hasattr(self, 'weight') and self.weight is not None
-        has_bias = hasattr(self, 'bias') and self.bias is not None
-        if has_weight:
-            gradWeight = self.gradWeight
-            self.gradWeight = self.weight
-        if has_bias:
-            gradBias = self.gradBias
-            self.gradBias = self.bias
-        self.accGradParameters(input, gradOutput, -lr)
-        if has_weight:
-            self.gradWeight = gradWeight
-        if has_bias:
-            self.gradBias = gradBias
-
-    def sharedAccUpdateGradParameters(self, input, gradOutput, lr):
-        if self.parameters():
-            self.zeroGradParameters()
-            self.accGradParameters(input, gradOutput, 1)
-            self.updateParameters(lr)
-
-    def zeroGradParameters(self):
-        params = self.parameters()
-        if params is not None:
-            for grad in params[1]:
-                grad.zero_()
-
-    def updateParameters(self, learningRate):
-        if self.parameters() is not None:
-            params, gradParams = self.parameters()
-            if params:
-                for p, gp in zip(params, gradParams):
-                    p.add_(-learningRate, gp)
-
-    def training(self):
-        self.train = True
-
-    def evaluate(self):
-        self.train = False
-
-    # TODO
-    def share(self, mlp, *arg):
-        raise NotImplementedError
-
-    def clone(self, *arg):
-        raise NotImplementedError
-
-    def type(self, type=None, tensorCache=None):
-        if type is None:
-            return self._type
-
-        tensorCache = tensorCache or {}
-
-        # find all tensors and convert them
-        for key, param in self.__dict__.items():
-            setattr(self, key, recursiveType(param, type, tensorCache))
-
-        self._backend = torch._thnn.type2backend[type]
-        self._type = type
-        return self
-
-    def float(self, *args):
-        return self.type('torch.FloatTensor', *args)
-
-    def double(self, *args):
-        return self.type('torch.DoubleTensor', *args)
-
-    def cuda(self, *args):
-        return self.type('torch.cuda.FloatTensor', *args)
-
-    def reset(self):
-        pass
-
-    def write(self, f):
-        raise NotImplementedError
-
-    def read(self, f):
-        raise NotImplementedError
-
-    # This function is not easy to understand. It works as follows:
-    #
-    # - gather all parameter tensors for this module (and children);
-    #   count all parameter values (floats)
-    # - create one ginormous memory area (Storage object) with room for all
-    #   parameters
-    # - remap each parameter tensor to point to an area within the ginormous
-    #   Storage, and copy it there
-    #
-    # It has the effect of making all parameters point to the same memory area,
-    # which is: returned.
-    #
-    # The purpose is to allow operations over all parameters (such as momentum
-    # updates and serialization), but it assumes that all parameters are of
-    # the same type (and, in the case of CUDA, on the same device), which
-    # is not always True. Use for_each() to iterate over this module and
-    # children instead.
-    #
-    # Module._flattenTensorBuffer can be used by other packages (e.g. cunn)
-    # to specify the type of temporary buffers. For example, the temporary
-    # buffers for CudaTensor could be FloatTensor, to avoid GPU memory usage.
-    #
-    # TODO: This logically belongs to torch.Tensor, not nn.
-    _flattenTensorBuffer = {}
-
-    def _flatten(self, parameters=[]):
-
-        # returns True if tensor occupies a contiguous region of memory (no holes)
-        def isCompact(tensor):
-            # isn't it enough to check if strides == size.cumprod(0)?
-            sortedStride, perm = torch.sort(torch.LongTensor(tensor.stride()), 0, True)
-            sortedSize = torch.LongTensor(list(tensor.size())).index_select(0, perm)
-            nRealDim = int(torch.clamp(sortedStride, 0, 1).sum())
-            sortedStride = sortedStride.narrow(0, 0, nRealDim).clone()
-            sortedSize = sortedSize.narrow(0, 0, nRealDim).clone()
-            t = tensor.new().set_(tensor.storage(), 0,
-                                  tuple(sortedSize),
-                                  tuple(sortedStride))
-            return t.is_contiguous()
-
-        if not parameters:
-            return torch.Tensor()
-
-        Tensor = parameters[0].new
-        BufferTensor = Module._flattenTensorBuffer.get(type(parameters[0]), Tensor)
-
-        # 1. construct the set of all unique storages referenced by parameter tensors
-        storages = {}
-        num_parameters = 0
-        parameterMeta = []
-        for i, param in enumerate(parameters):
-            storage = param.storage()
-            key = storage._cdata
-
-            if key not in storages:
-                storages[key] = (storage, num_parameters)
-                num_parameters = num_parameters + storage.size()
-
-            parameterMeta.append({
-                'storage_offset': param.storage_offset() + storages[key][1],
-                'size': param.size(),
-                'stride': param.stride()
-            })
-
-        # 2. construct a single tensor that will hold all the parameters
-        flatParameters = BufferTensor(num_parameters).zero_()
-
-        # 3. determine if there are elements in the storage that none of the
-        #    parameter tensors reference ('holes')
-        tensorsCompact = True
-        for meta in parameterMeta:
-            tmp = BufferTensor().set_(flatParameters.storage(), meta['storage_offset'], meta['size'], meta['stride'])
-            tmp.fill_(1)
-            tensorsCompact = tensorsCompact and isCompact(tmp)
-
-        maskParameters = flatParameters.byte().clone()
-        compactOffsets = flatParameters.long().cumsum(0)
-        used_parameters = compactOffsets[-1]
-
-        # 4. copy storages into the flattened parameter tensor
-        for storageAndOffset in storages.values():
-            storage, offset = storageAndOffset
-            flatParameters[slice(offset, offset + storage.size())].copy_(Tensor().set_(storage))
-
-        # 5. allow garbage collection
-        storages = None
-        for param in parameters:
-            param.set_()
-
-        # 6. compact the flattened parameters if there were holes
-        if used_parameters != num_parameters:
-            assert tensorsCompact
-
-            flatParameters = BufferTensor(used_parameters).copy_(
-                flatParameters.masked_select(maskParameters))
-            for meta in parameterMeta:
-                meta['storage_offset'] = compactOffsets[meta['storage_offset']]
-
-        if BufferTensor != Tensor:
-            flatParameters = Tensor(flatParameters.nelement()).copy_(flatParameters)
-
-        # 7. fix up the parameter tensors to point at the flattened parameters
-        for param, meta in zip(parameters, parameterMeta):
-            param.set_(flatParameters.storage(),
-                       meta['storage_offset'],
-                       meta['size'],
-                       meta['stride'])
-
-        return flatParameters
-
-    def flattenParameters(self):
-        _params = self.parameters()
-        if _params is None:
-            return
-        parameters, gradParameters = _params
-        p, g = self._flatten(parameters), self._flatten(gradParameters)
-
-        assert p.nelement() == g.nelement()
-        if parameters:
-            for param, grad in zip(parameters, gradParameters):
-                assert param.storage_offset() == grad.storage_offset()
-
-        return p, g
-
-    def apply(self, callback):
-        callback(self)
-        if hasattr(self, 'modules'):
-            for module in self.modules:
-                module.apply(callback)
-
-    def findModules(self, cls, container=None):
-        nodes = []
-        containers = []
-        if isinstance(self, cls):
-            nodes.append(self)
-            containers.append(container)
-
-        # Recurse on nodes with 'modules'
-        if hasattr(self, 'modules'):
-            for child in self.modules:
-                child_nodes, child_containers = child.findModules(cls, self)
-                assert len(child_nodes) == len(child_containers)
-                # add the list items from our child to our list (i.e. return a
-                # flattened table of the return nodes).
-                nodes.extend(child_nodes)
-                containers.extend(child_containers)
-
-        return nodes, containers
-
-    def listModules(self):
-        # include self first
-        modules = [self]
-        if hasattr(self, 'modules'):
-            for child in self.modules:
-                modules.extend(child.listModules())
-        return modules
-
-    def clearState(self):
-        return clear(self, 'output', 'gradInput')
-
-    def replace(self, callback):
-        out = callback(self)
-        # TODO: not out.modules?
-        if hasattr(self, 'modules'):
-            for i, module in enumerate(self.modules):
-                self.modules[i] = module.replace(callback)
-        return out
diff --git a/torch/legacy/nn/Mul.py b/torch/legacy/nn/Mul.py
deleted file mode 100644
index 7e7e33641fea37..00000000000000
--- a/torch/legacy/nn/Mul.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class Mul(Module):
-
-    def __init__(self):
-        super(Mul, self).__init__()
-        self.weight = torch.Tensor(1)
-        self.gradWeight = torch.Tensor(1)
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(0))
-        self.weight.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        self.output.mul_(self.weight[0])
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input).zero_()
-        self.gradInput.add_(self.weight[0], gradOutput)
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self.gradWeight[0] = (self.gradWeight[0] +
-                              scale * input.contiguous().view(-1).dot(gradOutput.contiguous().view(-1)))
diff --git a/torch/legacy/nn/MulConstant.py b/torch/legacy/nn/MulConstant.py
deleted file mode 100644
index 6652ffbaac45e7..00000000000000
--- a/torch/legacy/nn/MulConstant.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-from .Module import Module
-
-
-class MulConstant(Module):
-
-    def __init__(self, constant_scalar, inplace=False):
-        super(MulConstant, self).__init__()
-        self.constant_scalar = constant_scalar
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        if self.inplace:
-            input.mul_(self.constant_scalar)
-            self.output.set_(input)
-        else:
-            self.output.resize_as_(input)
-            self.output.copy_(input)
-            self.output.mul_(self.constant_scalar)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        if self.inplace:
-            gradOutput.mul_(self.constant_scalar)
-            self.gradInput.set_(gradOutput)
-            # restore previous input value
-            input.div_(self.constant_scalar)
-        else:
-            self.gradInput.resize_as_(gradOutput)
-            self.gradInput.copy_(gradOutput)
-            self.gradInput.mul_(self.constant_scalar)
-
-        return self.gradInput
diff --git a/torch/legacy/nn/MultiCriterion.py b/torch/legacy/nn/MultiCriterion.py
deleted file mode 100644
index 455b32cf928b1e..00000000000000
--- a/torch/legacy/nn/MultiCriterion.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
-
-
-class MultiCriterion(Criterion):
-
-    def __init__(self, ):
-        super(MultiCriterion, self).__init__()
-        self.criterions = []
-        self.weights = torch.DoubleStorage()
-
-    def add(self, criterion, weight=1):
-        self.criterions.append(criterion)
-        new_weights = torch.DoubleStorage(len(self.criterions))
-        for i, v in enumerate(self.weights):
-            new_weights[i] = v
-        new_weights[len(self.criterions) - 1] = weight
-        self.weights = new_weights
-        return self
-
-    def updateOutput(self, input, target):
-        self.output = 0
-        for i in range(len(self.criterions)):
-            self.output = self.output + self.weights[i] * self.criterions[i].updateOutput(input, target)
-
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
-        recursiveFill(self.gradInput, 0)
-        for i in range(len(self.criterions)):
-            recursiveAdd(self.gradInput, self.weights[i], self.criterions[i].updateGradInput(input, target))
-
-        return self.gradInput
-
-    def type(self, type):
-        for criterion in self.criterions:
-            criterion.type(type)
-
-        return super(MultiCriterion, self).type(type)
diff --git a/torch/legacy/nn/MultiLabelMarginCriterion.py b/torch/legacy/nn/MultiLabelMarginCriterion.py
deleted file mode 100644
index 9ca2a233efdf99..00000000000000
--- a/torch/legacy/nn/MultiLabelMarginCriterion.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class MultiLabelMarginCriterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(MultiLabelMarginCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.isTarget = torch.Tensor()
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        target = target.long()
-        self._backend.MultiLabelMarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            self.isTarget,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        target = target.long()
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.MultiLabelMarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            self.isTarget,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
deleted file mode 100644
index 59b2b29b861b17..00000000000000
--- a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .Sigmoid import Sigmoid
-from .BCECriterion import BCECriterion
-
-
-class MultiLabelSoftMarginCriterion(Criterion):
-    """
-    A MultiLabel multiclass criterion based on sigmoid:
-
-    the loss is:
-    l(x, y) = - sum_i y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i])
-    where p[i] = exp(x[i]) / (1 + exp(x[i]))
-
-    and with weights:
-    l(x, y) = - sum_i weights[i] (y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i]))
-
-
-    """
-
-    def __init__(self, weights=None):
-        super(MultiLabelSoftMarginCriterion, self).__init__()
-        self.lsm = Sigmoid()
-        self.nll = BCECriterion(weights)
-
-    def updateOutput(self, input, target):
-        input = input if input.nelement() == 1 else input.squeeze()
-        target = target if target.nelement() == 1 else target.squeeze()
-        self.lsm.updateOutput(input)
-        self.nll.updateOutput(self.lsm.output, target)
-        self.output = self.nll.output
-        return self.output
-
-    def updateGradInput(self, input, target):
-        size = input.size()
-        input = input if input.nelement() == 1 else input.squeeze()
-        target = target if target.nelement() == 1 else target.squeeze()
-        self.nll.updateGradInput(self.lsm.output, target)
-        self.lsm.updateGradInput(input, self.nll.gradInput)
-        self.gradInput = self.lsm.gradInput.view(size)
-        return self.gradInput
diff --git a/torch/legacy/nn/MultiMarginCriterion.py b/torch/legacy/nn/MultiMarginCriterion.py
deleted file mode 100644
index cc9835c3395f99..00000000000000
--- a/torch/legacy/nn/MultiMarginCriterion.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class MultiMarginCriterion(Criterion):
-
-    def __init__(self, p=1, weights=None, margin=1, sizeAverage=True):
-        super(MultiMarginCriterion, self).__init__()
-        if p != 1 and p != 2:
-            raise ValueError("only p == 1 and p == 2 supported")
-        self.p = p
-        self.margin = margin
-        self.sizeAverage = sizeAverage
-        if weights is not None:
-            assert weights.dim() == 1
-        self.weights = weights
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        target = target.long()
-        self._backend.MultiMarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.p,
-            self.weights,
-            self.margin,
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        target = target.long()
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.MultiMarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.p,
-            self.weights,
-            self.margin,
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/Narrow.py b/torch/legacy/nn/Narrow.py
deleted file mode 100644
index 419be6cb2bca5a..00000000000000
--- a/torch/legacy/nn/Narrow.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Narrow(Module):
-
-    def __init__(self, dimension, offset, length=1):
-        super(Narrow, self).__init__()
-        self.dimension = dimension
-        self.index = offset
-        self.length = length
-
-    def updateOutput(self, input):
-        length = self.length
-        if length < 0:
-            length = input.size(self.dimension) - self.index + self.length + 1
-
-        output = input.narrow(self.dimension, self.index, length)
-        self.output = self.output.type_as(output)
-        self.output.resize_as_(output).copy_(output)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        length = self.length
-        if length < 0:
-            length = input.size(self.dimension) - self.index + self.length + 1
-
-        self.gradInput = self.gradInput.type_as(input)
-        self.gradInput.resize_as_(input).zero_()
-        self.gradInput.narrow(self.dimension, self.index, length).copy_(gradOutput)
-        return self.gradInput
diff --git a/torch/legacy/nn/NarrowTable.py b/torch/legacy/nn/NarrowTable.py
deleted file mode 100644
index 48d8a03f55bb6e..00000000000000
--- a/torch/legacy/nn/NarrowTable.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear, recursiveResizeAs, recursiveFill
-
-
-class NarrowTable(Module):
-
-    def __init__(self, offset, length=1):
-        super(NarrowTable, self).__init__()
-        self.offset = offset
-        self.length = length
-        self.output = []
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        self.output[:] = [input[self.offset + i] for i in range(self.length)]
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if len(self.gradInput) != len(input):
-            self.gradInput[:] = [None for i in range(len(input))]
-
-        assert len(gradOutput) == self.length
-        for i in range(self.length):
-            self.gradInput[self.offset + i] = gradOutput[i]
-
-        for i in range(len(input)):
-            if i < self.offset or i >= self.offset + self.length:
-                gi = self.gradInput[i]
-                if gi is None:
-                    gi = input[i].new()
-                self.gradInput[i] = recursiveResizeAs(gi, input[i])[0]
-                recursiveFill(self.gradInput[i], 0)
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        if not type:
-            return self._type
-        clear(self, 'output', 'gradInput')
-        return super(NarrowTable, self).type(self, type, tensorCache)
diff --git a/torch/legacy/nn/Normalize.py b/torch/legacy/nn/Normalize.py
deleted file mode 100644
index 1704bdf32b318a..00000000000000
--- a/torch/legacy/nn/Normalize.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import torch
-from torch._six import inf
-from .Module import Module
-from .utils import clear
-
-
-class Normalize(Module):
-
-    def __init__(self, p, eps=1e-10):
-        super(Normalize, self).__init__()
-        assert p > 0
-        self.p = p
-        self.eps = eps
-
-        self._output = None
-        self.norm = None
-        self.buffer = None
-        self._indices = None
-        self.normp = None
-        self._gradInput = None
-        self.cross = None
-        self.buffer2 = None
-
-    def updateOutput(self, input):
-        assert input.dim() == 2
-        input_size = input.size()
-
-        if self._output is None:
-            self._output = input.new()
-        if self.norm is None:
-            self.norm = input.new()
-        if self.buffer is None:
-            self.buffer = input.new()
-
-        self._output.resize_as_(input)
-
-        # specialization for the infinity norm
-        if self.p == inf:
-            if not self._indices:
-                self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \
-                    else torch.LongTensor()
-
-            torch.abs(input, out=self.buffer)
-            torch.max(self._indices, self.buffer, 1, out=self.norm, keepdim=True)
-            self.norm.add_(self.eps)
-        else:
-            if self.normp is None:
-                self.normp = input.new()
-            if self.p % 2 != 0:
-                torch.abs(input, out=self.buffer).pow_(self.p)
-            else:
-                torch.pow(input, self.p, out=self.buffer)
-
-            torch.sum(self.buffer, 1, out=self.normp, keepdim=True).add_(self.eps)
-            torch.pow(self.normp, 1. / self.p, out=self.norm)
-
-        torch.div(input, self.norm.view(-1, 1).expand_as(input), out=self._output)
-
-        self.output = self._output.view(input_size)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 2
-        assert gradOutput.dim() == 2
-
-        input_size = input.size()
-        n = input.size(0)  # batch size
-        d = input.size(1)  # dimensionality of vectors
-
-        if self._gradInput is None:
-            self._gradInput = input.new()
-        if self.cross is None:
-            self.cross = input.new()
-        # compute diagonal term with gradOutput
-        self._gradInput.resize_(n, d)
-        if self.p == inf:
-                # specialization for the inf case
-            torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput)
-            self.buffer.resize_as_(input).zero_()
-            self.cross.resize_(n, 1)
-            torch.gather(input, 1, self._indices, out=self.cross)
-            self.cross.div_(self.norm)
-            self.buffer.scatter_(1, self._indices, self.cross)
-        else:
-            torch.mul(self.normp.view(n, 1).expand(n, d), gradOutput, out=self._gradInput)
-            # small optimizations for different p
-            # buffer = input*|input|^(p-2)
-            # for non-even p, need to add absolute value
-            if self.p % 2 != 0:
-                if self.p < 2:
-                    # add eps to avoid possible division by 0
-                    torch.abs(input, out=self.buffer).add_(self.eps).pow_(self.p - 2).mul_(input)
-                else:
-                    torch.abs(input, out=self.buffer).pow_(self.p - 2).mul_(input)
-            # special case for p == 2, pow(x, 0) = 1
-            elif self.p == 2:
-                self.buffer.copy_(input)
-            else:
-                # p is even and > 2, pow(x, p) is always positive
-                torch.pow(input, self.p - 2, out=self.buffer).mul_(input)
-
-        # compute cross term in two steps
-        self.cross.resize_(n, 1)
-
-        # instead of having a huge temporary matrix (b1*b2),
-        #: the computations as b1*(b2*gradOutput). This avoids redundant
-        # computation and also a huge buffer of size n*d^2
-        if self.buffer2 is None:
-            self.buffer2 = input.new()  # nxd
-        torch.mul(input, gradOutput, out=self.buffer2)
-        torch.sum(self.buffer2, 1, out=self.cross, keepdim=True)
-
-        self.buffer.mul_(self.cross.expand_as(self.buffer))
-        self._gradInput.add_(-1, self.buffer)
-
-        # reuse cross buffer for normalization
-        if self.p == inf:
-            torch.mul(self.norm, self.norm, out=self.cross)
-        else:
-            torch.mul(self.normp, self.norm, out=self.cross)
-
-        self._gradInput.div_(self.cross.expand(n, d))
-
-        self.gradInput = self._gradInput.view(input_size)
-        return self.gradInput
-
-    def __repr__(self):
-        return super(Normalize, self).__repr__() + '({})'.format(self.p)
-
-    def type(self, type, tensorCache=None):
-        if not type:
-            return self._type
-        # torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-        if type == 'torch.cuda.FloatTensor':
-            super(Normalize, self).type(type, tensorCache)
-        else:
-            # self._indices must be a LongTensor. Setting it to nil temporarily avoids
-            # unnecessary memory allocations.
-            indices, self._indices = self._indices, None
-            super(Normalize, self).type(type, tensorCache)
-            self._indices = indices.long() if indices else None
-
-        return self
-
-    def clearState(self):
-        clear(self, [
-            '_output',
-            '_indices',
-            '_gradInput',
-            'buffer',
-            'norm',
-            'normp',
-            'cross',
-        ])
-        return super(Normalize, self).clearState()
diff --git a/torch/legacy/nn/PReLU.py b/torch/legacy/nn/PReLU.py
deleted file mode 100644
index 59586769a929d1..00000000000000
--- a/torch/legacy/nn/PReLU.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class PReLU(Module):
-
-    def __init__(self, nOutputPlane=0):
-        super(PReLU, self).__init__()
-        # if no argument provided, use shared model (weight is scalar)
-        self.nOutputPlane = nOutputPlane
-        self.weight = torch.Tensor(nOutputPlane or 1).fill_(0.25)
-        self.gradWeight = torch.Tensor(nOutputPlane or 1)
-
-    def updateOutput(self, input):
-        self._backend.PReLU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.PReLU_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._backend.PReLU_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.gradWeight,
-            scale
-        )
-        return self.gradWeight
-
-    def clearState(self):
-        clear(self, 'gradWeightBuf', 'gradWeightBuf2')
-        return super(PReLU, self).clearState()
diff --git a/torch/legacy/nn/Padding.py b/torch/legacy/nn/Padding.py
deleted file mode 100644
index 0c5cdf0b0f674a..00000000000000
--- a/torch/legacy/nn/Padding.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Padding(Module):
-    # pad puts in [pad] amount of [value] over dimension [dim], starting at
-    # index [index] in that dimension. If pad<0, index counts from the left.
-    # If pad>0 index counts from the right index = 1 pads before index 1.
-    # index = 2 pads starting before index 2 and after index 1 in dimension [dim]
-    # When nInputDim is provided, inputs larger than that value will be considered batches
-    # where the actual dim to be padded will be dimension dim + 1.
-
-    def __init__(self, dim, pad, value=0, index=0, nInputDim=0):
-        self.value = value
-        self.index = index
-        self.dim = dim
-        self.pad = pad
-        self.nInputDim = nInputDim
-        self.outputSize = torch.Size()
-        super(Padding, self).__init__()
-
-    def updateOutput(self, input):
-        dim = self.dim
-        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
-            dim = dim + 1
-
-        outputSize = list(input.size())
-        outputSize[dim] += abs(self.pad)
-        self.outputSize = torch.Size(outputSize)
-
-        self.output.resize_(self.outputSize)
-        self.output.fill_(self.value)
-        index = self.index
-        pad = self.pad
-        if pad > 0:
-            index = input.size(dim) - index
-        else:
-            pad = -pad
-
-        if index == 0:
-            self.output.narrow(dim, pad, input.size(dim)).copy_(input)
-        elif index == input.size(dim):
-            self.output.narrow(dim, 0, input.size(dim)).copy_(input)
-        else:
-            self.output.narrow(dim, 0, index).copy_(input.narrow(dim, 0, index))
-            self.output.narrow(dim, index + pad, input.size(dim) -
-                               index).copy_(input.narrow(dim, index, input.size(dim) - index))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-        dim = self.dim
-
-        if hasattr(self, "nInputDim") and self.nInputDim > 0 and input.dim() != self.nInputDim:
-            dim = dim + 1
-
-        index = self.index
-        pad = self.pad
-        if pad > 0:
-            index = input.size(dim) - index
-        else:
-            pad = -pad
-
-        if index == 0:
-            self.gradInput.copy_(gradOutput.narrow(dim, pad, input.size(dim)))
-        elif index == input.size(dim):
-            self.gradInput.copy_(gradOutput.narrow(dim, 0, input.size(dim)))
-        else:
-            self.gradInput.narrow(dim, 0, index).copy_(gradOutput.narrow(dim, 0, index))
-            self.gradInput.narrow(dim, index, input.size(
-                dim) - index).copy_(gradOutput.narrow(dim, index + pad, input.size(dim) - index))
-
-        return self.gradInput
diff --git a/torch/legacy/nn/PairwiseDistance.py b/torch/legacy/nn/PairwiseDistance.py
deleted file mode 100644
index 98c6268a1812a5..00000000000000
--- a/torch/legacy/nn/PairwiseDistance.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class PairwiseDistance(Module):
-
-    def __init__(self, p):
-        super(PairwiseDistance, self).__init__()
-        assert p % 1 == 0
-        self.gradInput = []
-        self.diff = torch.Tensor()
-        self.norm = p
-
-        self.outExpand = None
-        self.grad = None
-        self.ones = None
-
-    def updateOutput(self, input):
-        self.output.resize_(1)
-        assert input[0].dim() == 2
-
-        if self.diff is None:
-            self.diff = input[0].new()
-
-        torch.add(input[0], -1, input[1], out=self.diff).abs_()
-
-        self.output.resize_(input[0].size(0))
-        self.output.zero_()
-        self.output.add_(self.diff.pow_(self.norm).sum(1, keepdim=False))
-        self.output.pow_(1. / self.norm)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input[0].dim() == 2
-
-        if len(self.gradInput) != 2:
-            self.gradInput[:] = [None, None]
-
-        if self.gradInput[0] is None:
-            self.gradInput[0] = input[0].new()
-        self.gradInput[0].resize_(input[0].size())
-        if self.gradInput[1] is None:
-            self.gradInput[1] = input[1].new()
-        self.gradInput[1].resize_(input[1].size())
-        self.gradInput[0].copy_(input[0])
-        self.gradInput[0].add_(-1, input[1])
-
-        if self.norm == 1:
-            self.gradInput[0].sign_()
-        else:
-            # Note: derivative of p-norm:
-            # d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
-            if self.norm > 2:
-                self.gradInput[0].mul_(self.gradInput[0].abs().pow_(self.norm - 2))
-
-            if self.outExpand is None:
-                self.outExpand = self.output.new()
-            self.outExpand.resize_(self.output.size(0), 1)
-            self.outExpand.copy_(self.output.view(self.output.size(0), 1))
-            self.outExpand.add_(1e-6)  # Prevent divide by zero errors
-            self.outExpand.pow_(-(self.norm - 1))
-            self.gradInput[0].mul_(self.outExpand.expand(self.gradInput[0].size(0),
-                                                         self.gradInput[0].size(1)))
-
-        if self.grad is None:
-            self.grad = gradOutput.new()
-        if self.ones is None:
-            self.ones = gradOutput.new()
-
-        self.grad.resize_as_(input[0]).zero_()
-        self.ones.resize_(input[0].size(1)).fill_(1)
-
-        self.grad.addr_(gradOutput, self.ones)
-        self.gradInput[0].mul_(self.grad)
-
-        self.gradInput[1].zero_().add_(-1, self.gradInput[0])
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'diff', 'outExpand', 'grad', 'ones')
-        return super(PairwiseDistance, self).clearState()
diff --git a/torch/legacy/nn/Parallel.py b/torch/legacy/nn/Parallel.py
deleted file mode 100644
index 6db1c060e38733..00000000000000
--- a/torch/legacy/nn/Parallel.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import torch
-from .Container import Container
-
-
-class Parallel(Container):
-
-    def __init__(self, inputDimension, outputDimension):
-        super(Parallel, self).__init__()
-        self.inputDimension = inputDimension
-        self.outputDimension = outputDimension
-        self.totalOutputSize = None
-
-    def updateOutput(self, input):
-        nModule = input.size(self.inputDimension)
-        outputs = []
-
-        for i in range(nModule):
-            currentInput = input.select(self.inputDimension, i)
-            currentOutput = self.modules[i].updateOutput(currentInput)
-            outputs.append(currentOutput)
-            outputSize = currentOutput.size(self.outputDimension)
-
-            if i == 0:
-                totalOutputSize = list(currentOutput.size())
-            else:
-                totalOutputSize[self.outputDimension] += outputSize
-
-        self.totalOutputSize = torch.Size(totalOutputSize)
-        self.output.resize_(self.totalOutputSize)
-
-        offset = 0
-        for i in range(nModule):
-            currentOutput = outputs[i]
-            outputSize = currentOutput.size(self.outputDimension)
-            self.output.narrow(self.outputDimension, offset, outputSize).copy_(currentOutput)
-            offset = offset + currentOutput.size(self.outputDimension)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        nModule = input.size(self.inputDimension)
-        self.gradInput.resize_as_(input)
-
-        offset = 0
-        for i in range(nModule):
-            module = self.modules[i]
-            currentInput = input.select(self.inputDimension, i)
-            currentOutput = module.output
-            outputSize = currentOutput.size(self.outputDimension)
-            currentGradOutput = gradOutput.narrow(self.outputDimension, offset, outputSize)
-
-            currentGradInput = module.updateGradInput(currentInput, currentGradOutput)
-
-            self.gradInput.select(self.inputDimension, i).copy_(currentGradInput)
-            offset = offset + outputSize
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        nModule = input.size(self.inputDimension)
-
-        offset = 0
-        for i in range(nModule):
-            module = self.modules[i]
-            currentOutput = module.output
-            outputSize = currentOutput.size(self.outputDimension)
-
-            module.accGradParameters(
-                input.select(self.inputDimension, i),
-                gradOutput.narrow(self.outputDimension, offset, outputSize),
-                scale)
-            offset += outputSize
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        nModule = input.size(self.inputDimension)
-
-        offset = 0
-        for i in range(nModule):
-            module = self.modules[i]
-            currentOutput = module.output
-            module.accupdateGradParameters(
-                input.select(self.inputDimension, i),
-                gradOutput.narrow(self.outputDimension, offset, currentOutput.size(self.outputDimension)),
-                lr)
-            offset = offset + currentOutput.size(self.outputDimension)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   ... -> '
-        res = torch.typename(self)
-        res += ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res += line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res += line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
-
-        res += line + tab + last + 'output'
-        res += line + '}'
-        return res
diff --git a/torch/legacy/nn/ParallelCriterion.py b/torch/legacy/nn/ParallelCriterion.py
deleted file mode 100644
index 7ecfd95c6b540a..00000000000000
--- a/torch/legacy/nn/ParallelCriterion.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import torch
-from .Criterion import Criterion
-from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
-
-
-class ParallelCriterion(Criterion):
-
-    def __init__(self, repeatTarget=False):
-        super(ParallelCriterion, self).__init__()
-        self.criterions = []
-        self.weights = []
-        self.gradInput = []
-        self.repeatTarget = repeatTarget
-
-    def add(self, criterion, weight=1):
-        self.criterions.append(criterion)
-        self.weights.append(weight)
-        return self
-
-    def updateOutput(self, input, target):
-        self.output = 0
-        for i, criterion in enumerate(self.criterions):
-            current_target = target if self.repeatTarget else target[i]
-            self.output += self.weights[i] * criterion.updateOutput(input[i], current_target)
-
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
-        recursiveFill(self.gradInput, 0)
-        for i, criterion in enumerate(self.criterions):
-            current_target = target if self.repeatTarget else target[i]
-            recursiveAdd(self.gradInput[i], self.weights[i], criterion.updateGradInput(input[i], current_target))
-
-        return self.gradInput
-
-    def type(self, type=None, tensorCache=None):
-        self.gradInput = []
-        return super(ParallelCriterion, self).type(type, tensorCache)
diff --git a/torch/legacy/nn/ParallelTable.py b/torch/legacy/nn/ParallelTable.py
deleted file mode 100644
index 41912a6a1029a5..00000000000000
--- a/torch/legacy/nn/ParallelTable.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import torch
-from .Container import Container
-
-
-class ParallelTable(Container):
-
-    def __init__(self, ):
-        super(ParallelTable, self).__init__()
-        self.modules = []
-        self.output = []
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        for i in range(len(self.modules)):
-            tmp = self.modules[i].updateOutput(input[i])
-            if len(self.output) <= i:
-                self.output.append(tmp)
-            else:
-                self.output[i] = tmp
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        for i, module in enumerate(self.modules):
-            tmp = module.updateGradInput(input[i], gradOutput[i])
-            if len(self.gradInput) <= i:
-                self.gradInput.append(tmp)
-            else:
-                self.gradInput[i] = tmp
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        for i, module in enumerate(self.modules):
-            module.accGradParameters(input[i], gradOutput[i], scale)
-
-    def accUpdateGradParameters(self, input, gradOutput, lr=1):
-        for i, module in enumerate(self.modules):
-            module.accUpdateGradParameters(input[i], gradOutput[i], lr)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = '  |`-> '
-        ext = '  |    '
-        extlast = '       '
-        last = '   ... -> '
-        res = torch.typename(self)
-        res = res + ' {' + line + tab + 'input'
-        for i in range(len(self.modules)):
-            if i == len(self.modules) - 1:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + extlast)
-            else:
-                res = res + line + tab + next + '(' + str(i) + '): ' + \
-                    str(self.modules[i]).replace(line, line + tab + ext)
-
-        res = res + line + tab + last + 'output'
-        res = res + line + '}'
-        return res
diff --git a/torch/legacy/nn/PartialLinear.py b/torch/legacy/nn/PartialLinear.py
deleted file mode 100644
index 66aa8116fd66bd..00000000000000
--- a/torch/legacy/nn/PartialLinear.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import torch
-from .Module import Module
-from .Identity import Identity
-from .LookupTable import LookupTable
-from .Sequential import Sequential
-from .ParallelTable import ParallelTable
-from .MM import MM
-
-
-class PartialLinear(Module):
-    """
-    PartialLinear is a Linear layer that allows the user to a set a collection of
-    column indices. When the column indices are set, the layer will behave like a
-    Linear layer that only has those columns. Meanwhile, all parameters are
-    preserved, so resetting the PartialLinear layer will result in a module that
-    behaves just like a regular Linear layer.
-
-    This module is useful, for instance, when you want to: forward-backward on
-    only a subset of a Linear layer during training but use the full Linear layer
-    at test time.
-    """
-
-    def __init__(self, inputsize, outputsize, bias=True):
-        super(PartialLinear, self).__init__()
-
-        # define the layer as a small network:
-        pt = ParallelTable()
-        pt.add(Identity()).add(LookupTable(outputsize, inputsize))
-        self.network = Sequential().add(pt).add(MM(False, True))
-        if bias:
-            self.bias = torch.zeros(1, outputsize)
-            self.gradBias = torch.zeros(1, outputsize)
-        else:
-            self.bias = self.gradBias = None
-
-        # set partition:
-        self.inputsize = inputsize
-        self.outputsize = outputsize
-        self.allcolumns = torch.arange(0, self.outputsize).long()
-        self.resetPartition()
-        self.addBuffer = None
-        self.buffer = None
-
-    def setPartition(self, indices):
-        self.partition = indices.type(self.allcolumns.type())
-        return self
-
-    def resetPartition(self):
-        self.partition = self.allcolumns
-        return self
-
-    def parameters(self):
-        return [self.network.get(0).get(1).weight, self.bias], \
-               [self.network.get(0).get(1).gradWeight, self.gradBias]
-        # should return only the relevant partition?
-
-    def updateOutput(self, input):
-        self.output.set_(self.network.forward([input, self.partition]))
-        if self.bias is not None:
-            self.output.add_(torch.index_select(self.bias, 1, self.partition).expand_as(self.output))
-            if self.addBuffer is None:
-                self.addBuffer = input.new()
-            if self.addBuffer.nelement() != input.size(0):
-                self.addBuffer.resize_(input.size(0)).fill_(1)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is not None:
-            self.network.updateGradInput([input, self.partition], gradOutput)
-            self.gradInput.set_(self.network.gradInput[0])
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self.network.accGradParameters([input, self.partition], gradOutput, scale)
-        if self.bias is not None:
-            if self.buffer is None:
-                self.buffer = input.new()
-            self.buffer.resize_(gradOutput.size(1))
-            torch.mv(gradOutput.t(), self.addBuffer, out=self.buffer).mul_(scale)
-            self.gradBias.index_add_(
-                1, self.partition, self.buffer.view(1, self.buffer.nelement())
-            )
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        gradWeight = self.network.get(0).get(1).gradWeight
-        gradBias = self.gradBias
-        self.network.get(0).get(1).gradWeight = self.network.get(0).get(1).weight
-        self.gradBias = self.bias
-        self.accGradParameters(input, gradOutput, -lr)
-        self.network.get(0).get(1).gradWeight = gradWeight
-        self.gradBias = gradBias
-
-    def zeroGradParameters(self):
-        self.network.zeroGradParameters()
-        self.gradBias.zero_()
-
-    def updateParameters(self, learningRate):
-        self.network.updateParameters(learningRate)
-        self.bias._add(-learningRate, self.gradBias)
-
-    def type(self, type=None, tensorCache=None):
-        result = super(PartialLinear, self).type(type, tensorCache)
-        self.partition = self.partition.long()
-        self.allcolumns = self.allcolumns.long()
-        if type == 'torch.cuda.FloatTensor':
-            self.allcolumns = self.allcolumns.cuda()
-            self.partition = self.partition.cuda()
-        return result
-
-    def __repr__(self):
-        return super(ParallelTable, self).__repr__() + \
-            '({} -> {})'.format(self.inputsize, self.outputsize) + \
-            ' without bias' if self.bias is None else ''
diff --git a/torch/legacy/nn/Power.py b/torch/legacy/nn/Power.py
deleted file mode 100644
index 20b23baefd64b0..00000000000000
--- a/torch/legacy/nn/Power.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Power(Module):
-
-    def __init__(self, p):
-        super(Power, self).__init__()
-        self.pow = p
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        self.output.pow_(self.pow)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input).copy_(input)
-        self.gradInput.pow_(self.pow - 1)
-        self.gradInput.mul_(gradOutput).mul_(self.pow)
-        return self.gradInput
diff --git a/torch/legacy/nn/RReLU.py b/torch/legacy/nn/RReLU.py
deleted file mode 100644
index 237d927da7b690..00000000000000
--- a/torch/legacy/nn/RReLU.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class RReLU(Module):
-
-    def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
-        super(RReLU, self).__init__()
-        self.lower = lower
-        self.upper = upper
-        self.inplace = inplace
-
-        assert self.lower <= self.upper and self.lower >= 0 and self.upper >= 0
-        self.noise = torch.Tensor()
-        self.train = True
-
-    def updateOutput(self, input):
-        self._backend.RReLU_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.noise,
-            self.lower,
-            self.upper,
-            self.train,
-            self.inplace,
-            torch.default_generator if not input.is_cuda else 0
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.RReLU_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.noise,
-            self.lower,
-            self.upper,
-            self.train,
-            self.inplace
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return super(RReLU, self).__repr__() + '({:.4f}, {:.4f})'.format(self.lower, self.upper)
-
-    def clearState(self):
-        clear(self, 'noise')
-        return super(RReLU, self).clearState()
diff --git a/torch/legacy/nn/ReLU.py b/torch/legacy/nn/ReLU.py
deleted file mode 100644
index 2674f47cf9c4a4..00000000000000
--- a/torch/legacy/nn/ReLU.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import torch
-from .Threshold import Threshold
-
-
-class ReLU(Threshold):
-
-    def __init__(self, inplace=False):
-        super(ReLU, self).__init__(0, 0, inplace)
diff --git a/torch/legacy/nn/ReLU6.py b/torch/legacy/nn/ReLU6.py
deleted file mode 100644
index cb8b59d2b5c7eb..00000000000000
--- a/torch/legacy/nn/ReLU6.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-from .Module import Module
-
-
-class ReLU6(Module):
-
-    def __init__(self, inplace=False):
-        super(ReLU6, self).__init__()
-        self.inplace = inplace
-
-    def updateOutput(self, input):
-        self._backend.HardTanh_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            0, 6, self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.HardTanh_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            0, 6, self.inplace
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/Replicate.py b/torch/legacy/nn/Replicate.py
deleted file mode 100644
index 4eed0b52b6b041..00000000000000
--- a/torch/legacy/nn/Replicate.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Replicate(Module):
-
-    def __init__(self, nf, dim=0):
-        super(Replicate, self).__init__()
-        self.nfeatures = nf
-        self.dim = dim
-        assert self.dim >= 0
-
-    def updateOutput(self, input):
-        assert self.dim < input.dim()
-
-        size = list(input.size())
-        size.insert(self.dim, self.nfeatures)
-
-        stride = list(input.stride())
-        stride.insert(self.dim, 0)
-
-        self.output.set_(input.storage(), input.storage_offset(),
-                         torch.Size(size), tuple(stride))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input).zero_()
-        size = list(input.size())
-        size.insert(self.dim, 1)
-
-        gradInput = self.gradInput.view(*size)
-        torch.sum(gradOutput, self.dim, True, out=gradInput)
-        return self.gradInput
diff --git a/torch/legacy/nn/Reshape.py b/torch/legacy/nn/Reshape.py
deleted file mode 100644
index 23d5ad9b8e7043..00000000000000
--- a/torch/legacy/nn/Reshape.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Reshape(Module):
-
-    def __init__(self, *args):
-        super(Reshape, self).__init__()
-
-        if len(args) == 0 and isinstance(args[0], torch.Size):
-            self.size = args[0]
-        else:
-            self.size = torch.Size(args)
-
-        self.nelement = 1
-        for s in self.size:
-            self.nelement *= s
-
-        self._input = None
-        self._gradOutput = None
-
-    def updateOutput(self, input):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input)
-            self._input.copy_(input)
-            input = self._input
-
-        batchsize = [input.size(0)] + list(self.size)
-        self.output = input.view(torch.Size(batchsize))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if not gradOutput.is_contiguous():
-            if self._gradOutput is None:
-                self._gradOutput = gradOutput.new()
-            self._gradOutput.resize_as_(gradOutput)
-            self._gradOutput.copy_(gradOutput)
-            gradOutput = self._gradOutput
-
-        self.gradInput = gradOutput.view_as(input)
-        return self.gradInput
-
-    def __repr__(self):
-        return super(Reshape, self).__repr__() + \
-            '({})'.format('x'.join(map(lambda x: str(x), self.size)))
-
-    def clearState(self):
-        clear(self, '_input', '_gradOutput')
-        return super(Reshape, self).clearState()
diff --git a/torch/legacy/nn/Select.py b/torch/legacy/nn/Select.py
deleted file mode 100644
index 287cb000e4affd..00000000000000
--- a/torch/legacy/nn/Select.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Select(Module):
-
-    def __init__(self, dimension, index):
-        super(Select, self).__init__()
-        self.dimension = dimension
-        self.index = index
-
-    def updateOutput(self, input):
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        output = input.select(self.dimension, index)
-        self.output.resize_as_(output)
-        return self.output.copy_(output)
-
-    def updateGradInput(self, input, gradOutput):
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        self.gradInput.resize_as_(input)
-        self.gradInput.zero_()
-        self.gradInput.select(self.dimension, index).copy_(gradOutput)
-        return self.gradInput
diff --git a/torch/legacy/nn/SelectTable.py b/torch/legacy/nn/SelectTable.py
deleted file mode 100644
index 9150c44c942f8b..00000000000000
--- a/torch/legacy/nn/SelectTable.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import torch
-from .Module import Module
-from .utils import recursiveCopy, clear
-
-
-class SelectTable(Module):
-
-    def __init__(self, index):
-        super(SelectTable, self).__init__()
-        self.index = index
-        self.gradInput = []
-
-    def updateOutput(self, input):
-        # handle negative indices
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        assert len(input) > index
-        self.output = input[index]
-        return self.output
-
-    def _zeroTableCopy(self, l1, l2):
-        for i, v in enumerate(l2):
-            if isinstance(v, list):
-                if len(l1) > i:
-                    l1[i] = self._zeroTableCopy(l1[i], l2[i])
-                else:
-                    l1.append(self._zeroTableCopy([], l2[i]))
-            else:
-                if i >= len(l1):
-                    l1.append(v.new().resize_as_(v).zero_())
-                else:
-                    l1[i].resize_as_(v)
-                    l1[i].zero_()
-        del l1[len(l2):]
-        return l1
-
-    def updateGradInput(self, input, gradOutput):
-        # make gradInput a zeroed copy of input
-        self._zeroTableCopy(self.gradInput, input)
-        # handle negative indices
-        index = self.index if self.index >= 0 else input.size(self.dimension) + self.index
-        # copy into gradInput[index] (necessary for variable sized inputs)
-        assert self.gradInput[index] is not None
-        recursiveCopy(self.gradInput[index], gradOutput)
-        return self.gradInput
-
-    def type(self, type, tensorCache=None):
-        del self.gradInput[:]
-        if isinstance(self.output, list):
-            del self.output[:]
-        return super(SelectTable, self).type(type, tensorCache)
-
-    def __repr__(self):
-        return super(SelectTable, self).__repr__() + '({})'.format(self.index)
-
-    def clearState(self):
-        clear(self, 'gradInput')
diff --git a/torch/legacy/nn/Sequential.py b/torch/legacy/nn/Sequential.py
deleted file mode 100644
index e3c4a0034ae2fa..00000000000000
--- a/torch/legacy/nn/Sequential.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import torch
-from .Container import Container
-
-
-class Sequential(Container):
-
-    def __len__(self):
-        return len(self.modules)
-
-    def add(self, module):
-        if len(self.modules) == 0:
-            self.gradInput = module.gradInput
-
-        self.modules.append(module)
-        self.output = module.output
-        return self
-
-    def insert(self, module, index):
-        self.modules.insert(module, index)
-        self.output = self.modules[-1].output
-        self.gradInput = self.modules[0].gradInput
-
-    def remove(self, index=-1):
-        del self.modules[index]
-
-        if len(self.modules) > 0:
-            self.output = self.modules[-1].output
-            self.gradInput = self.modules[0].gradInput
-        else:
-            self.output = torch.Tensor()
-            self.gradInput = torch.Tensor()
-
-    def updateOutput(self, input):
-        currentOutput = input
-        for i, module in enumerate(self.modules):
-            currentOutput = module.updateOutput(currentOutput)
-        self.output = currentOutput
-        return self.output
-
-    def _iter_with_prev(self):
-        return zip(self.modules[-2::-1], self.modules[-1:0:-1])
-
-    def updateGradInput(self, input, gradOutput):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            currentGradOutput = current.updateGradInput(prev.output, currentGradOutput)
-        self.gradInput = self.modules[0].updateGradInput(input, currentGradOutput)
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            current.accGradParameters(prev.output, currentGradOutput, scale)
-            currentGradOutput = current.gradInput
-        self.modules[0].accGradParameters(input, currentGradOutput, scale)
-
-    def backward(self, input, gradOutput, scale=1):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            currentGradOutput = current.backward(prev.output, currentGradOutput, scale)
-            # currentModule.gradInput = currentGradOutput
-        self.gradInput = self.modules[0].backward(input, currentGradOutput, scale)
-        return self.gradInput
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        currentGradOutput = gradOutput
-        for prev, current in self._iter_with_prev():
-            current.accUpdateGradParameters(prev.output, currentGradOutput, lr)
-            currentGradOutput = current.gradInput
-        self.modules[0].accUpdateGradParameters(input, currentGradOutput, lr)
-
-    def __repr__(self):
-        tab = '  '
-        line = '\n'
-        next = ' -> '
-        res = 'nn.Sequential'
-        res = res + ' {' + line + tab + '[input'
-        for i in range(len(self.modules)):
-            res = res + next + '(' + str(i) + ')'
-
-        res = res + next + 'output]'
-        for i in range(len(self.modules)):
-            res = res + line + tab + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab)
-
-        res = res + line + '}'
-        return res
diff --git a/torch/legacy/nn/Sigmoid.py b/torch/legacy/nn/Sigmoid.py
deleted file mode 100644
index 47b42ddc40dc1f..00000000000000
--- a/torch/legacy/nn/Sigmoid.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Sigmoid(Module):
-
-    def updateOutput(self, input):
-        self._backend.Sigmoid_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.Sigmoid_updateGradInput(
-            self._backend.library_state,
-            gradOutput,
-            self.gradInput,
-            self.output
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SmoothL1Criterion.py b/torch/legacy/nn/SmoothL1Criterion.py
deleted file mode 100644
index 714d0b6ed0fe0b..00000000000000
--- a/torch/legacy/nn/SmoothL1Criterion.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class SmoothL1Criterion(Criterion):
-
-    def __init__(self, sizeAverage=True):
-        super(SmoothL1Criterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.SmoothL1Criterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.SmoothL1Criterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SoftMarginCriterion.py b/torch/legacy/nn/SoftMarginCriterion.py
deleted file mode 100644
index 4bfa37173ce013..00000000000000
--- a/torch/legacy/nn/SoftMarginCriterion.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class SoftMarginCriterion(Criterion):
-
-    def __init__(self, ):
-        super(SoftMarginCriterion, self).__init__()
-        self.sizeAverage = True
-        self.output_tensor = None
-
-    def updateOutput(self, input, target):
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.SoftMarginCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.SoftMarginCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SoftMax.py b/torch/legacy/nn/SoftMax.py
deleted file mode 100644
index 9a2e2c7d94b05c..00000000000000
--- a/torch/legacy/nn/SoftMax.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SoftMax(Module):
-
-    def __init__(self, dim=None):
-        super(SoftMax, self).__init__()
-        if dim is not None:
-            self.dim = dim
-
-    def _get_dim(self, input):
-        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
-
-    def updateOutput(self, input):
-        self.output = torch.softmax(input, self._get_dim(input))
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = torch.softmax_backward_data(
-            gradOutput,
-            self.output,
-            self._get_dim(input),
-            input)
-        return self.gradInput
diff --git a/torch/legacy/nn/SoftMin.py b/torch/legacy/nn/SoftMin.py
deleted file mode 100644
index f85f30d4691e5d..00000000000000
--- a/torch/legacy/nn/SoftMin.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SoftMin(Module):
-
-    def __init__(self, dim=None):
-        super(SoftMin, self).__init__()
-        self.mininput = None
-        if dim is not None:
-            self.dim = dim
-
-    def _get_dim(self, input):
-        return getattr(self, 'dim', 0 if input.dim() == 1 or input.dim() == 3 else 1)
-
-    def updateOutput(self, input):
-        if self.mininput is None:
-            self.mininput = input.new()
-        self.mininput.resize_as_(input).copy_(input).mul_(-1)
-        self.output = torch.softmax(
-            self.mininput,
-            self._get_dim(input)
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.mininput is None:
-            self.mininput = input.new()
-        self.mininput.resize_as_(input).copy_(input).mul_(-1)
-        self.gradInput = torch.softmax_backward_data(
-            gradOutput,
-            self.output,
-            self._get_dim(input),
-            self.mininput
-        )
-
-        self.gradInput.mul_(-1)
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'mininput')
-        return super(SoftMin, self).clearState()
diff --git a/torch/legacy/nn/SoftPlus.py b/torch/legacy/nn/SoftPlus.py
deleted file mode 100644
index 062600d698e74f..00000000000000
--- a/torch/legacy/nn/SoftPlus.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SoftPlus(Module):
-
-    def __init__(self, beta=1, threshold=20):
-        super(SoftPlus, self).__init__()
-        self.beta = beta              # Beta controls sharpness of transfer function
-        self.threshold = threshold    # Avoid floating point issues with exp(x), x>20
-
-    def updateOutput(self, input):
-        # f(x) = 1/beta * log(1 + exp(beta * x))
-        self._backend.SoftPlus_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.beta,
-            self.threshold
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        # d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
-        # SINCE
-        # y = (1/k)*log(1+exp(k*x)) #> x = (1/k)*log(exp(k*y)-1)
-        # THEREFORE:
-        # d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
-        self._backend.SoftPlus_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.output,
-            self.beta,
-            self.threshold
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SoftShrink.py b/torch/legacy/nn/SoftShrink.py
deleted file mode 100644
index a3ac3166507058..00000000000000
--- a/torch/legacy/nn/SoftShrink.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SoftShrink(Module):
-
-    def __init__(self, lambd=0.5):
-        super(SoftShrink, self).__init__()
-        self.lambd = lambd
-
-    def updateOutput(self, input):
-        self._backend.SoftShrink_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.lambd
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.SoftShrink_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.lambd
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SoftSign.py b/torch/legacy/nn/SoftSign.py
deleted file mode 100644
index 9aa58c1f7be26c..00000000000000
--- a/torch/legacy/nn/SoftSign.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SoftSign(Module):
-
-    def __init__(self):
-        super(SoftSign, self).__init__()
-        self.temp = None
-        self.tempgrad = None
-
-    def updateOutput(self, input):
-        if self.temp is None:
-            self.temp = input.new()
-        self.temp.resize_as_(input).copy_(input).abs_().add_(1)
-        self.output.resize_as_(input).copy_(input).div_(self.temp)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.tempgrad is None:
-            self.tempgrad = input.new()
-        self.tempgrad.resize_as_(self.output).copy_(input).abs_().add_(1).mul_(self.tempgrad)
-        self.gradInput.resize_as_(input).copy_(gradOutput).div_(self.tempgrad)
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'temp', 'tempgrad')
-        return super(SoftSign, self).clearState()
diff --git a/torch/legacy/nn/SpatialAdaptiveMaxPooling.py b/torch/legacy/nn/SpatialAdaptiveMaxPooling.py
deleted file mode 100644
index b8ed87492cf3ff..00000000000000
--- a/torch/legacy/nn/SpatialAdaptiveMaxPooling.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SpatialAdaptiveMaxPooling(Module):
-
-    def __init__(self, w, h):
-        super(SpatialAdaptiveMaxPooling, self).__init__()
-        self.w = w
-        self.h = h
-        self.indices = None
-
-    def updateOutput(self, input):
-        if self.indices is None:
-            self.indices = input.new()
-        self.indices = self.indices.long()
-        self._backend.SpatialAdaptiveMaxPooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.indices,
-            self.w,
-            self.h
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.SpatialAdaptiveMaxPooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.indices
-        )
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'indices')
-        return super(SpatialAdaptiveMaxPooling, self).clearState()
diff --git a/torch/legacy/nn/SpatialAveragePooling.py b/torch/legacy/nn/SpatialAveragePooling.py
deleted file mode 100644
index acf4c640830ed5..00000000000000
--- a/torch/legacy/nn/SpatialAveragePooling.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SpatialAveragePooling(Module):
-
-    def __init__(self, kW, kH, dW=1, dH=1, padW=0, padH=0):
-        super(SpatialAveragePooling, self).__init__()
-
-        self.kW = kW
-        self.kH = kH
-        self.dW = dW
-        self.dH = dH
-        self.padW = padW
-        self.padH = padH
-        self.ceil_mode = False
-        self.count_include_pad = True
-        self.divide = True
-
-    def ceil(self):
-        self.ceil_mode = True
-        return self
-
-    def floor(self):
-        self.ceil_mode = False
-        return self
-
-    def setCountIncludePad(self):
-        self.count_include_pad = True
-        return self
-
-    def setCountExcludePad(self):
-        self.count_include_pad = False
-        return self
-
-    def updateOutput(self, input):
-        self._backend.SpatialAveragePooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.ceil_mode,
-            self.count_include_pad
-        )
-        # for backward compatibility with saved models
-        # which are not supposed to have "divide" field
-        if not self.divide:
-            self.output.mul_(self.kW * self.kH)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is not None:
-            self._backend.SpatialAveragePooling_updateGradInput(
-                self._backend.library_state,
-                input,
-                gradOutput,
-                self.gradInput,
-                self.kW, self.kH,
-                self.dW, self.dH,
-                self.padW, self.padH,
-                self.ceil_mode,
-                self.count_include_pad
-            )
-            # for backward compatibility
-            if not self.divide:
-                self.gradInput.mul_(self.kW * self.kH)
-
-            return self.gradInput
-
-    def __repr__(self):
-        s = super(SpatialAveragePooling, self).__repr__()
-        s += '({}x{}, {}, {}'.format(self.kW, self.kH, self.dW, self.dH)
-        if (self.padW or self.padH) and (self.padW != 0 or self.padH != 0):
-            s += ', {}, {}'.format(self.padW, self.padH)
-        s += ')'
-        return s
diff --git a/torch/legacy/nn/SpatialBatchNormalization.py b/torch/legacy/nn/SpatialBatchNormalization.py
deleted file mode 100644
index 725ebfffc66460..00000000000000
--- a/torch/legacy/nn/SpatialBatchNormalization.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-from .BatchNormalization import BatchNormalization
-
-
-class SpatialBatchNormalization(BatchNormalization):
-    """
-       This class implements Batch Normalization as described in the paper:
-       "Batch Normalization: Accelerating Deep Network Training
-                             by Reducing Internal Covariate Shift"
-                   by Sergey Ioffe, Christian Szegedy
-
-       This implementation is useful for inputs coming from convolution layers.
-       For non-convolutional layers, see BatchNormalization.lua
-
-       The operation implemented is:
-               (x - mean(x))
-       y = --------------------- * gamma + beta
-           standard-deviation(x)
-       where gamma and beta are learnable parameters.
-
-       The learning of gamma and beta is optional.
-
-       Usage:
-       with    learnable parameters: nn.SpatialBatchNormalization(N [, eps] [, momentum])
-                                   where N = dimensionality of input
-       without learnable parameters: nn.SpatialBatchNormalization(N [, eps] [, momentum], False)
-
-       eps is a small value added to the variance to avoid divide-by-zero.
-           Defaults to 1e-5
-
-       In training time, this layer keeps a running estimate of it's computed mean and std.
-       The running sum is kept with a default momentum of 0.1 (unless over-ridden)
-       In test time, this running mean/std is used to normalize.
-    """
-
-    # expected dimension of input
-    nDim = 4
diff --git a/torch/legacy/nn/SpatialClassNLLCriterion.py b/torch/legacy/nn/SpatialClassNLLCriterion.py
deleted file mode 100644
index 8a7e15c8298149..00000000000000
--- a/torch/legacy/nn/SpatialClassNLLCriterion.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class SpatialClassNLLCriterion(Criterion):
-
-    def __init__(self, weights=None, sizeAverage=True, ignore_index=-100):
-        assert weights is None or weights.dim() == 1
-        super(SpatialClassNLLCriterion, self).__init__()
-        self.sizeAverage = sizeAverage
-        self.weights = weights
-        self.ignore_index = ignore_index
-
-        self.output_tensor = torch.zeros(1)
-        self.total_weight_tensor = torch.ones(1)
-
-    def updateOutput(self, input, target):
-        if not hasattr(self, 'ignore_index'):
-            self.ignore_index = -100
-        self._backend.SpatialClassNLLCriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            target,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.weights,
-            self.total_weight_tensor,
-            self.ignore_index,
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.gradInput.resize_as_(input).zero_()
-        implicit_gradOutput = torch.ones(1).type_as(input)
-        self._backend.SpatialClassNLLCriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            target,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-            self.weights,
-            self.total_weight_tensor,
-            self.ignore_index,
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SpatialContrastiveNormalization.py b/torch/legacy/nn/SpatialContrastiveNormalization.py
deleted file mode 100644
index a47f94667e71ab..00000000000000
--- a/torch/legacy/nn/SpatialContrastiveNormalization.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-from .Module import Module
-from .Sequential import Sequential
-from .SpatialSubtractiveNormalization import SpatialSubtractiveNormalization
-from .SpatialDivisiveNormalization import SpatialDivisiveNormalization
-
-
-class SpatialContrastiveNormalization(Module):
-
-    def __init__(self, nInputPlane=1, kernel=None, threshold=1e-4, thresval=1e-4):
-        super(SpatialContrastiveNormalization, self).__init__()
-
-        # get args
-        self.nInputPlane = nInputPlane
-        if kernel is None:
-            self.kernel = torch.Tensor(9, 9).fill_(1)
-        else:
-            self.kernel = kernel
-        self.threshold = threshold
-        self.thresval = thresval or threshold
-        kdim = self.kernel.ndimension()
-
-        # check args
-        if kdim != 2 and kdim != 1:
-            raise ValueError('SpatialContrastiveNormalization averaging kernel must be 2D or 1D')
-
-        if self.kernel.size(0) % 2 == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
-            raise ValueError('SpatialContrastiveNormalization averaging kernel must have ODD dimensions')
-
-        # instantiate sub+div normalization
-        self.normalizer = Sequential()
-        self.normalizer.add(SpatialSubtractiveNormalization(self.nInputPlane, self.kernel))
-        self.normalizer.add(SpatialDivisiveNormalization(self.nInputPlane, self.kernel,
-                                                         self.threshold, self.thresval))
-
-    def updateOutput(self, input):
-        self.output = self.normalizer.forward(input)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = self.normalizer.backward(input, gradOutput)
-        return self.gradInput
diff --git a/torch/legacy/nn/SpatialConvolution.py b/torch/legacy/nn/SpatialConvolution.py
deleted file mode 100644
index d5d81631284743..00000000000000
--- a/torch/legacy/nn/SpatialConvolution.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SpatialConvolution(Module):
-
-    def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None):
-        super(SpatialConvolution, self).__init__()
-
-        self.nInputPlane = nInputPlane
-        self.nOutputPlane = nOutputPlane
-        self.kW = kW
-        self.kH = kH
-
-        self.dW = dW
-        self.dH = dH
-        self.padW = padW
-        self.padH = padH if padH is not None else padW
-
-        self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
-        self.bias = torch.Tensor(nOutputPlane)
-        self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
-        self.gradBias = torch.Tensor(nOutputPlane)
-
-        self.reset()
-        self._input = None
-        self._gradOutput = None
-        self.finput = None
-        self.fgradInput = None
-
-    def noBias(self):
-        self.bias = None
-        self.gradBias = None
-        return self
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.kW * self.kH * self.nInputPlane)
-
-        self.weight.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.uniform_(-stdv, stdv)
-
-    def _makeContiguous(self, input, gradOutput=None):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input).copy_(input)
-            input = self._input
-
-        if gradOutput is not None:
-            if not gradOutput.is_contiguous():
-                if self._gradOutput is None:
-                    self._gradOutput = gradOutput.new()
-                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-                gradOutput = self._gradOutput
-            return input, gradOutput
-
-        return input
-
-    def _init(self):
-        if self.finput is None:
-            self.finput = self.weight.new()
-        if self.fgradInput is None:
-            self.fgradInput = self.weight.new()
-
-    # function to re-view the weight layout in a way that would make the MM ops happy
-    def _viewWeight(self):
-        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
-        if self.gradWeight is not None and self.gradWeight.dim() > 0:
-            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
-
-    def _unviewWeight(self):
-        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
-        if self.gradWeight is not None and self.gradWeight.dim() > 0:
-            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
-
-    def updateOutput(self, input):
-        self._init()
-        self._viewWeight()
-        input = self._makeContiguous(input)
-        self._backend.SpatialConvolutionMM_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH
-        )
-        self._unviewWeight()
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        self._init()
-        self._viewWeight()
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-        self._backend.SpatialConvolutionMM_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH
-        )
-        self._unviewWeight()
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._init()
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-        self._viewWeight()
-        self._backend.SpatialConvolutionMM_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            scale
-        )
-        self._unviewWeight()
-
-    def type(self, type=None, tensorCache={}):
-        if self.finput is not None:
-            self.finput = torch.Tensor()
-        if self.fgradInput is not None:
-            self.fgradInput = torch.Tensor()
-        return super(SpatialConvolution, self).type(type, tensorCache)
-
-    def __repr__(self):
-        s = super(SpatialConvolution, self).__repr__()
-        s += '({} -> {}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
-        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
-            s += ', {}, {}'.format(self.dW, self.dH)
-
-        if self.padW != 0 or self.padH != 0:
-            s += ', {}, {}'.format(self.padW, self.padH)
-
-        s += ')'
-        if self.bias is None:
-            s += ' without bias'
-        return s
-
-    def clearState(self):
-        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
-        return super(SpatialConvolution, self).clearState()
diff --git a/torch/legacy/nn/SpatialConvolutionLocal.py b/torch/legacy/nn/SpatialConvolutionLocal.py
deleted file mode 100644
index 0e0cbafde8f7b1..00000000000000
--- a/torch/legacy/nn/SpatialConvolutionLocal.py
+++ /dev/null
@@ -1,202 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SpatialConvolutionLocal(Module):
-
-    def __init__(self, nInputPlane, nOutputPlane, iW, iH, kW, kH, dW=1, dH=1, padW=0, padH=None):
-        super(SpatialConvolutionLocal, self).__init__()
-
-        self.nInputPlane = nInputPlane
-        self.nOutputPlane = nOutputPlane
-        self.kW = kW
-        self.kH = kH
-        self.iW = iW
-        self.iH = iH
-
-        self.dW = dW
-        self.dH = dH
-        self.padW = padW
-        self.padH = padH if padH is not None else padW
-        self.oW = int(math.floor((self.padW * 2 + iW - self.kW) / self.dW)) + 1
-        self.oH = int(math.floor((self.padH * 2 + iH - self.kH) / self.dH)) + 1
-        assert 1 <= self.oW and 1 <= self.oH
-
-        self.weight = torch.Tensor(self.oH, self.oW, nOutputPlane, nInputPlane, kH, kW)
-        self.bias = torch.Tensor(nOutputPlane, self.oH, self.oW)
-        self.gradWeight = torch.Tensor().resize_as_(self.weight)
-        self.gradBias = torch.Tensor().resize_as_(self.bias)
-
-        self.reset()
-        self.finput = None
-        self.fgradInput = None
-        self._input = None
-        self._gradOutput = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.kW * self.kH * self.nInputPlane)
-
-        self.weight.uniform_(-stdv, stdv)
-        self.bias.uniform_(-stdv, stdv)
-
-    def _makeContiguous(self, input, gradOutput=None):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input).copy_(input)
-            input = self._input
-
-        if gradOutput is not None:
-            if not gradOutput.is_contiguous():
-                if self._gradOutput is None:
-                    self._gradOutput = gradOutput.new()
-                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-                gradOutput = self._gradOutput
-            return input, gradOutput
-
-        return input
-
-    def _viewWeight(self):
-        self.weight = self.weight.view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
-        if self.gradWeight is not None and self.gradWeight.dim() > 0:
-            self.gradWeight = self.gradWeight.view(
-                self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
-
-    def _unviewWeight(self):
-        self.weight = self.weight.view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
-        if self.gradWeight is not None and self.gradWeight.dim() > 0:
-            self.gradWeight = self.gradWeight.view(
-                self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
-
-    def _checkInputSize(self, input):
-        if input.ndimension() == 3:
-            if input.size(0) != self.nInputPlane or input.size(1) != self.iH or input.size(1) != self.iW:
-                raise RuntimeError(
-                    'Given input size: ({}x{}x{}) inconsistent with expected input size: ({}x{}x{}).'.format(
-                        input.size(0), input.size(1), input.size(2), self.nInputPlane, self.iH, self.iW))
-        elif input.ndimension() == 4:
-            if input.size(1) != self.nInputPlane or input.size(2) != self.iH or input.size(3) != self.iW:
-                raise RuntimeError(
-                    'Given input size: ({}x{}x{}x{}) inconsistent with expected input size: (*x{}x{}x{}).'.format(
-                        input.size(0), input.size(1), input.size(2), input.size(3), self.nInputPlane, self.iH, self.iW))
-        else:
-            raise RuntimeError('3D or 4D (batch mode) tensor expected')
-
-    def _checkOutputSize(self, input, output):
-        if output.ndimension() != input.ndimension():
-            raise RuntimeError('inconsistent dimension between output and input.')
-
-        if output.ndimension() == 3:
-            if output.size(0) != self.nOutputPlane or output.size(1) != self.oH or output.size(2) != self.oW:
-                raise RuntimeError(
-                    'Given output size: ({}x{}x{}) inconsistent with expected output size: ({}x{}x{}).'.format(
-                        output.size(0), output.size(1), output.size(2), self.nOutputPlane, self.oH, self.oW))
-        elif output.ndimension() == 4:
-            if output.size(1) != self.nOutputPlane or output.size(2) != self.oH or output.size(3) != self.oW:
-                raise RuntimeError('Given output size: ({}x{}x{}x{}) inconsistent with expected output size: '
-                                   '(batchsize x{}x{}x{}).'.format(
-                                       output.size(0), output.size(1), output.size(2),
-                                       output.size(3), self.nOutputPlane, self.oH, self.oW))
-        else:
-            raise RuntimeError('3D or 4D(batch mode) tensor expected')
-
-    def updateOutput(self, input):
-        if self.finput is None:
-            self.finput = input.new()
-        if self.fgradInput is None:
-            self.fgradInput = input.new()
-        self._checkInputSize(input)
-        self._viewWeight()
-        input = self._makeContiguous(input)
-        self._backend.SpatialConvolutionLocal_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.iW, self.iH,
-            self.oW, self.oH
-        )
-        self._unviewWeight()
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        self._checkInputSize(input)
-        self._checkOutputSize(input, gradOutput)
-
-        self._viewWeight()
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-        self._backend.SpatialConvolutionLocal_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.iW, self.iH,
-            self.oW, self.oH
-        )
-        self._unviewWeight()
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._checkInputSize(input)
-        self._checkOutputSize(input, gradOutput)
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-        self._viewWeight()
-        self._backend.SpatialConvolutionLocal_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.iW, self.iH,
-            self.oW, self.oH,
-            scale
-        )
-        self._unviewWeight()
-
-    def type(self, type=None, tensorCache=None):
-        if self.finput is not None:
-            self.finput = torch.Tensor()
-        if self.fgradInput is not None:
-            self.fgradInput = torch.Tensor()
-        return super(SpatialConvolutionLocal, self).type(type, tensorCache)
-
-    def __tostring__(self, ):
-        s = super(SpatialConvolution, self).__repr__()
-        s += '({} -> {}, {}x{}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.iW, self.iH, self.kW, self.kH)
-        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
-            s += ', {}, {}'.format(self.dW, self.dH)
-
-        if self.padW != 0 or self.padH != 0:
-            s += ', {}, {}'.format(self.padW, self.padH)
-
-        s += ')'
-        return s
-
-    def clearState(self):
-        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
-        return super(SpatialConvolutionLocal, self).clearState()
diff --git a/torch/legacy/nn/SpatialConvolutionMap.py b/torch/legacy/nn/SpatialConvolutionMap.py
deleted file mode 100644
index e901140a528bb3..00000000000000
--- a/torch/legacy/nn/SpatialConvolutionMap.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import random
-import math
-import torch
-from .Module import Module
-
-# TODO fix THNN...
-
-
-class SpatialConvolutionMap(Module):
-
-    class maps(object):
-
-        @staticmethod
-        def full(nin, nout):
-            ft = torch.Tensor(nin * nout, 2)
-            p = 0
-            for j in range(nout):
-                for i in range(nin):
-                    ft[p][0] = i
-                    ft[p][1] = j
-                    p += 1
-            return ft
-
-        @staticmethod
-        def oneToOne(nfeat):
-            ft = torch.Tensor(nfeat, 2)
-            for i in range(nfeat):
-                ft[i][0] = i
-                ft[i][1] = i
-            return ft
-
-        @staticmethod
-        def random(nin, nout, nto):
-            nker = nto * nout
-            tbl = torch.Tensor(nker, 2)
-            fi = torch.randperm(nin)
-            frcntr = 0
-            nfi = math.floor(nin / nto)  # number of distinct nto chunks
-            totbl = tbl.select(1, 1)
-            frtbl = tbl.select(1, 0)
-            fitbl = fi.narrow(0, 0, (nfi * nto))  # part of fi that covers distinct chunks
-            ufrtbl = frtbl.unfold(0, nto, nto)
-            utotbl = totbl.unfold(0, nto, nto)
-            ufitbl = fitbl.unfold(0, nto, nto)
-
-            # start fill_ing frtbl
-            for i in range(nout):  # fro each unit in target map
-                ufrtbl.select(0, i).copy_(ufitbl.select(0, frcntr))
-                frcntr += 1
-                if frcntr - 1 == nfi:  # reset fi
-                    fi.copy_(torch.randperm(nin))
-                    frcntr = 1
-
-            for tocntr in range(utotbl.size(0)):
-                utotbl.select(0, tocntr).fill_(tocntr)
-
-            return tbl
-
-    def __init__(self, conMatrix, kW, kH, dW=1, dH=1):
-        super(SpatialConvolutionMap, self).__init__()
-
-        self.kW = kW
-        self.kH = kH
-        self.dW = dW
-        self.dH = dH
-        self.connTable = conMatrix
-        self.nInputPlane = int(self.connTable.select(1, 0).max()) + 1
-        self.nOutputPlane = int(self.connTable.select(1, 1).max()) + 1
-        self.weight = torch.Tensor(self.connTable.size(0), kH, kW)
-        self.bias = torch.Tensor(self.nOutputPlane)
-        self.gradWeight = torch.Tensor(self.connTable.size(0), kH, kW)
-        self.gradBias = torch.Tensor(self.nOutputPlane)
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-            self.weight.uniform_(-stdv, stdv)
-            self.bias.uniform_(-stdv, stdv)
-        else:
-            ninp = torch.Tensor(self.nOutputPlane).zero_()
-            for i in range(self.connTable.size(0)):
-                idx = int(self.connTable[i, 1])
-                ninp[idx] += 1
-            for k in range(self.connTable.size(0)):
-                idx = int(self.connTable[k, 1])
-                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[idx])
-                self.weight.select(0, k).uniform_(-stdv, stdv)
-            for k in range(self.bias.size(0)):
-                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[k])
-                # TODO: torch.uniform
-                self.bias[k] = random.uniform(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self._backend.SpatialConvolutionMap_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.connTable,
-            self.nInputPlane,
-            self.nOutputPlane,
-            self.dW, self.dH
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.SpatialConvolutionMap_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.bias,
-            self.connTable,
-            self.nInputPlane,
-            self.nOutputPlane,
-            self.dW, self.dH
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._backend.SpatialConvolutionMap_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.connTable,
-            self.nInputPlane,
-            self.nOutputPlane,
-            self.dW, self.dH,
-            scale
-        )
diff --git a/torch/legacy/nn/SpatialCrossMapLRN.py b/torch/legacy/nn/SpatialCrossMapLRN.py
deleted file mode 100644
index 57bbe81fce20ac..00000000000000
--- a/torch/legacy/nn/SpatialCrossMapLRN.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SpatialCrossMapLRN(Module):
-
-    def __init__(self, size, alpha=1e-4, beta=0.75, k=1):
-        super(SpatialCrossMapLRN, self).__init__()
-
-        self.size = size
-        self.alpha = alpha
-        self.beta = beta
-        self.k = k
-        self.scale = None
-        self.paddedRatio = None
-        self.accumRatio = None
-
-    def updateOutput(self, input):
-        assert input.dim() == 4
-
-        if self.scale is None:
-            self.scale = input.new()
-        if input.type() == 'torch.cuda.FloatTensor':
-            self._backend.SpatialCrossMapLRN_updateOutput(
-                self._backend.library_state,
-                input,
-                self.output,
-                self.scale,
-                self.size,
-                self.alpha,
-                self.beta,
-                self.k
-            )
-        else:
-            batchSize = input.size(0)
-            channels = input.size(1)
-            inputHeight = input.size(2)
-            inputWidth = input.size(3)
-
-            self.output.resize_as_(input)
-            self.scale.resize_as_(input)
-
-            # use output storage as temporary buffer
-            inputSquare = self.output
-            torch.pow(input, 2, out=inputSquare)
-
-            prePad = int((self.size - 1) / 2 + 1)
-            prePadCrop = channels if prePad > channels else prePad
-
-            scaleFirst = self.scale.select(1, 0)
-            scaleFirst.zero_()
-            # compute first feature map normalization
-            for c in range(prePadCrop):
-                scaleFirst.add_(inputSquare.select(1, c))
-
-            # reuse computations for next feature maps normalization
-            # by adding the next feature map and removing the previous
-            for c in range(1, channels):
-                scalePrevious = self.scale.select(1, c - 1)
-                scaleCurrent = self.scale.select(1, c)
-                scaleCurrent.copy_(scalePrevious)
-                if c < channels - prePad + 1:
-                    squareNext = inputSquare.select(1, c + prePad - 1)
-                    scaleCurrent.add_(1, squareNext)
-
-                if c > prePad:
-                    squarePrevious = inputSquare.select(1, c - prePad)
-                    scaleCurrent.add_(-1, squarePrevious)
-
-            self.scale.mul_(self.alpha / self.size).add_(self.k)
-
-            torch.pow(self.scale, -self.beta, out=self.output)
-            self.output.mul_(input)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 4
-
-        if input.type() == 'torch.cuda.FloatTensor':
-            self._backend.SpatialCrossMapLRN_updateGradInput(
-                self._backend.library_state,
-                input,
-                gradOutput,
-                self.gradInput,
-                self.scale,
-                self.output,
-                self.size,
-                self.alpha,
-                self.beta,
-                self.k
-            )
-        else:
-            batchSize = input.size(0)
-            channels = input.size(1)
-            inputHeight = input.size(2)
-            inputWidth = input.size(3)
-
-            if self.paddedRatio is None:
-                self.paddedRatio = input.new()
-            if self.accumRatio is None:
-                self.accumRatio = input.new()
-            self.paddedRatio.resize_(channels + self.size - 1, inputHeight, inputWidth)
-            self.accumRatio.resize_(inputHeight, inputWidth)
-
-            cacheRatioValue = 2 * self.alpha * self.beta / self.size
-            inversePrePad = int(self.size - (self.size - 1) / 2)
-
-            self.gradInput.resize_as_(input)
-            torch.pow(self.scale, -self.beta, out=self.gradInput).mul_(gradOutput)
-
-            self.paddedRatio.zero_()
-            paddedRatioCenter = self.paddedRatio.narrow(0, inversePrePad, channels)
-            for n in range(batchSize):
-                torch.mul(gradOutput[n], self.output[n], out=paddedRatioCenter)
-                paddedRatioCenter.div_(self.scale[n])
-                torch.sum(self.paddedRatio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=self.accumRatio)
-                for c in range(channels):
-                    self.accumRatio.add_(self.paddedRatio[c + self.size - 1])
-                    self.gradInput[n][c].addcmul_(-cacheRatioValue, input[n][c], self.accumRatio)
-                    self.accumRatio.add_(-1, self.paddedRatio[c])
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'scale', 'paddedRatio', 'accumRatio')
-        return super(SpatialCrossMapLRN, self).clearState()
diff --git a/torch/legacy/nn/SpatialDilatedConvolution.py b/torch/legacy/nn/SpatialDilatedConvolution.py
deleted file mode 100644
index 73056c89666c8a..00000000000000
--- a/torch/legacy/nn/SpatialDilatedConvolution.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import torch
-from .SpatialConvolution import SpatialConvolution
-
-
-class SpatialDilatedConvolution(SpatialConvolution):
-
-    def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None, dilationH=1, dilationW=None):
-        super(SpatialDilatedConvolution, self).__init__(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
-
-        self.dilationH = dilationH
-        self.dilationW = dilationW if dilationW is not None else dilationH
-
-    def updateOutput(self, input):
-        if self.finput is None:
-            self.finput = self.weight.new()
-        if self.fgradInput is None:
-            self.fgradInput = self.weight.new()
-        input = self._makeContiguous(input)
-        self._backend.SpatialDilatedConvolution_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.dilationH, self.dilationW
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-        if self.fgradInput is None:
-            self.fgradInput = self.weight.new()
-        self._backend.SpatialDilatedConvolution_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.finput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.dilationH, self.dilationW
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        input, gradOutput = self._makeContiguous(input, gradOutput)
-        if self.fgradInput is None:
-            self.fgradInput = self.weight.new()
-        self._backend.SpatialDilatedConvolution_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.dilationH, self.dilationW,
-            scale
-        )
-
-    def __repr__(self):
-        s = super(SpatialConvolution, self).__repr__()
-        s += '({} -> {}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
-        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
-            s += ', {}, {}'.format(self.dW, self.dH)
-
-        if self.padW != 0 or self.padH != 0:
-            s += ', {}, {}'.format(self.padW, self.padH)
-
-        s += ', {}, {}'.format(self.dilationW, self.dilationH)
-
-        s += ')'
-        if self.bias is None:
-            s += ' without bias'
-        return s
diff --git a/torch/legacy/nn/SpatialDivisiveNormalization.py b/torch/legacy/nn/SpatialDivisiveNormalization.py
deleted file mode 100644
index 6d7f6dce69bc28..00000000000000
--- a/torch/legacy/nn/SpatialDivisiveNormalization.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .Sequential import Sequential
-from .SpatialZeroPadding import SpatialZeroPadding
-from .SpatialConvolution import SpatialConvolution
-from .SpatialConvolutionMap import SpatialConvolutionMap
-from .Replicate import Replicate
-from .Square import Square
-from .Sqrt import Sqrt
-from .CDivTable import CDivTable
-from .Threshold import Threshold
-from .utils import clear
-
-
-class SpatialDivisiveNormalization(Module):
-
-    def __init__(self, nInputPlane=1, kernel=None, threshold=1e-4, thresval=None):
-        super(SpatialDivisiveNormalization, self).__init__()
-
-        # get args
-        self.nInputPlane = nInputPlane
-        if kernel is None:
-            kernel = torch.Tensor(9, 9).fill_(1)
-        self.kernel = kernel
-        self.threshold = threshold
-        self.thresval = thresval if thresval is not None else threshold
-        kdim = self.kernel.ndimension()
-
-        # check args
-        if kdim != 2 and kdim != 1:
-            raise ValueError('SpatialDivisiveNormalization averaging kernel must be 2D or 1D')
-
-        if (self.kernel.size(0) % 2) == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
-            raise ValueError('SpatialDivisiveNormalization averaging kernel must have ODD dimensions')
-
-        # padding values
-        padH = int(math.floor(self.kernel.size(0) / 2))
-        padW = padH
-        if kdim == 2:
-            padW = int(math.floor(self.kernel.size(1) / 2))
-
-        # create convolutional mean estimator
-        self.meanestimator = Sequential()
-        self.meanestimator.add(SpatialZeroPadding(padW, padW, padH, padH))
-        if kdim == 2:
-            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
-        else:
-            self.meanestimator.add(SpatialConvolutionMap(
-                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
-            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
-
-        self.meanestimator.add(Replicate(self.nInputPlane, 1))
-
-        # create convolutional std estimator
-        self.stdestimator = Sequential()
-        self.stdestimator.add(Square())
-        self.stdestimator.add(SpatialZeroPadding(padW, padW, padH, padH))
-        if kdim == 2:
-            self.stdestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
-        else:
-            self.stdestimator.add(SpatialConvolutionMap(
-                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
-            self.stdestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
-
-        self.stdestimator.add(Replicate(self.nInputPlane, 1))
-        self.stdestimator.add(Sqrt())
-
-        # set kernel and bias
-        if kdim == 2:
-            self.kernel.div_(self.kernel.sum() * self.nInputPlane)
-            for i in range(self.nInputPlane):
-                self.meanestimator.modules[1].weight[0][i] = self.kernel
-                self.stdestimator.modules[2].weight[0][i] = self.kernel
-
-            self.meanestimator.modules[1].bias.zero_()
-            self.stdestimator.modules[2].bias.zero_()
-        else:
-            self.kernel.div_(self.kernel.sum() * math.sqrt(self.nInputPlane))
-            for i in range(self.nInputPlane):
-                self.meanestimator.modules[1].weight[i].copy_(self.kernel)
-                self.meanestimator.modules[2].weight[0][i].copy_(self.kernel)
-                self.stdestimator.modules[2].weight[i].copy_(self.kernel)
-                self.stdestimator.modules[3].weight[0][i].copy_(self.kernel)
-
-            self.meanestimator.modules[1].bias.zero_()
-            self.meanestimator.modules[2].bias.zero_()
-            self.stdestimator.modules[2].bias.zero_()
-            self.stdestimator.modules[3].bias.zero_()
-
-        # other operation
-        self.normalizer = CDivTable()
-        self.divider = CDivTable()
-        self.thresholder = Threshold(self.threshold, self.thresval)
-
-        # coefficient array, to adjust side effects
-        self.coef = torch.Tensor(1, 1, 1)
-
-        self.ones = None
-        self._coef = None
-
-    def updateOutput(self, input):
-        self.localstds = self.stdestimator.updateOutput(input)
-
-        # compute side coefficients
-        dim = input.dim()
-        if (self.localstds.dim() != self.coef.dim() or
-                (input.size(dim - 1) != self.coef.size(dim - 1)) or
-                (input.size(dim - 2) != self.coef.size(dim - 2))):
-            if self.ones is None:
-                self.ones = input.new()
-            self.ones.resize_as_(input[0:1]).fill_(1)
-            coef = self.meanestimator.updateOutput(self.ones).squeeze(0)
-            if self._coef is None:
-                self._coef = input.new()
-            self._coef.resize_as_(coef).copy_(coef)  # make contiguous for view
-            self.coef = self._coef.view(1, *self._coef.size()).expand_as(self.localstds)
-
-        # normalize std dev
-        self.adjustedstds = self.divider.updateOutput([self.localstds, self.coef.contiguous().view_as(self.localstds)])
-        self.thresholdedstds = self.thresholder.updateOutput(self.adjustedstds)
-        self.output = self.normalizer.updateOutput([input, self.thresholdedstds.contiguous().view_as(input)])
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        # resize grad
-        self.gradInput.resize_as_(input).zero_()
-
-        # backprop through all modules
-        gradnorm = (self.normalizer.updateGradInput(
-            [input, self.thresholdedstds.contiguous().view_as(input)], gradOutput))
-        gradadj = self.thresholder.updateGradInput(self.adjustedstds, gradnorm[1])
-        graddiv = (self.divider.updateGradInput(
-            [self.localstds, self.coef.contiguous().view_as(self.localstds)], gradadj))
-        self.gradInput.add_(self.stdestimator.updateGradInput(input, graddiv[0]))
-        self.gradInput.add_(gradnorm[0])
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'ones', '_coef')
-        self.meanestimator.clearState()
-        self.stdestimator.clearState()
-        return super(SpatialDivisiveNormalization, self).clearState()
diff --git a/torch/legacy/nn/SpatialDropout.py b/torch/legacy/nn/SpatialDropout.py
deleted file mode 100644
index 0cf62af0d02c8e..00000000000000
--- a/torch/legacy/nn/SpatialDropout.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SpatialDropout(Module):
-
-    def __init__(self, p=0.5):
-        super(SpatialDropout, self).__init__()
-        self.p = p
-        self.train = True
-        self.noise = torch.Tensor()
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        if self.train:
-            if input.dim() == 4:
-                self.noise.resize_(input.size(0), input.size(1), 1, 1)
-            else:
-                raise RuntimeError('Input must be 4D (nbatch, nfeat, h, w)')
-
-            self.noise.bernoulli_(1 - self.p)
-            # We expand the random dropouts to the entire feature map because the
-            # features are likely correlated across the map and so the dropout
-            # should also be correlated.
-            self.output.mul_(self.noise.expand_as(input))
-        else:
-            self.output.mul_(1 - self.p)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.train:
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-            self.gradInput.mul_(self.noise.expand_as(input))  # simply mask the gradients with the noise vector
-        else:
-            raise RuntimeError('backprop only defined while training')
-
-        return self.gradInput
-
-    def setp(self, p):
-        self.p = p
-
-    def __repr__(self):
-        return super(SpatialDropout, self).__repr__()
-
-    def clearState(self):
-        clear(self, 'noise')
-        return super(SpatialDropout, self).clearState()
diff --git a/torch/legacy/nn/SpatialFractionalMaxPooling.py b/torch/legacy/nn/SpatialFractionalMaxPooling.py
deleted file mode 100644
index de0eeadbc8578b..00000000000000
--- a/torch/legacy/nn/SpatialFractionalMaxPooling.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class SpatialFractionalMaxPooling(Module):
-    # Usage:
-    # nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
-    #   the output should be the exact size (outH x outW)
-    # nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
-    #   the output should be the size (floor(inH x ratioH) x floor(inW x ratioW))
-    #   ratios are numbers between (0, 1) exclusive
-
-    def __init__(self, poolSizeW, poolSizeH, arg1, arg2):
-        super(SpatialFractionalMaxPooling, self).__init__()
-        assert poolSizeW >= 2
-        assert poolSizeH >= 2
-
-        # Pool size (how wide the pooling for each output unit is)
-        self.poolSizeW = poolSizeW
-        self.poolSizeH = poolSizeH
-
-        # Random samples are drawn for all
-        # batch * plane * (height, width; i.e., 2) points. This determines
-        # the 2d "pseudorandom" overlapping pooling regions for each
-        # (batch element x input plane). A new set of random samples is
-        # drawn every updateOutput call, unless we disable it via
-        # .fixPoolingRegions().
-        self.randomSamples = None
-
-        # Flag to disable re-generation of random samples for producing
-        # a new pooling. For testing purposes
-        self.newRandomPool = False
-
-        self.indices = None
-
-        if arg1 >= 1 and arg2 >= 1:
-            # Desired output size: the input tensor will determine the reduction
-            # ratio
-            self.outW = arg1
-            self.outH = arg2
-            self.ratioW = self.ratioH = None
-        else:
-            # Reduction ratio specified per each input
-            # This is the reduction ratio that we use
-            self.ratioW = arg1
-            self.ratioH = arg2
-            self.outW = self.outH = None
-
-            # The reduction ratio must be between 0 and 1
-            assert self.ratioW > 0 and self.ratioW < 1
-            assert self.ratioH > 0 and self.ratioH < 1
-
-    def _getBufferSize(self, input):
-        assert input.ndimension() == 4
-        batchSize = input.size(0)
-        planeSize = input.size(1)
-
-        return torch.Size([batchSize, planeSize, 2])
-
-    def _initSampleBuffer(self, input):
-        sampleBufferSize = self._getBufferSize(input)
-
-        if self.randomSamples is None:
-            self.randomSamples = input.new().resize_(sampleBufferSize).uniform_()
-        elif self.randomSamples.size(0) != sampleBufferSize[0] or self.randomSamples.size(1) != sampleBufferSize[1]:
-            self.randomSamples.resize_(sampleBufferSize).uniform_()
-        elif not self.newRandomPool:
-            # Create new pooling windows, since this is a subsequent call
-            self.randomSamples.uniform_()
-
-    def _getOutputSizes(self, input):
-        outW = self.outW
-        outH = self.outH
-        if self.ratioW is not None and self.ratioH is not None:
-            assert input.ndimension() == 4
-            outW = int(math.floor(input.size(3) * self.ratioW))
-            outH = int(math.floor(input.size(2) * self.ratioH))
-
-            # Neither can be smaller than 1
-            assert outW > 0
-            assert outH > 0
-        else:
-            assert outW is not None and outH is not None
-
-        return outW, outH
-
-    # Call this to turn off regeneration of random pooling regions each
-    # updateOutput call.
-    def fixPoolingRegions(self, val=True):
-        self.newRandomPool = val
-        return self
-
-    def updateOutput(self, input):
-        if self.indices is None:
-            self.indices = input.new()
-        self.indices = self.indices.long()
-        self._initSampleBuffer(input)
-        outW, outH = self._getOutputSizes(input)
-
-        self._backend.SpatialFractionalMaxPooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            outW, outH, self.poolSizeW, self.poolSizeH,
-            self.indices, self.randomSamples)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert self.randomSamples is not None
-        outW, outH = self._getOutputSizes(input)
-
-        self._backend.SpatialFractionalMaxPooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            outW, outH, self.poolSizeW, self.poolSizeH,
-            self.indices)
-        return self.gradInput
-
-    # backward compat
-    def empty(self):
-        self.clearState()
-
-    def clearState(self):
-        self.indices = None
-        self.randomSamples = None
-        return super(SpatialFractionalMaxPooling, self).clearState()
-
-    def __repr__(self):
-        return super(SpatialFractionalMaxPooling, self).__repr__() + \
-            '({}x{}, {}, {})'.format(self.outW or self.ratioW,
-                                     self.outH or self.ratioH,
-                                     self.poolSizeW, self.poolSizeH)
diff --git a/torch/legacy/nn/SpatialFullConvolution.py b/torch/legacy/nn/SpatialFullConvolution.py
deleted file mode 100644
index d68ea1439c0cb0..00000000000000
--- a/torch/legacy/nn/SpatialFullConvolution.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SpatialFullConvolution(Module):
-
-    def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None, adjW=0, adjH=0):
-        super(SpatialFullConvolution, self).__init__()
-
-        self.nInputPlane = nInputPlane
-        self.nOutputPlane = nOutputPlane
-        self.kW = kW
-        self.kH = kH
-        self.dW = dW
-        self.dH = dH
-        self.padW = padW
-        self.padH = padH if padH is not None else padW
-        self.adjW = adjW
-        self.adjH = adjH
-
-        if self.adjW > self.dW - 1 or self.adjH > self.dH - 1:
-            raise ValueError('adjW and adjH must be smaller than self.dW - 1 and self.dH - 1 respectively')
-
-        self.weight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
-        self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
-        self.bias = torch.Tensor(self.nOutputPlane)
-        self.gradBias = torch.Tensor(self.nOutputPlane)
-
-        self.ones = torch.Tensor()
-        self.finput = None
-        self.fgradInput = None
-        self.zeroScalar = None
-        self._input = None
-        self._gradOutput = None
-
-        self.reset()
-
-    def noBias(self):
-        self.bias = None
-        self.gradBias = None
-        return self
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            nInputPlane = self.nInputPlane
-            kH = self.kH
-            kW = self.kW
-            stdv = 1 / math.sqrt(kW * kH * nInputPlane)
-
-        self.weight.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.uniform_(-stdv, stdv)
-
-    def _makeContiguous(self, input, gradOutput=None):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input).copy_(input)
-            input = self._input
-
-        if gradOutput is not None:
-            if not gradOutput.is_contiguous():
-                if self._gradOutput is None:
-                    self._gradOutput = gradOutput.new()
-                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-                gradOutput = self._gradOutput
-            return input, gradOutput
-
-        return input
-
-    def _calculateAdj(self, targetSize, ker, pad, stride):
-        return (targetSize + 2 * pad - ker) % stride
-
-    def updateOutput(self, input):
-        inputTensor = input
-        adjW, adjH = self.adjW, self.adjH
-
-        # The input can be a table where the second element indicates the target
-        # output size, in which case the adj factors are computed automatically
-        if isinstance(input, list):
-            inputTensor = input[0]
-            targetTensor = input[1]
-            tDims = targetTensor.dim()
-            tH = targetTensor.size(tDims - 2)
-            tW = targetTensor.size(tDims - 1)
-            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
-            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
-            if not hasattr(self, 'finput') or self.finput is None:
-                self.finput = input[0].new()
-            if not hasattr(self, 'fgradInput') or self.fgradInput is None:
-                self.fgradInput = input[0].new()
-        else:
-            if not hasattr(self, 'finput') or self.finput is None:
-                self.finput = input.new()
-            if not hasattr(self, 'fgradInput') or self.fgradInput is None:
-                self.fgradInput = input.new()
-
-        inputTensor = self._makeContiguous(inputTensor)
-        self._backend.SpatialFullConvolution_updateOutput(
-            self._backend.library_state,
-            inputTensor,
-            self.output,
-            self.weight,
-            self.bias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            adjW, adjH
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-        inputTensor = input
-        adjW, adjH = self.adjW, self.adjH
-
-        # The input can be a table where the second element indicates the target
-        # output size, in which case the adj factors are computed automatically
-        if isinstance(input, list):
-            inputTensor = input[0]
-            targetTensor = input[1]
-            tDims = targetTensor.dim()
-            tH = targetTensor.size(tDims - 2)
-            tW = targetTensor.size(tDims - 1)
-            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
-            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
-        # Momentarily extract the gradInput tensor
-        if isinstance(self.gradInput, list):
-            self.gradInput = self.gradInput[0]
-
-        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
-        self._backend.SpatialFullConvolution_updateGradInput(
-            self._backend.library_state,
-            inputTensor,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.finput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            adjW, adjH
-        )
-
-        if isinstance(input, list):
-            # Create a zero tensor to be expanded and used as gradInput[1].
-            if self.zeroScalar is None:
-                self.zeroScalar = input[1].new(1).zero_()
-            self.ones.resize_(input[1].dim()).fill_(1)
-            zeroTensor = self.zeroScalar.view_as(self.ones).expand_as(input[1])
-            self.gradInput = [self.gradInput, zeroTensor]
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        inputTensor = input
-        adjW, adjH = self.adjW, self.adjH
-
-        # The input can be a table where the second element indicates the target
-        # output size, in which case the adj factors are computed automatically
-        if isinstance(inputTensor, list):
-            inputTensor = input[0]
-            targetTensor = input[1]
-            tDims = targetTensor.dim()
-            tH = targetTensor.size(tDims - 2)
-            tW = targetTensor.size(tDims - 1)
-            adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
-            adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
-
-        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
-        self._backend.SpatialFullConvolution_accGradParameters(
-            self._backend.library_state,
-            inputTensor,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.finput,
-            self.fgradInput,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            adjW, adjH,
-            scale
-        )
-
-    def type(self, type=None, tensorCache=None):
-        if hasattr(self, 'finput') and self.finput is not None:
-            self.finput = torch.Tensor()
-        if hasattr(self, 'fgradInput') and self.fgradInput is not None:
-            self.fgradInput = torch.Tensor()
-        return super(SpatialFullConvolution, self).type(type, tensorCache)
-
-    def __repr__(self):
-        s = super(SpatialFullConvolution, self).__repr__()
-        s += '({} -> {}, {}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
-        if self.dW != 1 or self.dH != 1 or self.padW != 0 or self.padH != 0:
-            s += ', {}, {}'.format(self.dW, self.dH)
-
-        if (self.padW or self.padH) and (self.padW != 0 or self.padH != 0):
-            s += ', {}, {}'.format(self.padW, self.padH)
-
-        if (self.adjW or self.adjH) and (self.adjW != 0 or self.adjH != 0):
-            s += ', {}, {}'.format(self.adjW, self.adjH)
-
-        s += ')'
-        if self.bias is None:
-            s += ' without bias'
-        return s
-
-    def clearState(self):
-        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
-        return super(SpatialFullConvolution, self).clearState()
diff --git a/torch/legacy/nn/SpatialFullConvolutionMap.py b/torch/legacy/nn/SpatialFullConvolutionMap.py
deleted file mode 100644
index b4981f3fd577f4..00000000000000
--- a/torch/legacy/nn/SpatialFullConvolutionMap.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import random
-import math
-import torch
-from .Module import Module
-
-
-class SpatialFullConvolutionMap(Module):
-
-    def __init__(self, conMatrix, kW, kH, dW=1, dH=1):
-        super(SpatialFullConvolutionMap, self).__init__()
-
-        self.kW = kW
-        self.kH = kH
-        self.dW = dW
-        self.dH = dH
-        self.connTable = conMatrix
-        self.nInputPlane = int(self.connTable.select(1, 0).max()) + 1
-        self.nOutputPlane = int(self.connTable.select(1, 1).max()) + 1
-
-        self.weight = torch.Tensor(self.connTable.size(0), kH, kW)
-        self.gradWeight = torch.Tensor(self.connTable.size(0), kH, kW)
-
-        self.bias = torch.Tensor(self.nOutputPlane)
-        self.gradBias = torch.Tensor(self.nOutputPlane)
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-            self.weight.uniform_(-stdv, stdv)
-            self.bias.uniform_(-stdv, stdv)
-        else:
-            ninp = torch.Tensor(self.nOutputPlane).zero_()
-            for i in range(self.connTable.size(0)):
-                idx = int(self.connTable[i][1])
-                ninp[idx] += 1
-            for k in range(self.connTable.size(0)):
-                idx = int(self.connTable[k][1])
-                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[idx])
-                self.weight[k].uniform_(-stdv, stdv)
-            for k in range(self.bias.size(0)):
-                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[k])
-                # TODO: torch.uniform
-                self.bias[k] = random.uniform(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self._backend.SpatialFullConvolutionMap_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.connTable,
-            self.nInputPlane,
-            self.nOutputPlane,
-            self.dW, self.dH
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.SpatialFullConvolutionMap_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.bias,
-            self.connTable,
-            self.nInputPlane,
-            self.nOutputPlane,
-            self.dW, self.dH
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._backend.SpatialFullConvolutionMap_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.connTable,
-            self.nInputPlane,
-            self.nOutputPlane,
-            self.dW, self.dH,
-            scale
-        )
diff --git a/torch/legacy/nn/SpatialLPPooling.py b/torch/legacy/nn/SpatialLPPooling.py
deleted file mode 100644
index cf84593da175bf..00000000000000
--- a/torch/legacy/nn/SpatialLPPooling.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch
-from .Module import Module
-from .Sequential import Sequential
-from .Square import Square
-from .Power import Power
-from .SpatialAveragePooling import SpatialAveragePooling
-from .MulConstant import MulConstant
-from .Sqrt import Sqrt
-
-
-class SpatialLPPooling(Sequential):
-
-    def __init__(self, nInputPlane, pnorm, kW, kH, dW=None, dH=None):
-        super(SpatialLPPooling, self).__init__()
-
-        dW = dW or kW
-        dH = dH or kH
-
-        self.kW = kW
-        self.kH = kH
-        self.dW = dW
-        self.dH = dH
-
-        if pnorm == 2:
-            self.add(Square())
-        else:
-            self.add(Power(pnorm))
-
-        self.add(SpatialAveragePooling(kW, kH, dW, dH))
-        self.add(MulConstant(kW * kH))
-        if pnorm == 2:
-            self.add(Sqrt())
-        else:
-            self.add(Power(1. / pnorm))
-
-    # the module is a Sequential: by default, it'll try to learn the parameters
-    # of the sub sampler: we avoid that by redefining its methods.
-    def reset(self, stdev=None):
-        pass
-
-    def accGradParameters(self, input, gradOutput):
-        pass
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        pass
-
-    def zeroGradParameters(self):
-        pass
-
-    def updateParameters(self, learningRate):
-        pass
diff --git a/torch/legacy/nn/SpatialMaxPooling.py b/torch/legacy/nn/SpatialMaxPooling.py
deleted file mode 100644
index 2b2051fe39a3ec..00000000000000
--- a/torch/legacy/nn/SpatialMaxPooling.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class SpatialMaxPooling(Module):
-
-    def __init__(self, kW, kH, dW=None, dH=None, padW=0, padH=0):
-        super(SpatialMaxPooling, self).__init__()
-
-        dW = dW or kW
-        dH = dH or kH
-
-        self.kW = kW
-        self.kH = kH
-        self.dW = dW
-        self.dH = dH
-
-        self.padW = padW
-        self.padH = padH
-
-        self.ceil_mode = False
-        self.indices = torch.LongTensor()
-
-    def ceil(self):
-        self.ceil_mode = True
-        return self
-
-    def floor(self):
-        self.ceil_mode = False
-        return self
-
-    def updateOutput(self, input):
-        if not hasattr(self, 'indices') or self.indices is None:
-            self.indices = input.new()
-        self.indices = self.indices.long()
-
-        dims = input.dim()
-        self.iheight = input.size(dims - 2)
-        self.iwidth = input.size(dims - 1)
-
-        self._backend.SpatialMaxPooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.indices,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.ceil_mode
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.SpatialMaxPooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.indices,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            self.padW, self.padH,
-            self.ceil_mode
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        s = super(SpatialMaxPooling, self).__repr__()
-        s += '({}x{}, {}, {}'.format(self.kW, self.kH, self.dW, self.dH)
-        if (self.padW or self.padH) and (self.padW != 0 or self.padH != 0):
-            s += ', {}, {}'.format(self.padW, self.padH)
-        s += ')'
-
-        return s
-
-    def clearState(self):
-        clear(self, 'indices')
-        return super(SpatialMaxPooling, self).clearState()
diff --git a/torch/legacy/nn/SpatialMaxUnpooling.py b/torch/legacy/nn/SpatialMaxUnpooling.py
deleted file mode 100644
index 477ef43124c2f8..00000000000000
--- a/torch/legacy/nn/SpatialMaxUnpooling.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from .Module import Module
-from .SpatialMaxPooling import SpatialMaxPooling
-
-
-class SpatialMaxUnpooling(Module):
-
-    def __init__(self, poolingModule):
-        super(SpatialMaxUnpooling, self).__init__()
-        assert isinstance(poolingModule, SpatialMaxPooling)
-        assert poolingModule.kH == poolingModule.dH
-        assert poolingModule.kW == poolingModule.dW
-        self.pooling = poolingModule
-
-    def _setParams(self):
-        self.indices = self.pooling.indices
-        self.oheight = self.pooling.iheight
-        self.owidth = self.pooling.iwidth
-
-    def updateOutput(self, input):
-        self._setParams()
-        self._backend.SpatialMaxUnpooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.indices,
-            self.owidth, self.oheight
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._setParams()
-        self._backend.SpatialMaxUnpooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.indices,
-            self.owidth, self.oheight
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return 'nn.SpatialMaxUnpooling associated to ' + self.pooling.__repr__()
diff --git a/torch/legacy/nn/SpatialReflectionPadding.py b/torch/legacy/nn/SpatialReflectionPadding.py
deleted file mode 100644
index b8f3d15ba39846..00000000000000
--- a/torch/legacy/nn/SpatialReflectionPadding.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SpatialReflectionPadding(Module):
-
-    def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
-        super(SpatialReflectionPadding, self).__init__()
-        self.pad_l = pad_l
-        self.pad_r = pad_r if pad_r is not None else pad_l
-        self.pad_t = pad_t if pad_t is not None else pad_l
-        self.pad_b = pad_b if pad_b is not None else pad_l
-
-    def updateOutput(self, input):
-        assert input.dim() == 4
-        self._backend.SpatialReflectionPadding_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.pad_l, self.pad_r, self.pad_t, self.pad_b
-        )
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 4 and gradOutput.dim() == 4
-        assert input.size(0) == gradOutput.size(0) and \
-            input.size(1) == gradOutput.size(1) and \
-            input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
-            input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
-
-        self._backend.SpatialReflectionPadding_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.pad_l, self.pad_r, self.pad_t, self.pad_b
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        s = super(SpatialReflectionPadding, self).__repr__()
-        s += '({}, {}, {}, {})'.format(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
-        return s
diff --git a/torch/legacy/nn/SpatialReplicationPadding.py b/torch/legacy/nn/SpatialReplicationPadding.py
deleted file mode 100644
index 67a79a965f072b..00000000000000
--- a/torch/legacy/nn/SpatialReplicationPadding.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SpatialReplicationPadding(Module):
-
-    def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
-        super(SpatialReplicationPadding, self).__init__()
-        self.pad_l = pad_l
-        self.pad_r = pad_r if pad_r is not None else pad_l
-        self.pad_t = pad_t if pad_t is not None else pad_l
-        self.pad_b = pad_b if pad_b is not None else pad_l
-
-    def updateOutput(self, input):
-        assert input.dim() == 4
-        self._backend.SpatialReplicationPadding_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.pad_l, self.pad_r, self.pad_t, self.pad_b
-        )
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 4 and gradOutput.dim() == 4
-        assert input.size(0) == gradOutput.size(0) and \
-            input.size(1) == gradOutput.size(1) and \
-            input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
-            input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
-
-        self._backend.SpatialReplicationPadding_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.pad_l, self.pad_r, self.pad_t, self.pad_b
-        )
-
-        return self.gradInput
-
-    def __repr__(self):
-        s = super(SpatialReplicationPadding, self).__repr__()
-        s += '({}, {}, {}, {})'.format(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
-        return s
diff --git a/torch/legacy/nn/SpatialSoftMax.py b/torch/legacy/nn/SpatialSoftMax.py
deleted file mode 100644
index f14c5637b91ecf..00000000000000
--- a/torch/legacy/nn/SpatialSoftMax.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SpatialSoftMax(Module):
-
-    def updateOutput(self, input):
-        self.output = torch.softmax(
-            input,
-            0 if input.dim() == 1 or input.dim() == 3 else 1
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput = torch.softmax_backward_data(
-            gradOutput,
-            self.output,
-            0 if input.dim() == 1 or input.dim() == 3 else 1,
-            input
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SpatialSubSampling.py b/torch/legacy/nn/SpatialSubSampling.py
deleted file mode 100644
index 2429800f075bb1..00000000000000
--- a/torch/legacy/nn/SpatialSubSampling.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class SpatialSubSampling(Module):
-
-    def __init__(self, nInputPlane, kW, kH, dW=1, dH=1):
-        super(SpatialSubSampling, self).__init__()
-
-        self.nInputPlane = nInputPlane
-        self.kW = kW
-        self.kH = kH
-        self.dW = dW
-        self.dH = dH
-
-        self.weight = torch.Tensor(nInputPlane)
-        self.bias = torch.Tensor(nInputPlane)
-        self.gradWeight = torch.Tensor(nInputPlane)
-        self.gradBias = torch.Tensor(nInputPlane)
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.kW * self.kH)
-
-        self.weight.uniform_(-stdv, stdv)
-        self.bias.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self._backend.SpatialSubSampling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.kW, self.kH,
-            self.dW, self.dH
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        self._backend.SpatialSubSampling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.kW, self.kH,
-            self.dW, self.dH
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._backend.SpatialSubSampling_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.kW, self.kH,
-            self.dW, self.dH,
-            scale
-        )
diff --git a/torch/legacy/nn/SpatialSubtractiveNormalization.py b/torch/legacy/nn/SpatialSubtractiveNormalization.py
deleted file mode 100644
index 3b8fed3fb956e7..00000000000000
--- a/torch/legacy/nn/SpatialSubtractiveNormalization.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .Sequential import Sequential
-from .SpatialZeroPadding import SpatialZeroPadding
-from .SpatialConvolution import SpatialConvolution
-from .SpatialConvolutionMap import SpatialConvolutionMap
-from .Replicate import Replicate
-from .CSubTable import CSubTable
-from .CDivTable import CDivTable
-from .utils import clear
-import warnings
-
-
-class SpatialSubtractiveNormalization(Module):
-
-    def __init__(self, nInputPlane=1, kernel=None):
-        super(SpatialSubtractiveNormalization, self).__init__()
-
-        # get args
-        self.nInputPlane = nInputPlane
-        if kernel is None:
-            kernel = torch.Tensor(9, 9).fill_(1)
-        self.kernel = kernel
-        kdim = self.kernel.ndimension()
-
-        # check args
-        if kdim != 2 and kdim != 1:
-            raise ValueError('SpatialSubtractiveNormalization averaging kernel must be 2D or 1D')
-
-        if (self.kernel.size(0) % 2) == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
-            raise ValueError('SpatialSubtractiveNormalization averaging kernel must have ODD dimensions')
-
-        # normalize kernel
-        self.kernel.div_(self.kernel.sum() * self.nInputPlane)
-
-        # padding values
-        padH = int(math.floor(self.kernel.size(0) / 2))
-        padW = padH
-        if kdim == 2:
-            padW = int(math.floor(self.kernel.size(1) / 2))
-
-        # create convolutional mean extractor
-        self.meanestimator = Sequential()
-        self.meanestimator.add(SpatialZeroPadding(padW, padW, padH, padH))
-        if kdim == 2:
-            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
-        else:
-            # TODO: map
-            self.meanestimator.add(SpatialConvolutionMap(
-                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
-            self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
-
-        self.meanestimator.add(Replicate(self.nInputPlane, 0))
-
-        # set kernel and bias
-        if kdim == 2:
-            for i in range(self.nInputPlane):
-                self.meanestimator.modules[1].weight[0][i] = self.kernel
-            self.meanestimator.modules[1].bias.zero_()
-        else:
-            for i in range(self.nInputPlane):
-                self.meanestimator.modules[1].weight[i] = self.kernel.unsqueeze(0)
-                self.meanestimator.modules[2].weight[0][i] = self.kernel.unsqueeze(1)
-
-            self.meanestimator.modules[1].bias.zero_()
-            self.meanestimator.modules[2].bias.zero_()
-
-        # other operation
-        self.subtractor = CSubTable()
-        self.divider = CDivTable()
-
-        # coefficient array, to adjust side effects
-        self.coef = torch.Tensor(1, 1, 1)
-
-        self.ones = None
-        self._coef = None
-
-    def updateOutput(self, input):
-        # compute side coefficients
-        dim = input.dim()
-        if (input.dim() + 1 != self.coef.dim() or
-                (input.size(dim - 1) != self.coef.size(dim - 1)) or
-                (input.size(dim - 2) != self.coef.size(dim - 2))):
-            if self.ones is None:
-                self.ones = input.new()
-            if self._coef is None:
-                self._coef = self.coef.new()
-
-            self.ones.resize_as_(input[0:1]).fill_(1)
-            coef = self.meanestimator.updateOutput(self.ones).squeeze(0)
-            self._coef.resize_as_(coef).copy_(coef)  # make contiguous for view
-            size = list(coef.size())
-            size = [input.size(0)] + size
-            self.coef = self._coef.view(1, *self._coef.size()).expand(*size)
-
-        # compute mean
-        self.localsums = self.meanestimator.updateOutput(input)
-        self.adjustedsums = (self.divider.updateOutput(
-            [self.localsums, self.coef.contiguous().view_as(self.localsums)]))
-        self.output = self.subtractor.updateOutput([input, self.adjustedsums.contiguous().view_as(input)])
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        # resize grad
-        self.gradInput.resize_as_(input).zero_()
-
-        # backprop through all modules
-        gradsub = self.subtractor.updateGradInput([input, self.adjustedsums.contiguous().view_as(input)], gradOutput)
-        graddiv = (self.divider.updateGradInput(
-            [self.localsums, self.coef.contiguous().view_as(self.localsums)], gradsub[1]))
-        size = self.meanestimator.updateGradInput(input, graddiv[0]).size()
-        self.gradInput.add_(self.meanestimator.updateGradInput(input, graddiv[0]))
-        self.gradInput.add_(gradsub[0])
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'ones', '_coef')
-        self.meanestimator.clearState()
-        return super(SpatialSubtractiveNormalization, self).clearState()
diff --git a/torch/legacy/nn/SpatialUpSamplingNearest.py b/torch/legacy/nn/SpatialUpSamplingNearest.py
deleted file mode 100644
index 1388f80e247fcf..00000000000000
--- a/torch/legacy/nn/SpatialUpSamplingNearest.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SpatialUpSamplingNearest(Module):
-    """
-    Applies a 2D up-sampling over an input image composed of several input planes.
-
-    The upsampling is.ne using the simple nearest neighbor technique.
-
-    The Y and X dimensions are assumed to be the last 2 tensor dimensions.  For
-    instance, if the tensor is 4D,: dim 3 is the y dimension and dim 4 is the x.
-
-    owidth  = width*scale_factor
-    oheight  = height*scale_factor
-    """
-
-    def __init__(self, scale):
-        super(SpatialUpSamplingNearest, self).__init__()
-
-        self.scale_factor = scale
-        if self.scale_factor < 1:
-            raise ValueError('scale_factor must be greater than 1')
-        if self.scale_factor % 1 != 0:
-            raise ValueError('scale_factor must be integer')
-
-    def updateOutput(self, input):
-        assert input.dim() == 4
-
-        # Copy the input size
-        xdim = input.dim() - 1
-        ydim = input.dim() - 2
-        outputSize = list(input.size())
-        outputSize[ydim] = outputSize[ydim] * self.scale_factor
-        outputSize[xdim] = outputSize[xdim] * self.scale_factor
-        self.output.resize_(*outputSize)
-        self._backend.SpatialUpSamplingNearest_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            outputSize[ydim],
-            outputSize[xdim]
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.gradInput.resize_as_(input)
-        assert input.dim() == 4
-        input_size = input.size()
-        self._backend.SpatialUpSamplingNearest_updateGradInput(
-            self._backend.library_state,
-            gradOutput,
-            self.gradInput,
-            input_size[0],
-            input_size[1],
-            input_size[2],
-            input_size[3],
-            gradOutput.shape[2],
-            gradOutput.shape[3]
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/SpatialZeroPadding.py b/torch/legacy/nn/SpatialZeroPadding.py
deleted file mode 100644
index 430feb43bbbe13..00000000000000
--- a/torch/legacy/nn/SpatialZeroPadding.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SpatialZeroPadding(Module):
-
-    def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
-        super(SpatialZeroPadding, self).__init__()
-        self.pad_l = pad_l
-        self.pad_r = pad_r if pad_r is not None else pad_l
-        self.pad_t = pad_t if pad_t is not None else pad_l
-        self.pad_b = pad_b if pad_b is not None else pad_l
-
-    def updateOutput(self, input):
-        assert input.dim() == 4
-
-        # sizes
-        h = input.size(2) + self.pad_t + self.pad_b
-        w = input.size(3) + self.pad_l + self.pad_r
-        if w < 1 or h < 1:
-            raise RuntimeError('input is too small (feature map size: {}x{})'.format(h, w))
-        self.output.resize_(input.size(0), input.size(1), h, w)
-        self.output.zero_()
-        # crop input if necessary
-        c_input = input
-        if self.pad_t < 0:
-            c_input = c_input.narrow(2, 0 - self.pad_t, c_input.size(2) + self.pad_t)
-        if self.pad_b < 0:
-            c_input = c_input.narrow(2, 0, c_input.size(2) + self.pad_b)
-        if self.pad_l < 0:
-            c_input = c_input.narrow(3, 0 - self.pad_l, c_input.size(3) + self.pad_l)
-        if self.pad_r < 0:
-            c_input = c_input.narrow(3, 0, c_input.size(3) + self.pad_r)
-        # crop output if necessary
-        c_output = self.output
-        if self.pad_t > 0:
-            c_output = c_output.narrow(2, 0 + self.pad_t, c_output.size(2) - self.pad_t)
-        if self.pad_b > 0:
-            c_output = c_output.narrow(2, 0, c_output.size(2) - self.pad_b)
-        if self.pad_l > 0:
-            c_output = c_output.narrow(3, 0 + self.pad_l, c_output.size(3) - self.pad_l)
-        if self.pad_r > 0:
-            c_output = c_output.narrow(3, 0, c_output.size(3) - self.pad_r)
-        # copy input to output
-        c_output.copy_(c_input)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 4
-
-        self.gradInput.resize_as_(input).zero_()
-        # crop gradInput if necessary
-        cg_input = self.gradInput
-        if self.pad_t < 0:
-            cg_input = cg_input.narrow(2, 0 - self.pad_t, cg_input.size(2) + self.pad_t)
-        if self.pad_b < 0:
-            cg_input = cg_input.narrow(2, 0, cg_input.size(2) + self.pad_b)
-        if self.pad_l < 0:
-            cg_input = cg_input.narrow(3, 0 - self.pad_l, cg_input.size(3) + self.pad_l)
-        if self.pad_r < 0:
-            cg_input = cg_input.narrow(3, 0, cg_input.size(3) + self.pad_r)
-        # crop gradOutput if necessary
-        cg_output = gradOutput
-        if self.pad_t > 0:
-            cg_output = cg_output.narrow(2, 0 + self.pad_t, cg_output.size(2) - self.pad_t)
-        if self.pad_b > 0:
-            cg_output = cg_output.narrow(2, 0, cg_output.size(2) - self.pad_b)
-        if self.pad_l > 0:
-            cg_output = cg_output.narrow(3, 0 + self.pad_l, cg_output.size(3) - self.pad_l)
-        if self.pad_r > 0:
-            cg_output = cg_output.narrow(3, 0, cg_output.size(3) - self.pad_r)
-        # copy gradOutput to gradInput
-        cg_input.copy_(cg_output)
-
-        return self.gradInput
-
-    def __tostring__(self, ):
-        s = super(SpatialZeroPadding, self).__repr__()
-        s += '({}, {}, {}, {})'.foramat(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
-        return s
diff --git a/torch/legacy/nn/SplitTable.py b/torch/legacy/nn/SplitTable.py
deleted file mode 100644
index c93079d5747b1b..00000000000000
--- a/torch/legacy/nn/SplitTable.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import torch
-from .Module import Module
-
-
-class SplitTable(Module):
-
-    def __init__(self, dimension):
-        super(SplitTable, self).__init__()
-        self.dimension = dimension
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input.dim() + dimension
-
-        return dimension
-
-    def updateOutput(self, input):
-        dimension = self._getPositiveDimension(input)
-        slices = input.size(dimension)
-
-        currentOutput = []
-        for i in range(slices):
-            currentOutput.append(input.select(dimension, i))
-
-        self.output = currentOutput
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-        dimension = self._getPositiveDimension(input)
-        slices = input.size(dimension)
-        self.gradInput.resize_as_(input)
-
-        for i in range(slices):
-            self.gradInput.select(dimension, i).copy_(gradOutput[i])
-
-        return self.gradInput
diff --git a/torch/legacy/nn/Sqrt.py b/torch/legacy/nn/Sqrt.py
deleted file mode 100644
index e046594be2f1a7..00000000000000
--- a/torch/legacy/nn/Sqrt.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Sqrt(Module):
-
-    def __init__(self, b=0, eps=0):
-        super(Sqrt, self).__init__()
-        self.eps = b
-        self.eps = eps
-
-    def updateOutput(self, input):
-        self._backend.Sqrt_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.eps
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.Sqrt_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.output
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/Square.py b/torch/legacy/nn/Square.py
deleted file mode 100644
index 9ebaa371eda580..00000000000000
--- a/torch/legacy/nn/Square.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Square(Module):
-
-    def updateOutput(self, input):
-        self._backend.Square_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.Square_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/Squeeze.py b/torch/legacy/nn/Squeeze.py
deleted file mode 100644
index 782b8f5cd90b08..00000000000000
--- a/torch/legacy/nn/Squeeze.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Squeeze(Module):
-
-    def __init__(self, dim=None):
-        super(Squeeze, self).__init__()
-        self.dim = dim
-
-    def updateOutput(self, input):
-        dim = self.dim
-        self.output.set_(input.squeeze(dim) if dim is not None else input.squeeze())
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.nelement() == gradOutput.nelement()
-        self.gradInput.set_(gradOutput.contiguous().view_as(input))
-        return self.gradInput
diff --git a/torch/legacy/nn/Sum.py b/torch/legacy/nn/Sum.py
deleted file mode 100644
index 31fd988460ab73..00000000000000
--- a/torch/legacy/nn/Sum.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class Sum(Module):
-
-    def __init__(self, dimension=0, sizeAverage=False):
-        super(Sum, self).__init__()
-        self.dimension = dimension
-        self.sizeAverage = sizeAverage
-        self._gradOutput = None
-
-    def _getPositiveDimension(self, input):
-        dimension = self.dimension
-        if dimension < 0:
-            dimension = input.dim() + dimension
-        return dimension
-
-    def updateOutput(self, input):
-        dimension = self._getPositiveDimension(input)
-
-        torch.sum(input, dimension, out=self.output, keepdim=True)
-        if self.sizeAverage:
-            self.output.div_(input.size(dimension))
-        if self.output.dim() > 1:
-            self.output.set_(self.output.select(dimension, 0))
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        dimension = self._getPositiveDimension(input)
-        # zero-strides don't work with MKL/BLAS, so
-        # don't set self.gradInput to zero-stride tensor.
-        # Instead, do a deepcopy.
-        size = list(input.size())
-        size[dimension] = 1
-        if not gradOutput.is_contiguous():
-            if self._gradOutput is None:
-                self._gradOutput = gradOutput.new()
-            self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-            gradOutput = self._gradOutput
-
-        gradOutput = gradOutput.view(*size)
-        self.gradInput.resize_as_(input)
-        self.gradInput.copy_(gradOutput.expand_as(input))
-        if self.sizeAverage:
-            self.gradInput.div_(input.size(dimension))
-
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, '_gradOutput')
-        return super(Sum, self).clearState()
diff --git a/torch/legacy/nn/Tanh.py b/torch/legacy/nn/Tanh.py
deleted file mode 100644
index 872cf2f5e7aa87..00000000000000
--- a/torch/legacy/nn/Tanh.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Tanh(Module):
-
-    def updateOutput(self, input):
-        self._backend.Tanh_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.Tanh_updateGradInput(
-            self._backend.library_state,
-            gradOutput,
-            self.gradInput,
-            self.output
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/TanhShrink.py b/torch/legacy/nn/TanhShrink.py
deleted file mode 100644
index 36ef0f1689b11a..00000000000000
--- a/torch/legacy/nn/TanhShrink.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-from .Tanh import Tanh
-
-
-class TanhShrink(Module):
-
-    def __init__(self):
-        super(TanhShrink, self).__init__()
-        self.tanh = Tanh()
-
-    def updateOutput(self, input):
-        th = self.tanh.updateOutput(input)
-        self.output.resize_as_(input).copy_(input)
-        self.output.add_(-1, th)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        dth = self.tanh.updateGradInput(input, gradOutput)
-        self.gradInput.resize_as_(input).copy_(gradOutput)
-        self.gradInput.add_(-1, dth)
-        return self.gradInput
diff --git a/torch/legacy/nn/TemporalConvolution.py b/torch/legacy/nn/TemporalConvolution.py
deleted file mode 100644
index 4ac04f264eb0f9..00000000000000
--- a/torch/legacy/nn/TemporalConvolution.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class TemporalConvolution(Module):
-
-    def __init__(self, inputFrameSize, outputFrameSize, kW, dW=1):
-        super(TemporalConvolution, self).__init__()
-
-        self.inputFrameSize = inputFrameSize
-        self.outputFrameSize = outputFrameSize
-        self.kW = kW
-        self.dW = dW
-
-        self.weight = torch.Tensor(outputFrameSize, inputFrameSize * kW)
-        self.bias = torch.Tensor(outputFrameSize)
-        self.gradWeight = torch.Tensor(outputFrameSize, inputFrameSize * kW)
-        self.gradBias = torch.Tensor(outputFrameSize)
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.kW * self.inputFrameSize)
-
-        self.weight.uniform_(-stdv, stdv)
-        self.bias.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self._backend.TemporalConvolution_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.kW,
-            self.dW,
-            self.inputFrameSize,
-            self.outputFrameSize
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-        self._backend.TemporalConvolution_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.kW,
-            self.dW
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._backend.TemporalConvolution_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.kW,
-            self.dW,
-            scale
-        )
diff --git a/torch/legacy/nn/TemporalMaxPooling.py b/torch/legacy/nn/TemporalMaxPooling.py
deleted file mode 100644
index d3088ca4f38789..00000000000000
--- a/torch/legacy/nn/TemporalMaxPooling.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class TemporalMaxPooling(Module):
-
-    def __init__(self, kW, dW=None):
-        super(TemporalMaxPooling, self).__init__()
-        self.kW = kW
-        self.dW = dW or kW
-        self.indices = None
-
-    def updateOutput(self, input):
-        if self.indices is None:
-            self.indices = input.new()
-        self._backend.TemporalMaxPooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.indices,
-            self.kW,
-            self.dW
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-        self._backend.TemporalMaxPooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.indices,
-            self.kW,
-            self.dW
-        )
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'indices')
-        return super(TemporalMaxPooling, self).clearState()
diff --git a/torch/legacy/nn/TemporalSubSampling.py b/torch/legacy/nn/TemporalSubSampling.py
deleted file mode 100644
index 823070bbccff8f..00000000000000
--- a/torch/legacy/nn/TemporalSubSampling.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class TemporalSubSampling(Module):
-
-    def __init__(self, inputFrameSize, kW, dW=1):
-        super(TemporalSubSampling, self).__init__()
-
-        self.inputFrameSize = inputFrameSize
-        self.kW = kW
-        self.dW = dW
-
-        self.weight = torch.Tensor(inputFrameSize)
-        self.bias = torch.Tensor(inputFrameSize)
-        self.gradWeight = torch.Tensor(inputFrameSize)
-        self.gradBias = torch.Tensor(inputFrameSize)
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.kW)
-
-        self.weight.uniform_(-stdv, stdv)
-        self.bias.uniform_(-stdv, stdv)
-
-    def updateOutput(self, input):
-        self._backend.TemporalSubSampling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.weight,
-            self.bias,
-            self.kW,
-            self.dW,
-            self.inputFrameSize
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-        self._backend.TemporalSubSampling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.kW,
-            self.dW
-        )
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        self._backend.TemporalSubSampling_accGradParameters(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.kW,
-            self.dW,
-            scale
-        )
diff --git a/torch/legacy/nn/Threshold.py b/torch/legacy/nn/Threshold.py
deleted file mode 100644
index f151d023e83bf1..00000000000000
--- a/torch/legacy/nn/Threshold.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Threshold(Module):
-
-    def __init__(self, threshold=0, value=0, inplace=False):
-        super(Threshold, self).__init__()
-        self.threshold = threshold
-        self.value = value
-
-        # default for inplace is False
-        self.inplace = inplace
-        self.validateParameters()
-
-    def updateOutput(self, input):
-        self.validateParameters()
-        self._backend.Threshold_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.threshold,
-            self.value,
-            self.inplace
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self.validateParameters()
-        self._backend.Threshold_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.threshold,
-            self.value,
-            self.inplace
-        )
-        return self.gradInput
-
-    def validateParameters(self):
-        if self.inplace:
-            if self.value > self.threshold:
-                raise RuntimeError('in-place processing requires value ({}) to not '
-                                   'exceed threshold ({})'.format(self.value, self.threshold))
diff --git a/torch/legacy/nn/Transpose.py b/torch/legacy/nn/Transpose.py
deleted file mode 100644
index 4478c251e0202b..00000000000000
--- a/torch/legacy/nn/Transpose.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import torch
-from .Module import Module
-
-
-class Transpose(Module):
-    # transpose dimensions:
-    # n = nn.Transpose({1, 4}, {1, 3})
-    # will transpose dims 1 and 4,: 1 and 3...
-
-    def __init__(self, *args):
-        super(Transpose, self).__init__()
-        self.permutations = args
-
-    def updateOutput(self, input):
-        for perm in self.permutations:
-            input = input.transpose(*perm)
-        self.output.resize_as_(input).copy_(input)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        for perm in self.permutations[::-1]:
-            gradOutput = gradOutput.transpose(*perm)
-        self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-        return self.gradInput
diff --git a/torch/legacy/nn/Unsqueeze.py b/torch/legacy/nn/Unsqueeze.py
deleted file mode 100644
index 21aaff890d97a3..00000000000000
--- a/torch/legacy/nn/Unsqueeze.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from .Module import Module
-from .utils import addSingletondimension
-
-
-class Unsqueeze(Module):
-
-    def __init__(self, dim):
-        super(Unsqueeze, self).__init__()
-        self.dim = dim
-
-    def updateOutput(self, input):
-        addSingletondimension(self.output, input, self.dim)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.nelement() == gradOutput.nelement()
-        self.gradInput = gradOutput.contiguous().view(input.size())
-        return self.gradInput
-
-    def __repr__(self):
-        return super(Unsqueeze, self).__repr__() + '({})'.format(self.dim)
diff --git a/torch/legacy/nn/View.py b/torch/legacy/nn/View.py
deleted file mode 100644
index 0d5f7ace279c37..00000000000000
--- a/torch/legacy/nn/View.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-from .Module import Module
-
-
-class View(Module):
-
-    def resetSize(self, *args):
-        if len(args) == 1 and isinstance(args[0], torch.Size):
-            self.size = args[0]
-        else:
-            self.size = torch.Size(args)
-
-        self.numElements = 1
-        inferdim = False
-        for i in range(len(self.size)):
-            szi = self.size[i]
-            if szi >= 0:
-                self.numElements = self.numElements * self.size[i]
-            else:
-                assert szi == -1
-                assert not inferdim
-                inferdim = True
-
-        return self
-
-    def __init__(self, *args):
-        super(View, self).__init__()
-        self.resetSize(*args)
-
-    def updateOutput(self, input):
-        if self.output is None:
-            self.output = input.new()
-        self.output = input.view(self.size)
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            self.gradInput = gradOutput.new()
-        self.gradInput = gradOutput.contiguous().view(input.size())
-        return self.gradInput
-
-    def __repr__(self):
-        return super(View, self).__repr__() + '({})'.format(', '.join(map(str, self.size)))
diff --git a/torch/legacy/nn/VolumetricAveragePooling.py b/torch/legacy/nn/VolumetricAveragePooling.py
deleted file mode 100644
index 3e6dd153149649..00000000000000
--- a/torch/legacy/nn/VolumetricAveragePooling.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import torch
-from .Module import Module
-
-
-class VolumetricAveragePooling(Module):
-
-    def __init__(self, kT, kW, kH, dT=None, dW=None, dH=None,
-                 padT=0, padW=0, padH=0,
-                 ceil_mode=False, count_include_pad=True):
-        super(VolumetricAveragePooling, self).__init__()
-        self.kT = kT
-        self.kH = kH
-        self.kW = kW
-        self.dT = dT or kT
-        self.dW = dW or kW
-        self.dH = dH or kH
-        self.padT = padT
-        self.padW = padW
-        self.padH = padH
-        self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self.__dict__.setdefault('padT', 0)
-        self.__dict__.setdefault('padH', 0)
-        self.__dict__.setdefault('padW', 0)
-        self.__dict__.setdefault('ceil_mode', False)
-        self.__dict__.setdefault('count_include_pad', True)
-
-    def updateOutput(self, input):
-        self._backend.VolumetricAveragePooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.kT, self.kW, self.kH,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            self.ceil_mode, self.count_include_pad
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.VolumetricAveragePooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.kT, self.kW, self.kH,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            self.ceil_mode, self.count_include_pad
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        s = super(VolumetricAveragePooling, self).__repr__()
-        s += '({}x{}x{}, {}x{}x{}, {}x{}x{}, {}, {}'.format(
-            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            self.ceil_mode, self.count_include_pad)
-        s += ')'
-        return s
diff --git a/torch/legacy/nn/VolumetricBatchNormalization.py b/torch/legacy/nn/VolumetricBatchNormalization.py
deleted file mode 100644
index 61bab4c6efd703..00000000000000
--- a/torch/legacy/nn/VolumetricBatchNormalization.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import torch
-from .Module import Module
-from .BatchNormalization import BatchNormalization
-
-
-class VolumetricBatchNormalization(BatchNormalization):
-    nDim = 5
diff --git a/torch/legacy/nn/VolumetricConvolution.py b/torch/legacy/nn/VolumetricConvolution.py
deleted file mode 100644
index 8e506a1d93865b..00000000000000
--- a/torch/legacy/nn/VolumetricConvolution.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import math
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class VolumetricConvolution(Module):
-
-    def __init__(self, nInputPlane, nOutputPlane, kT, kW, kH, dT=1, dW=1, dH=1, padT=0, padW=None, padH=None):
-        super(VolumetricConvolution, self).__init__()
-
-        self.nInputPlane = nInputPlane
-        self.nOutputPlane = nOutputPlane
-        self.kT = kT
-        self.kW = kW
-        self.kH = kH
-        self.dT = dT
-        self.dW = dW
-        self.dH = dH
-        self.padT = padT
-        self.padW = padW if padW is not None else self.padT
-        self.padH = padH if padH is not None else self.padW
-
-        self.weight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
-        self.bias = torch.Tensor(nOutputPlane)
-        self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
-        self.gradBias = torch.Tensor(nOutputPlane)
-        self.reset()
-
-        self.finput = None
-        self.fgradInput = None
-        self._input = None
-        self._gradOutput = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.kT * self.kW * self.kH * self.nInputPlane)
-
-        self.weight.uniform_(-stdv, stdv)
-        self.bias.uniform_(-stdv, stdv)
-
-    def _makeContiguous(self, input, gradOutput=None):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input).copy_(input)
-            input = self._input
-
-        if gradOutput is not None:
-            if not gradOutput.is_contiguous():
-                if self._gradOutput is None:
-                    self._gradOutput = gradOutput.new()
-                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-                gradOutput = self._gradOutput
-            return input, gradOutput
-
-        return input
-
-    # function to re-view the weight layout in a way that would make the MM ops happy
-    def _viewWeight(self):
-        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
-        if self.gradWeight is not None and self.gradWeight.dim() > 0:
-            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
-
-    def _unviewWeight(self):
-        self.weight = self.weight.view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
-        if self.gradWeight is not None and self.gradWeight.dim() > 0:
-            self.gradWeight = self.gradWeight.view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
-
-    def updateOutput(self, input):
-        if self.finput is None:
-            self.finput = input.new()
-        if self.fgradInput is None:
-            self.fgradInput = input.new()
-        if input.type() == 'torch.cuda.FloatTensor':
-            self._backend.VolumetricConvolution_updateOutput(
-                self._backend.library_state,
-                input,
-                self.output,
-                self.weight,
-                self.bias,
-                self.finput,
-                self.fgradInput,
-                self.dT, self.dW, self.dH,
-                self.padT, self.padW, self.padH
-            )
-        else:
-            self._viewWeight()
-            input = self._makeContiguous(input)
-            self._backend.VolumetricConvolutionMM_updateOutput(
-                self._backend.library_state,
-                input,
-                self.output,
-                self.weight,
-                self.bias,
-                self.finput,
-                self.fgradInput,
-                self.kT, self.kW, self.kH,
-                self.dT, self.dW, self.dH,
-                self.padT, self.padW, self.padH
-            )
-            self._unviewWeight()
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-        if input.type() == 'torch.cuda.FloatTensor':
-            self._backend.VolumetricConvolution_updateGradInput(
-                self._backend.library_state,
-                input,
-                gradOutput,
-                self.gradInput,
-                self.weight,
-                self.finput,
-                self.dT, self.dW, self.dH,
-                self.padT, self.padW, self.padH
-            )
-        else:
-            self._viewWeight()
-            input, gradOutput = self._makeContiguous(input, gradOutput)
-            self._backend.VolumetricConvolutionMM_updateGradInput(
-                self._backend.library_state,
-                input,
-                gradOutput,
-                self.gradInput,
-                self.weight,
-                self.finput,
-                self.fgradInput,
-                self.kT, self.kW, self.kH,
-                self.dT, self.dW, self.dH,
-                self.padT, self.padW, self.padH
-            )
-            self._unviewWeight()
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        if input.type() == 'torch.cuda.FloatTensor':
-            self._backend.VolumetricConvolution_accGradParameters(
-                self._backend.library_state,
-                input,
-                gradOutput,
-                self.gradWeight,
-                self.gradBias,
-                self.finput,
-                self.fgradInput,
-                self.dT, self.dW, self.dH,
-                self.padT, self.padW, self.padH,
-                scale
-            )
-        else:
-            input, gradOutput = self._makeContiguous(input, gradOutput)
-            self._viewWeight()
-            self._backend.VolumetricConvolutionMM_accGradParameters(
-                self._backend.library_state,
-                input,
-                gradOutput,
-                self.gradWeight,
-                self.gradBias,
-                self.finput,
-                self.fgradInput,
-                self.kT, self.kW, self.kH,
-                self.dT, self.dW, self.dH,
-                self.padT, self.padW, self.padH,
-                scale
-            )
-            self._unviewWeight()
-
-    def type(self, type, tensorCache=None):
-        clear(self, 'finput', 'fgradInput')
-        return super(VolumetricConvolution, self).type(type, tensorCache)
-
-    def clearState(self, ):
-        clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
-        return super(VolumetricConvolution, self).clearState()
-
-    def __repr__(self):
-        s = super(VolumetricConvolution, self).__repr__()
-        s += '({} -> {}, {}x{}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
-        if self.dT != 1 or self.dW != 1 or self.dH != 1 or \
-           self.padT != 0 or self.padW != 0 or self.padH != 0:
-            s += ', {}, {}, {}'.format(self.dT, self.dW, self.dH)
-
-        if self.padT != 0 or self.padW != 0 or self.padH != 0:
-            s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
-
-        s += ')'
-        return s
diff --git a/torch/legacy/nn/VolumetricDropout.py b/torch/legacy/nn/VolumetricDropout.py
deleted file mode 100644
index c1e68be29215e9..00000000000000
--- a/torch/legacy/nn/VolumetricDropout.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class VolumetricDropout(Module):
-
-    def __init__(self, p=0.5):
-        super(VolumetricDropout, self).__init__()
-        self.p = p
-        self.train = True
-        self.noise = torch.Tensor()
-
-    def updateOutput(self, input):
-        self.output.resize_as_(input).copy_(input)
-        if self.train:
-            assert input.dim() == 5
-            self.noise.resize_(input.size(0), input.size(1), 1, 1, 1)
-
-            self.noise.bernoulli_(1 - self.p)
-            # We expand the random dropouts to the entire feature map because the
-            # features are likely correlated across the map and so the dropout
-            # should also be correlated.
-            self.output.mul_(self.noise.expand_as(input))
-        else:
-            self.output.mul_(1 - self.p)
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.train:
-            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-            self.gradInput.mul_(self.noise.expand_as(input))  # simply mask the gradients with the noise vector
-        else:
-            raise RuntimeError('backprop only defined while training')
-
-        return self.gradInput
-
-    def setp(self, p):
-        self.p = p
-
-    def __repr__(self):
-        return super(VolumetricDropout, self).__repr__() + '({:.4f})'.format(self.p)
-
-    def clearState(self):
-        clear(self, 'noise')
-        return super(VolumetricDropout, self).clearState()
diff --git a/torch/legacy/nn/VolumetricFullConvolution.py b/torch/legacy/nn/VolumetricFullConvolution.py
deleted file mode 100644
index 3236a7ede019bf..00000000000000
--- a/torch/legacy/nn/VolumetricFullConvolution.py
+++ /dev/null
@@ -1,213 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class VolumetricFullConvolution(Module):
-
-    def __init__(self, nInputPlane, nOutputPlane,
-                 kT, kW, kH,                 # kernel size
-                 dT=1, dW=1, dH=1,           # stride
-                 padT=0, padW=0, padH=0,     # padding
-                 adjT=0, adjW=0, adjH=0):    # extra output adjustment
-        super(VolumetricFullConvolution, self).__init__()
-
-        self.nInputPlane = nInputPlane
-        self.nOutputPlane = nOutputPlane
-        self.kW = kW
-        self.kH = kH
-        self.kT = kT
-        self.dW = dW
-        self.dH = dH
-        self.dT = dT
-        self.padW = padW
-        self.padH = padH
-        self.padT = padT
-        self.adjW = adjW
-        self.adjH = adjH
-        self.adjT = adjT
-
-        if self.adjW > self.dW - 1 or self.adjH > self.dH - 1 or self.adjT > self.dT - 1:
-            raise RuntimeError('adjW, adjH and adjT must be smaller than self.dW - 1, '
-                               ' self.dH - 1 and self.dT - 1 respectively')
-
-        self.weight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
-        self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
-        self.bias = torch.Tensor(self.nOutputPlane)
-        self.gradBias = torch.Tensor(self.nOutputPlane)
-
-        self.ones = torch.Tensor()
-        self.finput = torch.Tensor()
-        self.fgradInput = torch.Tensor()
-        self._input = None
-        self._gradOutput = None
-
-        self.reset()
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            nInputPlane = self.nInputPlane
-            kT = self.kT
-            kH = self.kH
-            kW = self.kW
-            stdv = 1. / math.sqrt(kW * kH * kT * nInputPlane)
-
-        self.weight.uniform_(-stdv, stdv)
-        self.bias.uniform_(-stdv, stdv)
-
-    def _makeContiguous(self, input, gradOutput=None):
-        if not input.is_contiguous():
-            if self._input is None:
-                self._input = input.new()
-            self._input.resize_as_(input).copy_(input)
-            input = self._input
-
-        if gradOutput is not None:
-            if not gradOutput.is_contiguous():
-                if self._gradOutput is None:
-                    self._gradOutput = gradOutput.new()
-                self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
-                gradOutput = self._gradOutput
-            return input, gradOutput
-
-        return input
-
-    def _calculateAdj(targetSize, ker, pad, stride):
-        return (targetSize + 2 * pad - ker) % stride
-
-    def updateOutput(self, input):
-        inputTensor = input
-        adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
-
-        # The input can be a table where the second element indicates the target
-        # output size, in which case the adj factors are computed automatically
-        if isinstance(input, list):
-            inputTensor = input[0]
-            targetTensor = input[1]
-            tDims = targetTensor.dim()
-            tT = targetTensor.size(tDims - 3)
-            tH = targetTensor.size(tDims - 2)
-            tW = targetTensor.size(tDims - 1)
-            adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
-            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
-            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
-
-        inputTensor = self._makeContiguous(inputTensor)
-        self._backend.VolumetricFullConvolution_updateOutput(
-            self._backend.library_state,
-            inputTensor,
-            self.output,
-            self.weight,
-            self.bias,
-            self.finput,
-            self.fgradInput,
-            self.kT, self.kW, self.kH,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            adjT, adjW, adjH
-        )
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        inputTensor = input
-        adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
-
-        # The input can be a table where the second element indicates the target
-        # output size, in which case the adj factors are computed automatically
-        if isinstance(input, list):
-            inputTensor = input[0]
-            targetTensor = input[1]
-            tDims = targetTensor.dim()
-            tT = targetTensor.size(tDims - 3)
-            tH = targetTensor.size(tDims - 2)
-            tW = targetTensor.size(tDims - 1)
-            adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
-            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
-            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
-            # Momentarily extract the gradInput tensor
-            if isinstance(self.gradInput, list):
-                self.gradInput = self.gradInput[0]
-
-        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
-        self._backend.VolumetricFullConvolution_updateGradInput(
-            self._backend.library_state,
-            inputTensor,
-            gradOutput,
-            self.gradInput,
-            self.weight,
-            self.finput,
-            self.fgradInput,
-            self.kT, self.kW, self.kH,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            adjT, adjW, adjH
-        )
-
-        if isinstance(input, list):
-            # Create a zero tensor to be expanded and used as gradInput[1].
-            if self.zeroScalar is None:
-                self.zeroScalar = input[1].new(1).zero_()
-            self.ones.resize_(input[1].dim()).fill_(1)
-            zeroTensor = self.zeroScalar.view(self.ones.tolist()).expand_as(input[1])
-            self.gradInput = [self.gradInput, zeroTensor]
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        inputTensor = input
-        adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
-
-        # The input can be a table where the second element indicates the target
-        # output size, in which case the adj factors are computed automatically
-        if isinstance(input, list):
-            inputTensor = input[0]
-            targetTensor = input[1]
-            tDims = targetTensor.dim()
-            tT = targetTensor.size(tDims - 3)
-            tH = targetTensor.size(tDims - 2)
-            tW = targetTensor.size(tDims - 1)
-            adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
-            adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
-            adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
-
-        inputTensor, gradOutput = self._makeContiguous(inputTensor, gradOutput)
-        self._backend.VolumetricFullConvolution_accGradParameters(
-            self._backend.library_state,
-            inputTensor,
-            gradOutput,
-            self.gradWeight,
-            self.gradBias,
-            self.finput,
-            self.fgradInput,
-            self.kT, self.kW, self.kH,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            adjT, adjW, adjH,
-            scale
-        )
-
-    def type(self, type, tensorCache=None):
-        self.finput = torch.Tensor()
-        self.fgradInput = torch.Tensor()
-        return super(VolumetricFullConvolution, self).type(type, tensorCache)
-
-    def __repr__(self):
-        s = super(VolumetricFullConvolution, self).__repr__()
-        s += '({} -> {}, {}x{}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
-        if self.dT != 1 or self.dW != 1 or self.dH != 1 or \
-                self.padT != 0 or self.padW != 0 or self.padH != 0 or \
-                self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
-            s += ', {}, {}, {}'.format(self.dT, self.dW, self.dH)
-
-        if self.padT != 0 or self.padW != 0 or self.padH != 0 or \
-                self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
-            s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
-
-        if self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
-            s += ', {}, {}, {}'.format(self.adjT, self.adjW, self.adjH)
-
-        s += ')'
-        return s
diff --git a/torch/legacy/nn/VolumetricMaxPooling.py b/torch/legacy/nn/VolumetricMaxPooling.py
deleted file mode 100644
index 823ab058466414..00000000000000
--- a/torch/legacy/nn/VolumetricMaxPooling.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import torch
-from .Module import Module
-from .utils import clear
-
-
-class VolumetricMaxPooling(Module):
-
-    def __init__(self, kT, kW, kH, dT=None, dW=None, dH=None, padT=0, padW=0, padH=0):
-        super(VolumetricMaxPooling, self).__init__()
-
-        self.kT = kT
-        self.kH = kH
-        self.kW = kW
-        self.dT = dT or kT
-        self.dW = dW or kW
-        self.dH = dH or kH
-
-        self.padT = padT
-        self.padW = padW
-        self.padH = padH
-
-        self.ceil_mode = False
-        self.indices = torch.LongTensor()
-
-    def ceil(self):
-        self.ceil_mode = True
-        return self
-
-    def floor(self):
-        self.ceil_mode = False
-        return self
-
-    def updateOutput(self, input):
-        dims = input.dim()
-        self.itime = input.size(dims - 3)
-        self.iheight = input.size(dims - 2)
-        self.iwidth = input.size(dims - 1)
-
-        if self.indices is None:
-            self.indices = input.new()
-        self.indices = self.indices.long()
-        self._backend.VolumetricMaxPooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.indices,
-            self.kT, self.kW, self.kH,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            self.ceil_mode
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._backend.VolumetricMaxPooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.indices,
-            self.kT, self.kW, self.kH,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH,
-            self.ceil_mode
-        )
-        return self.gradInput
-
-    def clearState(self):
-        clear(self, 'indices')
-        return super(VolumetricMaxPooling, self).clearState()
-
-    def __repr__(self):
-        s = super(VolumetricMaxPooling, self).__repr__()
-        s += '({}x{}x{}, {}, {}, {}'.format(self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
-        if self.padT != 0 or self.padW != 0 or self.padH != 0:
-            s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
-        s += ')'
-        return s
diff --git a/torch/legacy/nn/VolumetricMaxUnpooling.py b/torch/legacy/nn/VolumetricMaxUnpooling.py
deleted file mode 100644
index 4de3b52607f1fc..00000000000000
--- a/torch/legacy/nn/VolumetricMaxUnpooling.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import torch
-from .Module import Module
-from .VolumetricMaxPooling import VolumetricMaxPooling
-
-
-class VolumetricMaxUnpooling(Module):
-
-    def __init__(self, poolingModule):
-        super(VolumetricMaxUnpooling, self).__init__()
-        assert isinstance(poolingModule, VolumetricMaxPooling)
-        assert poolingModule.kT == poolingModule.dT
-        assert poolingModule.kH == poolingModule.dH
-        assert poolingModule.kW == poolingModule.dW
-        self.pooling = poolingModule
-
-    def _setParams(self):
-        self.indices = self.pooling.indices
-        self.otime = self.pooling.itime
-        self.oheight = self.pooling.iheight
-        self.owidth = self.pooling.iwidth
-        self.dT = self.pooling.dT
-        self.dH = self.pooling.dH
-        self.dW = self.pooling.dW
-        self.padT = self.pooling.padT
-        self.padH = self.pooling.padH
-        self.padW = self.pooling.padW
-
-    def updateOutput(self, input):
-        self._setParams()
-        self._backend.VolumetricMaxUnpooling_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.indices,
-            self.otime, self.owidth, self.oheight,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH
-        )
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        self._setParams()
-        self._backend.VolumetricMaxUnpooling_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.indices,
-            self.otime, self.owidth, self.oheight,
-            self.dT, self.dW, self.dH,
-            self.padT, self.padW, self.padH
-        )
-        return self.gradInput
-
-    def __repr__(self):
-        return 'nn.VolumetricMaxUnpooling associated to ' + self.pooling.__repr__()
diff --git a/torch/legacy/nn/VolumetricReplicationPadding.py b/torch/legacy/nn/VolumetricReplicationPadding.py
deleted file mode 100644
index 16cc7a1c097d7c..00000000000000
--- a/torch/legacy/nn/VolumetricReplicationPadding.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import torch
-from .Module import Module
-
-
-class VolumetricReplicationPadding(Module):
-
-    def __init__(self, pleft, pright=None, ptop=None, pbottom=None, pfront=None, pback=None):
-        super(VolumetricReplicationPadding, self).__init__()
-        self.pleft = pleft
-        self.pright = pright or pleft
-        self.ptop = ptop or pleft
-        self.pbottom = pbottom or pleft
-        self.pfront = pfront or pleft
-        self.pback = pback or pleft
-
-    def updateOutput(self, input):
-        assert input.dim() == 5
-        self._backend.VolumetricReplicationPadding_updateOutput(
-            self._backend.library_state,
-            input,
-            self.output,
-            self.pleft, self.pright,
-            self.ptop, self.pbottom,
-            self.pfront, self.pback
-        )
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        assert input.dim() == 5 and gradOutput.dim() == 5
-        assert input.size(0) == gradOutput.size(0)
-        assert input.size(1) == gradOutput.size(1)
-        assert input.size(2) + self.pfront + self.pback == gradOutput.size(2)
-        assert input.size(3) + self.ptop + self.pbottom == gradOutput.size(3)
-        assert input.size(4) + self.pleft + self.pright == gradOutput.size(4)
-
-        self._backend.VolumetricReplicationPadding_updateGradInput(
-            self._backend.library_state,
-            input,
-            gradOutput,
-            self.gradInput,
-            self.pleft, self.pright,
-            self.ptop, self.pbottom,
-            self.pfront, self.pback
-        )
-
-        return self.gradInput
-
-    def __repr__(self):
-        s = super(VolumetricReplicationPadding, self).__repr__()
-        s += '({}, {}, {}, {}, {}, {})'.format(self.pleft, self.pright,
-                                               self.ptop, self.pbottom,
-                                               self.pfront, self.pback
-                                               )
-        return s
diff --git a/torch/legacy/nn/WeightedEuclidean.py b/torch/legacy/nn/WeightedEuclidean.py
deleted file mode 100644
index d8cfaf37209f08..00000000000000
--- a/torch/legacy/nn/WeightedEuclidean.py
+++ /dev/null
@@ -1,260 +0,0 @@
-import math
-import torch
-from .Module import Module
-
-
-class WeightedEuclidean(Module):
-
-    def __init__(self, inputSize, outputSize):
-        super(WeightedEuclidean, self).__init__()
-
-        self.weight = torch.Tensor(inputSize, outputSize)
-        self.gradWeight = torch.Tensor(inputSize, outputSize)
-
-        # each template (output dim) has its own diagonal covariance matrix
-        self.diagCov = torch.Tensor(inputSize, outputSize)
-        self.gradDiagCov = torch.Tensor(inputSize, outputSize)
-
-        self.reset()
-        self._diagCov = self.output.new()
-
-        # TODO: confirm
-        self.fastBackward = False
-
-        self._input = None
-        self._weight = None
-        self._expand = None
-        self._expand2 = None
-        self._expand3 = None
-        self._repeat = None
-        self._repeat2 = None
-        self._repeat3 = None
-        self._div = None
-        self._output = None
-        self._expand4 = None
-        self._gradOutput = None
-        self._sum = None
-
-    def reset(self, stdv=None):
-        if stdv is not None:
-            stdv = stdv * math.sqrt(3)
-        else:
-            stdv = 1. / math.sqrt(self.weight.size(1))
-
-        self.weight.uniform_(-stdv, stdv)
-        self.diagCov.fill_(1)
-
-    def _view(self, res, src, *args):
-        if src.is_contiguous():
-            res.set_(src.view(*args))
-        else:
-            res.set_(src.contiguous().view(*args))
-
-    def updateOutput(self, input):
-        # lazy-initialize
-        if self._diagCov is None:
-            self._diagCov = self.output.new()
-
-        if self._input is None:
-            self._input = input.new()
-        if self._weight is None:
-            self._weight = self.weight.new()
-        if self._expand is None:
-            self._expand = self.output.new()
-        if self._expand2 is None:
-            self._expand2 = self.output.new()
-        if self._expand3 is None:
-            self._expand3 = self.output.new()
-        if self._repeat is None:
-            self._repeat = self.output.new()
-        if self._repeat2 is None:
-            self._repeat2 = self.output.new()
-        if self._repeat3 is None:
-            self._repeat3 = self.output.new()
-
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        # y_j = || c_j * (w_j - x) ||
-        if input.dim() == 1:
-            self._view(self._input, input, inputSize, 1)
-            self._expand.expand_as(self._input, self.weight)
-            self._repeat.resize_as_(self._expand).copy_(self._expand)
-            self._repeat.add_(-1, self.weight)
-            self._repeat.mul_(self.diagCov)
-            torch.norm(self._repeat, 2, 0, True, out=self.output)
-            self.output.resize_(outputSize)
-        elif input.dim() == 2:
-            batchSize = input.size(0)
-
-            self._view(self._input, input, batchSize, inputSize, 1)
-            self._expand = self._input.expand(batchSize, inputSize, outputSize)
-            # make the expanded tensor contiguous (requires lots of memory)
-            self._repeat.resize_as_(self._expand).copy_(self._expand)
-
-            self._weight = self.weight.view(1, inputSize, outputSize)
-            self._expand2 = self._weight.expand_as(self._repeat)
-
-            self._diagCov = self.diagCov.view(1, inputSize, outputSize)
-            self._expand3 = self._diagCov.expand_as(self._repeat)
-            if input.type() == 'torch.cuda.FloatTensor':
-                # TODO: this can be fixed with a custom allocator
-                # requires lots of memory, but minimizes cudaMallocs and loops
-                self._repeat2.resize_as_(self._expand2).copy_(self._expand2)
-                self._repeat.add_(-1, self._repeat2)
-                self._repeat3.resize_as_(self._expand3).copy_(self._expand3)
-                self._repeat.mul_(self._repeat3)
-            else:
-                self._repeat.add_(-1, self._expand2)
-                self._repeat.mul_(self._expand3)
-
-            torch.norm(self._repeat, 2, 1, True, out=self.output)
-            self.output.resize_(batchSize, outputSize)
-        else:
-            raise RuntimeError("1D or 2D input expected")
-
-        return self.output
-
-    def updateGradInput(self, input, gradOutput):
-        if self.gradInput is None:
-            return
-
-        if self._div is None:
-            self._div = input.new()
-        if self._output is None:
-            self._output = self.output.new()
-        if self._expand4 is None:
-            self._expand4 = input.new()
-        if self._gradOutput is None:
-            self._gradOutput = input.new()
-
-        if not self.fastBackward:
-            self.updateOutput(input)
-
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        """
-        dy_j   -2 * c_j * c_j * (w_j - x)   c_j * c_j * (x - w_j)
-        ---- = -------------------------- = ---------------------
-         dx     2 || c_j * (w_j - x) ||              y_j
-        """
-
-        # to prevent div by zero (NaN) bugs
-        self._output.resize_as_(self.output).copy_(self.output).add_(1e-7)
-        self._view(self._gradOutput, gradOutput, gradOutput.size())
-        torch.div(gradOutput, self._output, out=self._div)
-        if input.dim() == 1:
-            self._div.resize_(1, outputSize)
-            self._expand4 = self._div.expand_as(self.weight)
-
-            if torch.type(input) == 'torch.cuda.FloatTensor':
-                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
-                self._repeat2.mul_(self._repeat)
-            else:
-                self._repeat2.mul_(self._repeat, self._expand4)
-
-            self._repeat2.mul_(self.diagCov)
-            torch.sum(self._repeat2, 1, True, out=self.gradInput)
-            self.gradInput.resize_as_(input)
-        elif input.dim() == 2:
-            batchSize = input.size(0)
-
-            self._div.resize_(batchSize, 1, outputSize)
-            self._expand4 = self._div.expand(batchSize, inputSize, outputSize)
-
-            if input.type() == 'torch.cuda.FloatTensor':
-                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
-                self._repeat2.mul_(self._repeat)
-                self._repeat2.mul_(self._repeat3)
-            else:
-                torch.mul(self._repeat, self._expand4, out=self._repeat2)
-                self._repeat2.mul_(self._expand3)
-
-            torch.sum(self._repeat2, 2, True, out=self.gradInput)
-            self.gradInput.resize_as_(input)
-        else:
-            raise RuntimeError("1D or 2D input expected")
-
-        return self.gradInput
-
-    def accGradParameters(self, input, gradOutput, scale=1):
-        inputSize, outputSize = self.weight.size(0), self.weight.size(1)
-
-        """
-        dy_j   2 * c_j * c_j * (w_j - x)    c_j * c_j * (w_j - x)
-        ---- = -------------------------- = ---------------------
-        dw_j    2 || c_j * (w_j - x) ||             y_j
-
-        dy_j    2 * c_j * (w_j - x)^2    c_j * (w_j - x)^2
-        ---- = ----------------------- = -----------------
-        dc_j   2 || c_j * (w_j - x) ||         y_j
-        #"""
-        # assumes a preceding call to updateGradInput
-        if input.dim() == 1:
-            self.gradWeight.add_(-scale, self._repeat2)
-
-            self._repeat.div_(self.diagCov)
-            self._repeat.mul_(self._repeat)
-            self._repeat.mul_(self.diagCov)
-
-            if torch.type(input) == 'torch.cuda.FloatTensor':
-                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
-                self._repeat2.mul_(self._repeat)
-            else:
-                torch.mul(self._repeat, self._expand4, out=self._repeat2)
-
-            self.gradDiagCov.add_(self._repeat2)
-        elif input.dim() == 2:
-            if self._sum is None:
-                self._sum = input.new()
-            torch.sum(self._repeat2, 0, True, out=self._sum)
-            self._sum.resize_(inputSize, outputSize)
-            self.gradWeight.add_(-scale, self._sum)
-
-            if input.type() == 'torch.cuda.FloatTensor':
-                # requires lots of memory, but minimizes cudaMallocs and loops
-                self._repeat.div_(self._repeat3)
-                self._repeat.mul_(self._repeat)
-                self._repeat.mul_(self._repeat3)
-                self._repeat2.resize_as_(self._expand4).copy_(self._expand4)
-                self._repeat.mul_(self._repeat2)
-            else:
-                self._repeat.div_(self._expand3)
-                self._repeat.mul_(self._repeat)
-                self._repeat.mul_(self._expand3)
-                self._repeat.mul_(self._expand4)
-
-            torch.sum(self._repeat, 0, True, out=self._sum)
-            self._sum.resize_(inputSize, outputSize)
-            self.gradDiagCov.add_(scale, self._sum)
-        else:
-            raise RuntimeError("1D or 2D input expected")
-
-    def type(self, type=None, tensorCache=None):
-        if type:
-            # prevent premature memory allocations
-            self._input = None
-            self._output = None
-            self._gradOutput = None
-            self._weight = None
-            self._div = None
-            self._sum = None
-            self._expand = None
-            self._expand2 = None
-            self._expand3 = None
-            self._expand4 = None
-            self._repeat = None
-            self._repeat2 = None
-            self._repeat3 = None
-        return super(WeightedEuclidean, self).type(type, tensorCache)
-
-    def parameters(self):
-        return [self.weight, self.diagCov], [self.gradWeight, self.gradDiagCov]
-
-    def accUpdateGradParameters(self, input, gradOutput, lr):
-        gradWeight = self.gradWeight
-        gradDiagCov = self.gradDiagCov
-        self.gradWeight = self.weight
-        self.gradDiagCov = self.diagCov
-        self.accGradParameters(input, gradOutput, -lr)
-        self.gradWeight = gradWeight
-        self.gradDiagCov = gradDiagCov
diff --git a/torch/legacy/nn/WeightedMSECriterion.py b/torch/legacy/nn/WeightedMSECriterion.py
deleted file mode 100644
index 2f0da29077d508..00000000000000
--- a/torch/legacy/nn/WeightedMSECriterion.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import torch
-from torch.nn.functional import _Reduction
-from .Criterion import Criterion
-
-
-class WeightedMSECriterion(Criterion):
-
-    def __init__(self, weight, sizeAverage=True):
-        super(WeightedMSECriterion, self).__init__()
-        self.weight = weight.clone()
-        self.buffer = None
-        self.output_tensor = None
-        self.sizeAverage = sizeAverage
-
-    def updateOutput(self, input, target):
-        if self.buffer is None:
-            self.buffer = input.new()
-        self.buffer.resize_as_(input).copy_(target)
-        if input.dim() - 1 == self.weight.dim():
-            for i in range(input.size(0)):
-                self.buffer[i].mul_(self.weight)
-        else:
-            self.buffer.mul_(self.weight)
-
-        if self.output_tensor is None:
-            self.output_tensor = input.new(1)
-        self._backend.MSECriterion_updateOutput(
-            self._backend.library_state,
-            input,
-            self.buffer,
-            self.output_tensor,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        self.output = self.output_tensor[0].item()
-        return self.output
-
-    def updateGradInput(self, input, target):
-        self.buffer.resize_as_(input).copy_(target)
-        if input.dim() - 1 == self.weight.dim():
-            for i in range(input.size(0)):
-                self.buffer[i].mul_(self.weight)
-        else:
-            self.buffer.mul_(self.weight)
-
-        implicit_gradOutput = torch.Tensor([1]).type(input.type())
-
-        self._backend.MSECriterion_updateGradInput(
-            self._backend.library_state,
-            input,
-            self.buffer,
-            implicit_gradOutput,
-            self.gradInput,
-            _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),
-        )
-        return self.gradInput
diff --git a/torch/legacy/nn/__init__.py b/torch/legacy/nn/__init__.py
deleted file mode 100644
index 7e2507ac0a53b4..00000000000000
--- a/torch/legacy/nn/__init__.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from torch._thnn import type2backend
-
-from . import utils
-
-from .Module import Module
-from .Criterion import Criterion
-from .Container import Container
-
-from .Sequential import Sequential
-from .Parallel import Parallel
-from .Concat import Concat
-from .DepthConcat import DepthConcat
-from .ConcatTable import ConcatTable
-from .JoinTable import JoinTable
-from .ParallelTable import ParallelTable
-
-from .Abs import Abs
-from .AbsCriterion import AbsCriterion
-from .Add import Add
-from .AddConstant import AddConstant
-from .BCECriterion import BCECriterion
-from .BatchNormalization import BatchNormalization
-from .Bilinear import Bilinear
-from .CAddTable import CAddTable
-from .CDivTable import CDivTable
-from .CMul import CMul
-from .CMulTable import CMulTable
-from .CSubTable import CSubTable
-from .ClassNLLCriterion import ClassNLLCriterion
-from .Contiguous import Contiguous
-from .Copy import Copy
-from .Cosine import Cosine
-from .CosineDistance import CosineDistance
-from .CosineEmbeddingCriterion import CosineEmbeddingCriterion
-from .CriterionTable import CriterionTable
-from .CrossEntropyCriterion import CrossEntropyCriterion
-from .DistKLDivCriterion import DistKLDivCriterion
-from .DotProduct import DotProduct
-from .Dropout import Dropout
-from .ELU import ELU
-from .Euclidean import Euclidean
-from .Exp import Exp
-from .FlattenTable import FlattenTable
-from .GradientReversal import GradientReversal
-from .HardShrink import HardShrink
-from .HardTanh import HardTanh
-from .HingeEmbeddingCriterion import HingeEmbeddingCriterion
-from .Identity import Identity
-from .Index import Index
-from .L1Cost import L1Cost
-from .L1HingeEmbeddingCriterion import L1HingeEmbeddingCriterion
-from .L1Penalty import L1Penalty
-from .LeakyReLU import LeakyReLU
-from .Linear import Linear
-from .Log import Log
-from .LogSigmoid import LogSigmoid
-from .LogSoftMax import LogSoftMax
-from .LookupTable import LookupTable
-from .MM import MM
-from .MSECriterion import MSECriterion
-from .MarginCriterion import MarginCriterion
-from .MarginRankingCriterion import MarginRankingCriterion
-from .MaskedSelect import MaskedSelect
-from .Max import Max
-from .Min import Min
-from .MixtureTable import MixtureTable
-from .Mul import Mul
-from .MulConstant import MulConstant
-from .MultiCriterion import MultiCriterion
-from .MV import MV
-from .MultiLabelMarginCriterion import MultiLabelMarginCriterion
-from .MultiLabelSoftMarginCriterion import MultiLabelSoftMarginCriterion
-from .MultiMarginCriterion import MultiMarginCriterion
-from .Narrow import Narrow
-from .NarrowTable import NarrowTable
-from .Normalize import Normalize
-from .PReLU import PReLU
-from .Padding import Padding
-from .PairwiseDistance import PairwiseDistance
-from .ParallelCriterion import ParallelCriterion
-from .PartialLinear import PartialLinear
-from .Power import Power
-from .RReLU import RReLU  # TODO implement
-from .ReLU6 import ReLU6
-from .Replicate import Replicate
-from .Reshape import Reshape
-from .Select import Select
-from .SelectTable import SelectTable
-from .Sigmoid import Sigmoid
-from .SmoothL1Criterion import SmoothL1Criterion
-from .SoftMarginCriterion import SoftMarginCriterion
-from .SoftMax import SoftMax
-from .SoftMin import SoftMin
-from .SoftPlus import SoftPlus
-from .SoftShrink import SoftShrink
-from .SoftSign import SoftSign
-from .SpatialAdaptiveMaxPooling import SpatialAdaptiveMaxPooling
-from .SpatialAveragePooling import SpatialAveragePooling
-from .SpatialBatchNormalization import SpatialBatchNormalization
-from .SpatialClassNLLCriterion import SpatialClassNLLCriterion
-from .SpatialContrastiveNormalization import SpatialContrastiveNormalization
-from .SpatialConvolution import SpatialConvolution
-from .SpatialConvolutionLocal import SpatialConvolutionLocal
-from .SpatialConvolutionMap import SpatialConvolutionMap
-from .SpatialCrossMapLRN import SpatialCrossMapLRN
-from .SpatialDilatedConvolution import SpatialDilatedConvolution
-from .SpatialDivisiveNormalization import SpatialDivisiveNormalization
-from .SpatialDropout import SpatialDropout
-from .SpatialFractionalMaxPooling import SpatialFractionalMaxPooling
-from .SpatialFullConvolution import SpatialFullConvolution
-from .SpatialFullConvolutionMap import SpatialFullConvolutionMap
-from .SpatialLPPooling import SpatialLPPooling
-from .SpatialMaxPooling import SpatialMaxPooling
-from .SpatialMaxUnpooling import SpatialMaxUnpooling
-from .SpatialReflectionPadding import SpatialReflectionPadding
-from .SpatialReplicationPadding import SpatialReplicationPadding
-from .SpatialSoftMax import SpatialSoftMax
-from .SpatialSubSampling import SpatialSubSampling
-from .SpatialSubtractiveNormalization import SpatialSubtractiveNormalization
-from .SpatialUpSamplingNearest import SpatialUpSamplingNearest
-from .SpatialZeroPadding import SpatialZeroPadding
-from .SplitTable import SplitTable
-from .Sqrt import Sqrt
-from .Square import Square
-from .Squeeze import Squeeze
-from .Sum import Sum
-from .Tanh import Tanh
-from .TanhShrink import TanhShrink
-from .Threshold import Threshold
-from .Transpose import Transpose
-from .Unsqueeze import Unsqueeze
-from .View import View
-from .WeightedEuclidean import WeightedEuclidean
-from .WeightedMSECriterion import WeightedMSECriterion
-
-from .TemporalConvolution import TemporalConvolution
-from .TemporalMaxPooling import TemporalMaxPooling
-from .TemporalSubSampling import TemporalSubSampling
-
-from .VolumetricAveragePooling import VolumetricAveragePooling
-from .VolumetricBatchNormalization import VolumetricBatchNormalization
-from .VolumetricConvolution import VolumetricConvolution
-from .VolumetricDropout import VolumetricDropout
-from .VolumetricFullConvolution import VolumetricFullConvolution
-from .VolumetricMaxPooling import VolumetricMaxPooling
-from .VolumetricMaxUnpooling import VolumetricMaxUnpooling
-from .VolumetricReplicationPadding import VolumetricReplicationPadding
-
-from .Clamp import Clamp
-from .ClassSimplexCriterion import ClassSimplexCriterion
-from .ReLU import ReLU
-from .Mean import Mean
diff --git a/torch/legacy/nn/convert.vim b/torch/legacy/nn/convert.vim
deleted file mode 100644
index c83d953eaa6158..00000000000000
--- a/torch/legacy/nn/convert.vim
+++ /dev/null
@@ -1,52 +0,0 @@
-"Slightly adjust indentation
-%s/^   /        /g
-
-" # -> len
-%s/#\(\S*\) /len(\1)/g
-
-" for loops
-%s/for\( \)\{-\}\(\S*\)\( \)\{-\}=\( \)\{-\}\(\S*\),\( \)\{-\}\(\S*\)\( \)\{-\}do/for \2 in range(\5, \7+1)/g
-
-" Change comments
-%s/--\[\[/"""/g
-%s/]]/"""/g
-%s/--/#/g
-
-" Add spacing between commas
-%s/\(\S\),\(\S\)/\1, \2/g
-
-%s/local //g
-%s/ then/:/g
-%s/ do/:/g
-%s/end//g
-%s/elseif/elif/g
-%s/else/else:/g
-%s/true/True/g
-%s/false/False/g
-%s/\~=/!=/g
-%s/math\.min/min/g
-%s/math\.max/max/g
-%s/math\.abs/abs/g
-
-
-%s/__init/__init__/g
-
-" Rewrite function declarations
-%s/function \w*:\(\w*\)/    def \1/g
-%s/def \(.*\)$/def \1:/g
-
-" class declaration
-%s/\(\w*\), parent = torch\.class.*$/import torch\rfrom torch.legacy import nn\r\rclass \1(nn.Module):/g
-
-%s/input\.THNN/self._backend/g
-%s/\(self\.backend\w*$\)/\1\r        self._backend.library_state,/g
-%s/def \(\w*\)(/def \1(self, /g
-
-%s/__init__(self)/__init__()/g
-
-%s/:\(\S\)/.\1/g
-
-%s/\.cdata()//g
-%s/THNN\.optionalTensor(\(.*\))/\1/g
-
-%s/parent\./super(##, self)./g
diff --git a/torch/legacy/nn/utils.py b/torch/legacy/nn/utils.py
deleted file mode 100644
index 1dc300891b4666..00000000000000
--- a/torch/legacy/nn/utils.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import torch
-
-# tensorCache maintains a list of all tensors and storages that have been
-# converted (recursively) by calls to recursiveType() and type().
-# It caches conversions in order to preserve sharing semantics
-# i.e. if two tensors share a common storage, then type conversion
-# should preserve that.
-#
-# You can preserve sharing semantics across multiple networks by
-# passing tensorCache between the calls to type, e.g.
-#
-# > tensorCache = {}
-# > net1:type('torch.cuda.FloatTensor', tensorCache)
-# > net2:type('torch.cuda.FloatTensor', tensorCache)
-# > nn.utils.recursiveType(anotherTensor, 'torch.cuda.FloatTensor', tensorCache)
-
-
-def recursiveType(param, type, tensorCache={}):
-    from .Criterion import Criterion
-    from .Module import Module
-    if isinstance(param, list):
-        for i, p in enumerate(param):
-            param[i] = recursiveType(p, type, tensorCache)
-    elif isinstance(param, Module) or isinstance(param, Criterion):
-        param.type(type, tensorCache)
-    elif isinstance(param, torch.Tensor):
-        if param.type() != type:
-            key = param._cdata
-            if key in tensorCache:
-                newparam = tensorCache[key]
-            else:
-                newparam = torch.Tensor().type(type)
-                storageType = type.replace('Tensor', 'Storage')
-                param_storage = param.storage()
-                if param_storage:
-                    storage_key = param_storage._cdata
-                    if storage_key not in tensorCache:
-                        tensorCache[storage_key] = torch._import_dotted_name(
-                            storageType)(param_storage.size()).copy_(param_storage)
-                    newparam.set_(
-                        tensorCache[storage_key],
-                        param.storage_offset(),
-                        param.size(),
-                        param.stride()
-                    )
-                tensorCache[key] = newparam
-            param = newparam
-    return param
-
-
-def recursiveResizeAs(t1, t2):
-    if isinstance(t2, list):
-        t1 = t1 if isinstance(t1, list) else [t1]
-        if len(t1) < len(t2):
-            t1 += [None] * (len(t2) - len(t1))
-        for i, _ in enumerate(t2):
-            t1[i], t2[i] = recursiveResizeAs(t1[i], t2[i])
-        t1 = t1[:len(t2)]
-    elif isinstance(t2, torch.Tensor):
-        t1 = t1 if isinstance(t1, torch.Tensor) else t2.new()
-        t1.resize_as_(t2)
-    else:
-        raise RuntimeError("Expecting nested tensors or tables. Got " +
-                           type(t1).__name__ + " and " + type(t2).__name__ + "instead")
-    return t1, t2
-
-
-def recursiveFill(t2, val):
-    if isinstance(t2, list):
-        t2 = [recursiveFill(x, val) for x in t2]
-    elif isinstance(t2, torch.Tensor):
-        t2.fill_(val)
-    else:
-        raise RuntimeError("expecting tensor or table thereof. Got " +
-                           type(t2).__name__ + " instead")
-    return t2
-
-
-def recursiveAdd(t1, val=1, t2=None):
-    if t2 is None:
-        t2 = val
-        val = 1
-    if isinstance(t2, list):
-        t1 = t1 if isinstance(t1, list) else [t1]
-        for i, _ in enumerate(t2):
-            t1[i], t2[i] = recursiveAdd(t1[i], val, t2[i])
-    elif isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
-        t1.add_(val, t2)
-    else:
-        raise RuntimeError("expecting nested tensors or tables. Got " +
-                           type(t1).__name__ + " and " + type(t2).__name__ + " instead")
-    return t1, t2
-
-
-def recursiveCopy(t1, t2):
-    if isinstance(t2, list):
-        t1 = t1 if isinstance(t1, list) else [t1]
-        for i, _ in enumerate(t2):
-            t1[i], t2[i] = recursiveCopy(t1[i], t2[i])
-    elif isinstance(t2, torch.Tensor):
-        t1 = t1 if isinstance(t1, torch.Tensor) else t2.new()
-        t1.resize_as_(t2).copy_(t2)
-    else:
-        raise RuntimeError("expecting nested tensors or tables. Got " +
-                           type(t1).__name__ + " and " + type(t2).__name__ + " instead")
-    return t1, t2
-
-
-def addSingletondimension(*args):
-    view = None
-    if len(args) < 3:
-        t, dim = args
-        return t.unsqueeze(dim)
-    else:
-        view, t, dim = args
-        assert isinstance(view, torch.Tensor)
-        view.set_(t)
-        return view.unsqueeze_(dim)
-
-
-def contiguousView(output, input, *args):
-    if output is None:
-        output = input.new()
-    if input.is_contiguous():
-        output.set_(input.view(*args))
-    else:
-        output.resize_as_(input)
-        output.copy_(input)
-        output.set_(output.view(*args))
-    return output
-
-# go over specified fields and clear them. accepts
-# nn.clearState(self, ['_buffer', '_buffer2']) and
-# nn.clearState(self, '_buffer', '_buffer2')
-
-
-def clear(self, *args):
-    if len(args) == 1 and isinstance(args[0], list):
-        args = args[0]
-
-    def _clear(f):
-        if not hasattr(self, f):
-            return
-        attr = getattr(self, f)
-        if isinstance(attr, torch.Tensor):
-            attr.set_()
-        elif isinstance(attr, list):
-            del attr[:]
-        else:
-            setattr(self, f, None)
-    for key in args:
-        _clear(key)
-    return self
diff --git a/torch/legacy/optim/__init__.py b/torch/legacy/optim/__init__.py
deleted file mode 100644
index 7e07b4564b1715..00000000000000
--- a/torch/legacy/optim/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from .adadelta import adadelta
-from .adagrad import adagrad
-from .adam import adam
-from .adamax import adamax
-from .asgd import asgd
-from .cg import cg
-from .nag import nag
-from .rmsprop import rmsprop
-from .rprop import rprop
-from .sgd import sgd
-from .lbfgs import lbfgs
diff --git a/torch/legacy/optim/adadelta.py b/torch/legacy/optim/adadelta.py
deleted file mode 100644
index 86f264bf23a3a5..00000000000000
--- a/torch/legacy/optim/adadelta.py
+++ /dev/null
@@ -1,55 +0,0 @@
-
-def adadelta(opfunc, x, config, state=None):
-    """ADADELTA implementation http://arxiv.org/abs/1212.5701
-
-    ARGUMENTS:
-    - `opfunc` : a function that takes a single input (X), the point of
-                evaluation, and returns f(X) and df/dX
-    - `x` : the initial point
-    - `config` : a table of hyper-parameters
-    - `config['rho']` : interpolation parameter
-    - `config['eps']` : for numerical stability
-    - `config['weightDecay']` : weight decay
-    - `state` : a table describing the state of the optimizer; after each
-            call the state is modified
-    - `state['paramVariance']` : vector of temporal variances of parameters
-    - `state['accDelta']` : vector of accummulated delta of gradients
-    RETURNS:
-    - `x` : the new x vector
-    - `f(x)` : the value of optimized function, evaluated before the update
-    """
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("adadelta requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    rho = config.get('rho', 0.9)
-    eps = config.get('eps', 1e-6)
-    wd = config.get('weightDecay', 0)
-    state['evalCounter'] = state.get('evalCounter', 0)
-
-    # (1) evaluate f(x) and df/dx
-    fx, dfdx = opfunc(x)
-
-    # (2) weight decay
-    if wd != 0:
-        dfdx.add_(wd, x)
-
-    # (3) parameter update
-    if 'paramVariance' not in state:
-        state['paramVariance'] = x.new().resize_as_(dfdx).zero_()
-        state['paramStd'] = x.new().resize_as_(dfdx).zero_()
-        state['delta'] = x.new().resize_as_(dfdx).zero_()
-        state['accDelta'] = x.new().resize_as_(dfdx).zero_()
-
-    state['paramVariance'].mul_(rho).addcmul_(1 - rho, dfdx, dfdx)
-    state['paramStd'].resize_as_(state['paramVariance']).copy_(state['paramVariance']).add_(eps).sqrt_()
-    state['delta'].resize_as_(state['paramVariance']).copy_(
-        state['accDelta']).add_(eps).sqrt_().div_(state['paramStd']).mul_(dfdx)
-    x.add_(-1, state['delta'])
-    state['accDelta'].mul_(rho).addcmul_(1 - rho, state['delta'], state['delta'])
-
-    # (4) update evaluation counter
-    state['evalCounter'] += 1
-
-    # return x*, f(x) before optimization
-    return x, fx
diff --git a/torch/legacy/optim/adagrad.py b/torch/legacy/optim/adagrad.py
deleted file mode 100644
index 42f2b9111bf48f..00000000000000
--- a/torch/legacy/optim/adagrad.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-def adagrad(opfunc, x, config, state=None):
-    """ADAGRAD implementation
-
-    ARGS:
-    - `opfunc` : a function that takes a single input (X), the point of
-            evaluation, and returns f(X) and df/dX
-    - `x` : the initial point
-    - `state` : a table describing the state of the optimizer; after each
-            call the state is modified
-    - `state['learningRate']` : learning rate
-    - `state['paramVariance']` : vector of temporal variances of parameters
-    - `state['weightDecay']` : scalar that controls weight decay
-    RETURN:
-    - `x` : the new x vector
-    - `f(x)` : the value of optimized function, evaluated before the update
-
-    """
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("adagrad requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    lr = config.get('learningRate', 1e-3)
-    lrd = config.get('learningRateDecay', 0)
-    wd = config.get('weightDecay', 0)
-    state['evalCounter'] = state.get('evalCounter', 0)
-
-    # (1) evaluate f(x) and df/dx
-    fx, dfdx = opfunc(x)
-
-    # (2) weight decay with a single parameter
-    if wd != 0:
-        dfdx.add_(wd, x)
-
-    # (3) learning rate decay (annealing)
-    clr = lr / (1 + state['evalCounter'] * lrd)
-
-    # (4) parameter update with single or individual learning rates
-    if 'paramVariance' not in state:
-        state['paramVariance'] = x.new().resize_as_(dfdx).zero_()
-        state['paramStd'] = x.new().resize_as_(dfdx)
-
-    state['paramVariance'].addcmul_(1, dfdx, dfdx)
-    state['paramStd'].resize_as_(state['paramVariance']).copy_(state['paramVariance']).sqrt_()
-    x.addcdiv_(-clr, dfdx, state['paramStd'].add_(1e-10))
-
-    # (5) update evaluation counter
-    state['evalCounter'] += 1
-
-    # return x*, f(x) before optimization
-    return x, fx
diff --git a/torch/legacy/optim/adam.py b/torch/legacy/optim/adam.py
deleted file mode 100644
index f0d6b856b1eb1b..00000000000000
--- a/torch/legacy/optim/adam.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import math
-
-
-def adam(opfunc, x, config, state=None):
-    """ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf
-
-    ARGS:
-
-    - 'opfunc' : a function that takes a single input (X), the point
-                of a evaluation, and returns f(X) and df/dX
-    - 'x'      : the initial point
-    - 'config` : a table with configuration parameters for the optimizer
-    - 'config.learningRate'      : learning rate
-    - 'config.beta1'             : first moment coefficient
-    - 'config.beta2'             : second moment coefficient
-    - 'config.epsilon'           : for numerical stability
-    - 'config.weightDecay'       : weight decay
-    - 'state'                    : a table describing the state of the optimizer; after each
-                                call the state is modified
-
-    RETURN:
-    - `x`     : the new x vector
-    - `f(x)`  : the value of optimized function, evaluated before the update
-
-    """
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("adam requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    lr = config.get('learningRate', 0.001)
-    beta1 = config.get('beta1', 0.9)
-    beta2 = config.get('beta2', 0.999)
-    epsilon = config.get('epsilon', 1e-8)
-    wd = config.get('weightDecay', 0)
-
-    # (1) evaluate f(x) and df/dx
-    fx, dfdx = opfunc(x)
-
-    # (2) weight decay
-    if wd != 0:
-        dfdx.add_(wd, x)
-
-    # Initialization
-    if 't' not in state:
-        state['t'] = 0
-        # Exponential moving average of gradient values
-        state['m'] = x.new().resize_as_(dfdx).zero_()
-        # Exponential moving average of squared gradient values
-        state['v'] = x.new().resize_as_(dfdx).zero_()
-        # A tmp tensor to hold the sqrt(v) + epsilon
-        state['denom'] = x.new().resize_as_(dfdx).zero_()
-
-    state['t'] += 1
-
-    # Decay the first and second moment running average coefficient
-    state['m'].mul_(beta1).add_(1 - beta1, dfdx)
-    state['v'].mul_(beta2).addcmul_(1 - beta2, dfdx, dfdx)
-
-    state['denom'].copy_(state['v']).sqrt_().add_(epsilon)
-
-    biasCorrection1 = 1 - beta1 ** state['t']
-    biasCorrection2 = 1 - beta2 ** state['t']
-    stepSize = lr * math.sqrt(biasCorrection2) / biasCorrection1
-    # (3) update x
-    x.addcdiv_(-stepSize, state['m'], state['denom'])
-
-    # return x*, f(x) before optimization
-    return x, fx
diff --git a/torch/legacy/optim/adamax.py b/torch/legacy/optim/adamax.py
deleted file mode 100644
index 916991b77801a5..00000000000000
--- a/torch/legacy/optim/adamax.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import torch
-
-
-def adamax(opfunc, x, config, state=None):
-    """ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
-
-    ARGS:
-
-    - 'opfunc' : a function that takes a single input (X), the point
-                of a evaluation, and returns f(X) and df/dX
-    - 'x'      : the initial point
-    - 'config` : a table with configuration parameters for the optimizer
-    - 'config.learningRate'      : learning rate
-    - 'config.beta1'             : first moment coefficient
-    - 'config.beta2'             : second moment coefficient
-    - 'config.epsilon'           : for numerical stability
-    - 'config.weightDecay'       : weight decay
-    - 'state'                    : a table describing the state of the optimizer;
-                                   after each call the state is modified.
-
-    RETURN:
-    - `x`     : the new x vector
-    - `f(x)`  : the value of optimized function, evaluated before the update
-
-    """
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("adamax requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    lr = config.get('learningRate', 0.002)
-    beta1 = config.get('beta1', 0.9)
-    beta2 = config.get('beta2', 0.999)
-    epsilon = config.get('epsilon', 1e-38)
-    wd = config.get('weightDecay', 0)
-
-    # (1) evaluate f(x) and df/dx
-    fx, dfdx = opfunc(x)
-
-    # (2) weight decay
-    if wd != 0:
-        dfdx.add_(wd, x)
-
-    # Initialization
-    if 't' not in state:
-        state['t'] = 0
-        # Exponential moving average of gradient values
-        state['m'] = x.new().resize_as_(dfdx).zero_()
-        # Exponential moving average of the infinity norm
-        state['u'] = x.new().resize_as_(dfdx).zero_()
-        # A tmp tensor to hold the input to max()
-        state['max'] = x.new(*((2,) + dfdx.size())).zero_()
-
-    state['t'] += 1
-
-    # Update biased first moment estimate.
-    state['m'].mul_(beta1).add_(1 - beta1, dfdx)
-    # Update the exponentially weighted infinity norm.
-    state['max'][0].copy_(state['u']).mul_(beta2)
-    state['max'][1].copy_(dfdx).abs_().add_(epsilon)
-    torch.max(state['max'], 0, keepdim=False, out=(state['u'], state['u'].new().long()))
-
-    biasCorrection1 = 1 - beta1 ** state['t']
-    stepSize = lr / biasCorrection1
-    # (2) update x
-    x.addcdiv_(-stepSize, state['m'], state['u'])
-
-    # return x*, f(x) before optimization
-    return x, fx
diff --git a/torch/legacy/optim/asgd.py b/torch/legacy/optim/asgd.py
deleted file mode 100644
index edaa62d5f2ab18..00000000000000
--- a/torch/legacy/optim/asgd.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import math
-
-
-def asgd(opfunc, x, config, state=None):
-    """ An implementation of ASGD
-
-    ASGD:
-
-        x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
-        a := a + mu_t [ x - a ]
-
-        eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
-        mu_t = 1/max(1,t-t0)
-
-    implements ASGD algorithm as in L.Bottou's sgd-2.0
-
-    ARGS:
-
-    - `opfunc` : a function that takes a single input (X), the point of
-            evaluation, and returns f(X) and df/dX
-    - `x`      : the initial point
-    - `state`  : a table describing the state of the optimizer; after each
-            call the state is modified
-    - `state['eta0']`   : learning rate
-    - `state['lambda']` : decay term
-    - `state['alpha']`  : power for eta update
-    - `state['t0']`     : point at which to start averaging
-
-    RETURN:
-    - `x`     : the new x vector
-    - `f(x)`  : the function, evaluated before the update
-    - `ax`    : the averaged x vector
-
-    (Clement Farabet, 2012)
-    """
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("asgd requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    config['eta0'] = config.get('eta0', 1e-4)
-    config['lambda'] = config.get('lambda', 1e-4)
-    config['alpha'] = config.get('alpha', 0.75)
-    config['t0'] = config.get('t0', 1e6)
-
-    # (hidden state)
-    state['eta_t'] = state.get('eta_t', config['eta0'])
-    state['mu_t'] = state.get('mu_t', 1)
-    state['t'] = state.get('t', 0)
-
-    # (1) evaluate f(x) and df/dx
-    fx, dfdx = opfunc(x)
-
-    # (2) decay term
-    x.mul_(1 - config['lambda'] * state['eta_t'])
-
-    # (3) update x
-    x.add_(-state['eta_t'], dfdx)
-
-    # (4) averaging
-    state['ax'] = state.get('ax', x.new().resize_as_(x).zero_())
-    state['tmp'] = state.get('tmp', state['ax'].new().resize_as_(state['ax']))
-    if state['mu_t'] != 1:
-        state['tmp'].copy_(x)
-        state['tmp'].add_(-1, state['ax']).mul_(state['mu_t'])
-        state['ax'].add_(state['tmp'])
-    else:
-        state['ax'].copy_(x)
-
-    # (5) update eta_t and mu_t
-    state['t'] += 1
-    state['eta_t'] = config['eta0'] / math.pow((1 + config['lambda'] * config['eta0'] * state['t']), config['alpha'])
-    state['mu_t'] = 1 / max(1, state['t'] - config['t0'])
-
-    # return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
-    return x, fx, state['ax']
diff --git a/torch/legacy/optim/cg.py b/torch/legacy/optim/cg.py
deleted file mode 100644
index 7880489edd6f8d..00000000000000
--- a/torch/legacy/optim/cg.py
+++ /dev/null
@@ -1,217 +0,0 @@
-import math
-
-INFINITY = float('inf')
-NAN = float('nan')
-
-
-def sqrt_nothrow(x):
-    return math.sqrt(x) if x >= 0 else NAN
-
-
-def cg(opfunc, x, config, state=None):
-    """
-
-    This cg implementation is a rewrite of minimize.m written by Carl
-    E. Rasmussen. It is supposed to produce exactly same results (give
-    or take numerical accuracy due to some changed order of
-    operations). You can compare the result on rosenbrock with minimize.m.
-    http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html
-
-        [x fx c] = minimize([0 0]', 'rosenbrock', -25)
-
-    Note that we limit the number of function evaluations only, it seems much
-    more important in practical use.
-
-    ARGS:
-
-    - `opfunc` : a function that takes a single input, the point of evaluation.
-    - `x`      : the initial point
-    - `state` : a table of parameters and temporary allocations.
-    - `state['maxEval']`     : max number of function evaluations
-    - `state['maxIter']`     : max number of iterations
-    - `state['df0']` : if you pass torch.Tensor they will be used for temp storage
-    - `state['df1']` : if you pass torch.Tensor they will be used for temp storage
-    - `state['df2']` : if you pass torch.Tensor they will be used for temp storage
-    - `state['df3']` : if you pass torch.Tensor they will be used for temp storage
-    - `state['s']`   : if you pass torch.Tensor they will be used for temp storage
-    - `state['x0']`  : if you pass torch.Tensor they will be used for temp storage
-
-    RETURN:
-    - `x*` : the new x vector, at the optimal point
-    - `f`  : a table of all function values where
-        `f[1]` is the value of the function before any optimization and
-        `f[#f]` is the final fully optimized value, at x*
-
-    (Koray Kavukcuoglu, 2012)
-    """
-    # parameters
-    if config is None and state is None:
-        raise ValueError("cg requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    rho = config.get('rho', 0.01)
-    sig = config.get('sig', 0.5)
-    _int = config.get('int', 0.1)
-    ext = config.get('ext', 3.0)
-    maxIter = config.get('maxIter', 20)
-    ratio = config.get('ratio', 100)
-    maxEval = config.get('maxEval', maxIter * 1.25)
-    red = 1
-
-    i = 0
-    ls_failed = 0
-    fx = []
-
-    # we need three points for the interpolation/extrapolation stuff
-    z1, z2, z3 = 0, 0, 0
-    d1, d2, d3 = 0, 0, 0
-    f1, f2, f3 = 0, 0, 0
-
-    df1 = state.get('df1', x.new())
-    df2 = state.get('df2', x.new())
-    df3 = state.get('df3', x.new())
-
-    df1.resize_as_(x)
-    df2.resize_as_(x)
-    df3.resize_as_(x)
-
-    # search direction
-    s = state.get('s', x.new())
-    s.resize_as_(x)
-
-    # we need a temp storage for X
-    x0 = state.get('x0', x.new())
-    f0 = 0
-    df0 = state.get('df0', x.new())
-    x0.resize_as_(x)
-    df0.resize_as_(x)
-
-    # evaluate at initial point
-    f1, tdf = opfunc(x)
-    fx.append(f1)
-    df1.copy_(tdf)
-    i = i + 1
-
-    # initial search direction
-    s.copy_(df1).mul_(-1)
-
-    d1 = -s.dot(s)         # slope
-    z1 = red / (1 - d1)         # initial step
-
-    while i < abs(maxEval):
-        x0.copy_(x)
-        f0 = f1
-        df0.copy_(df1)
-
-        x.add_(z1, s)
-        f2, tdf = opfunc(x)
-        df2.copy_(tdf)
-        i = i + 1
-        d2 = df2.dot(s)
-        f3, d3, z3 = f1, d1, -z1   # init point 3 equal to point 1
-        m = min(maxIter, maxEval - i)
-        success = 0
-        limit = -1
-
-        while True:
-            while (f2 > f1 + z1 * rho * d1 or d2 > -sig * d1) and m > 0:
-                limit = z1
-                if f2 > f1:
-                    z2 = z3 - (0.5 * d3 * z3 * z3) / (d3 * z3 + f2 - f3)
-                else:
-                    A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
-                    B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
-                    z2 = (sqrt_nothrow(B * B - A * d2 * z3 * z3) - B) / A
-
-                if z2 != z2 or z2 == INFINITY or z2 == -INFINITY:
-                    z2 = z3 / 2
-
-                z2 = max(min(z2, _int * z3), (1 - _int) * z3)
-                z1 = z1 + z2
-                x.add_(z2, s)
-                f2, tdf = opfunc(x)
-                df2.copy_(tdf)
-                i = i + 1
-                m = m - 1
-                d2 = df2.dot(s)
-                z3 = z3 - z2
-
-            if f2 > f1 + z1 * rho * d1 or d2 > -sig * d1:
-                break
-            elif d2 > sig * d1:
-                success = 1
-                break
-            elif m == 0:
-                break
-
-            A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
-            B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
-            _denom = (B + sqrt_nothrow(B * B - A * d2 * z3 * z3))
-            z2 = -d2 * z3 * z3 / _denom if _denom != 0 else NAN
-
-            if z2 != z2 or z2 == INFINITY or z2 == -INFINITY or z2 < 0:
-                if limit < -0.5:
-                    z2 = z1 * (ext - 1)
-                else:
-                    z2 = (limit - z1) / 2
-            elif (limit > -0.5) and (z2 + z1) > limit:
-                z2 = (limit - z1) / 2
-            elif limit < -0.5 and (z2 + z1) > z1 * ext:
-                z2 = z1 * (ext - 1)
-            elif z2 < -z3 * _int:
-                z2 = -z3 * _int
-            elif limit > -0.5 and z2 < (limit - z1) * (1 - _int):
-                z2 = (limit - z1) * (1 - _int)
-
-            f3 = f2
-            d3 = d2
-            z3 = -z2
-            z1 = z1 + z2
-            x.add_(z2, s)
-
-            f2, tdf = opfunc(x)
-            df2.copy_(tdf)
-            i = i + 1
-            m = m - 1
-            d2 = df2.dot(s)
-
-        if success == 1:
-            f1 = f2
-            fx.append(f1)
-            ss = (df2.dot(df2) - df2.dot(df1)) / df1.dot(df1)
-            s.mul_(ss)
-            s.add_(-1, df2)
-            tmp = df1.clone()
-            df1.copy_(df2)
-            df2.copy_(tmp)
-            d2 = df1.dot(s)
-            if d2 > 0:
-                s.copy_(df1)
-                s.mul_(-1)
-                d2 = -s.dot(s)
-
-            z1 = z1 * min(ratio, d1 / (d2 - 1e-320))
-            d1 = d2
-            ls_failed = 0
-        else:
-            x.copy_(x0)
-            f1 = f0
-            df1.copy_(df0)
-            if ls_failed or i > maxEval:
-                break
-
-            tmp = df1.clone()
-            df1.copy_(df2)
-            df2.copy_(tmp)
-            s.copy_(df1)
-            s.mul_(-1)
-            d1 = -s.dot(s)
-            z1 = 1 / (1 - d1)
-            ls_failed = 1
-
-    state['df0'] = df0
-    state['df1'] = df1
-    state['df2'] = df2
-    state['df3'] = df3
-    state['x0'] = x0
-    state['s'] = s
-    return x, fx, i
diff --git a/torch/legacy/optim/lbfgs.py b/torch/legacy/optim/lbfgs.py
deleted file mode 100644
index 838db031743dc5..00000000000000
--- a/torch/legacy/optim/lbfgs.py
+++ /dev/null
@@ -1,254 +0,0 @@
-import torch
-
-
-def lbfgs(opfunc, x, config, state=None):
-    """
-    An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)
-    This implementation of L-BFGS relies on a user-provided line
-    search function (state.lineSearch). If this function is not
-    provided, then a simple learningRate is used to produce fixed
-    size steps. Fixed size steps are much less costly than line
-    searches, and can be useful for stochastic problems.
-    The learning rate is used even when a line search is provided.
-    This is also useful for large-scale stochastic problems, where
-    opfunc is a noisy approximation of f(x). In that case, the learning
-    rate allows a reduction of confidence in the step size.
-
-    Args:
-    - `opfunc` : a function that takes a single input (X), the point of
-             evaluation, and returns f(X) and df/dX
-    - `x` : the initial point
-    - `state` : a table describing the state of the optimizer; after each
-             call the state is modified
-    - `state.maxIter` : Maximum number of iterations allowed
-    - `state.maxEval` : Maximum number of function evaluations
-    - `state.tolFun` : Termination tolerance on the first-order optimality
-    - `state.tolX` : Termination tol on progress in terms of func/param changes
-    - `state.lineSearch` : A line search function
-    - `state.learningRate` : If no line search provided, then a fixed step size is used
-
-    Returns:
-    - `x*` : the new `x` vector, at the optimal point
-    - `f`  : a table of all function values:
-         `f[1]` is the value of the function before any optimization and
-         `f[#f]` is the final fully optimized value, at `x*`
-
-    (Clement Farabet, 2012)
-    """
-
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("lbfgs requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    maxIter = config.get('maxIter', 20)
-    maxEval = config.get('maxEval', maxIter * 1.25)
-    tolFun = config.get('tolFun', 1e-5)
-    tolX = config.get('tolX', 1e-9)
-    nCorrection = config.get('nCorrection', 100)
-    lineSearch = config.get('lineSearch')
-    lineSearchOptions = config.get('lineSearchOptions')
-    learningRate = config.get('learningRate', 1)
-    isverbose = config.get('verbose', False)
-
-    state.setdefault('funcEval', 0)
-    state.setdefault('nIter', 0)
-
-    # verbose function
-    if isverbose:
-        def verbose(*args):
-            args = ('<optim.lbfgs>',) + args
-            print(args)
-    else:
-        def verbose(*args):
-            pass
-
-    # evaluate initial f(x) and df/dx
-    f, g = opfunc(x)
-    f_hist = [f]
-    currentFuncEval = 1
-    state['funcEval'] += 1
-    p = g.size(0)
-
-    # check optimality of initial point
-    if 'tmp1' not in state:
-        state['tmp1'] = g.new(g.size()).zero_()
-    tmp1 = state['tmp1']
-    tmp1.copy_(g).abs_()
-    if tmp1.sum() <= tolFun:
-        verbose('optimality condition below tolFun')
-        return x, f_hist
-
-    if 'dir_bufs' not in state:
-        # reusable buffers for y's and s's, and their histories
-        verbose('creating recyclable direction/step/history buffers')
-        state['dir_bufs'] = list(g.new(nCorrection + 1, p).split(1))
-        state['stp_bufs'] = list(g.new(nCorrection + 1, p).split(1))
-        for i in range(len(state['dir_bufs'])):
-            state['dir_bufs'][i] = state['dir_bufs'][i].squeeze(0)
-            state['stp_bufs'][i] = state['stp_bufs'][i].squeeze(0)
-
-    # variables cached in state (for tracing)
-    d = state.get('d')
-    t = state.get('t')
-    old_dirs = state.get('old_dirs')
-    old_stps = state.get('old_stps')
-    Hdiag = state.get('Hdiag')
-    g_old = state.get('g_old')
-    f_old = state.get('f_old')
-
-    # optimize for a max of maxIter iterations
-    nIter = 0
-    while nIter < maxIter:
-        # keep track of nb of iterations
-        nIter += 1
-        state['nIter'] += 1
-
-        ############################################################
-        # compute gradient descent direction
-        ############################################################
-        if state['nIter'] == 1:
-            d = g.neg()
-            old_dirs = []
-            old_stps = []
-            Hdiag = 1
-        else:
-            # do lbfgs update (update memory)
-            y = state['dir_bufs'].pop()
-            s = state['stp_bufs'].pop()
-            torch.add(g, g_old, alpha=-1, out=y)
-            torch.mul(d, t, out=s)
-            ys = y.dot(s)  # y*s
-            if ys > 1e-10:
-                # updating memory
-                if len(old_dirs) == nCorrection:
-                    # shift history by one (limited-memory)
-                    state['dir_bufs'].append(old_dirs.pop(0))
-                    state['stp_bufs'].append(old_stps.pop(0))
-
-                # store new direction/step
-                old_dirs.append(s)
-                old_stps.append(y)
-
-                # update scale of initial Hessian approximation
-                Hdiag = ys / y.dot(y)  # (y*y)
-            else:
-                # put y and s back into the buffer pool
-                state['dir_bufs'].append(y)
-                state['stp_bufs'].append(s)
-
-            # compute the approximate (L-BFGS) inverse Hessian
-            # multiplied by the gradient
-            k = len(old_dirs)
-
-            # need to be accessed element-by-element, so don't re-type tensor:
-            if 'ro' not in state:
-                state['ro'] = torch.Tensor(nCorrection)
-            ro = state['ro']
-
-            for i in range(k):
-                ro[i] = 1 / old_stps[i].dot(old_dirs[i])
-
-            # iteration in L-BFGS loop collapsed to use just one buffer
-            q = tmp1  # reuse tmp1 for the q buffer
-            # need to be accessed element-by-element, so don't re-type tensor:
-            if 'al' not in state:
-                state['al'] = torch.zeros(nCorrection)
-            al = state['al']
-
-            torch.mul(g, -1, out=q)
-            for i in range(k - 1, -1, -1):
-                al[i] = old_dirs[i].dot(q) * ro[i]
-                q.add_(-al[i], old_stps[i])
-
-            # multiply by initial Hessian
-            r = d  # share the same buffer, since we don't need the old d
-            torch.mul(q, Hdiag, out=r)
-            for i in range(k):
-                be_i = old_stps[i].dot(r) * ro[i]
-                r.add_(al[i] - be_i, old_dirs[i])
-            # final direction is in r/d (same object)
-        if g_old is None:
-            g_old = g.clone()
-        else:
-            g_old.copy_(g)
-        f_old = f
-
-        ############################################################
-        # compute step length
-        ############################################################
-        # directional derivative
-        gtd = g.dot(d)  # g * d
-
-        # reset initial guess for step size
-        if state['nIter'] == 1:
-            tmp1.copy_(g).abs_()
-            t = min(1, 1 / tmp1.sum()) * learningRate
-        else:
-            t = learningRate
-
-        # optional line search: user function
-        lsFuncEval = 0
-        if lineSearch is not None:
-            # perform line search, using user function
-            f, g, x, t, lsFuncEval = lineSearch(opfunc, x, t, d, f, g, gtd, lineSearchOpts)
-            f_hist.append(f)
-        else:
-            # no line search, simply move with fixed-step
-            x.add_(t, d)
-            if nIter != maxIter:
-                # re-evaluate function only if not in last iteration
-                # the reason we do this: in a stochastic setting,
-                # no use to re-evaluate that function here
-                f, g = opfunc(x)
-                lsFuncEval = 1
-                f_hist.append(f)
-
-        # update func eval
-        currentFuncEval += lsFuncEval
-        state['funcEval'] += lsFuncEval
-
-        ############################################################
-        # check conditions
-        ############################################################
-        if nIter == maxIter:
-            # no use to run tests
-            verbose('reached max number of iterations')
-            break
-
-        if currentFuncEval >= maxEval:
-            # max nb of function evals
-            verbose('max nb of function evals')
-            break
-
-        tmp1.copy_(g).abs_()
-        if tmp1.sum() <= tolFun:
-            # check optimality
-            verbose('optimality condition below tolFun')
-            break
-
-        # check that progress can be made along that direction
-        if gtd > -tolX:
-            break
-
-        tmp1.copy_(d).mul_(t).abs_()
-        if tmp1.sum() <= tolX:
-            # step size below tolX
-            verbose('step size below tolX')
-            break
-
-        if abs(f - f_old) < tolX:
-            # function value changing less than tolX
-            verbose('function value changing less than tolX')
-            break
-
-    # save state
-    state['old_dirs'] = old_dirs
-    state['old_stps'] = old_stps
-    state['Hdiag'] = Hdiag
-    state['g_old'] = g_old
-    state['f_old'] = f_old
-    state['t'] = t
-    state['d'] = d
-
-    # return optimal x, and history of f(x)
-    return x, f_hist, currentFuncEval
diff --git a/torch/legacy/optim/nag.py b/torch/legacy/optim/nag.py
deleted file mode 100644
index e6f568cadc85bc..00000000000000
--- a/torch/legacy/optim/nag.py
+++ /dev/null
@@ -1,82 +0,0 @@
-
-def nag(opfunc, x, config, state=None):
-    """
-    An implementation of SGD adapted with features of Nesterov's
-    Accelerated Gradient method, based on the paper
-    On the Importance of Initialization and Momentum in Deep Learning
-    Sutsveker et. al., ICML 2013
-
-    ARGS:
-    opfunc : a function that takes a single input (X), the point of
-            evaluation, and returns f(X) and df/dX
-    x      : the initial point
-    state  : a table describing the state of the optimizer; after each
-            call the state is modified
-    state['learningRate']      : learning rate
-    state['learningRateDecay'] : learning rate decay
-    state['weightDecay']       : weight decay
-    state['momentum']          : momentum
-    state['learningRates']     : vector of individual learning rates
-
-    RETURN:
-    x     : the new x vector
-    f(x)  : the function, evaluated before the update
-
-    (Dilip Krishnan, 2013)
-    """
-
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("nag requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    lr = config.get('learningRate', 1e-3)
-    lrd = config.get('learningRateDecay', 0)
-    wd = config.get('weightDecay', 0)
-    mom = config.get('momentum', 0.9)
-    damp = config.get('dampening', mom)
-    lrs = config.get('learningRates', None)
-    state['evalCounter'] = state.get('evalCounter', 0)
-
-    if mom <= 0:
-        raise ValueError('Momentum must be positive for Nesterov Accelerated Gradient')
-
-    # (1) evaluate f(x) and df/dx
-    # first step in the direction of the momentum vector
-
-    if 'dfdx' in state:
-        x.add_(mom, state['dfdx'])
-
-    #: compute gradient at that point
-    # comment out the above line to get the original SGD
-    fx, dfdx = opfunc(x)
-
-    # (2) weight decay
-    if wd != 0:
-        dfdx.add_(wd, x)
-
-    # (3) learning rate decay (annealing)
-    clr = lr / (1 + state['evalCounter'] * lrd)
-
-    # (4) apply momentum
-    if 'dfdx' not in state:
-        state['dfdx'] = dfdx.new().resize_as_(dfdx).zero_()
-    else:
-        state['dfdx'].mul_(mom)
-
-    # (5) parameter update with single or individual learning rates
-    if lrs is not None:
-        if 'deltaParameters' in state:
-            state['deltaParameters'] = x.new().resize_as_(dfdx)
-
-        state['deltaParameters'].copy_(lrs).mul_(dfdx)
-        x.add_(-clr, state['deltaParameters'])
-        state['dfdx'].add_(-clr, state['deltaParameters'])
-    else:
-        x.add_(-clr, dfdx)
-        state['dfdx'].add_(-clr, dfdx)
-
-    # (6) update evaluation counter
-    state['evalCounter'] += 1
-
-    # return x, f(x) before optimization
-    return x, fx
diff --git a/torch/legacy/optim/rmsprop.py b/torch/legacy/optim/rmsprop.py
deleted file mode 100644
index 4c8db68522b515..00000000000000
--- a/torch/legacy/optim/rmsprop.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-
-
-def rmsprop(opfunc, x, config, state=None):
-    """ An implementation of RMSprop
-
-    ARGS:
-
-    - 'opfunc' : a function that takes a single input (X), the point
-                of a evaluation, and returns f(X) and df/dX
-    - 'x'      : the initial point
-    - 'config` : a table with configuration parameters for the optimizer
-    - 'config['learningRate']'      : learning rate
-    - 'config['alpha']'             : smoothing constant
-    - 'config['epsilon']'           : value with which to initialise m
-    - 'config['weightDecay']'       : weight decay
-    - 'state'                    : a table describing the state of the optimizer;
-                                after each call the state is modified
-    - 'state['m']'                  : leaky sum of squares of parameter gradients,
-    - 'state['tmp']'                : and the square root (with epsilon smoothing)
-
-    RETURN:
-    - `x`     : the new x vector
-    - `f(x)`  : the function, evaluated before the update
-
-    """
-    # (0) get/update state
-    if config is None and state is None:
-        raise ValueError("rmsprop requires a dictionary to retain state between iterations")
-    state = state if state is not None else config
-    lr = config.get('learningRate', 1e-2)
-    alpha = config.get('alpha', 0.99)
-    epsilon = config.get('epsilon', 1e-8)
-    wd = config.get('weightDecay', 0)
-
-    # (1) evaluate f(x) and df/dx
-    fx, dfdx = opfunc(x)
-
-    # (2) weight decay
-    if wd != 0:
-        dfdx.add_(wd, x)
-
-    # (3) initialize mean square values and square gradient storage
-    if 'm' not in state:
-        state['m'] = x.new().resize_as_(dfdx).zero_()
-        state['tmp'] = x.new().resize_as_(dfdx)
-
-    # (4) calculate new (leaky) mean squared values
-    state['m'].mul_(alpha)
-    state['m'].addcmul_(1.0 - alpha, dfdx, dfdx)
-
-    # (5) perform update
-    torch.sqrt(state['m'], out=state['tmp']).add_(epsilon)
-    x.addcdiv_(-lr, dfdx, state['tmp'])
-
-    # return x*, f(x) before optimization
-    return x, fx
diff --git a/torch/legacy/optim/rprop.py b/torch/legacy/optim/rprop.py
deleted file mode 100644
index 059533643e31c4..00000000000000
--- a/torch/legacy/optim/rprop.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import torch
-
-
-def rprop(opfunc, x, config, state=None):
-    """ A plain implementation of RPROP
-
-    ARGS:
-    - `opfunc` : a function that takes a single input (X), the point of
-                evaluation, and returns f(X) and df/dX
-    - `x`      : the initial point
-    - `state`  : a table describing the state of the optimizer; after each
-                call the state is modified
-    - `state['stepsize']`    : initial step size, common to all components
-    - `state['etaplus']`     : multiplicative increase factor, > 1 (default 1.2)
-    - `state['etaminus']`    : multiplicative decrease factor, < 1 (default 0.5)
-    - `state['stepsizemax']` : maximum stepsize allowed (default 50)
-    - `state['stepsizemin']` : minimum stepsize allowed (default 1e-6)
-    - `state['niter']`       : number of iterations (default 1)
-
-    RETURN:
-    - `x`     : the new x vector
-    - `f(x)`  : the function, evaluated before the update
-
-    (Martin Riedmiller, Koray Kavukcuoglu 2013)
-    """
-    if config is None and state is None:
-        raise ValueError("rprop requires a dictionary to retain state between iterations")
-
-    # (0) get/update state
-    state = state if state is not None else config
-    stepsize = config.get('stepsize', 0.1)
-    etaplus = config.get('etaplus', 1.2)
-    etaminus = config.get('etaminus', 0.5)
-    stepsizemax = config.get('stepsizemax', 50.0)
-    stepsizemin = config.get('stepsizemin', 1e-06)
-    niter = config.get('niter', 1)
-
-    hfx = []
-
-    for i in range(niter):
-        # (1) evaluate f(x) and df/dx
-        fx, dfdx = opfunc(x)
-
-        # init temp storage
-        if 'delta' not in state:
-            state['delta'] = dfdx.new(dfdx.size()).zero_()
-            state['stepsize'] = dfdx.new(dfdx.size()).fill_(stepsize)
-            state['sign'] = dfdx.new(dfdx.size())
-            state['bytesign'] = torch.ByteTensor(dfdx.size())
-            state['psign'] = torch.ByteTensor(dfdx.size())
-            state['nsign'] = torch.ByteTensor(dfdx.size())
-            state['zsign'] = torch.ByteTensor(dfdx.size())
-            state['dminmax'] = torch.ByteTensor(dfdx.size())
-            if str(type(x)).find('Cuda') > -1:
-                # Push to GPU
-                state['psign'] = state['psign'].cuda()
-                state['nsign'] = state['nsign'].cuda()
-                state['zsign'] = state['zsign'].cuda()
-                state['dminmax'] = state['dminmax'].cuda()
-
-        # sign of derivative from last step to this one
-        torch.mul(dfdx, state['delta'], out=state['sign']).sign_()
-
-        # get indices of >0, <0 and ==0 entries
-        torch.gt(state['sign'], 0, out=state['psign'])
-        torch.lt(state['sign'], 0, out=state['nsign'])
-        torch.eq(state['sign'], 0, out=state['zsign'])
-
-        # get step size updates
-        state['sign'][state['psign']] = etaplus
-        state['sign'][state['nsign']] = etaminus
-        state['sign'][state['zsign']] = 1
-
-        # update stepsizes with step size updates
-        state['stepsize'].mul_(state['sign'])
-
-        # threshold step sizes
-        # >50 => 50
-        torch.gt(state['stepsize'], stepsizemax, out=state['dminmax'])
-        state['stepsize'][state['dminmax']] = stepsizemax
-        # <1e-6 ==> 1e-6
-        torch.lt(state['stepsize'], stepsizemin, out=state['dminmax'])
-        state['stepsize'][state['dminmax']] = stepsizemin
-
-        # for dir<0, dfdx=0
-        # for dir>=0 dfdx=dfdx
-        dfdx[state['nsign']] = 0
-        torch.sign(dfdx, out=state['sign'])
-
-        # update weights
-        x.addcmul_(-1, state['sign'], state['stepsize'])
-
-        # update state['dfdx'] with current dfdx
-        state['delta'].copy_(dfdx)
-
-        hfx.append(fx)
-
-    # return x*, table of f(x) values from each step
-    return x, hfx
diff --git a/torch/legacy/optim/sgd.py b/torch/legacy/optim/sgd.py
deleted file mode 100644
index 300654fe8fd25e..00000000000000
--- a/torch/legacy/optim/sgd.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import torch
-
-
-def sgd(opfunc, x, config, state=None):
-    """A plain implementation of SGD
-
-    ARGS:
-
-    - `opfunc` : a function that takes a single input (X), the point
-                of a evaluation, and returns f(X) and df/dX
-    - `x`      : the initial point
-    - `config` : a table with configuration parameters for the optimizer
-    - `config['learningRate']`      : learning rate
-    - `config['learningRateDecay']` : learning rate decay
-    - `config['weightDecay']`       : weight decay
-    - `config['weightDecays']`      : vector of individual weight decays
-    - `config['momentum']`          : momentum
-    - `config['dampening']`         : dampening for momentum
-    - `config['nesterov']`          : enables Nesterov momentum
-    - `config['learningRates']`     : vector of individual learning rates
-    - `state`  : a table describing the state of the optimizer; after each
-                call the state is modified
-    - `state['evalCounter']`        : evaluation counter (optional: 0, by default)
-
-    RETURN:
-    - `x`     : the new x vector
-    - `f(x)`  : the function, evaluated before the update
-
-    (Clement Farabet, 2012)
-    """
-    # (0) get/update state
-    state = state if state is not None else config
-    lr = config.get('learningRate', 1e-3)
-    lrd = config.get('learningRateDecay', 0)
-    wd = config.get('weightDecay', 0)
-    mom = config.get('momentum', 0)
-    damp = config.get('dampening', mom)
-    nesterov = config.get('nesterov', False)
-    lrs = config.get('learningRates', None)
-    wds = config.get('weightDecays', None)
-    if 'evalCounter' not in state:
-        state['evalCounter'] = 0
-    if nesterov and (mom <= 0 and damp != 0):
-        raise ValueError("Nesterov momentum requires a momentum and zero dampening")
-    if wd != 0 and wds is not None:
-        raise ValueError("Only one of wd and wds can be specified")
-
-    # (1) evaluate f(x) and df/dx
-    fx, dfdx = opfunc(x)
-
-    # (2) weight decay with single or individual parameters
-    if wd != 0:
-        dfdx.add_(wd, x)
-    elif wds is not None:
-        if not state['decayParameters']:
-            state['decayParameters'] = torch.Tensor().type_as(x).resize_as_(dfdx)
-
-        state['decayParameters'].copy_(wds).mul_(x)
-        dfdx.add_(state['decayParameters'])
-
-    # (3) apply momentum
-    if mom != 0:
-        if 'dfdx' not in state:
-            state['dfdx'] = torch.Tensor().type_as(dfdx).resize_as_(dfdx).copy_(dfdx)
-        else:
-            state['dfdx'].mul_(mom).add_(1 - damp, dfdx)
-
-        if nesterov:
-            dfdx.add_(mom, state['dfdx'])
-        else:
-            dfdx = state['dfdx']
-
-    # (4) learning rate decay (annealing)
-    clr = lr / (1 + state['evalCounter'] * lrd)
-
-    # (5) parameter update with single or individual learning rates
-    if lrs is not None:
-        if 'deltaParameters' not in state:
-            state['deltaParameters'] = torch.Tensor().type_as(x).resize_as_(dfdx)
-
-        state['deltaParameters'].copy_(lrs).mul_(dfdx)
-        x.add_(-clr, state['deltaParameters'])
-    else:
-        x.add_(-clr, dfdx)
-
-    # (6) update evaluation counter
-    state['evalCounter'] += 1
-
-    # return x*, f(x) before optimization
-    return x, fx
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index 4417364641efd2..e56d996a36ba33 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -2,6 +2,7 @@
 
 #include <gloo/allreduce_halving_doubling.h>
 #include <gloo/allreduce_ring_chunked.h>
+#include <gloo/barrier_all_to_one.h>
 #include <gloo/broadcast_one_to_all.h>
 
 #ifdef USE_CUDA
@@ -186,23 +187,26 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) {
   {
     std::unique_lock<std::mutex> lock(m_);
     completed_ = true;
+    if (entry.key.type != nullptr) {
 #ifdef USE_CUDA
-    cuda_ = entry.key.type->is_cuda();
-    // Populate devices and events so that we can later synchronize
-    // with the operation associated with this work finishing.
-    if (cuda_) {
-      at::DeviceGuard deviceGuard;
-      devices_ = entry.key.devices;
-      events_.resize(devices_.size());
-      for (size_t i = 0; i < devices_.size(); i++) {
-        deviceGuard.set_index(devices_[i]);
-        events_[i] = CUDAEvent::create();
-        const auto& event = events_[i].getEvent();
-        const auto& stream = entry.streams[i].getStream();
-        C10D_CUDA_CHECK(cudaEventRecord(event, stream));
+      cuda_ = entry.key.type->is_cuda();
+
+      // Populate devices and events so that we can later synchronize
+      // with the operation associated with this work finishing.
+      if (cuda_) {
+        at::DeviceGuard deviceGuard;
+        devices_ = entry.key.devices;
+        events_.resize(devices_.size());
+        for (size_t i = 0; i < devices_.size(); i++) {
+          deviceGuard.set_index(devices_[i]);
+          events_[i] = CUDAEvent::create();
+          const auto& event = events_[i].getEvent();
+          const auto& stream = entry.streams[i].getStream();
+          C10D_CUDA_CHECK(cudaEventRecord(event, stream));
+        }
       }
-    }
 #endif
+    }
   }
   cv_.notify_all();
 }
@@ -306,10 +310,6 @@ ProcessGroupGloo::ProcessGroupGloo(
   for (size_t i = 0; i < threads_.size(); i++) {
     threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this);
   }
-
-#ifdef USE_CUDA
-  thcState_ = ::at::globalContext().lazyInitCUDA();
-#endif
 }
 
 ProcessGroupGloo::~ProcessGroupGloo() {
@@ -384,6 +384,10 @@ void ProcessGroupGloo::createAlgorithm(AlgorithmEntry& entry) {
     case CollectiveType::BROADCAST:
       GENERATE_ALL_TYPES(key.type->scalarType(), createBroadcast, entry);
       return;
+    case CollectiveType::BARRIER:
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::BarrierAllToOne(contexts_[0]));
+      return;
     case CollectiveType::UNUSED:
       break;
   }
@@ -497,6 +501,11 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
   auto entry = std::unique_ptr<AlgorithmEntry>(new AlgorithmEntry);
   entry->key = key;
 
+  // Without type there is nothing else to construct
+  if (key.type == nullptr) {
+    return entry;
+  }
+
   // Allocate source tensors for this entry
   auto& srcSizes = key.srcSizes;
   entry->src.resize(srcSizes.size());
@@ -592,14 +601,15 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
   // In case of CUDA, ensure that operations that are queued after
   // this collective wait for the collective to complete.
   if (key.type->is_cuda()) {
-    synchronizeStreams(thcState_, entry);
+    auto thcState = ::at::globalContext().lazyInitCUDA();
+    synchronizeStreams(thcState, entry);
     entry->run = [=]() mutable {
       entry->algorithm->run();
       for (size_t i = 0; i < tensors.size(); i++) {
         // The THCStreamGuard is a RAII wrapper for temporarily
         // overriding the current THCStream. This also sets the
         // current device to the stream's device.
-        THCStreamGuard guard(thcState_, entry->streams[i]);
+        THCStreamGuard guard(thcState, entry->streams[i]);
         tensors[i].copy_(entry->src[i]);
       }
     };
@@ -642,14 +652,15 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
   // In case of CUDA, ensure that operations that are queued after
   // this collective wait for the collective to complete.
   if (key.type->is_cuda()) {
-    synchronizeStreams(thcState_, entry);
+    auto thcState = ::at::globalContext().lazyInitCUDA();
+    synchronizeStreams(thcState, entry);
     entry->run = [=]() mutable {
       entry->algorithm->run();
       for (size_t i = 0; i < tensors.size(); i++) {
         // The THCStreamGuard is a RAII wrapper for temporarily
         // overriding the current THCStream. This also sets the
         // current device to the stream's device.
-        THCStreamGuard guard(thcState_, entry->streams[i]);
+        THCStreamGuard guard(thcState, entry->streams[i]);
         tensors[i].copy_(entry->src[i]);
       }
     };
@@ -782,7 +793,14 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::barrier() {
-  throw std::runtime_error("ProcessGroupGloo does not support barrier");
+  AlgorithmKey key;
+  key.collectiveType = CollectiveType::BARRIER;
+
+  auto entry = checkout(key);
+  entry->run = [=]() mutable {
+    entry->algorithm->run();
+  };
+  return enqueue(entry);
 }
 
 std::unordered_map<int, int> ProcessGroupGloo::getGroupRank() {
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
index 773ad600d4402f..7b4bfbccbecf38 100644
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -24,11 +24,6 @@
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
 
-#ifdef USE_CUDA
-// Forward declaration
-struct THCState;
-#endif
-
 namespace c10d {
 
 // AlgorithmKey is a const identifier for a Gloo algorithm.
@@ -389,11 +384,6 @@ class ProcessGroupGloo : public ProcessGroup {
   std::mutex queueMutex_;
   std::condition_variable queueProduceCV_;
   std::condition_variable queueConsumeCV_;
-
-#ifdef USE_CUDA
-  // Store copy of pointer to THCState retrieved from ::at::globalContext().
-  THCState* thcState_;
-#endif
 };
 
 } // namespace c10d
diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp
index 8d5686f19a8d7a..334551992ff3a1 100644
--- a/torch/lib/c10d/Types.hpp
+++ b/torch/lib/c10d/Types.hpp
@@ -7,6 +7,7 @@ namespace c10d {
 enum class CollectiveType : std::uint8_t {
   BROADCAST,
   ALLREDUCE,
+  BARRIER,
   UNUSED,
 };
 
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index a9b272cab8f5c5..41484b1007a8f1 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -252,6 +252,24 @@ void testBroadcast(const std::string& path, const at::Backend b) {
   }
 }
 
+void testBarrier(const std::string& path) {
+  const auto size = 2;
+  auto tests = CollectiveTest::initialize(path, size);
+
+  // Kick off work
+  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> work(size);
+  for (auto i = 0; i < size; i++) {
+    work[i] = tests[i].getProcessGroup().barrier();
+  }
+
+  // Wait for work to complete
+  for (auto i = 0; i < size; i++) {
+    if (!work[i]->wait()) {
+      throw work[i]->exception();
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   {
     TemporaryFile file;
@@ -291,6 +309,11 @@ int main(int argc, char** argv) {
   }
 #endif
 
+  {
+    TemporaryFile file;
+    testBarrier(file.path);
+  }
+
   std::cout << "Test successful" << std::endl;
   return 0;
 }
diff --git a/torch/nn/init.py b/torch/nn/init.py
index 29de7b54a14906..530a0f1c11f7c1 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -417,6 +417,7 @@ def deprecated_init(*args, **kwargs):
 
     See :func:`~torch.nn.init.{new_name}` for details.""".format(
         old_name=old_name, new_name=new_name)
+    deprecated_init.__name__ = old_name
     return deprecated_init
 
 
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 3f334304705ded..b2500dc7306141 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -346,7 +346,7 @@ def embedding_bag(g,
                 indices,
                 offsets,
                 operator_s="embedding_bag",
-                outputs=3,
+                outputs=4,
                 scale_grad_by_freq_i=scale_grad_by_freq,
                 mode_i=mode,
                 sparse_i=sparse)
diff --git a/torch/tensor.py b/torch/tensor.py
index f72db6d138e584..ec2569b28ff753 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -241,6 +241,10 @@ def argsort(self, dim=None, descending=False):
         r"""See :func: `torch.argsort`"""
         return torch.argsort(self, dim, descending)
 
+    def norm(self, p="fro", dim=None, keepdim=False):
+        r"""See :func: `torch.norm`"""
+        return torch.norm(self, p, dim, keepdim)
+
     def btrifact(self, info=None, pivot=True):
         r"""See :func:`torch.btrifact`
         """
diff --git a/torch/utils/_cpp_extension_versioner.py b/torch/utils/_cpp_extension_versioner.py
new file mode 100644
index 00000000000000..cb778ab8923d70
--- /dev/null
+++ b/torch/utils/_cpp_extension_versioner.py
@@ -0,0 +1,54 @@
+import collections
+
+
+Entry = collections.namedtuple('Entry', 'version, hash')
+
+
+def update_hash(seed, value):
+    # Good old boost::hash_combine
+    # https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+    return seed ^ (hash(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2))
+
+
+def hash_source_files(hash_value, source_files):
+    for filename in source_files:
+        with open(filename) as file:
+            hash_value = update_hash(hash_value, file.read())
+    return hash_value
+
+
+def hash_build_arguments(hash_value, build_arguments):
+    for group in build_arguments:
+        if group:
+            for argument in group:
+                hash_value = update_hash(hash_value, argument)
+    return hash_value
+
+
+class ExtensionVersioner(object):
+    def __init__(self):
+        self.entries = {}
+
+    def get_version(self, name):
+        entry = self.entries.get(name)
+        return None if entry is None else entry.version
+
+    def bump_version_if_changed(self,
+                                name,
+                                source_files,
+                                build_arguments,
+                                build_directory,
+                                with_cuda):
+        hash_value = 0
+        hash_value = hash_source_files(hash_value, source_files)
+        hash_value = hash_build_arguments(hash_value, build_arguments)
+        hash_value = update_hash(hash_value, build_directory)
+        hash_value = update_hash(hash_value, with_cuda)
+
+        entry = self.entries.get(name)
+        if entry is None:
+            self.entries[name] = entry = Entry(0, hash_value)
+        elif hash_value != entry.hash:
+            self.entries[name] = entry = Entry(entry.version + 1, hash_value)
+
+        return entry.version
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 01961cf994ec73..50d3f74b2a2c69 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -10,19 +10,25 @@
 import tempfile
 import warnings
 
+from future.utils import raise_from
+
 import torch
 from .file_baton import FileBaton
+from ._cpp_extension_versioner import ExtensionVersioner
 
 from setuptools.command.build_ext import build_ext
 
 
+IS_WINDOWS = sys.platform == 'win32'
+
+
 def _find_cuda_home():
     '''Finds the CUDA install path.'''
     # Guess #1
     cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
     if cuda_home is None:
         # Guess #2
-        if sys.platform == 'win32':
+        if IS_WINDOWS:
             cuda_homes = glob.glob(
                 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
             if len(cuda_homes) == 0:
@@ -34,7 +40,7 @@ def _find_cuda_home():
         if not os.path.exists(cuda_home):
             # Guess #3
             try:
-                which = 'where' if sys.platform == 'win32' else 'which'
+                which = 'where' if IS_WINDOWS else 'which'
                 nvcc = subprocess.check_output(
                     [which, 'nvcc']).decode().rstrip('\r\n')
                 cuda_home = os.path.dirname(os.path.dirname(nvcc))
@@ -76,10 +82,26 @@ def _find_cuda_home():
 ]
 
 
-def is_binary_build():
+JIT_EXTENSION_VERSIONER = ExtensionVersioner()
+
+
+def _is_binary_build():
     return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__)
 
 
+def get_default_build_root():
+    '''
+    Returns the path to the root folder under which extensions will built.
+
+    For each extension module built, there will be one folder underneath the
+    folder returned by this function. For example, if ``p`` is the path
+    returned by this function and ``ext`` the name of an extension, the build
+    folder for the extension will be ``p/ext``.
+    '''
+    # tempfile.gettempdir() will be /tmp on UNIX and \TEMP on Windows.
+    return os.path.realpath(os.path.join(tempfile.gettempdir(), 'torch_extensions'))
+
+
 def check_compiler_abi_compatibility(compiler):
     '''
     Verifies that the given compiler is ABI-compatible with PyTorch.
@@ -92,10 +114,10 @@ def check_compiler_abi_compatibility(compiler):
         False if the compiler is (likely) ABI-incompatible with PyTorch,
         else True.
     '''
-    if not is_binary_build():
+    if not _is_binary_build():
         return True
     try:
-        check_cmd = '{}' if sys.platform == 'win32' else '{} --version'
+        check_cmd = '{}' if IS_WINDOWS else '{} --version'
         info = subprocess.check_output(
             check_cmd.format(compiler).split(), stderr=subprocess.STDOUT)
     except Exception:
@@ -262,7 +284,7 @@ def _check_abi(self):
         # On some platforms, like Windows, compiler_cxx is not available.
         if hasattr(self.compiler, 'compiler_cxx'):
             compiler = self.compiler.compiler_cxx[0]
-        elif sys.platform == 'win32':
+        elif IS_WINDOWS:
             compiler = os.environ.get('CXX', 'cl')
         else:
             compiler = os.environ.get('CXX', 'c++')
@@ -290,7 +312,7 @@ def _add_gnu_abi_flag_if_binary(self, extension):
         # so that the std::string in the API is resolved to
         # non-C++11 symbols.
         define = '-D_GLIBCXX_USE_CXX11_ABI=0'
-        if is_binary_build():
+        if _is_binary_build():
             if isinstance(extension.extra_compile_args, dict):
                 for args in extension.extra_compile_args.values():
                     args.append(define)
@@ -327,7 +349,7 @@ def CppExtension(name, sources, *args, **kwargs):
     include_dirs += include_paths()
     kwargs['include_dirs'] = include_dirs
 
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         library_dirs = kwargs.get('library_dirs', [])
         library_dirs += library_paths()
         kwargs['library_dirs'] = library_dirs
@@ -376,7 +398,7 @@ def CUDAExtension(name, sources, *args, **kwargs):
 
     libraries = kwargs.get('libraries', [])
     libraries.append('cudart')
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         libraries.append('caffe2')
         libraries.append('torch')
         libraries.append('caffe2_gpu')
@@ -431,7 +453,7 @@ def library_paths(cuda=False):
     '''
     paths = []
 
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         here = os.path.abspath(__file__)
         torch_path = os.path.dirname(os.path.dirname(here))
         lib_path = os.path.join(torch_path, 'lib')
@@ -439,7 +461,7 @@ def library_paths(cuda=False):
         paths.append(lib_path)
 
     if cuda:
-        lib_dir = 'lib/x64' if sys.platform == 'win32' else 'lib64'
+        lib_dir = 'lib/x64' if IS_WINDOWS else 'lib64'
         paths.append(_join_cuda_home(lib_dir))
         if CUDNN_HOME is not None:
             paths.append(os.path.join(CUDNN_HOME, lib_dir))
@@ -528,7 +550,7 @@ def load(name,
         extra_include_paths,
         build_directory or _get_build_directory(name, verbose),
         verbose,
-        with_cuda=with_cuda)
+        with_cuda)
 
 
 def load_inline(name,
@@ -655,7 +677,7 @@ def load_inline(name,
         extra_include_paths,
         build_directory,
         verbose,
-        with_cuda=with_cuda)
+        with_cuda)
 
 
 def _jit_compile(name,
@@ -667,46 +689,85 @@ def _jit_compile(name,
                  build_directory,
                  verbose,
                  with_cuda=None):
-    baton = FileBaton(os.path.join(build_directory, 'lock'))
-    if baton.try_acquire():
-        try:
-            verify_ninja_availability()
-            check_compiler_abi_compatibility(os.environ.get('CXX', 'c++'))
-            if with_cuda is None:
-                with_cuda = any(map(_is_cuda_file, sources))
-            extra_ldflags = _prepare_ldflags(
-                extra_ldflags or [],
-                with_cuda,
-                verbose)
-            build_file_path = os.path.join(build_directory, 'build.ninja')
-            if verbose:
-                print(
-                    'Emitting ninja build file {}...'.format(build_file_path))
-            # NOTE: Emitting a new ninja build file does not cause re-compilation if
-            # the sources did not change, so it's ok to re-emit (and it's fast).
-            _write_ninja_file(
-                path=build_file_path,
-                name=name,
-                sources=sources,
-                extra_cflags=extra_cflags or [],
-                extra_cuda_cflags=extra_cuda_cflags or [],
-                extra_ldflags=extra_ldflags or [],
-                extra_include_paths=extra_include_paths or [],
-                with_cuda=with_cuda)
-
-            if verbose:
-                print('Building extension module {}...'.format(name))
-            _build_extension_module(name, build_directory)
-        finally:
-            baton.release()
-    else:
-        baton.wait()
+    old_version = JIT_EXTENSION_VERSIONER.get_version(name)
+    version = JIT_EXTENSION_VERSIONER.bump_version_if_changed(
+        name,
+        sources,
+        build_arguments=[extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths],
+        build_directory=build_directory,
+        with_cuda=with_cuda
+    )
+    if version > 0:
+        if version != old_version and verbose:
+            print('The input conditions for extension module {} have changed. '.format(name) +
+                  'Bumping to version {0} and re-building as {1}_v{0}...'.format(version, name))
+        name = '{}_v{}'.format(name, version)
+
+    if version != old_version:
+        baton = FileBaton(os.path.join(build_directory, 'lock'))
+        if baton.try_acquire():
+            try:
+                _write_ninja_file_and_build(
+                    name=name,
+                    sources=sources,
+                    extra_cflags=extra_cflags or [],
+                    extra_cuda_cflags=extra_cuda_cflags or [],
+                    extra_ldflags=extra_ldflags or [],
+                    extra_include_paths=extra_include_paths or [],
+                    build_directory=build_directory,
+                    verbose=verbose,
+                    with_cuda=with_cuda)
+            finally:
+                baton.release()
+        else:
+            baton.wait()
+    elif verbose:
+        print('No modifications detected for re-loaded extension '
+              'module {}, skipping build step...'.format(name))
 
     if verbose:
         print('Loading extension module {}...'.format(name))
     return _import_module_from_library(name, build_directory)
 
 
+def _write_ninja_file_and_build(name,
+                                sources,
+                                extra_cflags,
+                                extra_cuda_cflags,
+                                extra_ldflags,
+                                extra_include_paths,
+                                build_directory,
+                                verbose,
+                                with_cuda):
+    verify_ninja_availability()
+    check_compiler_abi_compatibility(os.environ.get('CXX', 'c++'))
+    if with_cuda is None:
+        with_cuda = any(map(_is_cuda_file, sources))
+    extra_ldflags = _prepare_ldflags(
+        extra_ldflags or [],
+        with_cuda,
+        verbose)
+    build_file_path = os.path.join(build_directory, 'build.ninja')
+    if verbose:
+        print(
+            'Emitting ninja build file {}...'.format(build_file_path))
+    # NOTE: Emitting a new ninja build file does not cause re-compilation if
+    # the sources did not change, so it's ok to re-emit (and it's fast).
+    _write_ninja_file(
+        path=build_file_path,
+        name=name,
+        sources=sources,
+        extra_cflags=extra_cflags or [],
+        extra_cuda_cflags=extra_cuda_cflags or [],
+        extra_ldflags=extra_ldflags or [],
+        extra_include_paths=extra_include_paths or [],
+        with_cuda=with_cuda)
+
+    if verbose:
+        print('Building extension module {}...'.format(name))
+    _build_extension_module(name, build_directory, verbose)
+
+
 def verify_ninja_availability():
     '''
     Returns ``True`` if the `ninja <https://ninja-build.org/>`_ build system is
@@ -720,7 +781,7 @@ def verify_ninja_availability():
 
 
 def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         python_path = os.path.dirname(sys.executable)
         python_lib_path = os.path.join(python_path, 'libs')
 
@@ -739,7 +800,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
     if with_cuda:
         if verbose:
             print('Detected CUDA files, patching ldflags')
-        if sys.platform == 'win32':
+        if IS_WINDOWS:
             extra_ldflags.append('/LIBPATH:{}'.format(
                 _join_cuda_home('lib/x64')))
             extra_ldflags.append('cudart.lib')
@@ -757,9 +818,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
 def _get_build_directory(name, verbose):
     root_extensions_directory = os.environ.get('TORCH_EXTENSIONS_DIR')
     if root_extensions_directory is None:
-        # tempfile.gettempdir() will be /tmp on UNIX and \TEMP on Windows.
-        root_extensions_directory = os.path.join(tempfile.gettempdir(),
-                                                 'torch_extensions')
+        root_extensions_directory = get_default_build_root()
 
     if verbose:
         print('Using {} as PyTorch extensions root...'.format(
@@ -775,16 +834,28 @@ def _get_build_directory(name, verbose):
     return build_directory
 
 
-def _build_extension_module(name, build_directory):
+def _build_extension_module(name, build_directory, verbose):
     try:
-        subprocess.check_output(
-            ['ninja', '-v'], stderr=subprocess.STDOUT, cwd=build_directory)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        if sys.version_info >= (3, 5):
+            subprocess.run(
+                ['ninja', '-v'],
+                stdout=None if verbose else subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                cwd=build_directory,
+                check=True)
+        else:
+            subprocess.check_output(
+                ['ninja', '-v'],
+                stderr=subprocess.STDOUT,
+                cwd=build_directory)
     except subprocess.CalledProcessError:
         # Python 2 and 3 compatible way of getting the error object.
         _, error, _ = sys.exc_info()
         # error.output contains the stdout and stderr of the build attempt.
-        raise RuntimeError("Error building extension '{}': {}".format(
-            name, error.output.decode()))
+        message = "Error building extension '{}': {}".format(name, error.output.decode())
+        raise_from(RuntimeError(message), None)
 
 
 def _import_module_from_library(module_name, path):
@@ -802,7 +873,7 @@ def _write_ninja_file(path,
                       extra_cuda_cflags,
                       extra_ldflags,
                       extra_include_paths,
-                      with_cuda=False):
+                      with_cuda):
     extra_cflags = [flag.strip() for flag in extra_cflags]
     extra_cuda_cflags = [flag.strip() for flag in extra_cuda_cflags]
     extra_ldflags = [flag.strip() for flag in extra_ldflags]
@@ -825,7 +896,7 @@ def _write_ninja_file(path,
     system_includes.append(sysconfig.get_paths()['include'])
 
     # Windoze does not understand `-isystem`.
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         user_includes += system_includes
         system_includes.clear()
 
@@ -833,18 +904,18 @@ def _write_ninja_file(path,
     common_cflags += ['-I{}'.format(include) for include in user_includes]
     common_cflags += ['-isystem {}'.format(include) for include in system_includes]
 
-    if is_binary_build():
+    if _is_binary_build():
         common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
     cflags = common_cflags + ['-fPIC', '-std=c++11'] + extra_cflags
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         from distutils.spawn import _nt_quote_args
         cflags = _nt_quote_args(cflags)
     flags = ['cflags = {}'.format(' '.join(cflags))]
 
     if with_cuda:
         cuda_flags = common_cflags + COMMON_NVCC_FLAGS
-        if sys.platform == 'win32':
+        if IS_WINDOWS:
             cuda_flags = _nt_quote_args(cuda_flags)
         else:
             cuda_flags += ['--compiler-options', "'-fPIC'"]
@@ -854,20 +925,20 @@ def _write_ninja_file(path,
 
         flags.append('cuda_flags = {}'.format(' '.join(cuda_flags)))
 
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         ldflags = ['/DLL'] + extra_ldflags
     else:
         ldflags = ['-shared'] + extra_ldflags
     # The darwin linker needs explicit consent to ignore unresolved symbols.
     if sys.platform == 'darwin':
         ldflags.append('-undefined dynamic_lookup')
-    elif sys.platform == 'win32':
+    elif IS_WINDOWS:
         ldflags = _nt_quote_args(ldflags)
     flags.append('ldflags = {}'.format(' '.join(ldflags)))
 
     # See https://ninja-build.org/build.ninja.html for reference.
     compile_rule = ['rule compile']
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         compile_rule.append(
             '  command = cl /showIncludes $cflags -c $in /Fo$out')
         compile_rule.append('  deps = msvc')
@@ -883,7 +954,7 @@ def _write_ninja_file(path,
             '  command = $nvcc $cuda_flags -c $in -o $out')
 
     link_rule = ['rule link']
-    if sys.platform == 'win32':
+    if IS_WINDOWS:
         cl_paths = subprocess.check_output(['where',
                                             'cl']).decode().split('\r\n')
         if len(cl_paths) >= 1:
@@ -911,13 +982,14 @@ def _write_ninja_file(path,
             rule = 'compile'
             target = '{}.o'.format(file_name)
         object_files.append(target)
-        if sys.platform == 'win32':
+        if IS_WINDOWS:
             source_file = source_file.replace(':', '$:')
         source_file = source_file.replace(" ", "$ ")
         build.append('build {}: {} {}'.format(target, rule, source_file))
 
-    ext = '.pyd' if sys.platform == 'win32' else '.so'
-    library_target = '{}{}'.format(name, ext)
+    ext = 'pyd' if IS_WINDOWS else 'so'
+    library_target = '{}.{}'.format(name, ext)
+
     link = ['build {}: link {}'.format(library_target, ' '.join(object_files))]
 
     default = ['default {}'.format(library_target)]
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 4618e731406e68..bdf27ad897c46d 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -420,29 +420,29 @@ class DataLoader(object):
     Arguments:
         dataset (Dataset): dataset from which to load the data.
         batch_size (int, optional): how many samples per batch to load
-            (default: 1).
+            (default: ``1``).
         shuffle (bool, optional): set to ``True`` to have the data reshuffled
-            at every epoch (default: False).
+            at every epoch (default: ``False``).
         sampler (Sampler, optional): defines the strategy to draw samples from
             the dataset. If specified, ``shuffle`` must be False.
         batch_sampler (Sampler, optional): like sampler, but returns a batch of
-            indices at a time. Mutually exclusive with batch_size, shuffle,
-            sampler, and drop_last.
+            indices at a time. Mutually exclusive with :attr:`batch_size`,
+            :attr:`shuffle`, :attr:`sampler`, and :attr:`drop_last`.
         num_workers (int, optional): how many subprocesses to use for data
             loading. 0 means that the data will be loaded in the main process.
-            (default: 0)
+            (default: ``0``)
         collate_fn (callable, optional): merges a list of samples to form a mini-batch.
         pin_memory (bool, optional): If ``True``, the data loader will copy tensors
             into CUDA pinned memory before returning them.
         drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
             if the dataset size is not divisible by the batch size. If ``False`` and
             the size of dataset is not divisible by the batch size, then the last batch
-            will be smaller. (default: False)
+            will be smaller. (default: ``False``)
         timeout (numeric, optional): if positive, the timeout value for collecting a batch
-            from workers. Should always be non-negative. (default: 0)
-        worker_init_fn (callable, optional): If not None, this will be called on each
+            from workers. Should always be non-negative. (default: ``0``)
+        worker_init_fn (callable, optional): If not ``None``, this will be called on each
             worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
-            input, after seeding and before data loading. (default: None)
+            input, after seeding and before data loading. (default: ``None``)
 
     .. note:: By default, each worker will have its PyTorch seed set to
               ``base_seed + worker_id``, where ``base_seed`` is a long generated
@@ -450,9 +450,9 @@ class DataLoader(object):
               may be duplicated upon initializing workers (w.g., NumPy), causing
               each worker to return identical random numbers. (See
               :ref:`dataloader-workers-random-seed` section in FAQ.) You may
-              use ``torch.initial_seed()`` to access the PyTorch seed for each
-              worker in :attr:`worker_init_fn`, and use it to set other seeds
-              before data loading.
+              use :func:`torch.initial_seed()` to access the PyTorch seed for
+              each worker in :attr:`worker_init_fn`, and use it to set other
+              seeds before data loading.
 
     .. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an
                  unpicklable object, e.g., a lambda function.
diff --git a/torch/utils/serialization/__init__.py b/torch/utils/serialization/__init__.py
deleted file mode 100644
index f0a83c0274eeed..00000000000000
--- a/torch/utils/serialization/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-
-from .read_lua_file import load_lua, T7Reader
diff --git a/torch/utils/serialization/read_lua_file.py b/torch/utils/serialization/read_lua_file.py
deleted file mode 100644
index e01cbd6d4cba3f..00000000000000
--- a/torch/utils/serialization/read_lua_file.py
+++ /dev/null
@@ -1,608 +0,0 @@
-"""
-Based on python-torchfile package.
-https://github.com/bshillingford/python-torchfile
-
-Copyright (c) 2016, Brendan Shillingford
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-   may be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-TYPE_NIL = 0
-TYPE_NUMBER = 1
-TYPE_STRING = 2
-TYPE_TABLE = 3
-TYPE_TORCH = 4
-TYPE_BOOLEAN = 5
-TYPE_FUNCTION = 6
-TYPE_RECUR_FUNCTION = 8
-LEGACY_TYPE_RECUR_FUNCTION = 7
-
-
-import sys
-import struct
-from array import array
-from collections import namedtuple
-from functools import wraps
-
-import torch
-import torch.legacy.nn as nn
-import torch.cuda
-from torch._thnn import type2backend
-from torch._utils import _import_dotted_name
-
-HAS_CUDA = torch.cuda.is_available()
-
-LuaFunction = namedtuple('LuaFunction', ['size', 'dumped', 'upvalues'])
-
-
-class hashable_uniq_dict(dict):
-    """
-    Subclass of dict with equality and hashing semantics changed:
-    equality and hashing is purely by reference/instance, to match
-    the behaviour of lua tables.
-
-    Supports lua-style dot indexing.
-
-    This way, dicts can be keys of other dicts.
-    """
-
-    def __hash__(self):
-        return id(self)
-
-    def __getattr__(self, key):
-        return self.get(key)
-
-    def __eq__(self, other):
-        return id(self) == id(other)
-    # TODO: dict's __lt__ etc. still exist
-
-
-class TorchObject(object):
-    """
-    Simple torch object, used by `add_trivial_class_reader`.
-    Supports both forms of lua-style indexing, i.e. getattr and getitem.
-    Use the `torch_typename` method to get the object's torch class name.
-
-    Equality is by reference, as usual for lua (and the default for Python
-    objects).
-    """
-
-    def __init__(self, typename, obj):
-        self._typename = typename
-        self._obj = obj
-
-    def __getattr__(self, k):
-        return self._obj.get(k)
-
-    def __getitem__(self, k):
-        return self._obj.get(k)
-
-    def torch_typename(self):
-        return self._typename
-
-    def __repr__(self):
-        return "TorchObject(%s, %s)" % (self._typename, repr(self._obj))
-
-    def __str__(self):
-        return repr(self)
-
-    def __dir__(self):
-        keys = list(self._obj.keys())
-        keys.append('torch_typename')
-        return keys
-
-
-reader_registry = {}
-
-
-def get_python_class(typename):
-    module, _, cls_name = typename.rpartition('.')
-    if cls_name.startswith('Cuda'):
-        module = module + '.cuda'
-        cls_name = cls_name[4:]
-        if cls_name == 'Storage' or cls_name == 'Tensor':
-            cls_name = 'Float' + cls_name
-    return _import_dotted_name(module + '.' + cls_name)
-
-
-def make_tensor_reader(typename):
-    python_class = get_python_class(typename)
-
-    def read_tensor(reader, version):
-        # source:
-        # https://github.com/torch/torch7/blob/master/generic/Tensor.c#L1243
-        ndim = reader.read_int()
-
-        # read size:
-        size = torch.LongStorage(reader.read_long_array(ndim))
-        # read stride:
-        stride = torch.LongStorage(reader.read_long_array(ndim))
-        # storage offset:
-        storage_offset = reader.read_long() - 1
-        # read storage:
-        storage = reader.read()
-
-        if storage is None or ndim == 0 or len(size) == 0 or len(stride) == 0:
-            # empty torch tensor
-            return python_class()
-
-        return python_class().set_(storage, storage_offset, torch.Size(size), tuple(stride))
-    return read_tensor
-
-
-def make_storage_reader(typename):
-    python_class = get_python_class(typename)
-    # TODO: be smarter about this
-    element_size = python_class().element_size()
-
-    def read_storage(reader, version):
-        # source:
-        # https://github.com/torch/torch7/blob/master/generic/Storage.c#L244
-        size = reader.read_long() * element_size
-        return python_class.from_buffer(reader.f.read(size), 'native')
-    return read_storage
-
-
-def register_torch_class(obj_kind, reader_factory):
-    for t in ['Double', 'Float', 'Half', 'Long', 'Int', 'Short', 'Char', 'Byte']:
-        for prefix in ['', 'Cuda']:
-            if prefix == 'Cuda' and not HAS_CUDA:
-                continue
-            if t == 'Half' and prefix == '':
-                continue
-            if prefix == 'Cuda' and t == 'Float':
-                cls_name = 'torch.Cuda' + obj_kind
-            else:
-                cls_name = 'torch.' + prefix + t + obj_kind
-            reader_registry[cls_name] = reader_factory(cls_name)
-
-
-register_torch_class('Storage', make_storage_reader)
-register_torch_class('Tensor', make_tensor_reader)
-
-################################################################################
-# Reader function for tds.Vector and tds.Hash
-################################################################################
-
-
-def tds_Vec_reader(reader, version):
-    length = reader.read_long()
-    return [reader.read() for i in range(length)]
-
-
-def tds_Hash_reader(reader, version):
-    length = reader.read_long()
-    obj = {}
-    for i in range(length):
-        k = reader.read()
-        v = reader.read()
-        obj[k] = v
-    return obj
-
-
-reader_registry['tds.Vec'] = tds_Vec_reader
-reader_registry['tds.Hash'] = tds_Hash_reader
-
-################################################################################
-# Reader function for nn modules
-################################################################################
-
-
-def _load_backend(obj):
-    if hasattr(obj, '_type'):
-        obj._backend = type2backend[obj._type]
-        return
-    # Try to find tensor attributes and infer type from them
-    for key in dir(obj):
-        attr = getattr(obj, key)
-        if isinstance(attr, torch.Tensor):
-            try:
-                obj._backend = type2backend[attr.type()]
-            except KeyError:
-                pass
-    # Monkey patch the forward to capture the type of input
-    updateOutput_orig = obj.updateOutput
-
-    def updateOutput_patch(*args):
-        input = args[0]
-        while not isinstance(input, torch.Tensor):
-            input = input[0]
-        obj._backend = type2backend[input.type()]
-        obj.updateOutput = updateOutput_orig
-        return obj.updateOutput(*args)
-    obj.updateOutput = updateOutput_patch
-
-
-def nn_reader(cls):
-    def read_nn_class(reader, version):
-        obj = cls.__new__(cls)
-        attributes = reader.read()
-        obj.__dict__.update(attributes)
-        _load_backend(obj)
-        return obj
-    return read_nn_class
-
-
-reader_registry.update({('nn.' + name): nn_reader(module)
-                        for name, module in nn.__dict__.items()
-                        if name[0] != '_' and name[0].upper() == name[0]})
-
-
-def custom_reader(cls):
-    def reader_factory(fn):
-        base = nn_reader(cls)
-
-        def wrapper(reader, version):
-            obj = base(reader, version)
-            fn(reader, version, obj)
-            return obj
-        reader_registry['nn.' + cls.__name__] = wrapper
-        return wrapper
-    return reader_factory
-
-
-def BatchNorm_reader(reader, version, obj):
-    if version < 2 and hasattr(obj, 'running_std'):
-        obj.running_var = obj.running_var.pow(-2).add(-obj.eps)
-        del obj.running_std
-
-for prefix in ['', 'Spatial', 'Volumetric']:
-    name = prefix + 'BatchNormalization'
-    custom_reader(getattr(nn, name))(BatchNorm_reader)
-
-
-@custom_reader(nn.Transpose)
-def Transpose_reader(reader, version, obj):
-    obj.permutations = list(
-        map(lambda swap: [swap[0] - 1, swap[1] - 1], obj.permutations))
-
-
-@custom_reader(nn.SpatialDivisiveNormalization)
-def SpatialDivisiveNormalization_reader(reader, version, obj):
-    obj.stdestimator.modules[-2].dim += 1
-    obj.meanestimator.modules[-1].dim += 1
-
-
-@custom_reader(nn.SpatialContrastiveNormalization)
-def SpatialContrastiveNormalization_reader(reader, version, obj):
-    raise RuntimeError("loading of SpatialContrastiveNormalization is disabled for now")
-
-
-@custom_reader(nn.GradientReversal)
-def GradientReversal_reader(reader, version, obj):
-    if version < 2:
-        setattr(obj, 'lambda', 1)
-
-
-@custom_reader(nn.VolumetricAveragePooling)
-def VolumetricAveragePooling_reader(reader, version, obj):
-    obj.padT, obj.padH, obj.padW = 0, 0, 0
-    obj.ceil_mode = False
-    obj.count_include_pad = True
-
-################################################################################
-# Functions for patching objects so that they work with legacy modules
-################################################################################
-
-
-def registry_addon(fn):
-    def wrapper_factory(module_name, *args, **kwargs):
-        module_name = 'nn.' + module_name
-        build_fn = reader_registry[module_name]
-
-        def wrapper(reader, version):
-            obj = build_fn(reader, version)
-            fn(obj, *args, **kwargs)
-            return obj
-        reader_registry[module_name] = wrapper
-    return wrapper_factory
-
-
-@registry_addon
-def attr_map(obj, attribute_map):
-    for src, dst in attribute_map.items():
-        setattr(obj, dst, getattr(obj, src))
-        delattr(obj, src)
-
-
-@registry_addon
-def ensure_attr(obj, *attrs):
-    for attr in attrs:
-        if not hasattr(obj, attr):
-            setattr(obj, attr, None)
-
-
-@registry_addon
-def make_none_attr(obj, *attrs):
-    for attr in attrs:
-        setattr(obj, attr, None)
-
-
-@registry_addon
-def decrement(obj, *attrs):
-    for attr in attrs:
-        value = getattr(obj, attr)
-        value -= 1
-        setattr(obj, attr, value)
-
-
-@registry_addon
-def decrement_positive(obj, *attrs):
-    for attr in attrs:
-        value = getattr(obj, attr)
-        if value > 0:
-            value -= 1
-        setattr(obj, attr, value)
-
-
-@registry_addon
-def storage_to_size(obj, *attrs):
-    for attr in attrs:
-        value = getattr(obj, attr)
-        setattr(obj, attr, torch.Size(value))
-
-
-@registry_addon
-def ensure_type(obj, type_map):
-    for attr, converter in type_map.items():
-        value = getattr(obj, attr)
-        setattr(obj, attr, getattr(value, converter)())
-
-
-ensure_attr('Linear', 'bias', 'gradWeight', 'gradBias', 'addBuffer')
-ensure_attr('CAddTable', 'inplace')
-ensure_attr('SpatialFractionalMaxPooling', 'outW', 'outH', 'ratioW', 'ratioH')
-ensure_attr('BatchNormalization', 'weight', 'bias', 'gradWeight', 'gradBias',
-            'save_mean', 'save_std')
-ensure_attr('SpatialBatchNormalization', 'weight', 'bias', 'gradWeight', 'gradBias',
-            'save_mean', 'save_std')
-ensure_attr('VolumetricBatchNormalization', 'weight', 'bias', 'gradWeight', 'gradBias')
-ensure_attr('LookupTable', 'maxNorm', 'normType', '_gradOutput', '_sorted', '_indices')
-ensure_attr('MixtureTable', 'table')
-ensure_attr('WeightedEuclidean', 'fastBackward')
-ensure_attr('VolumetricMaxPooling', 'ceil_mode')
-ensure_attr('BCECriterion', 'buffer')
-ensure_attr('SpatialClassNLLCriterion', 'weights')
-ensure_attr('ClassNLLCriterion', 'weights')
-ensure_attr('ParallelCriterion', 'repeatTarget')
-ensure_attr('MultiMarginCriterion', 'weights')
-ensure_attr('SpatialConvolution', 'bias', 'gradWeight', 'gradBias', '_gradOutput')
-ensure_attr('SpatialCrossMapLRN', 'scale')
-ensure_attr('Dropout', 'inplace')
-make_none_attr('SpatialConvolution', 'finput', 'fgradInput', '_input')
-attr_map('ReLU', {'val': 'value'})
-attr_map('Threshold', {'val': 'value'})
-attr_map('Unsqueeze', {'pos': 'dim'})
-attr_map('HardShrink', {'lambda': 'lambd'})
-attr_map('SoftShrink', {'lambda': 'lambd'})
-attr_map('GradientReversal', {'lambda': 'lambd'})
-attr_map('SpatialAdaptiveMaxPooling', {'H': 'h', 'W': 'w'})
-decrement('Index', 'dimension')
-decrement('SelectTable', 'index')
-decrement('SplitTable', 'dimension')
-decrement_positive('JoinTable', 'dimension')
-decrement('Parallel', 'inputDimension', 'outputDimension')
-decrement('Concat', 'dimension')
-decrement('DepthConcat', 'dimension')
-decrement('Squeeze', 'dim')
-decrement('Unsqueeze', 'dim')
-decrement('Replicate', 'dim')
-decrement('MixtureTable', 'dim')
-decrement('Narrow', 'dimension', 'index')
-decrement('NarrowTable', 'offset')
-decrement('LookupTable', 'paddingValue')
-decrement('SpatialConvolutionMap', 'connTable')
-decrement('SpatialFullConvolutionMap', 'connTable')
-decrement('Select', 'dimension', 'index')
-decrement('Padding', 'dim', 'index')
-decrement('PartialLinear', 'partition')
-decrement_positive('Sum', 'dimension')
-decrement_positive('Max', 'dimension')
-decrement_positive('Min', 'dimension')
-decrement_positive('Mean', 'dimension')
-storage_to_size('View', 'size')
-storage_to_size('DepthConcat', 'outputSize')
-storage_to_size('MixtureTable', 'size')
-ensure_type('PartialLinear', {'partition': 'long'})
-
-
-class T7ReaderException(Exception):
-    pass
-
-
-class T7Reader:
-
-    def __init__(self,
-                 fileobj,
-                 list_heuristic=True,
-                 int_heuristic=True,
-                 unknown_classes=False,
-                 long_size=None):
-        """
-        Params:
-        * `fileobj` file object to read from, must be actual file object
-                    as it must support array, struct, and numpy
-        * `list_heuristic`: automatically turn tables with only consecutive
-                                positive integral indices into lists
-                                (default True)
-        * `int_heuristic`: cast all whole floats into ints (default True)
-        * `force_deserialize_classes`: deserialize all classes, not just the
-                                       whitelisted ones (default True)
-        """
-        self.f = fileobj
-        self.memo = {}
-
-        self.list_heuristic = list_heuristic
-        self.int_heuristic = int_heuristic
-        self.unknown_classes = unknown_classes
-        self.long_size = long_size
-
-    def _read(self, fmt):
-        sz = struct.calcsize(fmt)
-        result = struct.unpack(fmt, self.f.read(sz))
-        if len(result) == 1:
-            return result[0]
-        return result
-
-    def read_boolean(self):
-        return self.read_int() == 1
-
-    def read_int(self):
-        return self._read('i')
-
-    def read_long(self):
-        if self.long_size is None:
-            return self._read('l')
-        elif self.long_size is 8:
-            return self._read('q')
-        else:
-            return self._read('i')
-
-    def read_long_array(self, n):
-        if self.long_size is not None:
-            lst = []
-            for i in range(n):
-                lst.append(self.read_long())
-            return lst
-        else:
-            LONG_SIZE_ARR = 'q' if sys.version_info[0] == 3 else 'l'
-            arr = array(LONG_SIZE_ARR)
-            arr.fromfile(self.f, n)
-            return arr.tolist()
-
-    def read_float(self):
-        return self._read('f')
-
-    def read_double(self):
-        return self._read('d')
-
-    def read_string(self):
-        size = self.read_int()
-        byte_str = self.f.read(size)
-        if not isinstance(byte_str, str):
-            byte_str = str(byte_str, 'ascii')
-        return byte_str
-
-    def read_number(self):
-        x = self.read_double()
-        # Extra checking for integral numbers:
-        if self.int_heuristic and x.is_integer():
-            return int(x)
-        return x
-
-    def memoize_index(fn):
-        @wraps(fn)
-        def wrapper(self, *args, **kwargs):
-            index = self.read_int()
-            if index in self.memo:
-                return self.memo[index]
-            result = fn(self, *args, **kwargs)
-            self.memo[index] = result
-            return result
-        return wrapper
-
-    @memoize_index
-    def read_function(self):
-        size = self.read_int()
-        dumped = self.f.read(size)
-        upvalues = self.read()
-        return LuaFunction(size, dumped, upvalues)
-
-    @memoize_index
-    def read_object(self):
-        version_str = self.read_string()
-        if version_str.startswith('V '):
-            version = int(version_str.partition(' ')[2])
-            cls_name = self.read_string()
-        else:
-            cls_name = version_str
-            version = 0  # created before existence of versioning
-
-        if cls_name in reader_registry:
-            return reader_registry[cls_name](self, version)
-        if self.unknown_classes:
-            return TorchObject(cls_name, self.read())
-        raise T7ReaderException(("don't know how to deserialize Lua class "
-                                 "{}. If you want to ignore this error and load this object "
-                                 "as a dict, specify unknown_classes=True in reader's "
-                                 "constructor").format(cls_name))
-
-    def _can_be_list(self, table):
-        def is_natural(key):
-            return (isinstance(key, int) or
-                    (isinstance(key, float) and key.is_integer()) and
-                    k > 0)
-        natural_keys = all(map(is_natural, table.keys()))
-        if not natural_keys:
-            return False
-        key_sum = sum(table.keys())
-        n = len(table)
-        return n * (n + 1) == 2 * key_sum
-
-    @memoize_index
-    def read_table(self):
-        size = self.read_int()
-        table = hashable_uniq_dict()  # custom hashable dict, can be a key
-        for i in range(size):
-            k = self.read()
-            v = self.read()
-            table[k] = v
-        if self.list_heuristic and self._can_be_list(table):
-            return [table[i] for i in range(1, len(table) + 1)]
-        return table
-
-    def read(self):
-        typeidx = self.read_int()
-
-        if typeidx == TYPE_NIL:
-            return None
-        elif typeidx == TYPE_NUMBER:
-            return self.read_number()
-        elif typeidx == TYPE_BOOLEAN:
-            return self.read_boolean()
-        elif typeidx == TYPE_STRING:
-            return self.read_string()
-        elif (typeidx == TYPE_FUNCTION or typeidx == TYPE_RECUR_FUNCTION or
-              typeidx == LEGACY_TYPE_RECUR_FUNCTION):
-            return self.read_function()
-        elif typeidx == TYPE_TORCH:
-            return self.read_object()
-        elif typeidx == TYPE_TABLE:
-            return self.read_table()
-        else:
-            raise T7ReaderException("unknown type id {}. The file may be "
-                                    "corrupted.".format(typeidx))
-
-
-def load_lua(filename, **kwargs):
-    """
-    Loads the given t7 file using default settings; kwargs are forwarded
-    to `T7Reader`.
-    """
-    with open(filename, 'rb') as f:
-        reader = T7Reader(f, **kwargs)
-        return reader.read()