diff --git a/.clang-tidy b/.clang-tidy
index 5466a4a31d20a3..d5fc66c26d42d9 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -2,6 +2,7 @@
 # NOTE: there must be no spaces before the '-', so put the comma first.
 Checks: '
   *
+  ,clang-analyzer-*
   ,modernize-*
   ,-cert-err58-cpp
   ,-cert-err60-cpp
@@ -9,6 +10,7 @@ Checks: '
   ,-cppcoreguidelines-owning-memory
   ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay
   ,-cppcoreguidelines-pro-bounds-constant-array-index
+  ,-cppcoreguidelines-pro-type-member-init
   ,-cppcoreguidelines-pro-type-static-cast-downcast
   ,-cppcoreguidelines-pro-type-vararg
   ,-cppcoreguidelines-special-member-functions
@@ -23,9 +25,11 @@ Checks: '
   ,-hicpp-braces-around-statements
   ,-hicpp-explicit-conversions
   ,-hicpp-no-array-decay
+  ,-hicpp-signed-bitwise
   ,-hicpp-special-member-functions
   ,-hicpp-vararg
   ,-llvm-header-guard
+  ,-llvm-include-order
   ,-llvm-namespace-comment
   ,-misc-unused-parameters
   ,-modernize-make-unique
@@ -34,7 +38,6 @@ Checks: '
   ,-readability-braces-around-statements
   ,-readability-else-after-return
   ,-readability-named-parameter
-  ,clang-analyzer-*
   '
 WarningsAsErrors: ''
 HeaderFilterRegex: 'torch/csrc/'
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 053a9be5e05487..76ecadbd5297b8 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -64,7 +64,13 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
       ;;
     */aten/*)
       # ATen uses test framework Catch2
-      "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml"
+      # NB: We do NOT use the xml test reporter, because
+      # Catch doesn't support multiple reporters
+      # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223
+      # which means that enabling XML output means you lose useful stdout
+      # output for Jenkins.  It's more important to have useful console
+      # output than it is to have XML output for Jenkins.
+      "$test"
       ;;
     *)
       "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 56db6914c1c20a..48e81dfd635bce 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -43,12 +43,9 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # https://github.com/RadeonOpenCompute/hcc#hcc-with-thinlto-linking
   export KMTHINLTO=1
 
-  sudo chown -R jenkins:jenkins /usr/local
-  rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
-  python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
-
-  USE_ROCM=1 python setup.py install
-  exit
+  python tools/amd_build/build_pytorch_amd.py
+  USE_ROCM=1 python setup.py install --user
+  exit 0
 fi
 
 # TODO: Don't install this here
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 651e230ab35ea7..51984e0b79c51d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -284,6 +284,8 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR})
 # in PROJECT_SOURCE_DIR.
 include_directories(BEFORE ${PROJECT_BINARY_DIR})
 
+include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
+
 # ---[ Old caffe protobuf
 if(BUILD_CAFFE2)
   add_subdirectory(caffe/proto)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 562910ad86a298..4eaa4ad81ec372 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -44,6 +44,7 @@ CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig
 # NB: If you edit these globs, you'll have to update setup.py package_data as well
 FILE(GLOB base_h "*.h" "detail/*.h")
 FILE(GLOB base_cpp "*.cpp" "detail/*.cpp")
+add_subdirectory(core)
 FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
 FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
 FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
@@ -62,7 +63,7 @@ FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
 FILE(GLOB native_mkl_cpp "native/mkl/*.cpp")
 FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 
-set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
+set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
 if(AT_MKL_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()
@@ -393,7 +394,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
   DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")
 
 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
-FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h})
+FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h})
   string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER})
   GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY)
   INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR})
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 309c4be2e651dd..7d3fdd1cc2d4af 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -9,6 +9,9 @@
 #include "ATen/detail/CUDAHooksInterface.h"
 #include "ATen/CUDAStream.h"
 
+// This is temporary
+#include "ATen/core/ATenCoreTest.h"
+
 #include <memory>
 #include <mutex>
 #include <cstdint>
diff --git a/aten/src/ATen/Storage.cpp b/aten/src/ATen/Storage.cpp
index f5ba512cc27105..893703a0510425 100644
--- a/aten/src/ATen/Storage.cpp
+++ b/aten/src/ATen/Storage.cpp
@@ -4,20 +4,30 @@
 
 namespace at {
 
+Storage::Storage(at::ScalarType scalar_type, size_t size, Allocator* allocator)
+    : storage_impl_(new StorageImpl(
+          scalar_type,
+          size,
+          allocator,
+          /* resizable */ false)) {}
+
+Storage::Storage(
+    at::ScalarType scalar_type,
+    at::DataPtr data_ptr,
+    size_t size,
+    const std::function<void(void*)>& deleter)
+    : storage_impl_(new StorageImpl(
+          scalar_type,
+          size,
+          std::move(data_ptr),
+          /* allocator */ nullptr,
+          /* resizable */ false)) {}
+
 Storage::~Storage() {
   if (!storage_impl_) {
     return;
   }
-  if (--storage_impl_->refcount == 0) {
-    if (storage_impl_->finalizer) {
-      (*storage_impl_->finalizer)();
-    }
-    storage_impl_->finalizer = nullptr;
-    storage_impl_->data_ptr.clear();
-    if (storage_impl_ && --storage_impl_->weakcount == 0) {
-      delete storage_impl_;
-    }
-  }
+  storage_impl_->release();
 }
 
 } // namespace at
diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h
index a5c85192e36f8c..aa27296c74d40f 100644
--- a/aten/src/ATen/Storage.h
+++ b/aten/src/ATen/Storage.h
@@ -8,6 +8,12 @@ struct AT_API Storage {
 public:
   Storage() = delete;
   Storage(StorageImpl* storage_impl) : storage_impl_(storage_impl) {}
+  Storage(at::ScalarType, size_t size, Allocator* allocator);
+  Storage(
+      at::ScalarType,
+      at::DataPtr,
+      size_t size,
+      const std::function<void(void*)>& deleter);
   ~Storage();
   // There are reasonable interpretations of these constructors, but they're to
   // be implemented on demand.
diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp
index a26f8971310aa5..6e3d693d012c5c 100644
--- a/aten/src/ATen/StorageImpl.cpp
+++ b/aten/src/ATen/StorageImpl.cpp
@@ -12,8 +12,6 @@ StorageImpl::StorageImpl(
     : scalar_type(scalar_type),
       data_ptr(std::move(data_ptr)),
       size(size),
-      refcount(1),
-      weakcount(1), // from the strong reference
       resizable(resizable),
       allocator(allocator),
       finalizer(nullptr) {}
diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index c48ec51e013d4c..d9f14ee6baa5f2 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -5,6 +5,7 @@
 #include <ATen/Allocator.h>
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
+#include <ATen/Retainable.h>
 #include <TH/THTypeConversion.hpp>
 #include <atomic>
 
@@ -39,7 +40,7 @@ namespace at {
 
 struct Type;
 
-struct TH_CPP_API StorageImpl {
+struct TH_CPP_API StorageImpl : public Retainable {
 
   StorageImpl() = delete;
   virtual ~StorageImpl() {};
@@ -48,8 +49,6 @@ struct TH_CPP_API StorageImpl {
   at::ScalarType scalar_type;
   at::DataPtr data_ptr;
   ptrdiff_t size;
-  std::atomic<int> refcount;
-  std::atomic<int> weakcount;
   bool resizable;
   at::Allocator* allocator;
   std::unique_ptr<THFinalizer> finalizer;
@@ -76,6 +75,14 @@ struct TH_CPP_API StorageImpl {
     return static_cast<T*>(this->data_ptr.get());
   }
 
+  void release_resources() {
+    if (finalizer) {
+      (*finalizer)();
+    }
+    finalizer = nullptr;
+    data_ptr.clear();
+  }
+
   void operator=(const StorageImpl&) = delete;
 
   virtual size_t elementSize() const {
@@ -94,9 +101,6 @@ struct TH_CPP_API StorageImpl {
   const void* data() const {
     return data_ptr.get();
   };
-  void retain() {
-    ++refcount;
-  }
 
   int getDevice() const {
     return data_ptr.device().index();
diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h
index 55e7d3de6dea4a..8ebcfdaeada40f 100644
--- a/aten/src/ATen/THLongStorageView.h
+++ b/aten/src/ATen/THLongStorageView.h
@@ -64,7 +64,6 @@ class THLongStorageView {
       storage.size = ref.size();
     }
     storage.scalar_type = at::CTypeToScalarType<th::from_type<int64_t>>::to();
-    storage.refcount = 0;
     storage.set_resizable(false);
   }
 private:
diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp
new file mode 100644
index 00000000000000..e409ff2d979fce
--- /dev/null
+++ b/aten/src/ATen/core/ATenCoreTest.cpp
@@ -0,0 +1,10 @@
+#include <ATen/core/ATenCoreTest.h>
+
+namespace at {
+
+static int CoreTestGlobal = 0;
+int CoreTest() {
+  return CoreTestGlobal++;
+}
+
+}
diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h
new file mode 100644
index 00000000000000..e79705fde394f5
--- /dev/null
+++ b/aten/src/ATen/core/ATenCoreTest.h
@@ -0,0 +1,18 @@
+#pragma once
+
+// TODO: Move this to something like ATenCoreGeneral.h
+#ifdef _WIN32
+# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#  define AT_CORE_API __declspec(dllexport)
+# else
+#  define AT_CORE_API __declspec(dllimport)
+# endif
+#else
+# define AT_CORE_API
+#endif
+
+namespace at {
+
+AT_CORE_API int CoreTest();
+
+}
diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt
new file mode 100644
index 00000000000000..66c04b3a17c592
--- /dev/null
+++ b/aten/src/ATen/core/CMakeLists.txt
@@ -0,0 +1,13 @@
+# This file solely exists to let Caffe2 Android build get at the list
+# of core files without having to trundle through all of ATen's CMakeLists.txt
+
+FILE(GLOB ATen_CORE_HEADERS "*.h")
+FILE(GLOB ATen_CORE_SRCS "*.cpp")
+
+# Pass to parent
+set(ATen_CORE_HEADERS ${ATen_CORE_HEADERS} PARENT_SCOPE)
+set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
+# This is a little dodgy, because it means ALL ATen headers are made
+# visible.  Fortunately, you should just get a lot of undefined symbol
+# errors if you go outside core
+set(ATen_CORE_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../.. PARENT_SCOPE)
diff --git a/aten/src/ATen/core/README.md b/aten/src/ATen/core/README.md
new file mode 100644
index 00000000000000..71654f44e26f91
--- /dev/null
+++ b/aten/src/ATen/core/README.md
@@ -0,0 +1,5 @@
+ATen Core
+---------
+
+ATen Core is a minimal subset of ATen which is suitable for deployment
+on mobile.  Binary size of files in this folder is an important constraint.
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 085f2723bf0455..7ce3da3c9e051c 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -319,6 +319,20 @@ struct AT_CUDA_API RNNDescriptor
   }
 };
 
+#if CUDNN_VERSION >= 7000
+
+struct AT_CUDA_API CTCLossDescriptor
+  : public Descriptor<cudnnCTCLossStruct,
+                      &cudnnCreateCTCLossDescriptor,
+                      &cudnnDestroyCTCLossDescriptor>
+{
+  void set(cudnnDataType_t datatype) {
+    AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
+  }
+};
+
+#endif
+
 union Constant
 {
   float f;
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 93c20d4be032f4..b012de25194361 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -290,7 +290,7 @@ def __init__(self, reason):
             'Backend::${DenseBackend}, ScalarType::Long)'),
     'THStorage*':
         CodeTemplate(
-            'checked_cast_storage<${Storage}>('
+            'checked_cast_storage<Storage>('
             '&${arg_name},"${arg_name}",${arg_pos}, '
             'Backend::${Backend}, ScalarType::${ScalarName})'),
     'THGenerator*':
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 0f2aaffd6eac9d..209cca57c293ff 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -103,10 +103,6 @@ def check_all_files_written(self):
 TEMPLATE_PATH = options.source_path + "/templates"
 GENERATOR_DERIVED = CodeTemplate.from_file(
     TEMPLATE_PATH + "/GeneratorDerived.h")
-STORAGE_DERIVED_CPP = CodeTemplate.from_file(
-    TEMPLATE_PATH + "/StorageDerived.cpp")
-STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h")
-
 TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp")
 SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp")
 TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h")
@@ -237,7 +233,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['isFloatingType'] = is_floating_type
     env['isIntegralType'] = not is_floating_type
     if density == 'Dense':
-        env['Storage'] = "{}{}Storage".format(backend, scalar_name)
         env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name)
     env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name)
     env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name)
@@ -246,7 +241,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['storage_tensor_headers'] = []
     if density != 'Sparse':
         env['storage_tensor_headers'] = [
-            '#include "ATen/{}.h"'.format(env['Storage']),
             '#include "ATen/{}.h"'.format(env['Tensor']),
             '#include "ATen/{}ByteTensor.h"'.format(env['Backend']),
             '#include "ATen/{}IntTensor.h"'.format(env['Backend']),
@@ -322,8 +316,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
 
     if density != 'Sparse':
         # there are no storage or tensor types for sparse; it's all uniform
-        fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env)
-        fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env)
         env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env)
         fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env)
         fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env)
@@ -379,7 +371,7 @@ def declare_outputs():
     for backend, density, scalar_types in iterate_types():
         scalar_name = scalar_types[0]
         full_backend = "Sparse" + backend if density == "Sparse" else backend
-        for kind in ["Storage", "Type", "Tensor"]:
+        for kind in ["Type", "Tensor"]:
             if kind != 'Type' and density == "Sparse":
                 # No Storage or Tensor for sparse
                 continue
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index cb24e71119f9b1..c82bf8ba0ae043 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -1,6 +1,7 @@
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/WrapDimUtilsMulti.h"
+#include <cctype>
 
 namespace at { namespace native {
 
@@ -136,6 +137,8 @@ Tensor einsum(std::string eqn, TensorList tensors) {
   } else {
     in_eqn = eqn;
   }
+  // remove spaces for einsum compatibility (#9929)
+  in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end());
 
   // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
   int64_t operand = 0;
@@ -212,7 +215,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
             num_output_dims++;
           }
         }
-      } else {                              // letter (hopefully)
+      } else if (! isspace(c)) {                              // letter (hopefully)
         AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
         AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
         int64_t letter_num = c-'a';
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
new file mode 100644
index 00000000000000..092b7255eb4a0d
--- /dev/null
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -0,0 +1,365 @@
+// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
+// Licensed under the BSD-3-Clause license
+// This is the CPU implementation of the Connectionist Temporal Loss.
+// We mostly follow Graves.
+// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
+// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
+// Graves et al call the probabilities y, we use log_probs (also calling them inputs)
+
+#include <ATen/ATen.h>
+#include "ATen/Dispatch.h"
+#include "ATen/TensorUtils.h"
+
+#include <numeric>
+#include <type_traits>
+
+namespace at {
+namespace native {
+
+namespace {
+
+// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
+template<typename target_t>
+static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
+  if (idx % 2 == 0) {
+    return BLANK;
+  } else {
+    return target[offset + stride * (idx / 2)];
+  }
+}
+
+// This kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1).
+// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha).
+// The function returns the loss and the alphas, the alphas are kept for the backward step. The wrapper (ctc_loss below) hides
+// the alphas from the user by only returning the loss.
+template<typename scalar_t, ScalarType target_scalar_type>
+std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  // log_probs: input_len x batch_size x num_labels
+  // targets [int64]: batch_size x target_length OR sum(target_lengths)
+  constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+
+  CheckedFrom c = "ctc_loss_cpu";
+  auto log_probs_arg = TensorArg(log_probs, "log_probs", 1);
+  auto targets_arg = TensorArg(targets, "targets", 2);
+  checkScalarType(c, targets_arg, target_scalar_type);
+  checkDim(c, log_probs_arg, 3);
+  checkDimRange(c, targets_arg, 1, 3);
+
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  AT_CHECK(BLANK < num_labels, "blank must be in label range");
+  AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
+  AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
+
+  size_t tg_target_stride;
+  int64_t max_target_length;
+  std::vector<int64_t> tg_batch_offsets(batch_size);
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+    checkSize(c, targets_arg, 0, pos);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+    checkSize(c, targets_arg, 0, batch_size);
+    AT_CHECK(targets.size(1) >= max_target_length,
+             "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
+             " (while checking arguments for ", c, ")");
+  }
+  int64_t max_input_length = log_probs.size(0);
+  for (int64_t b = 0; b < batch_size; b++) {
+    AT_CHECK(input_lengths[b] <= max_input_length,
+	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
+	     " (while checking arguments for ", c, ")");
+  }
+
+  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
+  Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
+
+  auto lpp  = log_probs.permute({1,0,2});
+  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
+  auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
+  auto targets_data = targets.data<target_t>();
+  auto neg_log_likelihood_a = neg_log_likelihood.accessor<scalar_t, 1>();
+
+  // alpha calculation for the first row, the three equations for alpha_1 above eq (6)
+  // first the default
+  log_alpha.narrow(1, 0, 1).fill_(neginf);
+  #pragma omp parallel for
+  for (int64_t b = 0; b < batch_size; b++) {
+    int64_t input_length = input_lengths[b];
+    int64_t target_length = target_lengths[b];
+    auto log_probs_a = log_probs_a_global[b];
+    auto log_alpha_a = log_alpha_a_global[b];
+    int64_t tg_batch_offset = tg_batch_offsets[b];
+
+    // the first two items of alpha_t above eq (6)
+    log_alpha_a[0][0] = log_probs_a[0][BLANK];
+    if (target_length > 0)
+      log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
+
+    // now the loop over the inputs
+    for (int64_t t=1; t<input_length; t++) {
+      for (int64_t s=0; s<2*target_length+1; s++) {
+	auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+	// this loop over s could be parallel/vectorized, too, but the required items are one index apart
+	// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
+	// for the cuda implementation, that gave a speed boost.
+	// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
+
+        scalar_t la1 = log_alpha_a[t-1][s];
+        scalar_t lamax = la1;
+        scalar_t la2, la3;
+        if (s > 0) {
+          la2 = log_alpha_a[t-1][s-1];
+          if (la2 > lamax)
+            lamax = la2;
+        } else {
+          la2 = neginf;
+        }
+        if ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) !=
+                        current_target_prime)) {
+          la3 = log_alpha_a[t-1][s-2];
+          if (la3 > lamax)
+            lamax = la3;
+        } else {
+          la3 = neginf;
+        }
+        if (lamax == neginf) // cannot do neginf-neginf
+          lamax = 0;
+	// this is the assignment of eq (6)
+        log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
+      }
+    }
+    // the likelihood is the the sum of the last two alphas, eq (8), the loss is the negative log likelihood
+    scalar_t l1 = log_alpha_a[input_length-1][target_length*2];
+    scalar_t l2 = log_alpha_a[input_length-1][target_length*2-1];
+    scalar_t m = std::max(l1, l2);
+    m = ((m == neginf) ? 0 : m);
+    scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m;
+    neg_log_likelihood_a[b] = -log_likelihood;
+  }
+
+  return std::make_tuple(neg_log_likelihood, log_alpha);
+}
+
+// This is the backward. It consists of two phases:
+// a) computing the beta analogous to the alphas in the forward (backward half of the forward-backward algorithm) (eq (10) and (11))
+// b) collecting the per-activation characters for all s and wrapping the gradient (eq (16), the collection is the sum)
+template<typename scalar_t, ScalarType target_scalar_type>
+Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
+                                      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  int64_t max_input_length = log_probs.size(0);
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  Tensor grad = at::full_like(log_probs, neginf); // at this point, this is log of empty sum
+
+  // The admin bits. We don't do much checking and assume that the forward did.
+  int64_t tg_target_stride;
+  int64_t max_target_length;
+  std::vector<int64_t> tg_batch_offsets(batch_size);
+
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+  }
+
+  Tensor log_beta = at::empty_like(log_alpha);  // could be optimized to use only 2 rows
+  auto lpp  = log_probs.permute({1,0,2});
+  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
+  auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
+  auto log_beta_a_global = log_beta.accessor<scalar_t, 3>();
+  auto gp = grad.permute({1,0,2});
+  auto grad_a_global = gp.accessor<scalar_t, 3>();
+  auto targets_data = targets.data<target_t>();
+
+  #pragma omp parallel for
+  for (int64_t b = 0; b < batch_size; b++) {
+    auto log_probs_a = log_probs_a_global[b];
+    auto log_alpha_a = log_alpha_a_global[b];
+    auto log_beta_a = log_beta_a_global[b];
+    auto grad_a = grad_a_global[b];
+    int64_t input_length = input_lengths[b];
+    int64_t target_length = target_lengths[b];
+    int64_t tg_batch_offset = tg_batch_offsets[b];
+
+    // the initialization of beta before eq (10)
+    // here we do the fill for each batch item separately, as the input lengths will differ, so the t in which
+    // we start varies
+    if (input_length > 0) {
+      log_beta.narrow(0, b, 1).narrow(1, input_length-1, 1).fill_(neginf);
+      log_beta_a[input_length-1][2*target_length] = log_probs_a[input_length-1][BLANK];
+      grad_a[input_length-1][BLANK] = log_alpha_a[input_length-1][2*target_length] + log_beta_a[input_length-1][2*target_length];
+
+      if (target_length > 0) {
+        auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 2*target_length-1, BLANK);
+        log_beta_a[input_length-1][2*target_length-1] = log_probs_a[input_length-1][current_target_prime];
+
+        // the first two are a blank and a non-blank, so we know they are different and we don't need to do log+
+        grad_a[input_length-1][current_target_prime] = log_alpha_a[input_length-1][2*target_length-1] + log_beta_a[input_length-1][2*target_length-1];
+      }
+    }
+
+    // now loop applying eq (10) / (11)
+    for (int64_t t=input_length-2; t>=0; t--) {
+      // this loop over s could be parallel/vectorized and doesn't really need to be descending...
+      // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
+      // for the cuda implementation, that gave a speed boost.
+      for (int64_t s=2*target_length; s>=0; s--) {
+        scalar_t lb1 = log_beta_a[t+1][s];
+        scalar_t lbmax = lb1;
+        scalar_t lb2, lb3;
+        auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+        if (s < 2*target_length) {
+          lb2 = log_beta_a[t+1][s+1];
+          if (lb2 > lbmax)
+            lbmax = lb2;
+        } else {
+          lb2 = neginf;
+        }
+        if ((s < 2*target_length-1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
+                                        current_target_prime)) {
+          lb3 = log_beta_a[t+1][s+2];
+          if (lb3 > lbmax)
+            lbmax = lb3;
+        } else {
+          lb3 = neginf;
+        }
+        if (lbmax == neginf)
+          lbmax = 0;
+
+        log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
+        // one might check whether one can vectorize this better when done after the t-loop...
+	// now that we have beta, we fill in the sum of alpha*beta in eq (16)
+	// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
+	// issue (several s can map to the same target character)
+        // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
+        scalar_t log_alpha_beta =  log_alpha_a[t][s] + log_beta_a[t][s];
+        scalar_t &lcab = grad_a[t][current_target_prime];
+        if (lcab == neginf) {
+          lcab = log_alpha_beta;
+        } else {
+          scalar_t max = std::max(lcab, log_alpha_beta);
+          lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max;
+        }
+      }
+    }
+
+    // now grad has the sum of eq (16)
+    // now we wrap up the calculation by adding in the remaining items of eq (16)
+    // this could be a great target for further vectorization.
+    // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
+    scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
+    scalar_t gr =  grad_out.accessor<scalar_t, 1>()[b];
+    for (int64_t t = 0; t < input_length; t++) { // or go for the full thing?
+      for (int64_t c = 0; c < num_labels; c++) {
+        scalar_t& res = grad_a[t][c];
+        scalar_t lp = log_probs_a[t][c];
+        res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+      }
+    }
+    // zero the remainder
+    if (input_length < max_input_length) {
+      grad.narrow(0, input_length, max_input_length - input_length).narrow(1, b, 1).zero_();
+    }
+  }
+  return grad;
+}
+
+} // namespace
+
+std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      } else {
+	return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      }
+  });
+}
+
+Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
+                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      } else {
+	return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      }
+  });
+}
+
+// this wrapper function dispatches to the native and cudnn implementations and hides the alpha/grad from the user (by just returning the loss)
+// the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients
+// it also handles the reduction if desired
+Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, int64_t reduction) {
+  auto& ctx = at::globalContext();
+
+  bool use_cudnn =
+    detail::getCUDAHooks().compiledWithCuDNN() &&
+    (detail::getCUDAHooks().versionCuDNN() >= 7000) &&
+    ctx.userEnabledCuDNN() &&
+    (BLANK == 0) && (targets.dim()==1) &&
+    (log_probs.type().scalarType() == at::kFloat) &&
+    (targets.type().scalarType() == at::kInt) &&
+    (log_probs.type().backend() == Backend::CUDA);
+
+  if (use_cudnn) {
+    // we don't know that input_lengths and target_lengths have the same size (they should, but we didn't check yet)
+    int64_t max_input_length = log_probs.size(0);
+    for (int64_t b = 0; b < input_lengths.size(); b++) {
+      use_cudnn &= (input_lengths[b] == max_input_length);
+    }
+    for (int64_t b = 0; b < target_lengths.size(); b++) {
+      use_cudnn &= (target_lengths[b] <= 256);
+    }
+  }
+
+  Tensor res;
+  if (use_cudnn) {
+    res = std::get<0>(at::_cudnn_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK, ctx.deterministicCuDNN()));
+  } else {
+    res = std::get<0>(at::_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK));
+  }
+  if (reduction == Reduction::ElementwiseMean) {
+    auto target_lengths_t = at::tensor(target_lengths, res.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(res.type());
+    return (res / target_lengths_t).mean();
+  } else if (reduction == Reduction::Sum) {
+    return res.sum();
+  }
+  return res;
+}
+
+} } // at::native
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index 4b474e0c079e77..12f22fcaf2f216 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -76,6 +76,9 @@ void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) {
   using arg0_t = typename traits::result_type;
 
   int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
   if (iter.is_trivial_1d()) {
     auto strides = iter.get_inner_strides();
     int stride0 = strides[0];
@@ -105,6 +108,9 @@ void gpu_unary_kernel(TensorIterator& iter, const func_t& f) {
   using arg1_t = typename traits::arg1_t;
 
   int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
   if (iter.is_cpu_scalar(1)) {
     auto a = iter.scalar_value<arg1_t>(1);
     iter.remove_operand(1);
@@ -152,6 +158,9 @@ void gpu_binary_kernel(TensorIterator& iter, const func_t& f) {
   using arg2_t = typename traits::arg2_t;
 
   int numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
   if (iter.is_cpu_scalar(1)) {
     auto a = iter.scalar_value<arg1_t>(1);
     iter.remove_operand(1);
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
new file mode 100644
index 00000000000000..70ece3f4440cf7
--- /dev/null
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -0,0 +1,625 @@
+// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
+// Licensed under the BSD-3-Clause license
+// This is the GPU implementation of the Connectionist Temporal Loss.
+// We mostly follow Graves.
+// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
+// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
+// Graves et al call the probabilities y, we use log_probs (also calling them inputs)
+// A few optimizations (simmilar to those here, but also some I didn't take) are described in
+// 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf
+
+#include <ATen/TensorUtils.h>
+#include <ATen/Error.h>
+
+#include <ATen/ATen.h>
+#include "ATen/Dispatch.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+
+#include <type_traits>
+#include <numeric>
+
+namespace at {
+namespace native {
+
+namespace {
+
+// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
+// __restrict__ impact to be measured, https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/
+template<typename target_t>
+__device__ static inline int64_t get_target_prime(const target_t* __restrict__ target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
+  if (idx % 2 == 0) {
+    return BLANK;
+  } else {
+    return target[offset + stride * (idx / 2)];
+  }
+}
+
+// this kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1).
+// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha).
+// In total it would be more efficient to compute the beta in the same kernel (e.g. cudnn does this). While the beta are not
+// needed for the loss itself (just the grad), we can return log_alpha+log_beta (so same space as currently) and the overhead
+// is small and the use-case for loss without grad is relatively limited.
+// We parallelize by batch and target sequence. Empirically, it is faster to loop over the input (log probs) sequence  and do
+// target in parallel, even if it means more frequent __syncthreads.
+// In contrast to the cuDNN implementation, we allow large target lengths. For this we need that all previous `s` have been
+// computed when we start a new block_s. This is why we have our own for loop here.
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
+                                    const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                    const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                    scalar_t* __restrict__ neg_log_likelihood_data,
+                                    int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                    int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                    const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                    int64_t batch_size, int64_t BLANK) {
+
+  constexpr scalar_t neginf = -INFINITY;
+
+  // bookkeeping
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (b >= batch_size)
+    return;
+
+  // first row (t=0), the three equations for alpha_1 above eq (6)
+  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    scalar_t la;
+    switch (s) {
+    case 0:
+      la = log_probs_data[lp_batch_offset + lp_char_stride * BLANK];
+      break;
+    case 1:
+      if (target_length > 0) {
+        la = log_probs_data[lp_batch_offset + lp_char_stride * get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
+      }
+      else {
+        la = neginf;
+      }
+      break;
+    default:
+      la = neginf;
+    }
+    if (s < 2*max_target_length+1)
+      log_alpha_data[la_batch_offset + /* la_input_stride * 0 */ + la_target_stride * s] = la;
+  }
+
+  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+
+    // These two only depend on s, so we can cache them.
+    int64_t current_char;       // l_s in eq (6)
+    bool have_three;            // flag which of the two cases in eq (6) we have
+    if (s < 2*target_length+1) {
+      current_char = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) !=
+				current_char));
+    } else {
+      current_char = BLANK;
+      have_three = false;
+    }
+    for (int64_t t=1; t < max_input_length; t++) {
+      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
+      if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
+	// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
+	// lamax is the maximum for the logsumexp trick.
+        scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
+        scalar_t lamax = la1;
+        scalar_t la2, la3;
+        if (s > 0) {
+          la2 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-1)];
+          if (la2 > lamax)
+            lamax = la2;
+        } else {
+          la2 = neginf;
+        }
+        if (have_three) {
+          la3 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-2)];
+          if (la3 > lamax)
+            lamax = la3;
+        } else {
+          la3 = neginf;
+        }
+        if (lamax == neginf) // when all are neginf. (then the whole thing is neginf, but we can pretend)
+          lamax = 0;
+
+        log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
+          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
+      } else {
+	// otherwise we just set to neginf
+        if (s < 2*max_target_length+1)
+          log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
+      }
+    }
+  }
+  __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
+
+  // compute the loss (eq (8))
+  if (threadIdx.x == 0) {
+    scalar_t l1 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2)];
+    scalar_t l2 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2-1)];
+    scalar_t m = ((l1 > l2) ? l1 : l2);
+    m = ((m == neginf) ? 0 : m);
+    scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m;
+    neg_log_likelihood_data[b] = -log_likelihood;
+  }
+}
+
+// The forward computation. Lot's of admin and a call to the alpha kernel.
+// Note: we do not check that the labels are in the valid range. As we use
+// them for indexing in the kernels, you'll see memory errors when you
+// pass corrupt labels.
+// We support both a 2-dimensional tensor as targets (one set of targets in each row) and
+// a 1-dimensional tensor where all targets are concatenated (and we use target_lengths
+// to figure out where they begin).
+// We return log_alpha (currently, might change to (log_alpha+log_beta) to be passed to the
+// backward. The dispatch function will only return the loss.
+template<typename scalar_t, ScalarType target_scalar_type>
+std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  // log_probs: input_len x batch_size x num_labels
+  // targets [int64]: batch_size x target_length OR sum(target_lengths)
+  CheckedFrom c = "ctc_loss_gpu";
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to log_probs cuda if it isn't there already
+  auto log_probs_arg = TensorArg(log_probs, "log_probs", 1);
+  auto targets_arg = TensorArg(targets, "targets", 2);
+  checkAllSameGPU(c, {log_probs_arg, targets_arg});
+
+  checkScalarType(c, targets_arg, target_scalar_type);
+  checkDim(c, log_probs_arg, 3);
+  checkDimRange(c, targets_arg, 1, 3);
+
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  AT_CHECK(BLANK < num_labels, "blank must be in label range");
+  AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
+  AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
+
+  int64_t lp_input_stride = log_probs.stride(0);
+  int64_t lp_char_stride = log_probs.stride(2);
+  int64_t tg_target_stride;
+
+  int64_t max_target_length;
+  auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong)));
+  auto tg_batch_offsets_data = tg_batch_offsets.data<int64_t>();
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+    checkSize(c, targets_arg, 0, pos);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+    checkSize(c, targets_arg, 0, batch_size);
+    AT_CHECK(targets.size(1) >= max_target_length,
+             "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
+             " (while checking arguments for ", c, ")");
+  }
+  int64_t max_input_length = log_probs.size(0);
+  for (int64_t b = 0; b < batch_size; b++) {
+    AT_CHECK(input_lengths[b] <= max_input_length,
+	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
+	     " (while checking arguments for ", c, ")");
+  }
+
+  auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong));
+
+  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
+  Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
+
+  // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu...
+  constexpr int max_threads = 1024;
+  int threads_target = max_threads;
+  while (threads_target / 2 >= 2*max_target_length+1) {
+    threads_target /= 2;
+  }
+  int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
+
+  dim3 block(threads_target, threads_batch);
+  dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
+		      log_alpha.data<scalar_t>(),
+                      log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+                      targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+                      neg_log_likelihood.data<scalar_t>(),
+                      log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+                      log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+                      tg_batch_offsets.data<int64_t>(), tg_target_stride,
+                      batch_size, BLANK);
+  return std::make_tuple(neg_log_likelihood, log_alpha);
+}
+
+// The second (backward) half of the forward backward algorithm, (10) and (11). This is parallel to the
+// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.)
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
+                                             const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                             const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                             int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                             int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                             const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                             int64_t batch_size, int64_t BLANK) {
+  constexpr scalar_t neginf = -INFINITY;
+
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (b >= batch_size)
+    return;
+
+  // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch)
+  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    scalar_t lb;
+    if (s == 2*target_length) {
+      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * BLANK];
+    } else if ((target_length > 0) && (s == 2*target_length-1)) {
+      int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * current_target_prime];
+    } else {
+      lb = neginf;
+    }
+    if (s < 2*max_target_length+1) {
+      log_beta_data[lb_batch_offset + (input_length-1) * lb_input_stride + lb_target_stride * s] = lb;
+    }
+  }
+
+  // go backward in s
+  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    int64_t current_target_prime;
+    bool have_three;
+    if (s < 2*target_length+1) {
+      current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      have_three = ((s < 2*target_length-1) &&
+		    (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
+		     current_target_prime));
+    } else {
+      current_target_prime = BLANK;
+      have_three = false;
+    }
+    // now go backward in t. Note that we need to skip the last timestep that we did above.
+    for (int64_t t=max_input_length-2; t>=0; t--) {
+      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch item
+      if ((t < input_length-1) && (target_length > 0) && (s < 2*target_length+1)) {
+        scalar_t lb1 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * s];
+        scalar_t lbmax = lb1;
+        scalar_t lb2, lb3;
+
+        if (s < 2*target_length) {
+          lb2 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+1)];
+          if (lb2 > lbmax)
+            lbmax = lb2;
+        } else {
+          lb2 = neginf;
+        }
+        if (have_three) {
+          lb3 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+2)];
+          if (lb3 > lbmax)
+            lbmax = lb3;
+        } else {
+          lb3 = neginf;
+        }
+        if (lbmax == neginf)
+          lbmax = 0;
+
+        scalar_t lb = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax
+          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
+
+        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
+      } else if ((s < 2*max_target_length+1) || (t >= input_length)) {
+          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
+      }
+    }
+  }
+}
+
+// This implements the subtrahend of equation (16) for all *nonblank* characters.
+// It assumes you have probs in gradient_data when called
+// and it modifies gradient_data to be, the gradient.
+// In order to facilitate this inplace update, We don't actually do this in logspace.
+// (The other variant implemented uses log_space and the differences seem to be
+//  not so problematic at least with unit normal distributed test activations.)
+// Internally this uses atomicAdd because different threads may write to the same
+// gradient position.
+// This is parallelised over b and s again.
+// Note that for us, the Z of eqn (16) is actually constant for all t and it is the
+// likelihood - this is why we use the negative log likelihood below.
+// We also multiply by the input gradient to keep with standard autograd style.
+// I took this trick from [2], for moderate alphabet sizes a log-space
+// calculation (with an atomic log add) is similarly in performance, but for large
+// alphabets the inplace nature is a considerable advantage.
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data,
+                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
+                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
+                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
+                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
+                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                                     int64_t batch_size, int64_t num_labels, int64_t BLANK) {
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!
+
+  if (b >= batch_size)
+    return;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t gr_batch_offset = b*gr_batch_stride;
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (s >= target_length)
+    return;
+
+  int64_t target = targets_data[tg_batch_offset + s * tg_target_stride];
+  scalar_t nll = neg_log_likelihood_data[b];
+  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
+
+  for (int64_t t = 0; t < input_length; t++) {
+    scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
+    atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
+	      -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
+			+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
+			+ nll - lp) * gr);
+  }
+}
+
+// This is the naive implementation of equation (16). It is parallelised in batch and input timestep.
+// It appears to be faster than the above method for small batch sizes.
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
+                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
+                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
+                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
+                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
+                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                                     int64_t batch_size, int64_t num_labels, int64_t BLANK) {
+
+  constexpr scalar_t neginf = -INFINITY;
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t t = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if ((t >= max_input_length) || (b >= batch_size))
+    return;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t gr_batch_offset = b*gr_batch_stride;
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
+  for (int s = 0; s < 2*max_target_length+1; s++) {
+    if ((target_length > 0) && (s < 2*target_length+1)) {
+      int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      scalar_t log_alpha_beta = (log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s]
+                                 + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s]);
+      scalar_t& lcab = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * current_target_prime];
+      if (lcab == neginf) {
+        lcab = log_alpha_beta;
+      } else {
+        scalar_t max = ((lcab > log_alpha_beta) ? lcab : log_alpha_beta);
+        lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max;
+      }
+    }
+  }
+
+  scalar_t nll = neg_log_likelihood_data[b];
+  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
+
+  for (int64_t c = 0; c < num_labels; c++) {
+    scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c];
+    if (t < input_length) {
+      scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c];
+      res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+    }
+    else {
+      res = 0.;
+    }
+  }
+}
+
+// The backward. It essentially computes eq 16 by using the above kernels.
+// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
+template<typename scalar_t, ScalarType target_scalar_type>
+Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths,
+				      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  constexpr scalar_t neginf = -INFINITY;
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to cuda if it isn't there already
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  int64_t lp_input_stride = log_probs.stride(0);
+  int64_t lp_char_stride = log_probs.stride(2);
+  int64_t tg_target_stride;
+
+  int64_t max_target_length;
+  auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong)));
+  auto tg_batch_offsets_data = tg_batch_offsets.data<int64_t>();
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+  }
+  auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong));
+
+  Tensor log_beta = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
+  Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta))
+
+  // As above, there may be better configurations to use.
+  constexpr int max_threads = 1024;
+  int threads_target = max_threads;
+  while (threads_target / 2 >= 2*max_target_length+1) {
+    threads_target /= 2;
+  }
+  int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  {
+    dim3 block(threads_target, threads_batch);
+    dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
+
+    ctc_loss_backward_log_beta_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
+      (log_beta.data<scalar_t>(),
+       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.data<int64_t>(), tg_target_stride,
+       batch_size, BLANK);
+  }
+
+  // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on
+  // the (capped) difference of timings.
+  // Note that for OK problems target length <= input length, so we
+  // only consider input length.
+  bool is_large = (2*log_probs.size(0)+(24*batch_size)/10+(2*num_labels)/10) > 450;
+  if (is_large) { // large alphabet, large batch
+    // this computes the probs, minuend in (16)
+    exp_out(grad, log_probs);
+    // now we compute the subtrahend for the blanks. It is a straightforward reduction because we know that
+    // blanks are in every other position.
+    // maybe we should kernelize this, too.
+    auto grad_blank = grad.narrow(2, BLANK, 1);
+    grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
+						      {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
+				 + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
+						       {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
+				 2, true)
+		   .permute({1, 0, 2})
+		   .add_(neg_log_likelihood.view({1, batch_size, 1}))
+		   .sub_(log_probs.narrow(2, BLANK, 1))
+		   .exp_()
+		   );
+    // Tor the non-blank characters, we use a kernel to compute the subtrahend.
+    // Again we might configure block and grid in a better way.
+    int threads_target = max_threads;
+    while (threads_target / 2 >= max_target_length) {
+      threads_target /= 2;
+    }
+    int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
+    dim3 block(threads_target, threads_batch);
+    dim3 grid((max_target_length + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
+    ctc_loss_backward_collect_nonblank_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
+      (grad.data<scalar_t>(),
+       grad_out.data<scalar_t>(), grad_out.stride(0),
+       log_alpha.data<scalar_t>(), log_beta.data<scalar_t>(),
+       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+       neg_log_likelihood.data<scalar_t>(),
+       grad.stride(0), grad.stride(1), grad.stride(2),
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.data<int64_t>(), tg_target_stride,
+       batch_size, num_labels, BLANK);
+  } else { // small problem, use naive algorithm
+    // Still no block/grid configuration guru...
+    int threads_input = max_threads;
+    while (threads_input / 2 >= log_probs.size(0)) {
+      threads_input /= 2;
+    }
+    threads_batch = std::min(max_threads / threads_input, (int) batch_size);
+    dim3 block(threads_input, threads_batch);
+    dim3 grid((log_probs.size(0) + threads_input-1)/threads_input, (batch_size+threads_batch-1)/threads_batch);
+
+    ctc_loss_backward_collect_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
+      (grad.data<scalar_t>(),
+       grad_out.data<scalar_t>(), grad_out.stride(0),
+       log_alpha.data<scalar_t>(), log_beta.data<scalar_t>(),
+       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+       neg_log_likelihood.data<scalar_t>(),
+       grad.stride(0), grad.stride(1), grad.stride(2),
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.data<int64_t>(), tg_target_stride,
+       batch_size, num_labels, BLANK);
+  }
+  return grad;
+}
+
+} // namespace
+
+std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      } else {
+	return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      }
+    });
+}
+
+Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
+                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      } else {
+	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      }
+    });
+}
+
+} } // at::native
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
new file mode 100644
index 00000000000000..966aa20e0a128d
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -0,0 +1,92 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+#include <ATen/cuda/CUDAConfig.h>
+#if AT_CUDNN_ENABLED()
+  #include <ATen/cudnn/Descriptors.h>
+#endif
+
+
+#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) {
+  throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
+}
+
+}}
+
+#else // AT_CUDNN_ENABLED
+
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+
+#include <ATen/TensorUtils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+}  // namespace
+
+std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntList input_lengths_, IntList target_lengths_, int64_t BLANK, bool deterministic) {
+  CheckedFrom c = "cudnn_ctc_loss";
+  TensorArg log_probs { log_probs_t, "log_probs", 1 };
+  TensorArg targets { targets_t, "targets", 2 };
+  checkDim(c, log_probs, 3);
+  checkScalarType(c, log_probs, kFloat);
+  checkDim(c, targets, 1);
+  checkScalarType(c, targets, kInt);
+  checkContiguous(c, targets); // ?
+  checkBackend(c, {*log_probs}, Backend::CUDA);
+  checkBackend(c, {*targets}, Backend::CPU);
+  int64_t batch_size = log_probs->size(1);
+  AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size");
+  AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size");
+
+  std::vector<int> input_lengths(input_lengths_.begin(), input_lengths_.end());
+  std::vector<int> target_lengths(target_lengths_.begin(), target_lengths_.end());
+
+  setCuDNNStreamToCurrent();
+  AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss");
+  // checked in dispatch:
+  // assert other conditions for cudnnCTCLoss: all label lengths <= 256
+  // all input lengths = logprob.size(0)
+
+  auto handle = getCudnnHandle();
+
+  cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC);
+
+  Tensor probs = log_probs->softmax(2);
+  TensorDescriptor probs_desc{probs};
+  Tensor grad = at::empty_like(probs);
+  TensorDescriptor grad_desc{grad};
+
+  CTCLossDescriptor ctc_loss_desc;
+  ctc_loss_desc.set(CUDNN_DATA_FLOAT);
+
+  size_t workspace_size;
+  AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
+					      targets->data<int>(), target_lengths.data(), input_lengths.data(),
+					      algo, ctc_loss_desc.desc(), &workspace_size));
+
+
+  Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty?
+  Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
+
+  AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
+			      targets->data<int>(), target_lengths.data(), input_lengths.data(),
+			      costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
+			      ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
+
+  return std::make_tuple(costs, grad);
+}
+
+
+}}  // namespace at::native
+
+#endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8692d6165ff72a..6917cdb6ee6a65 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -29,6 +29,11 @@
 - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
   variants: function, method
 
+- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _cudnn_ctc_loss
+
 - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
   variants: function
   dispatch:
@@ -504,6 +509,21 @@
 - func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor
   variants: function
 
+- func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
+  variants: function
+
+- func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU:  ctc_loss_cpu
+    CUDA: ctc_loss_gpu
+
+- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor
+  variants: function
+  dispatch:
+    CPU: ctc_loss_backward_cpu
+    CUDA: ctc_loss_backward_gpu
+
 - func: det(Tensor self) -> Tensor
 
 - func: diagflat(Tensor self, int64_t offset=0) -> Tensor
diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp
deleted file mode 100644
index 0491203c3286e6..00000000000000
--- a/aten/src/ATen/templates/StorageDerived.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "ATen/${Storage}.h"
-
-// ${generated_comment}
-
-#include "ATen/Half.h"
-#include "ATen/Allocator.h"
-#include <ATen/Context.h>
-
-#include "ATen/Config.h"
-$extra_cuda_headers
-
-namespace at {
-
-${Storage}::${Storage}()
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName}, 
-      0,
-#if ${isCUDA}
-      globalContext().getTHCState()->cudaDeviceAllocator,
-#else
-      getTHDefaultAllocator(),
-#endif
-      /* resizable */ true)) {}
-
-${Storage}::${Storage}(size_t size)
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName}, 
-      size,
-#if ${isCUDA}
-      globalContext().getTHCState()->cudaDeviceAllocator,
-#else
-      getTHDefaultAllocator(),
-#endif
-      /* resizable */ true)) {}
-
-${Storage}::${Storage}(size_t size, Allocator* allocator)
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName}, 
-      size,
-      allocator,
-      /* resizable */ false)) {}
-
-// TODO: Take in Device as an input to the std::function constructor
-
-#if ${isCUDA}
-static int getPointerDevice(void* ptr) {
-  struct cudaPointerAttributes attr;
-  THCudaCheck(cudaPointerGetAttributes(&attr, ptr));
-  return attr.device;
-}
-#endif
-
-${Storage}::${Storage}(
-  void * data, 
-  size_t size, 
-  const std::function<void(void*)> & deleter)
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName},
-      size,
-      InefficientStdFunctionContext::makeDataPtr(data, deleter,
-#if ${isCUDA}
-      Device(kCUDA, getPointerDevice(data))
-#else
-      kCPU
-#endif
-       ),
-     /* allocator */ nullptr,
-     /* resizable */ false)) {}
-}
diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h
deleted file mode 100644
index dddcd5dbf03f21..00000000000000
--- a/aten/src/ATen/templates/StorageDerived.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-// ${generated_comment}
-
-$th_headers
-
-#include "ATen/Storage.h"
-#include "ATen/Context.h"
-
-#include <memory>
-
-namespace at {
-
-struct Allocator;
-
-struct ${Storage} final : public Storage {
-  ${Storage}();
-  ${Storage}(StorageImpl* storage_impl) : Storage(storage_impl){};
-  ${Storage}(size_t size);
-  ${Storage}(size_t size, Allocator* allocator);
-  ${Storage}(
-      void* data,
-      size_t size,
-      const std::function<void(void*)>& deleter);
-  StorageImpl* storage_impl_;
-
- protected:
-  friend struct ${Type};
-};
-
-} // namespace at
diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp
index cc2f47a89180ab..aeba9fb22a3653 100644
--- a/aten/src/ATen/templates/TensorDense.cpp
+++ b/aten/src/ATen/templates/TensorDense.cpp
@@ -3,5 +3,5 @@
 std::unique_ptr<Storage> ${Tensor}::storage() {
   auto storage = THTensor_getStoragePtr(tensor);
   THStorage_retain(storage);
-  return std::unique_ptr<Storage>(new ${Storage}(storage));
+  return std::unique_ptr<Storage>(new Storage(storage));
 }
diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp
index d72ba4abde2c12..249fce467debfc 100644
--- a/aten/src/ATen/templates/TensorDerived.cpp
+++ b/aten/src/ATen/templates/TensorDerived.cpp
@@ -7,7 +7,7 @@
 
 #include "ATen/Config.h"
 #include "ATen/${Tensor}.h"
-#include "ATen/${Storage}.h"
+#include "ATen/Storage.h"
 #include "ATen/Scalar.h"
 #include "ATen/Half.h"
 
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index 67009473dddefc..ddd1483f0436f3 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -31,6 +31,14 @@
 
 namespace at {
 
+#if ${isCUDA}
+static int getPointerDevice(void* ptr) {
+  struct cudaPointerAttributes attr;
+  THCudaCheck(cudaPointerGetAttributes(&attr, ptr));
+  return attr.device;
+}
+#endif
+
 ${Type}::${Type}(Context* context)
   : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
@@ -44,18 +52,44 @@ bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() ==
 bool ${Type}::is_distributed() const { return false; }
 
 std::unique_ptr<Storage> ${Type}::storage() const {
-  return std::unique_ptr<Storage>(new ${Storage}());
+  return std::unique_ptr<Storage>(new Storage(
+      ScalarType::${ScalarName},
+      0,
+#if ${isCUDA}
+      globalContext().getTHCState()->cudaDeviceAllocator
+#else
+      getTHDefaultAllocator()
+#endif
+  ));
 }
 std::unique_ptr<Storage> ${Type}::storage(size_t size) const {
-  return std::unique_ptr<Storage>(new ${Storage}(size));
+  return std::unique_ptr<Storage>(new Storage(
+      ScalarType::${ScalarName},
+      size,
+#if ${isCUDA}
+      globalContext().getTHCState()->cudaDeviceAllocator
+#else
+      getTHDefaultAllocator()
+#endif
+  ));
 }
 std::unique_ptr<Storage> ${Type}::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
     return std::unique_ptr<Storage>(
-      new ${Storage}(data,size,deleter));
+      new Storage(
+      ScalarType::${ScalarName},
+      InefficientStdFunctionContext::makeDataPtr(data, deleter,
+#if ${isCUDA}
+      Device(kCUDA, getPointerDevice(data))
+#else
+      kCPU
+#endif
+      ),
+      size,
+      deleter));
 }
 std::unique_ptr<Storage> ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const {
     return std::unique_ptr<Storage>(
-        new ${Storage}(size, allocator));
+        new Storage(ScalarType::${ScalarName}, size, allocator));
 }
 Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
   if (retain)
@@ -65,7 +99,7 @@ Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
 std::unique_ptr<Storage> ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   if (retain)
     ${THStorage}_retain(${state,} (${THStorage}*) th_pointer);
-  return std::unique_ptr<Storage>(new ${Storage}((${THStorage}*) th_pointer));
+  return std::unique_ptr<Storage>(new Storage((${THStorage}*) th_pointer));
 }
 std::unique_ptr<Generator> ${Type}::generator() const {
   return std::unique_ptr<Generator>(new ${Generator}(context));
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 6b46c8c0b70018..8e58df97073086 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -270,6 +270,10 @@ static void test(Type & type) {
     auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean);
     REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
   }
+  SECTION("core") {
+    int i = CoreTest();
+    REQUIRE(i + 1 == CoreTest());
+  }
 }
 
 TEST_CASE( "basic tests CPU", "[cpu]" ) {
diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp
index 0f05bb466651d3..0c36d5bf97fcf0 100644
--- a/aten/src/TH/THStorageFunctions.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -19,38 +19,25 @@ void THStorage_free(THStorage* storage) {
   if (!storage) {
     return;
   }
-
-  if (--storage->refcount == 0) {
-    if (storage->finalizer) {
-      (*storage->finalizer)();
-    }
-    storage->finalizer = nullptr;
-    storage->data_ptr.clear();
-    THStorage_weakFree(storage);
-  }
+  storage->release();
 }
 
 // Manually retains a weak reference
 void THStorage_weakRetain(THStorage *weak_storage) {
-  weak_storage->weakcount++;
+  weak_storage->weak_retain();
 }
 
 // Releases a weak reference
 void THStorage_weakFree(THStorage *weak_storage) {
-  if (--weak_storage->weakcount == 0) {
-    delete weak_storage;
-  }
+  weak_storage->weak_release();
 }
 
 // Given a weak reference, returns a strong reference to a storage (which must
 // be freed when done) or null if the storage is already dead.
 THStorage* THStorage_weakLock(THStorage *weak_storage) {
-  for (;;) {
-    int refcount = weak_storage->refcount.load();
-    if (refcount == 0) return nullptr;
-    if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break;
-  }
-  return weak_storage;
+  if (weak_storage->weak_lock())
+    return weak_storage;
+  return nullptr;
 }
 
 THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
@@ -95,7 +82,7 @@ ptrdiff_t THStorage_size(const THStorage *self)
 void THStorage_retain(THStorage *storage)
 {
   if (storage) {
-    ++storage->refcount;
+    storage->retain();
   }
 }
 
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 671e2f39fb1c7e..0e8b3e4ab17bee 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -35,8 +35,6 @@
 
 TH_API ptrdiff_t THStorage_size(const THStorage *self);
 
-TH_API void THStorage_setFlag(THStorage *storage, const char flag);
-TH_API void THStorage_clearFlag(THStorage *storage, const char flag);
 TH_API void THStorage_retain(THStorage *storage);
 TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size);
 TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2);
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index 16329f7ed7f621..364290934263ef 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -109,6 +109,17 @@ inline int64_t* THTensor_getStridePtr(THTensor* tensor) {
 
 // NB: Non-retaining
 inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
+  // Within PyTorch, the invariant is that storage_ is always
+  // initialized; we never have tensors that don't have any storage.
+  // However, for Caffe2, this is not true, because they have permitted
+  // tensors to be allocated without specifying what scalar type
+  // they should be, only to be filled when GetMutableData is called
+  // for the first time (providing the necessary type).  It is an ERROR to
+  // invoke any PyTorch operations on such a half-constructed storage,
+  // and this check tests for that case.
+  AT_CHECK(tensor->storage_, "Cannot use PyTorch operations on a half-constructed "
+           "tensor.  If this tensor came from Caffe2, please call GetMutableData on "
+           "it first; otherwise, this is a bug, please report it.");
   return tensor->storage_;
 }
 
@@ -141,6 +152,9 @@ inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset
 
 // NB: Steals ownership of storage
 inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
+  // Caffe2 might have tensors whose storages are null, but we
+  // don't allow it in PyTorch.
+  AT_ASSERT(storage);
   tensor->storage_ = storage;
 }
 
@@ -177,6 +191,19 @@ inline int THTensor_nDimensionLegacyAll(const THTensor* tensor) {
   }
 }
 
+inline int64_t THTensor_strideLegacyNoScalars(const THTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor",
+      dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self));
+  return THTensor_isZeroDim(self) ? 1 : self->stride(dim);
+}
+
+inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor",
+      dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self));
+  return THTensor_isZeroDim(self) ? 1 : self->size(dim);
+}
+
 TH_API void THTensor_free(THTensor *self);
 TH_CPP_API at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
                                                                       at::IntList newshape);
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
index 8272b3d4020ec7..65bd6cdec850bb 100644
--- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
@@ -18,7 +18,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   input = THCTensor_(newContiguous)(state, input);
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
-  if (input->dim() == 1)
+  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
   {
     dim3 blocks(1);
     dim3 threads(MULTIMARGIN_THREADS);
@@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, input->size(0),
+        1, THTensor_sizeLegacyNoScalars(input, 0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, input->size(0),
+        1, THTensor_sizeLegacyNoScalars(input, 0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -52,7 +52,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   else if (input->dim() == 2)
   {
     int nframe = input->size(0);
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(input->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
@@ -149,7 +149,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
 
-  if (input->dim() == 1)
+  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
   {
     dim3 blocks(1);
     dim3 threads(MULTIMARGIN_THREADS);
@@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, gradInput->size(0),
+        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, gradInput->size(0),
+        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -187,7 +187,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   else if (input->dim() == 2)
   {
     int nframe = gradInput->size(0);
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(gradInput->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu
index f73bd5835c04bb..0363dcf0e3996a 100644
--- a/aten/src/THCUNN/generic/SparseLinear.cu
+++ b/aten/src/THCUNN/generic/SparseLinear.cu
@@ -4,17 +4,17 @@
 
 static bool THNN_(checkInput)(THCTensor* t)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(1) == 3;
+  return !t->is_empty() && t->dim() == 2 && t->size(1) == 3;
 }
 
 static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(0) == size0 && t->size(1) == size1;
+  return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1;
 }
 
 static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 1 && t->size(0) == size0;
+  return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0;
 }
 
 static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
index 0699c3ac471c55..a18252b06914d6 100644
--- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
@@ -17,14 +17,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   int64_t t, d, dt, ddt;
   real sum;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && input->dim() <= 2,
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim),
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
+    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
              "inconsistent target size");
   }
   else
@@ -155,16 +155,16 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   int64_t t, d, dt;
   real g;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && input->dim() <= 2,
            "vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
-    AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim),
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
+    AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
              "inconsistent target size");
-    AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim),
+    AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim),
              "inconsistent isTarget size");
   }
   else
diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c
index 424669e5de8515..511089bfaeb5b6 100644
--- a/aten/src/THNN/generic/MultiMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiMarginCriterion.c
@@ -20,13 +20,13 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   int64_t t, d;
   real sum;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && input->dim() <= 2,
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
   }
   else
   {
@@ -136,19 +136,19 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   int64_t t, d;
   real g;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && (input->dim() <= 2),
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
   }
   else
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe),
+    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c
index a28d4e78477ceb..3bf8e652fa9ed9 100644
--- a/aten/src/THNN/generic/SparseLinear.c
+++ b/aten/src/THNN/generic/SparseLinear.c
@@ -26,7 +26,7 @@ static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1)
 
 static bool THNN_(checkSize1D)(THTensor* t, int64_t size0)
 {
-  return !t->is_empty() && t->dim() == 1 && t->size(0) == size0;
+  return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0;
 }
 
 static void THNN_(set1d)(THTensor *t, int64_t x0, real value) {
diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c
index 4b74445e047705..6b1480667d3167 100644
--- a/aten/src/THNN/generic/VolumetricConvolution.c
+++ b/aten/src/THNN/generic/VolumetricConvolution.c
@@ -189,7 +189,7 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 
   int nOutputPlane = (int)gradWeight->size(0);
   if (gradBias) {
-    THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5,
+    THArgCheck(!gradBias->is_empty() && THTensor_nDimensionLegacyNoScalars(gradBias) == 1 && THTensor_sizeLegacyNoScalars(gradBias, 0) == nOutputPlane, 5,
       "gradBias tensor has wrong size"
     );
   }
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 0d84ccbfb606a1..113b4e8ba2e2c8 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -51,6 +51,14 @@ if(BUILD_ATEN)
     set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
     set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
   ENDIF(USE_ROCM)
+else()
+  # Only add "ATen Core", a minimal, easy-to-compile fragment of ATen.
+  # This codepath should only be exercised by the Android build.
+  add_subdirectory(../aten/src/ATen/core ATen_core)
+  list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS})
+  list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE})
+  # TODO: We should probably install the headers, but I don't know
+  # how to do that.
 endif()
 
 # ---[ Torch build
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index f2831909e1587a..8bf5b9dc61dc7a 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -13,6 +13,8 @@
 #include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2.pb.h"
 
+#include "ATen/core/ATenCoreTest.h"
+
 CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
 
 namespace caffe2 {
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
index a6e44846e9e0be..8924a9dc931be9 100644
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -6,6 +6,11 @@
 
 namespace caffe2 {
 
+TEST(CPUContextTest, ATenCoreTest) {
+  int i = at::CoreTest();
+  EXPECT_EQ(i + 1, at::CoreTest());
+}
+
 TEST(CPUContextTest, TestAllocAlignment) {
   for (int i = 1; i < 10; ++i) {
     auto data = CPUContext::New(i);
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index 3c5148e5b6c70f..aab127d8c56e16 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -46,28 +46,31 @@ class Edge : public StorageType<U...> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   Edge(NodeRef tail, NodeRef head, U... args)
-      : StorageType<U...>(std::forward<U...>(args)...), Tail(tail), Head(head) {
+      : StorageType<U...>(std::forward<U...>(args)...),
+        tail_(tail),
+        head_(head) {
     DEBUG_PRINT("Creating instance of Edge: %p\n", this);
   }
 
   const NodeRef& tail() const {
-    return Tail;
+    return tail_;
   }
   const NodeRef& head() const {
-    return Head;
+    return head_;
   }
 
   void setTail(NodeRef n) {
-    Tail = n;
+    tail_ = n;
   }
 
   void setHead(NodeRef n) {
-    Head = n;
+    head_ = n;
   }
 
  private:
-  NodeRef Tail;
-  NodeRef Head;
+  NodeRef tail_;
+  NodeRef head_;
+
   friend class Graph<T, U...>;
 };
 
@@ -88,54 +91,55 @@ class Node : public StorageType<T>, public Notifier<Node<T, U...>> {
   /// \brief Adds an edge by reference to known in-edges.
   /// \p e A reference to an edge that will be added as an in-edge.
   void addInEdge(EdgeRef e) {
-    inEdges.emplace_back(e);
+    inEdges_.emplace_back(e);
   }
 
   /// \brief Adds an edge by reference to known out-edges.
   /// \p e A reference to an edge that will be added as an out-edge.
   void addOutEdge(EdgeRef e) {
-    outEdges.emplace_back(e);
+    outEdges_.emplace_back(e);
   }
 
   /// \brief Removes an edge by reference to known in-edges.
   /// \p e A reference to an edge that will be removed from in-edges.
   void removeInEdge(EdgeRef e) {
-    auto iter = std::find(inEdges.begin(), inEdges.end(), e);
-    assert(
-        iter != inEdges.end() &&
-        "Attempted to remove edge that isn't connected to this node");
-    inEdges.erase(iter);
+    removeEdgeInternal(inEdges_, e);
   }
 
   /// \brief Removes an edge by reference to known out-edges.
   /// \p e A reference to an edge that will be removed from out-edges.
   void removeOutEdge(EdgeRef e) {
-    auto iter = std::find(outEdges.begin(), outEdges.end(), e);
-    assert(
-        iter != outEdges.end() &&
-        "Attempted to remove edge that isn't connected to this node");
-    outEdges.erase(iter);
+    removeEdgeInternal(outEdges_, e);
   }
 
   const std::vector<EdgeRef>& getOutEdges() const {
-    return outEdges;
+    return outEdges_;
   }
   const std::vector<EdgeRef>& getInEdges() const {
-    return inEdges;
+    return inEdges_;
   }
 
-  void setInEdges(std::vector<EdgeRef> es) {
-    inEdges = es;
+  void setInEdges(std::vector<EdgeRef> edges) {
+    inEdges_ = edges;
   }
 
-  void setOutEdges(std::vector<EdgeRef> es) {
-    outEdges = es;
+  void setOutEdges(std::vector<EdgeRef> edges) {
+    outEdges_ = edges;
   }
 
- protected:
-  std::vector<EdgeRef> inEdges;
-  std::vector<EdgeRef> outEdges;
+ private:
+  std::vector<EdgeRef> inEdges_;
+  std::vector<EdgeRef> outEdges_;
+
   friend class Graph<T, U...>;
+
+  void removeEdgeInternal(std::vector<EdgeRef>& edges, EdgeRef e) {
+    auto iter = std::find(edges.begin(), edges.end(), e);
+    assert(
+        iter != edges.end() &&
+        "Attempted to remove edge that isn't connected to this node");
+    edges.erase(iter);
+  }
 };
 
 /// \brief Effectively a constant reference to a graph.
@@ -158,46 +162,56 @@ class Subgraph {
   using EdgeRef = typename Graph<T, U...>::EdgeRef;
 
   void addNode(NodeRef n) {
-    Nodes.insert(n);
+    nodes_.insert(n);
   }
+
   bool hasNode(NodeRef n) const {
-    return Nodes.count(n) != 0;
+    return nodes_.count(n) != 0;
   }
+
   void removeNode(NodeRef n) {
-    Nodes.erase(n);
+    nodes_.erase(n);
   }
 
   void addEdge(EdgeRef e) {
-    Edges.insert(e);
+    edges_.insert(e);
   }
-  bool hasEdge(EdgeRef n) const {
-    return Edges.count(n) != 0;
+
+  bool hasEdge(EdgeRef e) const {
+    return edges_.count(e) != 0;
   }
+
   void removeEdge(EdgeRef e) {
-    Edges.erase(e);
+    edges_.erase(e);
   }
 
   const std::unordered_set<NodeRef>& getNodes() const {
-    return Nodes;
+    return nodes_;
+  }
+
+  const size_t getNodesCount() const {
+    return (size_t)nodes_.size();
   }
+
   const std::unordered_set<EdgeRef>& getEdges() const {
-    return Edges;
+    return edges_;
   }
 
+ private:
+  std::unordered_set<NodeRef> nodes_;
+  std::unordered_set<EdgeRef> edges_;
+
   void printEdges() {
-    for (const auto& edge : Edges) {
+    for (const auto& edge : edges_) {
       printf("Edge: %p (%p -> %p)\n", &edge, edge->tail(), edge->head());
     }
   }
 
   void printNodes() const {
-    for (const auto& node : Nodes) {
+    for (const auto& node : nodes_) {
       printf("Node: %p\n", node);
     }
   }
-
-  std::unordered_set<NodeRef> Nodes;
-  std::unordered_set<EdgeRef> Edges;
 };
 
 /// \brief A simple graph implementation
@@ -231,21 +245,21 @@ class Graph {
   }
 
   void importNode(NodeRef node, Graph<T, U...>& otherGraph) {
-    for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
+    for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
       if (&(*it) == node) {
-        std::list<Node<T, U...>>& otherNodes = otherGraph.Nodes;
-        otherNodes.splice(otherNodes.end(), Nodes, it, ++it);
-        otherGraph.NodeRefs.insert(node);
+        std::list<Node<T, U...>>& otherNodes = otherGraph.nodes_;
+        otherNodes.splice(otherNodes.end(), nodes_, it, ++it);
+        otherGraph.nodeRefs_.insert(node);
         break;
       }
     }
   }
 
   void importEdge(EdgeRef edge, Graph<T, U...>& otherGraph) {
-    std::list<Edge<T, U...>>& otherEdges = otherGraph.Edges;
-    for (auto it = Edges.begin(); it != Edges.end(); ++it) {
+    std::list<Edge<T, U...>>& otherEdges = otherGraph.edges_;
+    for (auto it = edges_.begin(); it != edges_.end(); ++it) {
       if (&(*it) == edge) {
-        otherEdges.splice(otherEdges.end(), Edges, it, ++it);
+        otherEdges.splice(otherEdges.end(), edges_, it, ++it);
         break;
       }
     }
@@ -313,9 +327,9 @@ class Graph {
   /// \return A reference to the edge created.
   EdgeRef createEdge(NodeRef tail, NodeRef head, U... data) {
     DEBUG_PRINT("Creating edge (%p -> %p)\n", tail, head);
-    this->Edges.emplace_back(
+    this->edges_.emplace_back(
         Edge<T, U...>(tail, head, std::forward<U...>(data)...));
-    EdgeRef e = &this->Edges.back();
+    EdgeRef e = &this->edges_.back();
     head->addInEdge(e);
     tail->addOutEdge(e);
     return e;
@@ -339,85 +353,85 @@ class Graph {
   /// related to the node.
   void deleteNode(NodeRef n, bool deleteEdges = true) {
     if (deleteEdges) {
-      auto inEdges = n->inEdges;
+      auto inEdges = n->inEdges_;
       for (auto& edge : inEdges) {
         deleteEdge(edge);
       }
-      auto outEdges = n->outEdges;
+      auto outEdges = n->outEdges_;
       for (auto& edge : outEdges) {
         deleteEdge(edge);
       }
     }
-    for (auto i = Nodes.begin(); i != Nodes.end(); ++i) {
+    for (auto i = nodes_.begin(); i != nodes_.end(); ++i) {
       if (&*i == n) {
-        NodeRefs.erase(n);
-        Nodes.erase(i);
+        nodeRefs_.erase(n);
+        nodes_.erase(i);
         break;
       }
     }
   }
 
-  bool hasNode(NodeRef ref) const {
-    return NodeRefs.find(ref) != NodeRefs.end();
+  bool hasNode(NodeRef node) const {
+    return nodeRefs_.find(node) != nodeRefs_.end();
   }
 
   /// \brief Deletes a edge from the graph.
   /// \p e A reference to the edge.
-  void deleteEdge(EdgeRef e, bool remove_ref = true) {
-    if (remove_ref) {
-      e->Tail->removeOutEdge(e);
-      e->Head->removeInEdge(e);
+  void deleteEdge(EdgeRef e, bool removeRef = true) {
+    if (removeRef) {
+      e->tail_->removeOutEdge(e);
+      e->head_->removeInEdge(e);
     }
-    for (auto i = Edges.begin(); i != Edges.end(); ++i) {
+    for (auto i = edges_.begin(); i != edges_.end(); ++i) {
       if (&*i == e) {
-        Edges.erase(i);
+        edges_.erase(i);
         break;
       }
     }
   }
 
   const std::vector<NodeRef> getMutableNodes() {
-    std::vector<NodeRef> v;
-    for (auto& n : Nodes) {
+    std::vector<NodeRef> result;
+    for (auto& n : nodes_) {
       DEBUG_PRINT("Adding node to mutable output (%p)\n", &n);
-      v.emplace_back(&n);
+      result.emplace_back(&n);
     }
-    return v;
+    return result;
   }
 
   const std::vector<EdgeRef> getMutableEdges() {
-    std::vector<EdgeRef> v;
-    for (auto& e : Edges) {
+    std::vector<EdgeRef> result;
+    for (auto& e : edges_) {
       DEBUG_PRINT("Adding edge to mutable output (%p)\n", &e);
-      v.emplace_back(&e);
+      result.emplace_back(&e);
     }
-    return v;
+    return result;
+  }
+
+ private:
+  std::list<Node<T, U...>> nodes_;
+  std::list<Edge<T, U...>> edges_;
+  std::unordered_set<NodeRef> nodeRefs_;
+
+  NodeRef createNodeInternal(Node<T, U...>&& node) {
+    nodes_.emplace_back(std::move(node));
+    NodeRef nodeRef = &nodes_.back();
+    DEBUG_PRINT("Creating node (%p)\n", nodeRef);
+    nodeRefs_.insert(nodeRef);
+    return nodeRef;
   }
 
   void printEdges() {
-    for (const auto& edge : Edges) {
+    for (const auto& edge : edges_) {
       printf("Edge: %p (%p -> %p)\n", &edge, edge.tail(), edge.head());
     }
   }
 
   void printNodes() const {
-    for (const auto& node : Nodes) {
+    for (const auto& node : nodes_) {
       printf("Node: %p\n", &node);
     }
   }
-
- private:
-  std::list<Node<T, U...>> Nodes;
-  std::list<Edge<T, U...>> Edges;
-  std::unordered_set<NodeRef> NodeRefs;
-
-  NodeRef createNodeInternal(Node<T, U...>&& node) {
-    Nodes.emplace_back(std::move(node));
-    NodeRef nodeRef = &Nodes.back();
-    DEBUG_PRINT("Creating node (%p)\n", nodeRef);
-    NodeRefs.insert(nodeRef);
-    return nodeRef;
-  }
 };
 
 } // namespace nom
diff --git a/caffe2/core/nomnigraph/tests/binary_match_test.cc b/caffe2/core/nomnigraph/tests/binary_match_test.cc
index 4834cea30f3e23..ca3fd11b3a9126 100644
--- a/caffe2/core/nomnigraph/tests/binary_match_test.cc
+++ b/caffe2/core/nomnigraph/tests/binary_match_test.cc
@@ -19,7 +19,7 @@ TEST(BinaryMatch, AllMatch) {
   auto matches = nom::algorithm::binaryMatch(
       &graph, [](decltype(graph)::NodeRef n) { return true; });
   EXPECT_EQ(matches.size(), 1);
-  EXPECT_EQ(matches.front().Nodes.size(), graph.getMutableNodes().size());
+  EXPECT_EQ(matches.front().getNodesCount(), graph.getMutableNodes().size());
 }
 
 TEST(BinaryMatch, EmptyGraph) {
@@ -58,9 +58,9 @@ TEST(BinaryMatch, Basic) {
 
   EXPECT_EQ(matches.size(), 1);
   auto match = matches.front();
-  EXPECT_EQ(match.Nodes.size(), 4);
+  EXPECT_EQ(match.getNodesCount(), 4);
   std::set<std::string> exp{"2", "3", "4", "6"};
-  for (auto n : match.Nodes) {
+  for (auto n : match.getNodes()) {
     EXPECT_EQ(exp.count(n->data()), 1);
     exp.erase(n->data());
   }
@@ -104,16 +104,16 @@ TEST(BinaryMatch, RemovedMiddleNode) {
   auto match1 = matches.front();
   auto match2 = matches.back();
 
-  EXPECT_EQ(match1.Nodes.size(), 2);
-  EXPECT_EQ(match2.Nodes.size(), 1);
+  EXPECT_EQ(match1.getNodesCount(), 2);
+  EXPECT_EQ(match2.getNodesCount(), 1);
 
   std::set<std::string> exp1{"2", "4"};
   std::set<std::string> exp2{"6"};
-  for (auto n : match1.Nodes) {
+  for (auto n : match1.getNodes()) {
     EXPECT_EQ(exp1.count(n->data()), 1);
     exp1.erase(n->data());
   }
-  for (auto n : match2.Nodes) {
+  for (auto n : match2.getNodes()) {
     EXPECT_EQ(exp2.count(n->data()), 1);
     exp2.erase(n->data());
   }
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 45f55ab2407a2e..755e1b5a57b8a9 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,7 +489,7 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsType<TensorCPU>() ||
+    if (!noiseBlob->IsType<Tensor>(CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index 9f032e6fe299d0..bcf588d8a384f0 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -94,7 +94,7 @@ void testMPSCNN() {
 
               Workspace ws;
               for (auto i = 0; i < N; ++i) {
-                auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
                 t->Resize(BS, C, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -152,7 +152,7 @@ void testMPSCNN() {
 
         Workspace ws;
         for (auto i = 0; i < N; ++i) {
-          auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
           switch (ndim) {
             case 1:
               t->Resize(5);
@@ -210,7 +210,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -218,14 +218,14 @@ void testMPSCNN() {
         }
 
         {
-          auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 0, 1, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("stddev")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandUniform<float, CPUContext>(
@@ -290,7 +290,7 @@ void testMPSCNN() {
           for (const auto dim : {10, 40}) {
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(batchSize, channels, dim, dim);
               CPUContext ctx;
               // Too noisy.
@@ -299,7 +299,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -310,7 +310,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -321,7 +321,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("pw")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
               t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
               CPUContext ctx;
               // Too noisy.
@@ -409,7 +409,7 @@ void testMPSCNN() {
           Workspace ws;
           const auto channels = array ? 12 : 3;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
             t->Resize(batch_size, channels, 8, 13);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -417,7 +417,7 @@ void testMPSCNN() {
           }
 
           {
-            auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
             t->Resize(shared ? channels : 1);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -480,7 +480,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -488,7 +488,7 @@ void testMPSCNN() {
         }
 
         for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = ws.CreateBlob(name)->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
           t->Resize(channels);
           CPUContext ctx;
           // High mean to avoid var division by zero.
@@ -575,7 +575,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSCNNFC Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                 t->Resize(batchSize, CIn, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -583,7 +583,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                 t->Resize(COut, CIn * H * W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -591,7 +591,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                 t->Resize(COut);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -683,7 +683,7 @@ void testMPSCNN() {
                           Workspace ws;
                           {
                             auto* t =
-                                ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                             t->Resize(batchSize, 8, 8, 13);
                             CPUContext ctx;
                             math::RandGaussian<float, CPUContext>(
@@ -784,7 +784,7 @@ void testMPSCNN() {
          std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         t->Resize(dims);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -860,7 +860,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNPreprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 8, 13, 4);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -869,7 +869,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -940,7 +940,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 3, 8, 24);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -949,7 +949,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -999,7 +999,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 3, 1280, 720);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -1008,7 +1008,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 30;
@@ -1072,8 +1072,7 @@ void testMPSCNN() {
                       LOG(INFO) << "MPSCNNConv Test";
                       Workspace ws;
                       {
-                        auto* t =
-                            ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                         t->Resize(batchSize, 12, 57, 72);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1081,7 +1080,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                         t->Resize(8, 12, kernel_h, kernel_w);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1093,7 +1092,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                         t->Resize(8);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1189,7 +1188,7 @@ void testMPSCNN() {
             Workspace ws;
             int output_channels = input_channels * channel_multiplier;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(batchSize, input_channels, 57, 72);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1197,7 +1196,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
               t->Resize(output_channels, 1, 3, 3);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1205,7 +1204,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
               t->Resize(output_channels);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1276,7 +1275,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNConvRelu Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1284,7 +1283,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1292,7 +1291,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1386,7 +1385,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSConv Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1394,7 +1393,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1402,7 +1401,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1494,7 +1493,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSConv Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                 t->Resize(batchSize, C, 12, 16);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1502,7 +1501,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                 t->Resize(M, C, K, K);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1510,7 +1509,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                 t->Resize(M);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1608,7 +1607,7 @@ void testMPSCNN() {
                 LOG(INFO) << "MPSCNNConv Test - group";
                 Workspace ws;
                 {
-                  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                   t->Resize(batchSize, C, 12, 16);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1616,7 +1615,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                   t->Resize(M, C / group, K, K);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1624,7 +1623,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                   t->Resize(M);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1727,7 +1726,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNMul Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1735,7 +1734,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1792,7 +1791,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNSub Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1800,7 +1799,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1857,7 +1856,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1865,7 +1864,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1922,7 +1921,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1930,7 +1929,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2012,7 +2011,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNNeuron Test: " << n;
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         t->Resize(1, 4, 12, 12);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -2066,7 +2065,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDropout Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2137,7 +2136,7 @@ void testMPSCNN() {
                       << " - scale: " << scale;
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(1, channels, 40, 40);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2145,7 +2144,7 @@ void testMPSCNN() {
             }
             {
               // Use the batch-first encoding (n, [bbox])
-              auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
               t->Resize(6, 5);
               for (auto i = 0; i < t->dim32(0); ++i) {
                 t->mutable_data<float>()[5 * i + 0] = 0; // batch
@@ -2251,14 +2250,14 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNRoIWarp Test 2";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(1, 8, 40, 40);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 4, 2, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
           t->Resize(6, 4);
           for (auto i = 0; i < t->dim32(0); ++i) {
             t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@@ -2363,7 +2362,7 @@ void testMPSCNN() {
             LOG(INFO) << "MPSCNNResizeNearestOp Test";
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(N, C, 37, 89);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2498,7 +2497,7 @@ void testMPSCNN() {
     vector<float> im_info{60, 80, 0.166667};
     vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(num_images, A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = scores[i];
@@ -2506,7 +2505,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
       t->Resize(num_images, 4 * A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = bbx[i];
@@ -2514,7 +2513,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("im_info")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
       t->Resize(num_images, 3);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = im_info[i];
@@ -2522,7 +2521,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("anchors")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
       t->Resize(A, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = anchors[i];
@@ -2588,7 +2587,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNSoftmax Test";
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         // Only works for spatial dimension of (1, 1) - weird.
         t->Resize(batchSize, 12, 1, 1);
         CPUContext ctx;
@@ -2662,8 +2661,8 @@ void testMPSCNN() {
                             LOG(INFO) << "MPSConvTranspose Test";
                             Workspace ws;
                             {
-                              auto* t = ws.CreateBlob("X_cpu")
-                                            ->GetMutable<TensorCPU>();
+                              auto* t =
+                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                               t->Resize(batchSize, inputChannels, 8, 12);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2676,7 +2675,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
                               t->Resize(
                                   inputChannels,
                                   outputChannels,
@@ -2693,7 +2692,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
                               t->Resize(outputChannels);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2810,7 +2809,7 @@ void testMPSCNN() {
                     << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
             t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2892,7 +2891,7 @@ void testMPSCNN() {
           }
           Workspace ws;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
             t->Resize(batchSize, inputChannels, 53, 47);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2965,7 +2964,7 @@ void testMPSCNN() {
                     << numInputs << ", " << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
             t->Resize(batchSize, channelCount, 9, 17);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -3338,7 +3337,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     cws.RunNetOnce(initNet);
     {
       auto* t =
-          cws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
+          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3350,7 +3349,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     mws.RunNetOnce(initNet);
     {
       auto* t =
-          mws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
+          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3398,16 +3397,16 @@ void verifyRewrite(
   dumpDef(predictNet);
   dumpDef(metalPredictNet);
 
-#define RUN_NET(ws, predictNet)                                               \
-  ws.RunNetOnce(initNet);                                                     \
-  {                                                                           \
-    auto* t =                                                                 \
-        ws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>(); \
-    t->Resize(inputDims);                                                     \
-    CPUContext ctx;                                                           \
-    math::RandGaussian<float, CPUContext>(                                    \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                     \
-  }                                                                           \
+#define RUN_NET(ws, predictNet)                                             \
+  ws.RunNetOnce(initNet);                                                   \
+  {                                                                         \
+    auto* t =                                                               \
+        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
+    t->Resize(inputDims);                                                   \
+    CPUContext ctx;                                                         \
+    math::RandGaussian<float, CPUContext>(                                  \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
+  }                                                                         \
   ws.RunNetOnce(predictNet);
 
   // initialize
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
index 70b9ac05747511..2896bc26ac08d4 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
@@ -41,7 +41,7 @@ void RowsWhereRoILevelEquals(Eigen::Ref<const ERArrXXf> rois,
 //    distribute those proposals to their appropriate FPN levels for Faster RCNN.
 //    An anchor at one FPN level may predict an RoI that will map to another
 //    level, hence the need to redistribute the proposals.
-// Reference: detectron/lib/ops/collect_and_distribute_fpn_rpn_proposals.py
+// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py
 template <class Context>
 class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
  public:
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
index 81f7d9ac43123f..faf4936495244f 100644
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@@ -59,7 +59,7 @@ ERMatXf ComputeAllAnchors(
 //     regression result 'deltas' as well as predefined bounding box shapes
 //     'anchors'. Greedy non-maximum suppression is applied to generate the
 //     final bounding boxes.
-// Reference: detectron/lib/ops/generate_proposals.py
+// Reference: facebookresearch/Detectron/detectron/ops/generate_proposals.py
 template <class Context>
 class GenerateProposalsOp final : public Operator<Context> {
  public:
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
index 0c4c345d382cb1..333514102b7d4b 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes.h
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -5,7 +5,7 @@
 #include "caffe2/utils/math.h"
 
 // Bounding box utils for generate_proposals_op
-// Reference: detectron/lib/utils/boxes.py
+// Reference: facebookresearch/Detectron/detectron/utils/boxes.py
 
 namespace caffe2 {
 namespace utils {
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
index 5d6f87d4d30563..7b38cd6a1420d6 100644
--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -19,7 +19,7 @@ namespace utils {
 // Reject a bounding box if its region has an intersection-overunion (IoU)
 //    overlap with a higher scoring selected bounding box larger than a
 //    threshold.
-// Reference: detectron/lib/utils/cython_nms.pyx
+// Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx
 // proposals: pixel coordinates of proposed bounding boxes,
 //    size: (M, 4), format: [x1; y1; x2; y2]
 // scores: scores for each bounding box, size: (M, 1)
@@ -78,7 +78,7 @@ std::vector<int> nms_cpu_upright(
 
 /**
  * Soft-NMS implementation as outlined in https://arxiv.org/abs/1704.04503.
- * Reference: detectron/lib/utils/cython_nms.pyx
+ * Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx
  * out_scores: Output updated scores after applying Soft-NMS
  * proposals: pixel coordinates of proposed bounding boxes,
  *    size: (M, 4), format: [x1; y1; x2; y2]
@@ -426,7 +426,7 @@ std::vector<int> nms_cpu(
 // Reject a bounding box if its region has an intersection-overunion (IoU)
 //    overlap with a higher scoring selected bounding box larger than a
 //    threshold.
-// Reference: detectron/lib/utils/cython_nms.pyx
+// Reference: facebookresearch/Detectron/detectron/lib/utils/cython_nms.pyx
 // proposals: pixel coordinates of proposed bounding boxes,
 //    size: (M, 4), format: [x1; y1; x2; y2]
 //    size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN
diff --git a/caffe2/opt/device.cc b/caffe2/opt/device.cc
index 9abca6d67e08b3..0cfdd6c1dc91a3 100644
--- a/caffe2/opt/device.cc
+++ b/caffe2/opt/device.cc
@@ -9,15 +9,14 @@ std::vector<NNGraph::EdgeRef> getInputEdges(
     const NNGraph::SubgraphType& sg,
     const NNGraph& g) {
   std::vector<NNGraph::EdgeRef> inputTensorEdges;
-  for (const auto& node : sg.Nodes) {
+  for (const auto& node : sg.getNodes()) {
     NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
     NOM_REQUIRE_OR_CONT(nn::hasInputs(node));
 
     // Check if tensor's parents are in the sg
     for (const auto& input : nn::getInputs(node)) {
       NOM_REQUIRE_OR_CONT(
-          !nn::hasProducer(input) ||
-          sg.Nodes.count(nn::getProducer(input)) == 0);
+          !nn::hasProducer(input) || !sg.hasNode(nn::getProducer(input)));
       inputTensorEdges.emplace_back(g.getEdge(input, node));
     }
   }
@@ -28,13 +27,13 @@ std::vector<NNGraph::EdgeRef> getOutputEdges(
     const NNGraph::SubgraphType& sg,
     const NNGraph& g) {
   std::vector<NNGraph::EdgeRef> outputTensorEdges;
-  for (const auto& node : sg.Nodes) {
+  for (const auto& node : sg.getNodes()) {
     NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
 
     for (const auto& output : nn::getOutputs(node)) {
       auto consumers = nn::getConsumers(output);
       for (const auto& consumer : consumers) {
-        NOM_REQUIRE_OR_CONT(sg.Nodes.count(consumer) == 0);
+        NOM_REQUIRE_OR_CONT(!sg.hasNode(consumer));
         outputTensorEdges.emplace_back(g.getEdge(node, output));
       }
       NOM_REQUIRE_OR_CONT(consumers.size() == 0);
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 75baec0e9be66b..09528b99b5da51 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -323,8 +323,10 @@ void OnnxifiTransformer::Transform(
 
   // function to tell whether the ONNXIFI backend supports a given C2 op or not
   // TODO: choose backend id
+  onnxifi_library* backend = lib_;
+  onnxBackendID backend_id = backend_ids_[0];
   auto supports =
-      [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]](
+      [&exporter, &shape_hints, backend, backend_id](
           const caffe2::OperatorDef& op) {
         const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
         // NB: this might not be a hard constraint as we can just export C2
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index cb9932bc4542a2..e5c9c095a16c55 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -630,7 +630,7 @@ def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g):
            beta=st.floats(min_value=0.1, max_value=0.9),
            lambda1=st.floats(min_value=0.001, max_value=0.1),
            lambda2=st.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None]),
+           engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
                       engine, gc, dc):
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index db870972f83946..ee60d776d55a82 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -1421,7 +1421,8 @@ def build_ftrl(model, engine="SIMD", **kwargs):
 
 
 def build_gftrl(model, engine="", **kwargs):
-    # SIMD version of GFTRL is not supported
+    if engine == "SIMD":
+        assert core.IsOperator('GFtrl_ENGINE_SIMD')
     gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
     return _build(model, gftrl_optimizer)
 
diff --git a/caffe2/requirements.txt b/caffe2/requirements.txt
index 9a1d67efc7c2f3..07fd95b72582a2 100644
--- a/caffe2/requirements.txt
+++ b/caffe2/requirements.txt
@@ -1,2 +1,4 @@
 numpy
 enum34
+pyyaml
+typing
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index e0ae5cc0336e2a..c573542af5763c 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -2605,6 +2605,13 @@ bool TransposeWithHPTT(
     axes_cm[i] = cm_fn(axes[cm_fn(i)]);
     dims_cm[i] = dims[cm_fn(i)];
   }
+
+  // HPTT doesn't handle 0 sized inputs.
+  for (auto dim : dims_cm) {
+    if (dim <= 0) {
+      return false;
+    }
+  }
   auto plan = hptt::create_plan(
       axes_cm.data(),
       ndim,
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index bc30f35f2a2eee..3829219a933b5d 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -1,3 +1,9 @@
+# This ill-named file does a number of things:
+# - Installs Caffe2 header files (this has nothing to do with code generation)
+# - Configures caffe2/core/macros.h
+# - Creates an ATen target for its generated C++ files and adds it
+#   as a dependency
+
 if (DEFINED ENV{PYTORCH_PYTHON})
   message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}")
   set(PYCMD "$ENV{PYTORCH_PYTHON}")
@@ -14,6 +20,11 @@ configure_file(
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
         DESTINATION include
         FILES_MATCHING PATTERN "*.h")
+if (NOT BUILD_ATEN)
+  install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
+          DESTINATION include/ATen/core
+          FILES_MATCHING PATTERN "*.h")
+endif()
 install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
         DESTINATION include/caffe2/core)
 
diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index 2a4e61f97b0b18..b05acdf1c90558 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -83,22 +83,26 @@ endif()
 cmake_pop_check_state()
 
 # ---[ Check for NUMA support
-cmake_push_check_state(RESET)
-set(CMAKE_REQUIRED_FLAGS "-std=c++11")
-CHECK_CXX_SOURCE_COMPILES(
+if (USE_NUMA)
+  cmake_push_check_state(RESET)
+  set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+  CHECK_CXX_SOURCE_COMPILES(
     "#include <numa.h>
     #include <numaif.h>
 
     int main(int argc, char** argv) {
     }" CAFFE2_IS_NUMA_AVAILABLE)
-
-if (CAFFE2_IS_NUMA_AVAILABLE)
-  message(STATUS "NUMA is available")
+  if (CAFFE2_IS_NUMA_AVAILABLE)
+    message(STATUS "NUMA is available")
+  else()
+    message(STATUS "NUMA is not available")
+    set(CAFFE2_DISABLE_NUMA 1)
+  endif()
+  cmake_pop_check_state()
 else()
-  message(STATUS "NUMA is not available")
+  message(STATUS "NUMA is disabled")
   set(CAFFE2_DISABLE_NUMA 1)
 endif()
-cmake_pop_check_state()
 
 # ---[ Check if we want to turn off deprecated warning due to glog.
 # Note(jiayq): on ubuntu 14.04, the default glog install uses ext/hash_set that
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 987044bbd212f4..9cf2326a9445fb 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -604,6 +604,12 @@ Loss functions
 .. autoclass:: CrossEntropyLoss
     :members:
 
+:hidden:`CTCLoss`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: CTCLoss
+    :members:
+
 :hidden:`NLLLoss`
 ~~~~~~~~~~~~~~~~~
 
@@ -1180,6 +1186,11 @@ Loss functions
 
 .. autofunction:: cross_entropy
 
+:hidden:`ctc_loss`
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: ctc_loss
+
 :hidden:`hinge_embedding_loss`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index c3c85797b4cd82..06b0305d28aae8 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -46,7 +46,7 @@ A tensor can be constructed from a Python :class:`list` or sequence using the
     If you have a numpy array and want to avoid a copy, use
     :func:`torch.as_tensor`.
 
-An tensor of specific data type can be constructed by passing a
+A tensor of specific data type can be constructed by passing a
 :class:`torch.dtype` and/or a :class:`torch.device` to a
 constructor or tensor creation op:
 
diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh
index 1db0f546724103..62185d1e9dc821 100755
--- a/scripts/build_anaconda.sh
+++ b/scripts/build_anaconda.sh
@@ -296,6 +296,10 @@ fi
 # Add packages required for all Caffe2 builds
 add_package 'glog'
 add_package 'gflags'
+add_package 'mkl' '>=2018'
+add_package 'mkl-include'
+add_package 'typing'
+append_to_section 'build' '- pyyaml'
 caffe2_cmake_args+=("-DUSE_LEVELDB=OFF")
 caffe2_cmake_args+=("-DUSE_LMDB=OFF")
 
@@ -303,10 +307,6 @@ caffe2_cmake_args+=("-DUSE_LMDB=OFF")
 # Add packages required for pytorch
 if [[ -n $integrated ]]; then
   add_package 'cffi'
-  add_package 'mkl' '>=2018'
-  add_package 'mkl-include'
-  add_package 'typing'
-  append_to_section 'build' '- pyyaml'
   append_to_section 'build' '- setuptools'
   #caffe2_cmake_args+=("-DBLAS=MKL")
   if [[ -n $cuda_ver ]]; then
diff --git a/setup.py b/setup.py
index 042d8668bb7b96..67330417492c3f 100644
--- a/setup.py
+++ b/setup.py
@@ -1023,6 +1023,7 @@ def make_relative_rpath(path):
                 'lib/torch_shm_manager',
                 'lib/*.h',
                 'lib/include/ATen/*.h',
+                'lib/include/ATen/core/*.h',
                 'lib/include/ATen/detail/*.h',
                 'lib/include/ATen/cuda/*.h',
                 'lib/include/ATen/cuda/*.cuh',
diff --git a/test/common_nn.py b/test/common_nn.py
index 6172f4b15adc3f..33b2e94204f3ca 100644
--- a/test/common_nn.py
+++ b/test/common_nn.py
@@ -448,6 +448,43 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele
     return output
 
 
+# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space
+def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='elementwise_mean'):
+    input_lengths = torch.tensor(input_lengths, dtype=torch.long)
+    target_lengths = torch.tensor(target_lengths, dtype=torch.long)
+    dt = log_probs.dtype
+    log_probs = log_probs.double()  # we need the accuracy as we are not in logspace
+    targets = targets.long()
+    cum_target_lengths = target_lengths.cumsum(0)
+    losses = []
+    for i in range(log_probs.size(1)):
+        input_length = input_lengths[i].item()
+        target_length = target_lengths[i].item()
+        cum_target_length = cum_target_lengths[i].item()
+        targets_prime = targets.new_full((2 * target_length + 1,), blank)
+        if targets.dim() == 2:
+            targets_prime[1::2] = targets[i, :target_length]
+        else:
+            targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length]
+        probs = log_probs[:input_length, i].exp()
+        alpha = log_probs.new_zeros((target_length * 2 + 1,))
+        alpha[0] = probs[0, blank]
+        alpha[1] = probs[0, targets_prime[1]]
+        mask_third = (targets_prime[:-2] != targets_prime[2:])
+        for t in range(1, input_length):
+            alpha_next = alpha.clone()
+            alpha_next[1:] += alpha[:-1]
+            alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1))
+            alpha = probs[t, targets_prime] * alpha_next
+        losses.append(-alpha[-2:].sum().log()[None])
+    output = torch.cat(losses, 0)
+    if reduction == 'elementwise_mean':
+        return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean()
+    elif reduction == 'sum':
+        return output.sum()
+    output = output.to(dt)
+    return output
+
 loss_reference_fns = {
     'KLDivLoss': kldivloss_reference,
     'NLLLoss': nllloss_reference,
@@ -460,6 +497,7 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele
     'CosineEmbeddingLoss': cosineembeddingloss_reference,
     'TripletMarginLoss': tripletmarginloss_reference,
     'MarginRankingLoss': marginrankingloss_reference,
+    'CTCLoss': ctcloss_reference,
 }
 
 
@@ -841,7 +879,7 @@ def check_criterion_jacobian(self, criterion, input, target):
 
 class TestBase(object):
 
-    _required_arg_names = {'constructor_args', 'input'}
+    _required_arg_names = {'constructor_args', 'input', 'extra_args'}
 
     def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs):
         self.desc = desc
@@ -850,8 +888,8 @@ def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwa
         self.reference_fn = reference_fn
         for name in self._required_arg_names:
             if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs:
-                if name == 'constructor_args':
-                    kwargs['constructor_args'] = tuple()
+                if name in {'constructor_args', 'extra_args'}:
+                    kwargs[name] = tuple()
                 else:
                     raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
                                      .format(self.get_name(), name))
@@ -879,6 +917,10 @@ def _unpack(self, value):
     def constructor_args(self):
         return self._get_arg('constructor_args', True)
 
+    @property
+    def extra_args(self):
+        return self._get_arg('extra_args', True)
+
     def _get_arg(self, name, unpack):
         assert name in self._required_arg_names
 
@@ -1103,9 +1145,9 @@ def __call__(self, test_case):
         target = self._get_target()
 
         if self.reference_fn is not None:
-            out = test_case._forward_criterion(module, input, target)
-            expected_out = self.reference_fn(deepcopy(input),
-                                             deepcopy(target), module)
+            out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args)
+            ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,)
+            expected_out = self.reference_fn(*ref_args)
             if isinstance(expected_out, torch.Tensor):
                 expected_out = expected_out.item()
             test_case.assertEqual(out, expected_out)
diff --git a/test/expect/TestJit.test_concat_fusion.expect b/test/expect/TestJit.test_concat_fusion.expect
index 027c2de33e5926..454a84cba1db76 100644
--- a/test/expect/TestJit.test_concat_fusion.expect
+++ b/test/expect/TestJit.test_concat_fusion.expect
@@ -3,12 +3,11 @@ graph(%0 : Float(3, 20)
   %2 : Float(6, 20) = prim::FusionGroup_0[device=0](%0, %1)
   return (%2);
 }
-with prim::FusionGroup_0 = graph(%4 : Float(3, 20)
-      %5 : Float(3, 20)) {
-  %7 : int = prim::Constant[value=1]()
-  %8 : Float(3, 20) = aten::add(%4, %5, %7)
-  %6 : Float(3, 20) = aten::mul(%4, %5)
-  %2 : int = prim::Constant[value=0]()
-  %3 : Float(6, 20) = aten::cat(%8, %6, %2)
-  return (%3);
+with prim::FusionGroup_0 = graph(%3 : Float(3, 20)
+      %4 : Float(3, 20)) {
+  %6 : int = prim::Constant[value=1]()
+  %7 : Float(3, 20) = aten::add(%3, %4, %6)
+  %5 : Float(3, 20) = aten::mul(%3, %4)
+  %2 : Float(6, 20) = prim::FusedConcat[dim=0](%7, %5)
+  return (%2);
 }
diff --git a/test/expect/TestJit.test_constant_prop_nested.expect b/test/expect/TestJit.test_constant_prop_nested.expect
new file mode 100644
index 00000000000000..5af0203c90349f
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_nested.expect
@@ -0,0 +1,16 @@
+graph(%a : Dynamic) {
+  %1 : int = prim::Constant[value=2]()
+  %2 : Dynamic = aten::lt(%a, %1)
+  %3 : int = prim::TensorToNum(%2)
+  %c : int = prim::If(%3)
+    block0() {
+      %5 : int = prim::Constant[value=5]()
+      -> (%5)
+    }
+    block1() {
+      %6 : int = prim::Constant[value=1]()
+      -> (%6)
+    }
+  %7 : Long() = prim::NumToTensor(%c)
+  return (%7);
+}
diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect
new file mode 100644
index 00000000000000..7cadfdbbc6b3ea
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_print.expect
@@ -0,0 +1,12 @@
+graph(%input_tensor : Dynamic) {
+  %1 : int = prim::Constant[value=6]()
+  %2 : Dynamic = ^FIXME_zerol()()
+  %a : Dynamic = aten::add(%1, %2)
+   = prim::Print(%a)
+  %4 : int = prim::Constant[value=2]()
+  %5 : int = prim::Constant[value=1]()
+  %b : Dynamic = aten::add(%a, %4, %5)
+  %7 : int = prim::Constant[value=1]()
+  %8 : Dynamic = aten::add(%b, %input_tensor, %7)
+  return (%8);
+}
diff --git a/test/expect/TestJit.test_constant_prop_rand.expect b/test/expect/TestJit.test_constant_prop_rand.expect
new file mode 100644
index 00000000000000..a6c305258bff95
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_rand.expect
@@ -0,0 +1,11 @@
+graph() {
+  %0 : int = prim::Constant[value=6]()
+  %1 : int = prim::Constant[value=0]()
+  %2 : int[] = prim::Constant[value=[0, -1]]()
+  %3 : int[] = prim::Constant[value=[3]]()
+  %a : Dynamic = aten::randn(%3, %0, %1, %2)
+  %5 : int = prim::Constant[value=2]()
+  %6 : int = prim::Constant[value=1]()
+  %b : Dynamic = aten::add(%a, %5, %6)
+  return (%b);
+}
diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect
new file mode 100644
index 00000000000000..029f9ac05a0783
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_simple.expect
@@ -0,0 +1,5 @@
+graph(%input_tensor : Dynamic) {
+  %1 : int = prim::Constant[value=8]()
+  %2 : Dynamic = aten::add(%1, %input_tensor)
+  return (%2);
+}
diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect
index 7884a95c48c9a1..f0771c133c11d9 100644
--- a/test/expect/TestJit.test_lstm_fusion_concat.expect
+++ b/test/expect/TestJit.test_lstm_fusion_concat.expect
@@ -16,34 +16,33 @@ graph(%0 : Float(3, 10)
   %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17)
   return (%21);
 }
-with prim::FusionGroup_0 = graph(%16 : Float(3, 20)
+with prim::FusionGroup_0 = graph(%15 : Float(3, 20)
+      %25 : Float(3!, 20)
       %26 : Float(3!, 20)
-      %27 : Float(3!, 20)
+      %29 : Float(3!, 20)
       %30 : Float(3!, 20)
-      %31 : Float(3!, 20)
+      %33 : Float(3!, 20)
       %34 : Float(3!, 20)
-      %35 : Float(3!, 20)
-      %38 : Float(3!, 20)
-      %39 : Float(3!, 20)) {
-  %40 : int = prim::Constant[value=1]()
-  %41 : Float(3, 20) = aten::add(%38, %39, %40)
-  %36 : int = prim::Constant[value=1]()
-  %37 : Float(3, 20) = aten::add(%34, %35, %36)
-  %32 : int = prim::Constant[value=1]()
-  %33 : Float(3, 20) = aten::add(%30, %31, %32)
-  %28 : int = prim::Constant[value=1]()
-  %29 : Float(3, 20) = aten::add(%26, %27, %28)
-  %25 : Float(3, 20) = aten::sigmoid(%41)
-  %23 : Float(3, 20) = aten::sigmoid(%37)
-  %21 : Float(3, 20) = aten::tanh(%33)
-  %19 : Float(3, 20) = aten::sigmoid(%29)
-  %17 : Float(3, 20) = aten::mul(%23, %16)
-  %14 : Float(3, 20) = aten::mul(%25, %21)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Float(3, 20) = aten::add(%17, %14, %10)
-  %7 : Float(3, 20) = aten::tanh(%11)
-  %6 : Float(3, 20) = aten::mul(%19, %7)
-  %2 : int = prim::Constant[value=0]()
-  %3 : Float(6, 20) = aten::cat(%6, %11, %2)
-  return (%3);
+      %37 : Float(3!, 20)
+      %38 : Float(3!, 20)) {
+  %39 : int = prim::Constant[value=1]()
+  %40 : Float(3, 20) = aten::add(%37, %38, %39)
+  %35 : int = prim::Constant[value=1]()
+  %36 : Float(3, 20) = aten::add(%33, %34, %35)
+  %31 : int = prim::Constant[value=1]()
+  %32 : Float(3, 20) = aten::add(%29, %30, %31)
+  %27 : int = prim::Constant[value=1]()
+  %28 : Float(3, 20) = aten::add(%25, %26, %27)
+  %24 : Float(3, 20) = aten::sigmoid(%40)
+  %22 : Float(3, 20) = aten::sigmoid(%36)
+  %20 : Float(3, 20) = aten::tanh(%32)
+  %18 : Float(3, 20) = aten::sigmoid(%28)
+  %16 : Float(3, 20) = aten::mul(%22, %15)
+  %13 : Float(3, 20) = aten::mul(%24, %20)
+  %9 : int = prim::Constant[value=1]()
+  %10 : Float(3, 20) = aten::add(%16, %13, %9)
+  %6 : Float(3, 20) = aten::tanh(%10)
+  %5 : Float(3, 20) = aten::mul(%18, %6)
+  %2 : Float(6, 20) = prim::FusedConcat[dim=0](%5, %10)
+  return (%2);
 }
diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect
index ea2fa3737c0556..c8c82e5199c030 100644
--- a/test/expect/TestScript.test_cat_lifts.expect
+++ b/test/expect/TestScript.test_cat_lifts.expect
@@ -1,15 +1,18 @@
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic = aten::cat(%x, %x, %1)
-  return (%2);
+  %2 : Dynamic[] = prim::ListConstruct(%x, %x)
+  %3 : Dynamic = aten::cat(%2, %1)
+  return (%3);
 }
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic = aten::cat(%1)
-  return (%2);
+  %2 : Dynamic[] = prim::ListConstruct()
+  %3 : Dynamic = aten::cat(%2, %1)
+  return (%3);
 }
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic = aten::cat(%x, %1)
-  return (%2);
+  %2 : Dynamic[] = prim::ListConstruct(%x)
+  %3 : Dynamic = aten::cat(%2, %1)
+  return (%3);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect
index 591e499da96671..37f08643f139a4 100644
--- a/test/expect/TestScript.test_index_put_trace_with_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_with_view.expect
@@ -6,6 +6,7 @@ graph(%0 : Double(100)
   %5 : Double(4) = aten::view(%2, %4)
   %6 : int = prim::Constant[value=0]()
   %7 : Long(4) = aten::_cast_Long(%1, %6)
-  %19 : Double(100) = aten::index_put(%0, %7, %5)
-  return (%19);
+  %8 : Dynamic[] = prim::ListConstruct(%7)
+  %20 : Double(100) = aten::index_put(%0, %8, %5)
+  return (%20);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect
index 42f8e49142942e..772308223b454b 100644
--- a/test/expect/TestScript.test_index_put_trace_without_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_without_view.expect
@@ -3,6 +3,7 @@ graph(%0 : Double(100)
       %2 : Double(4)) {
   %3 : int = prim::Constant[value=0]()
   %4 : Long(4) = aten::_cast_Long(%1, %3)
-  %16 : Double(100) = aten::index_put(%0, %4, %2)
-  return (%16);
+  %5 : Dynamic[] = prim::ListConstruct(%4)
+  %17 : Double(100) = aten::index_put(%0, %5, %2)
+  return (%17);
 }
diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
index b1ff53c2e4e7d8..3c1321664dd3fd 100644
--- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
+++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
@@ -10,33 +10,33 @@ graph {
       t {
         dims: 4
         data_type: INT64
-        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
+        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
-    input: "0"
-    input: "1"
     output: "2"
-    op_type: "Reshape"
-  }
-  node {
-    output: "3"
     op_type: "Constant"
     attribute {
       name: "value"
       t {
         dims: 4
         data_type: INT64
-        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
+        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
+    input: "0"
     input: "2"
+    output: "3"
+    op_type: "Reshape"
+  }
+  node {
     input: "3"
+    input: "1"
     output: "4"
     op_type: "Tile"
   }
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 85ef2eac5bf2ce..0284828e400487 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -676,6 +676,40 @@ def forward(self, x):
             x = Variable(torch.randn(*shape))
             self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
 
+    def test_repeat(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x):
+                return x.repeat(1, 2, 3, 4)
+
+        x = Variable(torch.randn(4, 3, 2, 1), requires_grad=True)
+        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_repeat_dim_overflow(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x):
+                return x.repeat(1, 2, 3, 4)
+
+        x = Variable(torch.randn(1, 2), requires_grad=True)
+        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_repeat_dynamic(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x, y):
+                return x.repeat(y.size()[0] / 2, y.size()[1] * 2)
+
+        x = Variable(torch.randn(1, 2), requires_grad=True)
+        y = Variable(torch.randn(2, 4), requires_grad=True)
+        self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False)
+
     def test_mean(self):
         shape = (3, 4, 5)
         for params in [{}] + [{'dim': i} for i in range(len(shape))]:
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 7effb9012e9fc6..263c9041da991d 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -3475,7 +3475,7 @@ def setUp(self):
             ),
             (
                 Binomial(10, simplex_tensor),
-                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor)
+                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy())
             ),
             (
                 Cauchy(random_var, positive_var),
diff --git a/test/test_jit.py b/test/test_jit.py
index ab4c907e72d19f..75fe7fca47b9f7 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1122,6 +1122,87 @@ def test_fn(ten, mask):
         ten = torch.rand(3, 3)
         self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask))
 
+    def test_constant_prop_simple(self):
+        @torch.jit.script
+        def constant_prop(input_tensor):
+            a = 2 * 3
+            b = a + 2
+            return b + input_tensor
+
+        x = torch.tensor(2)
+        out_ref = constant_prop(x)
+        self.run_pass('constant_propagation', constant_prop.graph)
+        out_test = constant_prop(torch.tensor(2))
+        self.assertEqual(out_ref, out_test)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    def test_constant_prop_nested(self):
+        @torch.jit.script
+        def constant_prop(a):
+            b = 2 + 1
+            if a < 2:
+                c = b + 2
+            else:
+                c = b - 2
+            return c
+
+        out_ref = constant_prop(torch.tensor(2))
+        self.run_pass('constant_propagation', constant_prop.graph)
+        out_test = constant_prop(torch.tensor(2))
+        self.assertEqual(out_ref, out_test)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    def test_constant_prop_print(self):
+        @torch.jit.script
+        def constant_prop(input_tensor):
+            a = 2 * 3 + FIXME_zerol()
+            print(a)
+            b = a + 2
+            return b + input_tensor
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    def test_constant_prop_rand(self):
+        @torch.jit.script
+        def constant_prop():
+            a = torch.randn([3])
+            b = a + 2
+            return b
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    # TODO: implement
+    @unittest.expectedFailure
+    def test_constant_prop_if_constant(self):
+        @torch.jit.script
+        def constant_prop():
+            b = 3
+            if True:
+                b = 1
+            if False:
+                b = 2
+            return b
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    # TODO: implement
+    @unittest.expectedFailure
+    def test_constant_prop_loop_constant(self):
+        @torch.jit.script
+        def constant_prop():
+            b = 0
+            while True:
+                b = 1
+            while False:
+                b = 2
+            return b
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
index 1463d15cf22d0c..de65e6fc8ce7a0 100644
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@@ -693,14 +693,18 @@ def _backward(self, module, input, output, grad_output, create_graph=False):
 
         return module.backward(input, grad_output)
 
-    def _forward_criterion(self, criterion, input, target):
+    def _forward_criterion(self, criterion, input, target, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         with torch.no_grad():
-            return criterion.forward(input, target)
+            return criterion.forward(input, target, *extra_args)
 
-    def _backward_criterion(self, criterion, input, target, gradOutput=None):
+    def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         # Ignore gradOutput. It's used for non-legacy tests.
         with torch.no_grad():
-            return criterion.backward(input, target)
+            return criterion.backward(input, target, *extra_args)
 
     def _zero_grad_parameters(self, module):
         return module.zeroGradParameters()
diff --git a/test/test_nn.py b/test/test_nn.py
index ccd698747ae8d5..4498434f1cf2c4 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -36,7 +36,7 @@
     TEST_CUDNN_VERSION
 from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \
     module_tests, criterion_tests, loss_reference_fns, get_reduction, \
-    get_weight, smoothl1loss_reference, kldivloss_reference
+    get_weight, smoothl1loss_reference, kldivloss_reference, ctcloss_reference
 
 
 if TEST_SCIPY:
@@ -383,6 +383,8 @@ class NewCriterionTest(InputVariableMixin, CriterionTest):
     def __init__(self, *args, **kwargs):
         super(NewCriterionTest, self).__init__(*args, **kwargs)
         self.check_gradgrad = kwargs.get('check_gradgrad', True)
+        self.check_half = kwargs.get('check_half', True)
+        self.convert_target = kwargs.get('convert_target', True)
 
     def _do_extra_tests(self, test_case, module, input, target):
         if not self.check_gradgrad:
@@ -407,7 +409,7 @@ def apply_fn(input1, input2, *params):
         gradcheck(apply_fn, inputs)
         gradgradcheck(apply_fn, inputs)
 
-    def test_cuda(self, test_case, dtype=None):
+    def test_cuda(self, test_case, dtype=None, extra_args=None):
         def convert_dtype(obj, dtype, requires_grad=False):
             if isinstance(obj, torch.Tensor):
                 return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad)
@@ -430,7 +432,7 @@ def convert_dtype(obj, dtype, requires_grad=False):
             if dtype is not None:
                 cpu_input = convert_dtype(cpu_input, dtype, True)
                 # NLLLoss requires target to be LongTensor
-                if not isinstance(cpu_target, torch.LongTensor):
+                if not isinstance(cpu_target, torch.LongTensor) and self.convert_target:
                     cpu_target = convert_dtype(cpu_target, dtype)
                 cpu_module.type(dtype)
                 gpu_module.type(dtype)
@@ -447,13 +449,13 @@ def convert_dtype(obj, dtype, requires_grad=False):
                 # Loss modules with weights require consistent input/module weight types
                 cpu_module = self.constructor(*self.constructor_args)
 
-            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
-            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
+            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
+            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
             # dtype can be None, so set precision in this way instead of a precision map
             test_case.assertEqual(cpu_output, gpu_output, 1e-1 if dtype == torch.half else 4e-4)
 
-            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
-            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
+            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
+            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
             test_case.assertEqual(cpu_gradInput, gpu_gradInput, 1e-1 if dtype == torch.half else 4e-4)
         except NotImplementedError:
             pass
@@ -465,6 +467,10 @@ def _get_target(self):
     def constructor_args(self):
         return self._get_arg('constructor_args', False)
 
+    @property
+    def extra_args(self):
+        return self._get_arg('extra_args', False)
+
 
 class TestNN(NNTestCase):
     _do_cuda_memory_leak_check = True
@@ -479,20 +485,24 @@ def _backward(self, module, input, output, grad_output, create_graph=False):
             return None
         return input.grad.data
 
-    def _forward_criterion(self, criterion, input, target):
+    def _forward_criterion(self, criterion, input, target, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         if isinstance(input, tuple):
-            args = input + (target,)
+            args = input + (target,) + extra_args
             output = criterion(*args)
         else:
-            output = criterion(input, target)
+            output = criterion(input, target, *extra_args)
         return output.item()
 
-    def _backward_criterion(self, criterion, input, target, gradOutput=None):
+    def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         input_tuple = input if isinstance(input, tuple) else (input,)
         for i in input_tuple:
             if i.grad is not None:
                 i.grad.data.zero_()
-        args = input_tuple + (target,)
+        args = input_tuple + (target,) + extra_args
         if gradOutput is None:
             gradOutput = torch.ones(())
         criterion(*args).backward(gradOutput.type_as(input_tuple[0]))
@@ -3578,6 +3588,19 @@ def test_NLLLoss_mismatched_batch(self):
         with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'):
             F.nll_loss(x, t)
 
+    @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 7000), "needs cudnn >= 7.0")
+    def test_CTCLoss_cudnn(self):
+        target_lengths = [30, 25, 20]
+        input_lengths = [50, 50, 50]
+        targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int)
+        log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2)
+        res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        expected = ctcloss_reference(log_probs, targets.cuda(), input_lengths, target_lengths).float()
+        with torch.backends.cudnn.flags(enabled=False):
+            res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths)
+        self.assertEqual(res, expected)
+        self.assertEqual(res2, res)
+
     def test_RNN_cell_no_broadcasting(self):
         def test(cell_module, input, hx, input_size, hidden_size):
             cell = cell_module(input_size, hidden_size)
@@ -6006,15 +6029,20 @@ def add(test_name, fn):
     add(test_name, lambda self, test=test: test(self))
     cuda_test_name = test_name + '_cuda'
     # With dtype enable, it's good enough to test against three floating types
+    kwargs = {}
+    if 'extra_args' in get_function_arglist(test.test_cuda):
+        kwargs['extra_args'] = test.extra_args
+
     if 'dtype' in get_function_arglist(test.test_cuda):
         add(cuda_test_name + '_float', lambda self,
-            test=test: test.test_cuda(self, dtype=torch.float))
+            test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs))
         add(cuda_test_name + '_double', lambda self,
-            test=test: test.test_cuda(self, dtype=torch.double))
-        add(cuda_test_name + '_half', lambda self,
-            test=test: test.test_cuda(self, dtype=torch.half))
+            test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs))
+        if getattr(test, 'check_half', True):
+            add(cuda_test_name + '_half', lambda self,
+                test=test: test.test_cuda(self, dtype=torch.half, **kwargs))
     else:
-        add(cuda_test_name, lambda self, test=test: test.test_cuda(self))
+        add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs))
 
 
 def wrap_functional(fn, **kwargs):
@@ -6174,6 +6202,45 @@ def forward(self, *args):
         check_sum_reduction=True,
         check_gradgrad=False,
     ),
+    dict(
+        module_name='CTCLoss',
+        constructor_args=(14,),  # blank=14
+        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
+        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
+        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
+        reference_fn=lambda i, t, il, tl, m:
+            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        check_half=False,
+    ),
+    dict(
+        module_name='CTCLoss',
+        desc='1d_target',
+        constructor_args=(14,),  # blank=14
+        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
+        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
+        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
+        reference_fn=lambda i, t, il, tl, m:
+            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        check_half=False,
+    ),
+    dict(
+        module_name='CTCLoss',
+        desc='2d_int_target',
+        constructor_args=(0,),  # blank=0
+        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
+        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
+        target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int),
+        reference_fn=lambda i, t, il, tl, m:
+            ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        check_half=False,
+        convert_target=False,
+    ),
 ]
 
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 2a8c897713111f..745e4cf4b52616 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1690,6 +1690,7 @@ def test_einsum(self):
             ("...ii->...i", I),       # batch diagonal
             # -- Other
             ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
+            ("... ii->...i  ", I),       # batch diagonal with spaces
         ]
         for test in test_list:
             actual = torch.einsum(test[0], test[1:])
@@ -7947,6 +7948,20 @@ def test_ctor_with_numpy_array(self):
                 for i in range(len(array)):
                     self.assertEqual(tensor[i], array[i])
 
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_ctor_with_numpy_scalar_ctor(self):
+        dtypes = [
+            np.double,
+            np.float,
+            np.float16,
+            np.int64,
+            np.int32,
+            np.int16,
+            np.uint8
+        ]
+        for dtype in dtypes:
+            self.assertEqual(dtype(42), torch.tensor(dtype(42)).item())
+
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_numpy_index(self):
         i = np.int32([0, 1, 2])
@@ -8034,6 +8049,17 @@ def test_numpy_array_interface(self):
             for i in range(len(x)):
                 self.assertEqual(geq2_x[i], geq2_array[i])
 
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_multiplication_numpy_scalar(self):
+        np_sc = np.float64(2.0)
+        t = torch.ones(2, requires_grad=True)
+        r1 = np_sc * t
+        self.assertIsInstance(r1, torch.Tensor)
+        self.assertTrue(r1.requires_grad)
+        r2 = t * np_sc
+        self.assertIsInstance(r2, torch.Tensor)
+        self.assertTrue(r2.requires_grad)
+
     def test_error_msg_type_translation(self):
         with self.assertRaisesRegex(
                 RuntimeError,
diff --git a/third_party/onnx b/third_party/onnx
index c761845c7f6880..df01dbc0051906 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit c761845c7f6880ab7eb7e2866d673834c7149e89
+Subproject commit df01dbc00519067ba6d4e818421eccd1d2552e35
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 14fd6d7cf5e09c..e3fc510a9a0d21 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -201,6 +201,9 @@
 - name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad)
   self, weight, bias: conv_tbc_backward(grad, self, weight, bias, pad)
 
+- name: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank)
+  log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank)
+
 - name: det(Tensor self)
   self: det_backward(grad, self, result)
 
@@ -1145,6 +1148,8 @@
   output: -2 * output * grad * grad_output
 
 # cudnn
+- name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic)
+  log_probs: result1
 
 - name: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic)
   self, weight, bias: cudnn_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask)
diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py
index abbadc70691b46..77b101dedf0f3e 100644
--- a/tools/clang_tidy.py
+++ b/tools/clang_tidy.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 
+
 DEFAULT_FILE_PATTERN = r".*\.[ch](pp)?"
 
 # @@ -start,count +start,count @@
@@ -26,6 +27,11 @@ def run_shell_command(arguments, process_name=None):
         return output.decode()
 
 
+def normalize_directory_path(path):
+    """Normalizes a directory path."""
+    return path.rstrip('/')
+
+
 def transform_globs_into_regexes(globs):
     """Turns glob patterns into regular expressions."""
     return [glob.replace("*", ".*").replace("?", ".") for glob in globs]
@@ -49,16 +55,37 @@ def git_diff(args, verbose):
     return run_shell_command(command, process_name="git diff")
 
 
-def filter_files(files, file_patterns):
+def filter_files(files, file_patterns, verbose):
     """Returns all files that match any of the patterns."""
     filtered = []
     for file in files:
+        has_match = False
         for pattern in file_patterns:
-            if pattern.match(file):
+            if pattern.search(file):
                 filtered.append(file)
+                has_match = True
+        if not has_match and verbose:
+            message = "{} does not match any ".format(file)
+            message += "file pattern in {{{}}}".format(', '.join(map(str, file_patterns)))
+            print(message)
     return filtered
 
 
+def remove_recursive_files(files, paths, verbose):
+    """
+    Removes all files that are not immediately under one of the given paths.
+    """
+    for file in files:
+        if os.path.dirname(file) in paths:
+            yield file
+        else:
+            if verbose:
+
+                message = "{} ({}) does not match any ".format(file, os.path.dirname(file))
+                message += "non-recursive path in {{{}}}".format(", ".join(paths))
+                print(message)
+
+
 def get_changed_files(revision, paths, verbose):
     """Runs git diff to get the paths of all changed files."""
     # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy).
@@ -152,7 +179,17 @@ def parse_options():
     )
     parser.add_argument("-r", "--revision", help="Git revision to get changes from")
     parser.add_argument(
-        "-p", "--paths", nargs="+", default=["."], help="Lint only the given paths"
+        "-p",
+        "--paths",
+        nargs="+",
+        default=["."],
+        help="Lint only the given paths (recursively)",
+    )
+    parser.add_argument(
+        "-n",
+        "--no-recursive",
+        action="store_true",
+        help="If paths are supplied with -p/--paths, do not recurse into paths",
     )
     parser.add_argument(
         "-s",
@@ -173,12 +210,15 @@ def parse_options():
 
 def main():
     options = parse_options()
+    paths = map(normalize_directory_path, options.paths)
     if options.revision:
-        files = get_changed_files(options.revision, options.paths, options.verbose)
+        files = get_changed_files(options.revision, paths, options.verbose)
     else:
-        files = get_all_files(options.paths)
+        files = get_all_files(paths)
+    if options.no_recursive:
+        files = remove_recursive_files(files, paths, options.verbose)
     file_patterns = get_file_patterns(options.glob, options.regex)
-    files = filter_files(files, file_patterns)
+    files = filter_files(files, file_patterns, options.verbose)
 
     # clang-tidy error's when it does not get input files.
     if not files:
diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh
index b35435acb388c6..6a50c14e05523e 100755
--- a/tools/cpp_build/build_caffe2.sh
+++ b/tools/cpp_build/build_caffe2.sh
@@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \
       -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \
       -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \
       -DCMAKE_INSTALL_MESSAGE=NEVER \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
       -G "$GENERATE" \
       $PYTORCHPATH/
 $MAKE -j "$JOBS" install
diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh
index 92a9b9981ed697..6dd9a589cf1074 100755
--- a/tools/cpp_build/build_libtorch.sh
+++ b/tools/cpp_build/build_libtorch.sh
@@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \
       -DCMAKE_INSTALL_MESSAGE=NEVER \
       -Dnanopb_BUILD_GENERATOR:BOOL=OFF \
       -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
       -DVERBOSE:BOOL=${VERBOSE:-0} \
       -G "$GENERATE" \
       $PYTORCHPATH/torch
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index ad9ad2e05c4f4c..be99490ab41a1c 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -84,6 +84,7 @@ def from_attribute(arg):
     'Scalar': '{}.toScalar()',
     'ScalarType': 'static_cast<at::ScalarType>({}.toInt())',
     'Tensor': '{}.toTensor()',
+    'TensorList': '{}.toTensorList()->elements()',
     'bool': 'bool({}.toInt())',
     'double': '{}.toDouble()',
     'int64_t': '{}.toInt()',
@@ -106,7 +107,7 @@ def from_ivalue(arg, value):
 );
 """)
 CALL_METHOD = CodeTemplate("""\
-DeviceGuard device_guard(deviceForInputs(stack, ${num_dynamic_inputs}));
+DeviceGuard device_guard(deviceForInputs(stack, ${num_inputs}));
 auto result = (${first}).${name}(
     ${args}
 );
@@ -129,7 +130,7 @@ def from_ivalue(arg, value):
   return Operation([=](Stack & stack) {
     autograd::profiler::RecordFunction record("${name}");
     ${call}
-    drop(stack, ${num_dynamic_inputs});
+    drop(stack, ${num_inputs});
     pack(stack, std::move(result));
     return 0;
   });
@@ -171,9 +172,6 @@ def is_jit_op(decl):
     # we currently only support vararg tensor lists when they are the _first_ argument
     # and the only tensor argument
     arguments = decl['arguments']
-    # Only support a single TensorList arg
-    if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1:
-        return False
 
     return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and
             not decl['name'].endswith('_out') and
@@ -197,7 +195,7 @@ def gen_jit_dispatch(declarations, out, template_path):
 
     ops = []
 
-    def get_invocation(decl, args, num_dynamic_inputs):
+    def get_invocation(decl, args, num_inputs):
 
         # because the arg list can get lengthy we put them on a separate line
         def pack_arguments(args):
@@ -211,67 +209,26 @@ def pack_arguments(args):
         elif 'namespace' in decl['method_of']:
             return CALL_NAMESPACE.substitute(name=decl['name'],
                                              args=pack_arguments(args),
-                                             num_dynamic_inputs=num_dynamic_inputs)
+                                             num_inputs=num_inputs)
         else:
             return CALL_METHOD.substitute(
                 name=decl['name'], first=args[0], args=pack_arguments(args[1:]),
-                num_dynamic_inputs=num_dynamic_inputs)
+                num_inputs=num_inputs)
 
-    def emit_decl_variant(decl, is_positional_arg, has_tensorlist):
+    def emit_decl_variant(decl, is_positional_arg):
         # is_positional_arg is a boolean list the same length as decl['arguments']
         # that indicates if the argument should come from the postional list
         # of inputs. If false, the argument comes from the constant attributes
         kw_assignments = []
         arguments = []
-
-        if has_tensorlist:
-            kw_assignments.append('size_t varargs_length = node->inputs().size();')
-            # arguments look like: [tensor list], arg1, arg2, arg3
-            # we use peek(<i>, static_inputs) to read the non-vararg inputs
-            # from the end of the stack
-            static_inputs = sum(is_positional_arg) - 1
-            num_dynamic_inputs = 'varargs_length'
-            tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0]
-        else:
-            static_inputs = sum(is_positional_arg)
-            num_dynamic_inputs = static_inputs
+        num_inputs = sum(is_positional_arg)
 
         real_inputs = 0
         for i, arg in enumerate(decl['arguments']):
-            # This conditional allows us to process argument lists with a flattened argument list
-            # with a single TensorList. Given the sequence of arguments:
-            # a b c [d e f g] h i # [] is the list
-            #
-            # 1. For the section where we are processing positional inputs before the
-            #    TensorList:
-            #    a b c [d e f g] h i # [] is the list
-            #    ~~~~~~~~~~~~ <- N
-            #   we set this view_length to the total number of varargs inputs (i.e. the length)
-            #   of the whole argument list. This means that indexing into the list using peek()
-            #   we will retrieve arguments ar their true indices (i.e. peek at 0 points to a,
-            #   1 points to b, etc...). Similarly, we can use peekSlice() to index into the
-            #   list itself this way.
-            # 2. After the list:
-            #    a b c [d e f g] h i # [] is the list
-            #                 ~~~~~~ <- N
-            #   Here we set the view length to static_inputs. In our example,
-            #   we effectively ignore the fact that we have a list here. What is
-            #   significant is that our index i is equivalent when the view length
-            #   is right-justified, whether we have the list or not. Concretely,
-            #   indexing h or i from `a b c [d e f g] h i` is equvalent to indexing
-            #   h or i from `a b c h i`.
-            view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs
-
-            if arg['simple_type'] == 'TensorList':
-                # NOTE: don't advance real_inputs here. After this we are going
-                # to switch over to indexing from the end as if we only had
-                # the static arguments.
-                arguments.append('toTensors(peekSlice(stack, {}, varargs_length - {}, varargs_length))'
-                                 .format(real_inputs, static_inputs))
-            elif arg['simple_type'] in default_only_types:
+            if arg['simple_type'] in default_only_types:
                 arguments.append(arg['default'])
             elif is_tensor_arg(arg) or is_positional_arg[i]:
-                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, view_length)
+                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs)
                 arguments.append(from_ivalue(arg, value))
                 real_inputs += 1
             else:
@@ -279,20 +236,18 @@ def emit_decl_variant(decl, is_positional_arg, has_tensorlist):
                 kw_assignments.append(assign)
                 arguments.append(arg['name'])
 
-        call = get_invocation(decl, arguments, num_dynamic_inputs)
+        call = get_invocation(decl, arguments, num_inputs)
 
         returns = decl['returns']
-        all_scalars = all(r['dynamic_type'] != 'TensorList' for r in returns)
 
         constructor = CONSTRUCTOR.substitute(name=decl['name'],
                                              call=call,
                                              kw_assignments=kw_assignments,
-                                             num_dynamic_inputs=num_dynamic_inputs)
+                                             num_inputs=num_inputs)
         return constructor
 
     def emit_decl(decl):
         arguments = decl['arguments']
-        has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments)
         num_tensor_args = sum(map(is_tensor_arg, arguments))
 
         # Right now, we generate dispatch methods that either take all non-tensor arguments
@@ -304,12 +259,12 @@ def emit_decl(decl):
         all_real_arguments_are_inputs = tuple(arg['simple_type'] not in default_only_types for arg in arguments)
         only_tensors_are_inputs = tuple(is_tensor_arg(arg) for arg in arguments)
 
-        variants = [emit_decl_variant(decl, all_real_arguments_are_inputs, has_tensorlist)]
+        variants = [emit_decl_variant(decl, all_real_arguments_are_inputs)]
         # in some cases there are no inputs that are possibly attributes, so the
         # variants are actually the same. If so avoid generating both to save compilation
         # time.
         if all_real_arguments_are_inputs != only_tensors_are_inputs:
-            variants += [',', emit_decl_variant(decl, only_tensors_are_inputs, has_tensorlist)]
+            variants += [',', emit_decl_variant(decl, only_tensors_are_inputs)]
 
         ops.append(OPERATOR.substitute(signature=signature(decl),
                                        ops=variants))
diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp
index 06ad9c2840b1cc..2ff38e80b56e3f 100644
--- a/tools/jit/templates/register_aten_ops.cpp
+++ b/tools/jit/templates/register_aten_ops.cpp
@@ -29,7 +29,6 @@ using autograd::Variable;
 using autograd::variable_list;
 using at::Scalar;
 using at::Tensor;
-using at::TensorList;
 using at::TensorOptions;
 using at::DeviceGuard;
 
@@ -42,12 +41,6 @@ int deviceForInputs(Stack & stack, size_t N) {
   return t.type().is_cuda() ? (int) t.get_device() : -1;
 }
 
-std::vector<at::Tensor> toTensors(at::ArrayRef<IValue> ivalues) {
-  return fmap(ivalues, [](const IValue& v) {
-    return v.toTensor();
-  });
-}
-
 template<size_t N>
 std::array<bool, N> as_bool_array(const std::vector<int64_t>& vec) {
   std::array<bool, N> res;
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 88546fda7ed604..3e43a6e00b7359 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -138,6 +138,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp
diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h
index c0f56eea72fbd0..2ae5c5d93752c1 100644
--- a/torch/csrc/api/include/torch/nn/cursor.h
+++ b/torch/csrc/api/include/torch/nn/cursor.h
@@ -48,7 +48,7 @@ class CursorBase {
 
   /// A `(key, value)` pair exposed by cursor iterators.
   struct Item {
-    Item(const std::string& key_, T& module_);
+    Item(const std::string& key_, T& value_);
 
     T& operator*();
     const T& operator*() const;
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
index 7327d03f11b887..1f12f0a65c7460 100644
--- a/torch/csrc/autograd/anomaly_mode.h
+++ b/torch/csrc/autograd/anomaly_mode.h
@@ -18,7 +18,7 @@ struct AnomalyMode {
 
 
 struct AnomalyMetadata {
-  virtual ~AnomalyMetadata(){};
+  virtual ~AnomalyMetadata() = default;
   virtual void store_stack() = 0;
   virtual void print_stack() = 0;
 };
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 8309ba1ce1038c..74e15f5caefe9d 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -159,7 +159,7 @@ struct GraphTask {
   std::unordered_map<Function*, ExecInfo> exec_info;
   std::vector<Variable> captured_vars;
 
-  void init_to_execute(Function& graph_root, const edge_list& captures);
+  void init_to_execute(Function& graph_root, const edge_list& outputs);
 
   // The value of worker_device in the thread that created this task.
   // See Note [Reentrant backwards]
@@ -499,14 +499,14 @@ struct ClearCallbacks {
   std::mutex& callbacks_lock;
 };
 
-auto Engine::execute(const edge_list& input_roots,
+auto Engine::execute(const edge_list& roots,
                      const variable_list& inputs,
                      bool keep_graph,
                      bool create_graph,
                      const edge_list& outputs) -> variable_list {
   std::call_once(start_threads_flag, &Engine::start_threads, this);
 
-  validate_outputs(input_roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
+  validate_outputs(roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
     return msg;
   });
 
@@ -517,7 +517,7 @@ auto Engine::execute(const edge_list& input_roots,
   std::unique_lock<std::mutex> lock(graph_task.mutex);
 
   // Now compute the dependencies for all executable functions and queue the root
-  auto graph_root = std::make_shared<GraphRoot>(input_roots, inputs);
+  auto graph_root = std::make_shared<GraphRoot>(roots, inputs);
   compute_dependencies(graph_root.get(), graph_task);
   if (!outputs.empty()) {
     graph_task.init_to_execute(*graph_root, outputs);
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index db8b3357ac2536..94490303ccc240 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -57,7 +57,7 @@ struct TORCH_API Engine {
   ReadyQueue& ready_queue(int device);
   void start_threads();
   virtual void thread_init(int device);
-  virtual void thread_main(GraphTask *task);
+  virtual void thread_main(GraphTask *graph_task);
   virtual void thread_on_exception(FunctionTask& task, std::exception& e);
 
   std::once_flag start_threads_flag;
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index b02bdf3928f2ff..46a80b90b29ffa 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -328,7 +328,7 @@ struct TORCH_API Function : std::enable_shared_from_this<Function> {
 /// See Function::is_traceable() for definition.
 struct TraceableFunction : public Function {
   using Function::Function;
-  bool is_traceable() final override {
+  bool is_traceable() final {
     return true;
   }
 };
diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h
index 03c52fea54535c..f3cf5b2e793c6a 100644
--- a/torch/csrc/autograd/function_hook.h
+++ b/torch/csrc/autograd/function_hook.h
@@ -10,12 +10,12 @@ struct Variable;
 using variable_list = std::vector<Variable>;
 
 struct FunctionPreHook {
-  virtual ~FunctionPreHook() {}
+  virtual ~FunctionPreHook() = default;
   virtual variable_list operator()(const variable_list& grads) = 0;
 };
 
 struct FunctionPostHook {
-  virtual ~FunctionPostHook() {}
+  virtual ~FunctionPostHook() = default;
   virtual variable_list operator()(const variable_list& grad_input, const variable_list& grad_output) = 0;
 };
 
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index 44d4b7f106c860..db86ae428d4060 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -6,9 +6,9 @@
 namespace torch { namespace autograd {
 
 struct AccumulateGrad : public Function {
-  explicit AccumulateGrad(Variable variable);
+  explicit AccumulateGrad(Variable variable_);
 
-  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& grads) override;
 
   Variable variable;
 };
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index b04b0f25ca42d5..c4a54d99d08702 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -11,7 +11,7 @@
 
 namespace torch { namespace autograd {
 
-auto Error::apply(variable_list&& grad_outputs) -> variable_list {
+auto Error::apply(variable_list&& inputs) -> variable_list {
   throw std::runtime_error(msg);
 }
 
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
index aa4b422136930f..1a21a360ba9fc2 100644
--- a/torch/csrc/autograd/functions/tensor.h
+++ b/torch/csrc/autograd/functions/tensor.h
@@ -13,7 +13,7 @@
 namespace torch { namespace autograd {
 
 struct CopyBackwards : public Function {
-  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& grads) override;
 
   at::Type *src_type;
   int32_t src_device = -1;
@@ -23,9 +23,12 @@ struct CopyBackwards : public Function {
 // grad[idx] is defined by the relative sizes, strides, and offset of base and
 // view.
 struct CopySlices : public Function {
-  CopySlices(const Variable& base, at::TensorGeometry view, std::shared_ptr<Function> fn);
+  CopySlices(
+      const Variable& base_var,
+      at::TensorGeometry view_,
+      std::shared_ptr<Function> fn_);
 
-  variable_list apply(variable_list&& grads) override;
+  variable_list apply(variable_list&& inputs) override;
   void release_variables() override;
 
   at::TensorGeometry base;
diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h
index 2e0febfc84b0bc..f1c02e0d78e565 100644
--- a/torch/csrc/autograd/input_buffer.h
+++ b/torch/csrc/autograd/input_buffer.h
@@ -22,14 +22,14 @@ struct InputBuffer {
   InputBuffer& operator=(InputBuffer&& other) = default;
 
   // Accumulates the variable at a specified index.
-  void add(size_t idx, Variable var);
+  void add(size_t pos, Variable var);
 
   int device() const;
 
   Variable operator[](size_t pos) { return buffer[pos]; }
 
   // Returns the inputs as a list of variables. Destroys given InputBuffer.
-  static std::vector<Variable> variables(InputBuffer&& buffer);
+  static std::vector<Variable> variables(InputBuffer&& g);
 
 private:
   std::vector<Variable> buffer;
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index dd77dc193ba9bd..ba0fee1510baa2 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -185,7 +185,7 @@ struct TORCH_API RecordFunction {
 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
-TORCH_API void enableProfiler(ProfilerState state);
+TORCH_API void enableProfiler(ProfilerState new_state);
 TORCH_API thread_event_lists disableProfiler();
 
 } // namespace profiler
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index 61a1d3b3eac172..037f06a7f95c11 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -45,10 +45,10 @@ class TORCH_API SavedVariable {
   std::weak_ptr<Function> grad_accumulator_;
   VariableVersion version_counter_;
 
-  uint32_t saved_version_;
-  uint32_t output_nr_;
+  uint32_t saved_version_ = 0;
+  uint32_t output_nr_ = 0;
   bool was_default_constructed_ = true;
-  bool requires_grad_;
-  bool has_grad_fn_;
+  bool requires_grad_ = false;
+  bool has_grad_fn_ = false;
 };
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index c97a0322359a4d..633b8028f765df 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -263,7 +263,7 @@ struct Variable::Impl : public at::TensorImpl {
   TORCH_API explicit Impl(
       at::Tensor data,
       bool requires_grad = false,
-      Edge edge = Edge());
+      Edge gradient_edge = Edge());
 
   ~Impl() override;
 
diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
index d6bd90cb708784..f404b4ce9a05c6 100644
--- a/torch/csrc/jit/argument_spec.h
+++ b/torch/csrc/jit/argument_spec.h
@@ -59,20 +59,21 @@ struct ArgumentSpec {
     for(int32_t i = 0; i < num_inputs; i++) {
       auto & pod = pods[i];
       pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
-      if (!pod.is_tensor) continue;
-      at::Tensor t = inputs[i].toTensor();
-      pod.defined = t.defined();
-      if (pod.defined) {
-        pod.type = static_cast<int>(t.type().scalarType());
-        pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
-        pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
-        total_dims += t.ndimension();
-        auto sizes = t.sizes();
-        std::copy(sizes.begin(),sizes.end(), next_dim);
-        next_dim += sizes.size();
-        auto strides = t.strides();
-        std::copy(strides.begin(), strides.end(), next_dim);
-        next_dim += strides.size();
+      if (pod.is_tensor) {
+        at::Tensor t = inputs[i].toTensor();
+        pod.defined = t.defined();
+        if (pod.defined) {
+          pod.type = static_cast<int>(t.type().scalarType());
+          pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
+          pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
+          total_dims += t.ndimension();
+          auto sizes = t.sizes();
+          std::copy(sizes.begin(),sizes.end(), next_dim);
+          next_dim += sizes.size();
+          auto strides = t.strides();
+          std::copy(strides.begin(), strides.end(), next_dim);
+          next_dim += strides.size();
+        }
       }
       // each POD has a running tally of all dimensions including its own
       pod.total_dims = total_dims;
diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h
index f69790cab52e00..53b87af9ef991d 100644
--- a/torch/csrc/jit/attributes.h
+++ b/torch/csrc/jit/attributes.h
@@ -28,7 +28,7 @@ struct AttributeValue {
   Symbol name;
   virtual AttributeKind kind() const = 0;
   virtual Ptr clone() const = 0;
-  virtual ~AttributeValue() {}
+  virtual ~AttributeValue() = default;
 };
 
 template<typename T, AttributeKind Kind>
@@ -101,7 +101,7 @@ struct AttributeError : public std::exception {
 // we return Derived* pointers because Nodes are normally held as pointers.
 template<typename Derived>
 struct Attributes {
-  Attributes() {}
+  Attributes() = default;
   void copyAttributes(const Attributes & rhs) {
     values_.clear();
     for(auto & i : rhs.values_) {
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index c830dc45a537f5..7f250bf7c452aa 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/jit/assertions.h>
 
 #include <algorithm>
+#include <memory>
 
 namespace torch { namespace jit {
 
@@ -564,14 +565,13 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
   reverse_block->owningNode()->destroy();
 }
 
-Gradient differentiate(std::shared_ptr<Graph>& _graph, const std::vector<bool>& requires_grad) {
+Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad) {
   Gradient grad_desc;
   // Take ownership of the graph
-  JIT_ASSERTM(
-      _graph.use_count() == 1,
-      "differentiate will mutate and destroy the graph, so it requires "
-      "graph.use_count() == 1, but found ", _graph.use_count());
-  std::swap(_graph, grad_desc.f);
+  JIT_ASSERTM(graph.use_count() == 1,
+              "differentiate will mutate and destroy the graph, so it requires "
+              "graph.use_count() == 1, but found %d", graph.use_count());
+  std::swap(graph, grad_desc.f);
   // XXX: Take care when handling outputs - they can be duplicated!
 
   WithInsertPoint guard(grad_desc.f->block());
diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h
index 6dd2be9db0e779..ea2b7a1170efeb 100644
--- a/torch/csrc/jit/autodiff.h
+++ b/torch/csrc/jit/autodiff.h
@@ -4,7 +4,9 @@
 #include "torch/csrc/jit/ir.h"
 
 #include <ATen/ATen.h>
+
 #include <vector>
+#include <memory>
 
 namespace torch { namespace jit {
 
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index 3c4ad0c130ea31..e8445734662b55 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -24,6 +24,11 @@ Value* insertConstant(
   } else if(val.isIntList()) {
     n->is_(attr::value, val.toIntList()->elements());
     n->output()->setType(ListType::ofInts());
+  } else if(val.isTensorList()) {
+    n->ts_(attr::value, fmap(val.toTensorList()->elements(), [](const at::Tensor & t) {
+      return autograd::Variable(t).data();
+    }));
+    n->output()->setType(ListType::ofTensors());
   } else {
     throw std::runtime_error("Unsupported value kind: " + val.tagKind());
   }
@@ -66,6 +71,14 @@ RegisterOperators reg({
             push(stack, is);
             return 0;
           };
+        } else if(type->isSubtypeOf(ListType::ofTensors())) {
+          auto ts = fmap(node->ts(attr::value), [](const at::Tensor & t) -> at::Tensor {
+            return autograd::make_variable(t);
+          });
+          return [ts](Stack& stack) {
+            push(stack, ts);
+            return 0;
+          };
         } else {
           std::stringstream ss;
           ss << "constant literal not supported for: " << type->str();
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
index 8d20045efefe6a..22f8b40ba30542 100644
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -345,18 +345,14 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
     size_t i = 0;
     for(auto o : subgraph.outputs()) {
       auto & desc = agraph.output_desc[i++];
-      if(o->node()->kind() != aten::cat) {
+      if(o->node()->kind() != prim::FusedConcat) {
         emitFormal(o, desc);
         concat_desc.emplace_back();
         flat_output_nodes.push_back(o);
       } else {
         auto cat = o->node();
-        auto tensor_inputs = cat->inputs();
-        // We need to drop the dim arg
-        tensor_inputs = tensor_inputs.slice(0, tensor_inputs.size() - 1);
-        size_t nInputs = tensor_inputs.size();
-        concat_desc.emplace_back(desc, nInputs, cat->get<int64_t>(attr::dim).value());
-        for(auto c : tensor_inputs) {
+        concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
+        for(auto c : cat->inputs()) {
           emitFormal(c, *concat_desc.back().subtensorDesc);
           flat_output_nodes.push_back(c);
         }
@@ -386,8 +382,9 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
   }
 
   for(auto n : subgraph.nodes()) {
-    if(n->kind() == aten::cat)
-      continue; // Concat nodes by narrowing the output Tensors before the kernel runs
+    // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
+    if (n->kind() == prim::FusedConcat)
+      continue;
     env.s("node",valueName(n->output()));
     env.s("rhs", encodeRHS(n));
     body << format("auto ${node} = ${rhs};\n",env);
diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h
index 6c4759aefb692a..c2f35ee0aa2074 100644
--- a/torch/csrc/jit/fusion_compiler.h
+++ b/torch/csrc/jit/fusion_compiler.h
@@ -86,7 +86,7 @@ struct CompiledFusionFunction {
   TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction);
 
   CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph);
-  virtual ~CompiledFusionFunction() {}
+  virtual ~CompiledFusionFunction() = default;
 
   // expects outputs to be pre-allocated
   void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index df81c378ad137d..bd115516208f04 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -21,6 +21,7 @@
 #include "torch/csrc/jit/passes/specialize_undef.h"
 #include "torch/csrc/jit/passes/loop_unrolling.h"
 #include "torch/csrc/jit/passes/lower_grad_of.h"
+#include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/ivalue.h"
 
@@ -516,28 +517,28 @@ void runRequiredPasses(const std::shared_ptr<Graph>& g)  {
   RemoveExpands(g);
 }
 
-void specializeToSpec(const std::shared_ptr<Graph>& graph_, const ArgumentSpec& spec) {
+void specializeToSpec(const std::shared_ptr<Graph>& graph, const ArgumentSpec& spec) {
   // clean up GradOf and AutogradAdd nodes
   // this must be first because later passes do not know what GradOfs are
   std::vector<bool> defined;
   for(size_t i = 0; i < spec.size(); ++i) {
     defined.push_back(spec.at(i).defined());
   }
-  specializeUndef(*graph_, defined);
+  specializeUndef(*graph, defined);
 
   // required passes shared with autograd fallback
-  runRequiredPasses(graph_);
+  runRequiredPasses(graph);
 
   // Decompose addmm nodes to add + mm, so expands can be inserted and
   // gradients accumulated on the backward pass
   //
   // In the future, if we need more passes like this, we should convert this
   // into a generic canonicalization pass.
-  DecomposeAddmm(graph_);
+  DecomposeAddmm(graph);
   // clean up dead constants from specialization
-  EliminateDeadCode(graph_);
+  EliminateDeadCode(graph);
   // calculate all input shapes
-  PropagateInputShapes(*graph_, spec);
+  PropagateInputShapes(*graph, spec);
 }
 
 void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables) {
@@ -554,7 +555,7 @@ void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariab
 
     // They also may assume that concrete sizes/strides are availiable
     UnrollLoops(graph);
-
+    ConstantPropagation(graph);
     //TODO: create peephole optimizations that are safe to run
     // when we are using variables, and when we do not know sizes.
     PeepholeOptimize(graph);
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index 4e862c9e0a1e44..2693af50af1025 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -34,7 +34,7 @@ struct GraphExecutorState {
 
 struct GraphExecutorImpl;
 struct TORCH_API GraphExecutor {
-  GraphExecutor() {}
+  GraphExecutor() = default;
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
   // note: if not specified, symbolically_differentiable is computed from the graph.
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable);
diff --git a/torch/csrc/jit/graph_node_list.h b/torch/csrc/jit/graph_node_list.h
index 996a8b2c75fa0f..054b9517776863 100644
--- a/torch/csrc/jit/graph_node_list.h
+++ b/torch/csrc/jit/graph_node_list.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "torch/csrc/jit/assertions.h"
 
 namespace torch { namespace jit {
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 5b128fd822dafd..40bc9966b45c18 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -66,7 +66,7 @@ struct Model_ {
 // Readers
 
 struct ReaderBase {
-  ReaderBase() {}
+  ReaderBase() = default;
   ReaderBase(pb_callback_t& cb) {
     initialize_callback(cb);
   }
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index d3a9bd9139a96e..7499e3152e41b3 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -18,6 +18,7 @@
 #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h"
 #include "torch/csrc/jit/passes/shape_analysis.h"
 #include "torch/csrc/jit/passes/decompose_addmm.h"
+#include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/passes/loop_unrolling.h"
 #include "torch/csrc/jit/passes/to_batch.h"
 #include "torch/csrc/jit/passes/specialize_undef.h"
@@ -75,6 +76,9 @@ void initJITBindings(PyObject *module) {
    .def("_jit_pass_remove_expands", RemoveExpands)
    .def("_jit_pass_erase_number_types", EraseNumberTypes)
    .def("_jit_pass_loop_unrolling", UnrollLoops)
+   .def("_jit_pass_constant_propagation", [](std::shared_ptr<Graph>& g) {
+     return ConstantPropagation(g);
+   })
    .def("_jit_run_cpp_tests", [] {
      // We have to release the GIL inside this method, because if we happen to
      // initialize the autograd engine in these tests, the newly spawned worker threads will
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 52b8cb0eaccd98..c567793552d73a 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -50,6 +50,7 @@ _(prim, TensorToNum) \
 _(prim, AutogradAdd) \
 _(prim, GradOf) \
 _(prim, AnyDefined) \
+_(prim, FusedConcat) \
 _(aten, __not__) \
 FORALL_ATEN_BASE_SYMBOLS(_) \
 _(onnx, Add) \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 65bdcf695f6de2..da6f629d629e41 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -339,7 +339,7 @@ struct ContainerTensor : public at::TensorImpl {
   ContainerTensor()
   : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined)), nullptr) {}
 
-  virtual ~ContainerTensor() {}
+  virtual ~ContainerTensor() = default;
   virtual at::IntList sizes() const override {
     throw std::runtime_error("sizes() on ContainerTensor");
   }
@@ -685,8 +685,8 @@ struct CodeImpl {
 
 // InterpreterState state that is held across stages and used to compute a Code
 struct InterpreterStateImpl {
-  InterpreterStateImpl(const Code & function_)
-  : function(function_.pImpl),
+  InterpreterStateImpl(const Code & code)
+  : function(code.pImpl),
     int_data(function->int_data.data()),
     bool_data(function->bool_data),
     registers(function->register_size) {
@@ -775,15 +775,15 @@ std::ostream & operator<<(std::ostream & out, const Code & code) {
 
 Code::Code(std::shared_ptr<Graph>& graph)
     : pImpl(new CodeImpl(graph)) {}
-Code::~Code() {}
+Code::~Code() = default;
 
 const std::vector<GraphExecutor*>& Code::executors() {
   return pImpl->executors();
 }
 
-InterpreterState::InterpreterState(const Code & function)
-  : pImpl(new InterpreterStateImpl(function)) {}
-InterpreterState::~InterpreterState() {}
+InterpreterState::InterpreterState(const Code & code)
+  : pImpl(new InterpreterStateImpl(code)) {}
+InterpreterState::~InterpreterState() = default;
 
 void InterpreterState::runOneStage(Stack & stack) {
   return pImpl->runOneStage(stack);
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 7f09b22b324d11..317d30b05967a5 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -355,7 +355,7 @@ void Graph::lint() const {
   // - every use will occur later in the topsort
 
   struct LintScope {
-    LintScope() {}
+    LintScope() = default;
     LintScope(std::unique_ptr<LintScope> parent)
     : parent(std::move(parent)) {}
     bool contains(const Value * v) {
@@ -487,13 +487,13 @@ void LintGraph(std::shared_ptr<Graph>& graph) {
   graph->lint();
 }
 
-void Block::cloneFrom(Block * src, std::function<Value*(Value*)> outer_map) {
+void Block::cloneFrom(Block * src, std::function<Value*(Value*)> value_map) {
   std::unordered_map<Value*, Value*> local_map;
   auto env = [&](Value * v) {
     auto it = local_map.find(v);
     if(it != local_map.end())
       return it->second;
-    return outer_map(v);
+    return value_map(v);
   };
 
   auto graph = owningGraph();
@@ -619,23 +619,8 @@ Value* Node::namedInput(Symbol name) const {
     // so this is completely unsafe and needs to be gone as soon as possible.
     return v;
   }
-  const auto & the_schema = schema();
-  int64_t tensor_list_pos = 0;
-  for (auto & arg : the_schema.arguments) {
-    if (*arg.type == *ListType::ofTensors())
-      break;
-    tensor_list_pos++;
-  }
   int64_t arg_pos = findArgument(schema(), name).first;
-  // XXX: we don't have a single value we could give for a Tensor[],
-  // because we flatten lists into arguments
-  JIT_ASSERT(arg_pos != tensor_list_pos);
-  // NB: if there's no tensor list, then tensor_list_pos == arguments.size(), so this is always true
-  if (arg_pos < tensor_list_pos) {
-    return input(arg_pos);
-  } else {
-    return input(inputs().size() - (the_schema.arguments.size() - arg_pos));
-  }
+  return input(arg_pos);
 }
 
 bool Node::matches(const char *signature_literal, at::ArrayRef<Symbol> const_inputs) {
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 9af468e6ee06e7..9a5c3342bf5df6 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -54,7 +54,7 @@ struct Value;
 
 TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g);
 TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t);
-TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Node & n);
 
 // A list of nodes, with inputs and outputs
 struct Block;
@@ -683,7 +683,7 @@ struct Node : public Attributes<Node> {
     return *schema_;
   }
 
-  virtual ~Node() {}
+  virtual ~Node() = default;
 private:
   std::pair<Value*, const Argument&> findInput(Symbol name);
   void findSchema() const;
@@ -889,8 +889,7 @@ friend struct Block;
   , block_(new Block(this, nullptr))
   , insert_before_(return_node()) {}
 
-  Graph()
-  : Graph( std::make_shared<Scope>()) {}
+  Graph() : Graph(std::make_shared<Scope>()) {}
 
   at::ArrayRef<Value*> inputs() {
     return block_->inputs();
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 42a5be89e55e4b..60801c49c2ce33 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -83,6 +83,7 @@ struct ConstantList;
 struct IValue;
 using Tuple = ConstantList<IValue>;
 using IntList = ConstantList<int64_t>;
+using TensorList = ConstantList<at::Tensor>;
 using DoubleList = ConstantList<double>;
 
 // IValue is the generic tagged union used by the interpreter to hold
@@ -93,7 +94,7 @@ using DoubleList = ConstantList<double>;
 // retain/release calls.
 
 #define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList)
+  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(TensorList)
 
 struct IValue {
   IValue()
@@ -223,6 +224,20 @@ struct IValue {
     return toRetainable<DoubleList>();
   }
 
+  //TensorList
+  IValue(Shared<TensorList> v);
+  IValue(std::vector<at::Tensor> v);
+  bool isTensorList() const { return Tag::TensorList == tag; }
+  Shared<TensorList> toTensorList() && {
+    JIT_ASSERT(isTensorList());
+    return moveToRetainable<TensorList>();
+  }
+  Shared<TensorList> toTensorList() const & {
+    JIT_ASSERT(isTensorList());
+    return toRetainable<TensorList>();
+  }
+
+  // None
   bool isNone() {
     return Tag::None == tag;
   }
@@ -369,6 +384,13 @@ inline IValue::IValue(Shared<DoubleList> v)
 inline IValue::IValue(std::vector<double> v)
 : IValue(DoubleList::create(std::move(v))) {}
 
+inline IValue::IValue(Shared<TensorList> v)
+: tag(Tag::TensorList), retainable(true) {
+  as_retainable = v.detach();
+}
+inline IValue::IValue(std::vector<at::Tensor> v)
+: IValue(TensorList::create(std::move(v))) {}
+
 inline std::vector<int64_t> IValue::copyToIntList() const {
   return std::vector<int64_t>(toIntList()->elements());
 }
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index f19d18caa9289e..66af83bb9525fb 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -359,22 +359,6 @@ bool Operator::matches(const Node* node) const {
         return false;
       }
       attributes_seen++;
-    } else if(*arg.type == *ListType::ofTensors()) {
-      // Tensor[] is handled as varargs, consume inputs until the remaining required arguments
-      // XXX - there can only be a single Tensor[] in a declaration
-      size_t remaining_required = 0;
-      for(size_t j = arg_i + 1; j < schema.arguments.size(); ++j){
-        // remaining arguments are only those that won't be consumed from attributes
-        if(attributes_size == 0 || !attributeKindOf(schema.arguments[j].type))
-          remaining_required++;
-      }
-      while(inputs_size - input_i > remaining_required) {
-        auto input = node->inputs()[input_i++];
-        if(!typeMatches(input->type(), DynamicType::get())) {
-          // std::cout << "vararg argument is not Dynamic\n";
-          return false;
-        }
-      }
     } else {
       if(input_i == inputs_size) {
         // std::cout << "not enough inputs\n";
diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h
index 7e6a314d2cb8c3..081344d0d1d83e 100644
--- a/torch/csrc/jit/operator.h
+++ b/torch/csrc/jit/operator.h
@@ -10,7 +10,7 @@
 
 namespace torch { namespace jit {
 
-FunctionSchema parseSchema(const std::string& decl);
+FunctionSchema parseSchema(const std::string& schema);
 
 using OperationCreator = std::function<Operation(Node*)>;
 
@@ -33,7 +33,7 @@ struct TORCH_API Operator {
 
   FunctionSchema schema;
 
-  bool matches(const Node* n) const;
+  bool matches(const Node* node) const;
   // Operators have different versions depending on if some inputs are encoded
   // as attributes or inputs. This function returns the right Operation function,
   // given a node encoded for one variant.
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 0e40bc8831a6df..414dc1652a4da1 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -3,8 +3,9 @@
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/interned_strings.h"
 #include "torch/csrc/jit/constants.h"
-#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/assertions.h"
+#include "torch/csrc/utils/functional.h"
 
 #include <ATen/ATen.h>
 #include <algorithm>
@@ -191,12 +192,11 @@ void BatchMMBlock(Block* block) {
       int cat_dim    = s == Side::LHS ? 1 : 0;
       cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes
 
-      auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; });
       WithInsertPoint iguard { root.node };
-      inputs.push_back(insertConstant(*graph, cat_dim));
-      Node *cat = graph->insertNode(graph->create(aten::cat, inputs));
-      cat->output()->setType(type->withSizes(cat_sizes));
-      return cat->output();
+      auto inputs = fmap(matmuls, [=](Node *mm) -> SymbolicVariable { return mm->inputs()[inputs_off]; });
+      auto cat_output = SymbolicVariable::cat(inputs, cat_dim).value();
+      cat_output->setType(type->withSizes(cat_sizes));
+      return cat_output;
     };
 
     auto lhs_batch = batch_inputs(Side::LHS, root.lhs_sizes);
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
new file mode 100644
index 00000000000000..39492f9e76c50c
--- /dev/null
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -0,0 +1,95 @@
+#include "torch/csrc/jit/passes/constant_propagation.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/constants.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ivalue.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/utils/functional.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+std::unordered_set<Symbol> skip_list = {
+  //FIXME If & Loop require special casing because they cannot be run as a
+  //single node.
+  prim::If,
+  prim::Loop,
+  //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be
+  //FIXME treated as having side effects but ONNX depends on them being removed
+  prim::Print,
+  //all the rand functions from native_functions.yaml
+  aten::permute,
+  aten::rand,
+  aten::rand_out,
+  aten::rand_like,
+  aten::randint,
+  aten::randint_out,
+  aten::randint_like,
+  aten::randn,
+  aten::randn_out,
+  aten::randn_like,
+  aten::randperm,
+  aten::randperm_out,
+ };
+
+std::vector<IValue> runNode(Node* n) {
+  auto op = getOperation(n);
+  Stack stack;
+  for (auto input : n->inputs()) {
+    stack.push_back(*(toIValue(input)));
+  }
+  op(stack);
+  auto var_outputs = fmap(stack, [&](IValue v) {
+    if (v.isTensor()) {
+      return IValue(autograd::as_variable_ref(v.toTensor()).data());
+    } else {
+      return v;
+    }
+  });
+  return var_outputs;
+}
+
+void propagateNode(Node* n) {
+  auto outputs = runNode(n);
+  auto graph = n->owningGraph();
+  WithInsertPoint guard(n);
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto new_output = insertConstant(*graph, outputs[i]);
+    n->outputs()[i]->replaceAllUsesWith(new_output);
+    // let dce elimination remove n
+  }
+}
+
+} // anonymous namespace
+
+void ConstantPropagation(Node* n, bool recurse) {
+  bool constant_inputs = (n->inputs().size() > 0) &&
+    std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
+      return v->node()->kind() == prim::Constant;
+    });
+  bool supported_node = skip_list.count(n->kind()) == 0;
+  if (constant_inputs && supported_node) {
+    propagateNode(n);
+  }
+  if (recurse) {
+    for (Block * block : n->blocks())
+      ConstantPropagation(block, recurse);
+  }
+}
+
+void ConstantPropagation(Block* block, bool recurse) {
+  ConstantPropagation(block->param_node(), recurse);
+  for (auto n: block->nodes()) {
+    ConstantPropagation(n, recurse);
+  }
+}
+
+void ConstantPropagation(std::shared_ptr<Graph>& graph) {
+  ConstantPropagation(graph->block(), true);
+  EliminateDeadCode(graph);
+}
+
+}}
diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h
new file mode 100644
index 00000000000000..12df329c81ccfc
--- /dev/null
+++ b/torch/csrc/jit/passes/constant_propagation.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+TORCH_API void ConstantPropagation(std::shared_ptr<Graph>& graph);
+TORCH_API void ConstantPropagation(Block* block, bool recurse);
+TORCH_API void ConstantPropagation(Node* n, bool recurse);
+
+}}
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index cb3757cffb0e34..cc8dcb8926dee0 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -177,16 +177,25 @@ struct GraphFuser {
     }
   }
 
-  bool allCatInputsHaveSameSize(Node * node) {
-    JIT_ASSERT(node->kind() == aten::cat);
-    std::vector<Value*> inputs = node->inputs();
-    if (!node->hasAttributes()) {
-      inputs.pop_back(); // Get rid of the dim argument
-    }
+  bool isFusableCatNode(Node * node) {
+    if (node->kind() != aten::cat)
+      return false;
+    if (!node->is_constant(attr::dim))
+      return false;
 
-    auto expected = inputs.at(0)->type()->cast<TensorType>();
+    auto tensors_node = node->namedInput(attr::tensors)->node();
+    if (tensors_node->kind() != prim::ListConstruct) return false;
+    // NB: Note that technically other uses of the list aren't a big problem for us.
+    // It would be enough to place the prim::FusedConcat before the prim::ListConstruct, and
+    // allUsersAreThisConsumerOrOccurAfterIt would still be satisfied. However, I don't expect this
+    // to be necessary any time soon, and so we're simply assuming that we don't have to deal with that.
+    if (tensors_node->output()->uses().size() > 1) return false;
+    auto tensors = tensors_node->inputs();
+
+    // Our fusion code assumes that all inputs have the same shapes, so we need to check this too.
+    auto expected = tensors.at(0)->type()->cast<TensorType>();
     if (!expected) return false;
-    return std::all_of(inputs.begin(), inputs.end(), [expected](Value *v) {
+    return std::all_of(tensors.begin(), tensors.end(), [&expected](Value *v) {
         auto actual = v->type()->cast<TensorType>();
         return actual && actual->sizes() == expected->sizes();
     });
@@ -197,15 +206,7 @@ struct GraphFuser {
   // because it is not a simple map, can be put in a fusion group
   // as long as no items in the group read the output of concat
   bool isFusableAsExitNode(Node * node) {
-    if(isFusable(node))
-      return true;
-    // this concat fusion only works when all the inputs are the same size
-    // and we can statically infer the dimension along which we should concat
-    // otherwise they cannot partipate in the same map
-    if(node->kind() == aten::cat && node->is_constant(attr::dim) && allCatInputsHaveSameSize(node))
-      return true;
-
-    return false;
+    return isFusable(node) || isFusableCatNode(node);
   }
 
   // necessary condition for fusion. If all of the uses of producer are consumer
@@ -241,8 +242,9 @@ struct GraphFuser {
     // we can move the consumer up into the producer.
     // but this requires better handling of merging fusion groups so it is not done now
     at::optional<int> consumer_device = getDevice(consumer);
+    Node *real_consumer = consumer->kind() == aten::cat ? consumer->namedInput(attr::tensors)->node() : consumer;
     return isFusable(producer->node()) &&
-      allUsersAreThisConsumerOrOccurAfterIt(consumer, producer) &&
+      allUsersAreThisConsumerOrOccurAfterIt(real_consumer, producer) &&
       consumer_device && consumer_device == getDevice(producer->node()) &&
       (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU());
   }
@@ -389,7 +391,24 @@ struct GraphFuser {
 
   Node * fuse(Node * consumer, Value * producer) {
     auto group = consumer;
-    if(group->kind() != prim::FusionGroup) {
+    if (consumer->kind() == aten::cat) {
+      Graph * graph = consumer->owningGraph();
+      Node * list_construct = consumer->namedInput(attr::tensors)->node();
+      int64_t dim = consumer->get<int64_t>(attr::dim).value();
+
+      Node * fused_cat = graph->create(prim::FusedConcat, list_construct->inputs())->i_(attr::dim, dim);
+      fused_cat->insertBefore(list_construct);
+      fused_cat->output()->copyMetadata(consumer->output());
+      consumer->output()->replaceAllUsesWith(fused_cat->output());
+      topological_index[fused_cat] = topological_index[list_construct];
+
+      // NB: this deletes the fused_cat node from the original graph
+      group = createSingletonFusionGroup(fused_cat);
+      consumer->destroy();
+      if (list_construct->output()->uses().empty()) {
+        list_construct->destroy();
+      }
+    } else if (consumer->kind() != prim::FusionGroup) {
       group = createSingletonFusionGroup(consumer);
     }
     if (producer->node()->kind() == prim::FusionGroup) {
@@ -450,7 +469,6 @@ struct GraphFuser {
       }
     }
 
-    // TODO: Remove this restriction if we ever need to distribute across
     // multiple return operators
     Node * producer_for_chunk_node = producer_for_chunk->node();
     JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1);
@@ -521,11 +539,14 @@ struct GraphFuser {
   std::pair<graph_node_list::iterator, bool> scanNode(Node * consumer) {
     auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage());
     if(isFusableAsExitNode(consumer)) {
+      value_list inputs;
+      auto consumer_inputs = consumer->kind() == aten::cat ?
+        consumer->namedInput(attr::tensors)->node()->inputs() :
+        consumer->inputs();
       // handle inputs in reverse topological order as well...
       // otherwise in f(a,a+b) it will appear a is used twice if we consider
       // the f-a fusion before the f-(a+b) fusion first.
-      value_list inputs;
-      for(auto i : consumer->inputs()) {
+      for(auto i : consumer_inputs) {
         if (i->node()->owningBlock() == block) {
           inputs.push_back(i);
           JIT_ASSERT(topological_index.count(i->node()) > 0);
diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h
index a0a881e3002ed9..0ec3589e3acd31 100644
--- a/torch/csrc/jit/passes/lower_grad_of.h
+++ b/torch/csrc/jit/passes/lower_grad_of.h
@@ -10,6 +10,6 @@ namespace torch { namespace jit {
 //  outputs = <original_computation>
 // else:
 //  outputs = undefineds
-TORCH_API void LowerGradOf(Graph& graph);
+TORCH_API void LowerGradOf(Graph& g);
 
 }}
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 63fb7030aa3ad1..ee9b76f417bd17 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -263,6 +263,39 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     default:
       break; // fall-through
   }
+  if (node->matches("aten::cat(Tensor[] tensors, int dim) -> Tensor", /*with_const=*/attr::dim)) {
+    auto list_node = node->namedInput(attr::tensors)->node();
+    JIT_ASSERT(list_node->kind() == prim::ListConstruct);
+    auto tensors = list_node->inputs();
+    if (tensors.size() > 0) {
+      auto input_types = fmap(tensors, [](Value *v) { return v->type()->cast<TensorType>(); });
+      if (std::all_of(input_types.begin(), input_types.end(),
+          [](const TensorTypePtr& tp) { return tp != nullptr; })) {
+        std::vector<int64_t> sizes = input_types[0]->sizes();
+        const int64_t dim = wrapDim(node->get<int64_t>(attr::dim).value(), sizes);
+        const int64_t ndim = sizes.size();
+
+        if (dim < 0 || dim >= ndim)
+          goto cat_fail;
+
+        sizes[dim] = 0;
+        for (auto & tp : input_types) {
+          auto & tp_sizes = tp->sizes();
+          if (sizes.size() != tp_sizes.size())
+            goto cat_fail;
+          for (int64_t i = 0; i < ndim; ++i) {
+            if (sizes[i] != tp_sizes[i] && i != dim) {
+              goto cat_fail;
+            }
+          }
+          sizes[dim] += tp_sizes[dim];
+        }
+        node->output()->setType(input_types[0]->withSizes(sizes));
+        return;
+      }
+    }
+  }
+cat_fail:
 
   bool can_propagate_by_running = canPropagateShapeByRunningIt(node);
   auto maybe_tensor_types = gatherTensorTypes(node);
diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h
index b5139032fde169..9894b802b2d29c 100644
--- a/torch/csrc/jit/python_arg_flatten.h
+++ b/torch/csrc/jit/python_arg_flatten.h
@@ -104,7 +104,7 @@ struct ParsedArgs {
 
 
 ParsedArgs flatten(py::handle obj);
-PyObject* unflatten(at::ArrayRef<autograd::Variable> outputs,
+PyObject* unflatten(at::ArrayRef<autograd::Variable> vars,
                     const IODescriptor& structure);
 
 }}} // namespace torch::jit::python
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 81211085569953..b72fdb6b8860b1 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -451,10 +451,22 @@ void initPythonIRBindings(PyObject * module_) {
     .def("scalarType",[](Type& t) {
       return at::toString(t.expect<TensorType>()->scalarType());
     })
-    ;
+    .def("__eq__", [](std::shared_ptr<Type>& self, std::shared_ptr<Type>& other) {
+		  return *self == *other;
+    })
+    .def("isSubtypeOf", [](std::shared_ptr<Type>& self, std::shared_ptr<Type> other) {
+        return self->isSubtypeOf(other);
+    });
 
+  py::class_<NumberType, Type, std::shared_ptr<NumberType>>(m, "NumberType")
+    .def_static("get", &NumberType::get);
+  py::class_<IntType, Type, std::shared_ptr<IntType>>(m, "IntType")
+    .def_static("get", &IntType::get);
+  py::class_<FloatType, Type, std::shared_ptr<FloatType>>(m, "FloatType")
+    .def_static("get", &FloatType::get);
   py::class_<DynamicType, Type, std::shared_ptr<DynamicType>>(m, "DynamicType")
-    .def(py::init([](){ return DynamicType::create(); }));
+    .def_static("get", &DynamicType::get);
+
   py::class_<TupleType, Type, std::shared_ptr<TupleType>>(m, "TupleType")
     .def(py::init([](std::vector<TypePtr> a){ return TupleType::create(a); }))
     .def("elements", [](TupleType &self){
@@ -465,7 +477,9 @@ void initPythonIRBindings(PyObject * module_) {
       return types;
     });
   py::class_<ListType, Type, std::shared_ptr<ListType>>(m, "ListType")
-    .def_static("ofInts", &ListType::ofInts);
+    .def_static("ofInts", &ListType::ofInts)
+    .def_static("ofTensors", &ListType::ofTensors)
+    .def("getElementType", &ListType::getElementType);
 
   py::class_<Use>(m,"Use")
   .def_readonly("user",&Use::user)
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index 7439b2b5e334cc..0496af67412654 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -103,10 +103,10 @@ void pythonRecordSourceLocation(Node* n) {
   n->setSourceLocation(sl);
 }
 
-void initPythonTracerBindings(PyObject* module_) {
+void initPythonTracerBindings(PyObject* module) {
   setRecordSourceLocation(pythonRecordSourceLocation);
 
-  auto m = py::handle(module_).cast<py::module>();
+  auto m = py::handle(module).cast<py::module>();
   py::class_<TracingState,std::shared_ptr<TracingState>>(m, "TracingState", py::dynamic_attr())
     // NB: no constructor; you have to get it from C++ code
     .def("__repr__", [](const TracingState& s) {
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 8fe747e59900f0..90d49e230792e9 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -231,6 +231,18 @@ RegisterOperators reg({
               push(stack, std::move(vals));
               return 0;
             };
+          } else if (lt->getElementType()->isSubtypeOf(DynamicType::get())) {
+            return [=](Stack& stack) {
+              const size_t stack_size = stack.size();
+              std::vector<at::Tensor> vals;
+              vals.reserve(num_inputs);
+              for (size_t i = stack_size - num_inputs; i < stack_size; ++i) {
+                vals.push_back(std::move(stack[i]).toTensor());
+              }
+              drop(stack, num_inputs);
+              push(stack, std::move(vals));
+              return 0;
+            };
           } else {
             std::stringstream ss;
             ss << "unsupported list type: " << *lt->getElementType();
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 0016f69b5ce07b..961dd3980933b3 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -351,27 +351,6 @@ Value* createNumber(Graph& g, const SourceRange& loc, const at::Tensor& val) {
   return output;
 }
 
-Value* createStack(Graph& g, const SourceRange& loc, at::ArrayRef<Value*> inputs) {
-  // bake in constant propagation for the all-constant case because it is
-  // common to see constant lists like [1, 2] passed to attributes
-  bool all_constant = std::all_of(inputs.begin(), inputs.end(), [&](Value* v) {
-    return v->node()->kind() == prim::Constant;
-  });
-  if(all_constant) {
-    auto values = fmap(inputs, [&](Value* v) {
-      return v->node()->t(attr::value);
-    });
-    return insertConstant(g, at::stack(values), loc);
-  }
-  return g.insertNode(g.create(aten::stack, inputs)
-                      ->i_(attr::dim, 0)
-                      ->setSourceLocation(std::make_shared<SourceRange>(loc)))->output();
-}
-
-static bool isTensorSubtype(Value* v) {
-  return v->type()->isSubtypeOf(DynamicType::get());
-}
-
 at::optional<std::vector<int64_t>> getIntListAttribute(at::optional<int32_t> N, Value* input) {
   auto list = constant_as<Shared<jit::IntList>>(input);
   if(list)
@@ -455,51 +434,46 @@ at::optional<std::vector<Value*>> tryMatchSchema(
     }
 
     // check input types
-    std::vector<Value*> flat_inputs;
+    std::vector<Value*> matched_inputs;
     for(size_t i = 0; i < schema.arguments.size(); ++i) {
-      NamedValue v = *positional_inputs[i];
+      Value* value = positional_inputs[i]->value;
       const auto& arg = schema.arguments[i];
 
       // some functions that take lists of integers for fixed size arrays
       // also allow single ints to be passed in their place.
       // the single int is then repeated to the length of the list
-      if (isIntUsedAsIntList(v.value, arg)) {
-        std::vector<Value*> repeated(*arg.N, v.value);
-        v.value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
+      if (isIntUsedAsIntList(value, arg)) {
+        std::vector<Value*> repeated(*arg.N, value);
+        value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
       }
 
-      // Allow tuples that only contain integers to turn into lists of integers
-      if(*ListType::ofInts() == *arg.type &&
-         v.value->type()->kind() == TypeKind::TupleType &&
-         v.value->type()->isSubtypeOf(ListType::ofInts())) {
-        auto unpacked = createTupleUnpack(v.value);
-        v.value = graph.insertNode(graph.createList(IntType::get(), unpacked))->output();
+      // Allow homogeneous tuples to be casted implicitly to lists of appropriate types
+      if (arg.type->kind() == TypeKind::ListType &&
+          value->type()->kind() == TypeKind::TupleType &&
+          value->type()->isSubtypeOf(arg.type)) {
+        auto unpacked = createTupleUnpack(value);
+        auto elem_type = arg.type->expect<ListType>()->getElementType();
+        value = graph.insertNode(graph.createList(elem_type, unpacked))->output();
       }
 
-      if (v.value->node()->kind() == prim::None){
+      if (value->node()->kind() == prim::None){
         if (arg.type->isSubtypeOf(NumberType::get()))
-          v.value = insertConstant(graph, at::Scalar(NAN), loc);
+          value = insertConstant(graph, at::Scalar(NAN), loc);
         else
-          v.value = graph.insertNode(graph.createUndefined())->output();
+          value = graph.insertNode(graph.createUndefined())->output();
       }
 
-      if(!v.value->type()->isSubtypeOf(arg.type)) {
+      if(!value->type()->isSubtypeOf(arg.type)) {
         err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found "
-              << v.value->type()->str() << "\n"
-              << v.loc;
+              << value->type()->str() << "\n"
+              << positional_inputs[i]->loc;
         return at::nullopt;
       }
 
-      // we only support tensor lists for builtins, where they must be flattened
-      if(arg.type->isSubtypeOf(ListType::ofTensors())) {
-        auto outputs = createTupleUnpack(v.value);
-        flat_inputs.insert(flat_inputs.end(), outputs.begin(), outputs.end());
-      } else {
-        flat_inputs.push_back(v.value);
-      }
+      matched_inputs.push_back(value);
     }
 
-    return flat_inputs;
+    return matched_inputs;
 }
 
 
@@ -513,20 +487,20 @@ static std::shared_ptr<SugaredValue> tryEmitBuiltin(
   at::ArrayRef<NamedValue> attributes) {
 
   auto graph = method.graph();
-  auto flat_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages);
-  if(!flat_inputs)
+  auto matched_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages);
+  if(!matched_inputs)
     return nullptr;
   // we successfully matched this schema, construct the node
 
   NodeKind kind(Symbol::aten(name));
-  auto n = graph->insertNode(graph->create(kind, *flat_inputs, 0))
+  auto n = graph->insertNode(graph->create(kind, *matched_inputs, 0))
                 ->setSourceLocation(std::make_shared<SourceRange>(loc));
 
   // special case for chunk when the chunks=<const> is known
   // DO NOT ADD MORE SPECIAL CASES HERE, REFACTOR INTO A FUNCTION IF
   // NEEDED
   if(n->kind() == aten::chunk) {
-    auto value = constant_as<int64_t>((*flat_inputs)[1]);
+    auto value = constant_as<int64_t>((*matched_inputs)[1]);
     if(!value) {
       throw ErrorReport(loc) << "argument 'chunks' must be a constant";
     }
@@ -588,7 +562,7 @@ std::shared_ptr<SugaredValue> emitBuiltinCall(
 }
 
 static Value* ensureTensor(const SourceRange& range, Value* v) {
-  if(!isTensorSubtype(v)) {
+  if(!v->type()->isSubtypeOf(DynamicType::get())) {
     throw ErrorReport(range) << "expected a tensor value but found a "
                              << v->type()->str();
   }
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
index 0b87cf56be6ad3..3c4dcb07a248ee 100644
--- a/torch/csrc/jit/script/compiler.h
+++ b/torch/csrc/jit/script/compiler.h
@@ -68,7 +68,7 @@ struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
     SourceRange loc,
     Method & m,
     // note: names for args will be 'argument 0', 'argument 1', etc..
-    at::ArrayRef<NamedValue> inputs,
+    at::ArrayRef<NamedValue> inputs_,
     at::ArrayRef<NamedValue> attributes,
     size_t n_binders) {
 // n_binders is always set to the number of variables an expression is
@@ -89,7 +89,7 @@ struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
     throw ErrorReport(loc) << "cannot call a " << kind();
   }
 
-  virtual ~SugaredValue() {}
+  virtual ~SugaredValue() = default;
 };
 
 // most things in the environment are just simple value types
diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h
index e3d69d2790682d..0b9bc7009e0162 100644
--- a/torch/csrc/jit/script/tree.h
+++ b/torch/csrc/jit/script/tree.h
@@ -89,7 +89,7 @@ struct Tree : std::enable_shared_from_this<Tree> {
       throw std::runtime_error(ss.str());
     }
   }
-  virtual ~Tree() {}
+  virtual ~Tree() = default;
 
  private:
   int kind_;
diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h
index 2c74ae7e0a4c77..7a23aa55df538f 100644
--- a/torch/csrc/jit/stack.h
+++ b/torch/csrc/jit/stack.h
@@ -77,8 +77,8 @@ inline void pack(Stack & stack, T&& v) {
 }
 
 template<>
-inline void pack(Stack & stack, std::vector<at::Tensor>&& ts) {
-  for(auto& t : ts) {
+inline void pack(Stack & stack, std::vector<at::Tensor>&& v) {
+  for(auto& t : v) {
     stack.push_back(IValue(std::move(t)));
   }
 }
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index e4d2f98ba0ea0f..ef6d41005789f8 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -119,18 +119,20 @@ struct SymbolicVariable {
     return create(t("narrow"), { *this, insertConstant(dim), insertConstant(start), insertConstant(length) }, 1)[0];
   }
   static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, Value* dim) {
-    std::vector<SymbolicVariable> all_inputs = inputs;
-    all_inputs.push_back(dim);
-    return create(aten::cat, all_inputs)[0];
+    Graph *g = dim->owningGraph();
+    auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); });
+    Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output();
+    return create(aten::cat, {input_list, dim})[0];
   }
   static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, int dim) {
     JIT_ASSERT(inputs.size() > 0);
     return SymbolicVariable::cat(inputs, inputs[0].insertConstant(dim));
   }
   static SymbolicVariable stack(ArrayRef<SymbolicVariable> inputs, Value* dim) {
-    std::vector<SymbolicVariable> all_inputs = inputs;
-    all_inputs.push_back(dim);
-    return create(aten::stack, all_inputs)[0];
+    Graph *g = dim->owningGraph();
+    auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); });
+    Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output();
+    return create(aten::stack, {input_list, dim})[0];
   }
   static SymbolicVariable stack(ArrayRef<SymbolicVariable> inputs, int dim) {
     JIT_ASSERT(inputs.size() > 0);
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index 8c9763f88353e5..d5d204f9465bd8 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -220,6 +220,9 @@ static void fusionTests() {
   testOne(1,2,0,2);
 
 
+  auto createFusedConcat = [](Graph & graph, at::ArrayRef<Value*> inputs, int64_t dim) -> Value* {
+    return graph.insertNode(graph.create(prim::FusedConcat, inputs)->i_(attr::dim, dim))->output();
+  };
 
   auto testConcat = [&](int dim) {
     Graph graph;
@@ -227,7 +230,7 @@ static void fusionTests() {
     Var i1 = Var::asNewInput(graph);
     auto o0 = i0 * i1;
     o0.addAsOutput();
-    Var::cat({i0, o0}, dim).addAsOutput();
+    Var(createFusedConcat(graph, {i0, o0}, dim)).addAsOutput();
 
     auto a = at::rand({3,4,5}, at::kCUDA);
     auto b = at::rand({4,3,5}, at::kCUDA).transpose(0,1);
@@ -776,6 +779,9 @@ void argumentSpecTest() {
   REQUIRE(!(c == a));
   REQUIRE(spec.count(c) == 0);
 
+  Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) };
+  ArgumentSpec with_const(true, stack);
+  REQUIRE(with_const.at(2).sizes().size() == 2);
 }
 
 void shapeAnalysisTest() {
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index aec6eb4ddc9447..a0e2f65e617754 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -38,9 +38,9 @@ void addInputs(Node *n, const char * name, const std::string& value)         { b
 void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { badArgType(); }
 
 void addInputs(Node *n, const char * name, at::TensorList value) {
-  for (auto & t : value) {
-    n->addInput(getValueTrace(t));
-  }
+  Graph *g = n->owningGraph();
+  Node *list_node = g->appendNode(g->createList(DynamicType::get(), fmap(value, getValueTrace)));
+  n->addInput(list_node->output());
 }
 
 void addInputs(Node *n, const char * name, at::IntList value) {
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index ebcc91a908c213..ddb4dfad0154ad 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -46,31 +46,31 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
   return out;
 }
 
-TypePtr DynamicType::get() {
+DynamicTypePtr DynamicType::get() {
   static auto value = DynamicType::create();
   return value;
 }
-TypePtr NumberType::get() {
+NumberTypePtr NumberType::get() {
   static auto value = NumberType::create();
   return value;
 }
-TypePtr IntType::get() {
+IntTypePtr IntType::get() {
   static auto value = IntType::create();
   return value;
 }
-TypePtr FloatType::get() {
+FloatTypePtr FloatType::get() {
   static auto value = FloatType::create();
   return value;
 }
-TypePtr NoneType::get() {
+NoneTypePtr NoneType::get() {
   static auto value = NoneType::create();
   return value;
 }
-TypePtr ListType::ofTensors() {
+ListTypePtr ListType::ofTensors() {
   static auto value = ListType::create(DynamicType::get());
   return value;
 }
-TypePtr ListType::ofInts() {
+ListTypePtr ListType::ofInts() {
   static auto value = ListType::create(IntType::get());
   return value;
 }
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index 7b7d708a549b32..5d01cf0a1552c1 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -80,7 +80,7 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     JIT_ASSERT(T::Kind == kind());
     return std::static_pointer_cast<const T>(shared_from_this());
   }
-  virtual ~Type() {}
+  virtual ~Type() = default;
 };
 
 inline bool operator!=(const Type & lhs, const Type & rhs) {
@@ -104,7 +104,7 @@ struct TORCH_API DynamicType : public Type {
   }
   static const TypeKind Kind = TypeKind::DynamicType;
   // global singleton
-  static TypePtr get();
+  static DynamicTypePtr get();
 private:
   DynamicType()
   : Type(TypeKind::DynamicType) {}
@@ -237,8 +237,8 @@ struct TORCH_API ListType : public Type {
     return elem;
   }
   // common cast List[Tensor]
-  static TypePtr ofTensors();
-  static TypePtr ofInts();
+  static ListTypePtr ofTensors();
+  static ListTypePtr ofInts();
 private:
   ListType(TypePtr elem)
   : Type(TypeKind::ListType), elem(elem) {}
@@ -326,7 +326,7 @@ struct TORCH_API NumberType : public Type {
   }
   static const TypeKind Kind = TypeKind::NumberType;
   // global singleton
-  static TypePtr get();
+  static NumberTypePtr get();
 private:
   NumberType()
   : Type(TypeKind::NumberType) {}
@@ -351,7 +351,7 @@ struct TORCH_API FloatType : public Type {
   }
   static const TypeKind Kind = TypeKind::FloatType;
   // global singleton
-  static TypePtr get();
+  static FloatTypePtr get();
 private:
   FloatType()
   : Type(TypeKind::FloatType) {}
@@ -376,7 +376,7 @@ struct TORCH_API IntType : public Type {
   }
   static const TypeKind Kind = TypeKind::IntType;
   // global singleton
-  static TypePtr get();
+  static IntTypePtr get();
 private:
   IntType()
   : Type(TypeKind::IntType) {}
@@ -401,7 +401,7 @@ struct NoneType : public Type {
   }
   static const TypeKind Kind = TypeKind::NoneType;
   // global singleton
-  static TypePtr get();
+  static NoneTypePtr get();
 private:
   NoneType()
   : Type(TypeKind::NoneType) {}
diff --git a/torch/csrc/jit/variable_tensor_list.h b/torch/csrc/jit/variable_tensor_list.h
index eeae2a66b17e5f..0916fe6ac051d2 100644
--- a/torch/csrc/jit/variable_tensor_list.h
+++ b/torch/csrc/jit/variable_tensor_list.h
@@ -6,10 +6,10 @@ namespace torch { namespace jit {
 // a wrapper to mark places where we expect all the at::Tensors to be
 // variables
 struct variable_tensor_list : public std::vector<at::Tensor> {
-  variable_tensor_list() {}
+  variable_tensor_list() = default;
   template<class InputIt>
   variable_tensor_list(InputIt first, InputIt last)
-  : std::vector<at::Tensor>(first, last) {} 
+  : std::vector<at::Tensor>(first, last) {}
   explicit variable_tensor_list(std::vector<at::Tensor> && tensor)
   : std::vector<at::Tensor>(std::move(tensor)) {}
 };
diff --git a/torch/csrc/utils/hash.h b/torch/csrc/utils/hash.h
index 05a5a27b51223a..954a7b5b7d0814 100644
--- a/torch/csrc/utils/hash.h
+++ b/torch/csrc/utils/hash.h
@@ -32,7 +32,7 @@ namespace torch {
 // DEALINGS IN THE SOFTWARE.
 
 inline size_t hash_combine(size_t seed, size_t value) {
-  return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+  return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
index f8d5fd1ba1cd63..0160bdd2d8e506 100644
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -16,7 +16,7 @@ std::string py_typename(PyObject *object) {
 
 struct Type {
   virtual bool is_matching(PyObject *object) = 0;
-  virtual ~Type() {};
+  virtual ~Type() = default;
 };
 
 struct SimpleType: public Type {
diff --git a/torch/csrc/utils/invalid_arguments.h b/torch/csrc/utils/invalid_arguments.h
index 138c3331113b7c..daaccfd877f377 100644
--- a/torch/csrc/utils/invalid_arguments.h
+++ b/torch/csrc/utils/invalid_arguments.h
@@ -7,7 +7,9 @@
 namespace torch {
 
 std::string format_invalid_args(
-    PyObject *args, PyObject *kwargs, const std::string& name,
+    PyObject* given_args,
+    PyObject* given_kwargs,
+    const std::string& function_name,
     const std::vector<std::string>& options);
 
 } // namespace torch
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index b00bd27c087495..0f2f51904c2554 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -90,8 +90,8 @@ struct PythonArgParser {
 
 private:
   [[noreturn]]
-  void print_error(PyObject* args, PyObject* kwargs, PyObject* dst[]);
-  PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* dst[]);
+  void print_error(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]);
+  PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]);
 
   std::vector<FunctionSignature> signatures_;
   std::string function_name;
diff --git a/torch/csrc/utils/tensor_apply.h b/torch/csrc/utils/tensor_apply.h
index 47fbaa672c4262..5dfdef98c81db4 100644
--- a/torch/csrc/utils/tensor_apply.h
+++ b/torch/csrc/utils/tensor_apply.h
@@ -6,8 +6,8 @@
 namespace torch { namespace utils {
 
 at::Tensor & apply_(at::Tensor & self, PyObject* fn);
-at::Tensor & map_(at::Tensor & self, const at::Tensor & other, PyObject* fn);
-at::Tensor & map2_(at::Tensor & self, const at::Tensor & other1,
-                   const at::Tensor & other2, PyObject* fn);
+at::Tensor & map_(at::Tensor & self, const at::Tensor & other_, PyObject* fn);
+at::Tensor & map2_(at::Tensor & self, const at::Tensor & x_,
+                   const at::Tensor & y_, PyObject* fn);
 
 }} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 3a8b4a7bbc1592..d03fd55f2accfc 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -139,8 +139,10 @@ ScalarType infer_scalar_type(PyObject *obj) {
   }
 #ifdef USE_NUMPY
   if (PyArray_Check(obj)) {
-    auto array = (PyArrayObject*)obj;
-    return numpy_dtype_to_aten(PyArray_TYPE(array));
+    return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)obj));
+  }
+  if (PyArray_CheckScalar(obj)) {
+    return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)(PyArray_FromScalar(obj, NULL))));
   }
 #endif
   if (PySequence_Check(obj)) {
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index f8b26b121fd3e8..a2086ae95b899c 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -61,7 +61,8 @@ def init_process_group(backend, init_method='env://', **kwargs):
         group_name (str, optional): Group name. See description of init methods.
 
     To enable ``backend == mpi``, PyTorch needs to built from source on a system that
-    supports MPI.
+    supports MPI. If you want to use Openmpi with CUDA-aware support, please use Openmpi
+    major version 2 and above.
 
     """
     world_size = kwargs.pop('world_size', -1)
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 77e6cf777f2784..c06a927add64b1 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -204,9 +204,9 @@ def as_ann(ann):
 
 def ann_to_type(ann):
     if ann is None:
-        return DynamicType()
+        return DynamicType.get()
     elif ann is torch.Tensor:
-        return DynamicType()
+        return DynamicType.get()
     elif is_tuple(ann):
         return TupleType([ann_to_type(a) for a in ann.__args__])
     raise ValueError("The only supported annotations kinds are Tensor and Tuple[...]")
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
index cc176931d8c0c2..b23157581bdfc0 100644
--- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
+++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
@@ -100,6 +100,14 @@ void DataChannelMPI::destroy() {}
 
 
 bool DataChannelMPI::init() {
+#ifdef OMPI_MAJOR_VERSION
+  // OMPI_* is specific to Openmpi implementation.
+  // Openmpi v1.10 segfaults in MPI_Bcast with CUDA buffer.
+  if (int(OMPI_MAJOR_VERSION) < 2) {
+      throw std::runtime_error("Please use Openmpi major version 2 and above for distributed.");
+  }
+#endif /* OMPI_MAJOR_VERSION */
+
   int provided;
   MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
   if (provided != MPI_THREAD_MULTIPLE) {
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 17a7c09b012da6..454230ad9e6171 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1350,6 +1350,41 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1):
 
 # loss
 
+def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0,
+             reduction='elementwise_mean'):
+    r"""The Connectionist Temporal Classification loss.
+
+    See :class:`~torch.nn.CTCLoss` for details.
+
+    Args:
+        log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
+            `T = input length`, and `N = batch size`.
+            The logarithmized probabilities of the outputs
+            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
+        targets: :math:`(N, S)` or `(sum(target_lenghts))`.
+            Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
+        input_lengths: :math:`(N)`.
+            Lengths of the inputs (must each be :math:`\leq T`)
+        target_lengths: :math:`(N)`.
+            Lengths of the targets
+        blank (int, optional):
+            Blank label. Default :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the output losses will be divided by the target lengths and
+            then the mean over the batch is taken. Default: 'elementwise_mean'
+
+    Example::
+
+        >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
+        >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long)
+        >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
+        >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
+        >>> loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        >>> loss.backward()
+    """
+    return torch.ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, _Reduction.get_enum(reduction))
+
 
 def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100,
              reduce=None, reduction='elementwise_mean'):
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 4d98f482768a63..7f67ca9fce804d 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -6,7 +6,7 @@
     Softmax, Softmax2d, LogSoftmax, ELU, SELU, Hardshrink, LeakyReLU, LogSigmoid, \
     Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU
 from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
-    CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \
+    CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
     MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
     SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss
 from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
@@ -34,7 +34,7 @@
     'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'GLU', 'Hardshrink',
     'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'PReLU', 'Softsign', 'Softmin',
     'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss',
-    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
+    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
     'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss',
     'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict',
     'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 489e8998843f98..ec7d60d8125152 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1123,6 +1123,61 @@ def forward(self, anchor, positive, negative):
         return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
                                      eps=self.eps, swap=self.swap, reduction=self.reduction)
 
+
+class CTCLoss(_Loss):
+    r"""The Connectionist Temporal Classification loss.
+
+    Args:
+        blank (int, optional): blank label. Default :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the output losses will be divided by the target lengths and
+            then the mean over the batch is taken. Default: 'elementwise_mean'
+
+    Inputs:
+        log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
+            `T = input length`, and `N = batch size`.
+            The logarithmized probabilities of the outputs
+            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
+        targets: :math:`(N, S)` or `(sum(target_lenghts))`.
+            Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
+        input_lengths: :math:`(N)`.
+            Lengths of the inputs (must each be :math:`\leq T`)
+        target_lengths: :math:`(N)`.
+            Lengths of the targets
+
+
+    Example::
+
+        >>> ctc_loss = nn.CTCLoss()
+        >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
+        >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long)
+        >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
+        >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
+        >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        >>> loss.backward()
+
+    Reference:
+        A. Graves et al.: Connectionist Temporal Classification:
+        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
+        https://www.cs.toronto.edu/~graves/icml_2006.pdf
+
+    .. Note::
+        In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
+        in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
+        :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
+        :class:`torch.IntTensor`.
+
+        The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
+    """
+
+    def __init__(self, blank=0, reduction='elementwise_mean'):
+        super(CTCLoss, self).__init__(reduction=reduction)
+        self.blank = blank
+
+    def forward(self, log_probs, targets, input_lengths, target_lengths):
+        return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction)
+
 # TODO: L1HingeEmbeddingCriterion
 # TODO: MSECriterion weight
 # TODO: ClassSimplexCriterion
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 3ca44f35c4eff3..d19dadd0ceb935 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -70,6 +70,12 @@ def _get_const(value, desc, arg_name):
     return _parse_arg(value, desc)
 
 
+def _unpack_list(list_value):
+    list_node = list_value.node()
+    assert list_node.kind() == "prim::ListConstruct"
+    return list_node.inputs()
+
+
 def parse_args(*arg_descriptors):
     def decorator(fn):
         def wrapper(g, *args):
@@ -215,13 +221,18 @@ def reciprocal(g, self):
     return g.op("Div", _if_scalar_type_as(g, torch.ones(1), self), self)
 
 
-# This syntax is Python 2 portable
-def cat(g, *args):
-    dim = _get_const(args[-1], 'i', 'dim')
-    tensors = args[:-1]
+@parse_args('v', 'i')
+def cat(g, tensor_list, dim):
+    tensors = _unpack_list(tensor_list)
     return g.op("Concat", *tensors, axis_i=dim)
 
 
+@parse_args('v', 'i')
+def stack(g, tensor_list, dim):
+    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in _unpack_list(tensor_list)]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
 def mm(g, self, other):
     # Create a dummy C tensor. Only needed for API purposes, the value is
     # since beta = 0
@@ -349,11 +360,6 @@ def view(g, self, size):
     return g.op("Reshape", self, shape)
 
 
-def stack(g, *args):
-    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in args[:-1]] + [args[-1]]
-    return concat(g, *unsqueezed)
-
-
 @parse_args('v', 'i', 'i')
 def split(g, self, split_size, dim):
     size = self.type().sizes()[dim]
@@ -555,9 +561,10 @@ def replication_pad(g, input, padding):
 
 @parse_args('v', 'is')
 def upsample_nearest2d(g, input, output_size):
+    height_scale = float(output_size[-2]) / input.type().sizes()[-2]
+    width_scale = float(output_size[-1]) / input.type().sizes()[-1]
     return g.op("Upsample", input,
-                height_scale_f=float(output_size[-2]) / input.type().sizes()[-2],
-                width_scale_f=float(output_size[-1]) / input.type().sizes()[-1],
+                scales_f=[1., 1., height_scale, width_scale],
                 mode_s="nearest")
 
 
@@ -565,10 +572,11 @@ def upsample_nearest2d(g, input, output_size):
 def upsample_bilinear2d(g, input, output_size, align_corners):
     if align_corners:
         return _unimplemented("upsample_bilinear2d", "align_corners == True")
-    w_scale = float(output_size[-1]) / input.type().sizes()[-1]
-    h_scale = float(output_size[-2]) / input.type().sizes()[-2]
-    return g.op("Upsample", input, width_scale_f=w_scale,
-                height_scale_f=h_scale, mode_s="bilinear")
+    height_scale = float(output_size[-2]) / input.type().sizes()[-2]
+    width_scale = float(output_size[-1]) / input.type().sizes()[-1]
+    return g.op("Upsample", input,
+                scales_f=[1., 1., height_scale, width_scale],
+                mode_s="bilinear")
 
 
 def gt(g, input, other):
@@ -676,8 +684,10 @@ def index_select(g, self, dim, index):
     return g.op("Gather", self, index, axis_i=dim)
 
 
-def index_put(g, *inputs):
-    return g.op("ATen", *inputs, operator_s='index_put')
+def index_put(g, self, indices_list_value, values):
+    indices_list = list(_unpack_list(indices_list_value))
+    args = [self] + indices_list + [values]
+    return g.op("ATen", *args, operator_s='index_put')
 
 
 def type_as(g, self, other):
@@ -868,14 +878,17 @@ def topk(g, self, k, dim, largest, sorted, out=None):
     return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
 
 
-@parse_args('v', 'is')
 def repeat(g, self, repeats):
-    if self.isTensor():
+    if not _is_value(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+    const_repeats = _maybe_get_const(repeats, 'is')
+
+    if self.isTensor() and not _is_value(const_repeats):
         sizes = self.type().sizes()
-        diff_dims = len(repeats) - len(sizes)
+        diff_dims = len(const_repeats) - len(sizes)
         if diff_dims > 0:
             self = view(g, self, [1] * diff_dims + sizes)
-    return g.op("Tile", self, g.op("Constant", value_t=torch.LongTensor(repeats)))
+    return g.op("Tile", self, repeats)
 
 
 def instance_norm(g, input, **kwargs):
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 4f9299d258ea3e..b770b900c4edd3 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -480,8 +480,14 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
                     raise RuntimeError("Unsupported prim::Constant kind: `{}`. Send a bug report.".format(
                         n.kindOf("value")))
             elif op_name == "ListConstruct":
-                unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs]
-                return g.op("Concat", *unsqueezed, axis_i=0)
+                t = n.output().type()
+                # Tensor lists are used mostly for inputs to cat/stack. They need to be handled
+                # in those symbolics, and should become dead afterwards.
+                if t == torch._C.ListType.ofTensors():
+                    return None
+                elif t == torch._C.ListType.ofInts():
+                    unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs]
+                    return g.op("Concat", *unsqueezed, axis_i=0)
             elif op_name == "Undefined":
                 # Undefined is not an ONNX operator; keep it as prim::Undefined
                 # and let the exporter handle finally eliminating these
diff --git a/torch/tensor.py b/torch/tensor.py
index 6b587fcf903586..9784fd59c9d2fb 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -384,6 +384,8 @@ def __dir__(self):
         return sorted(keys)
 
     # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray`
+    __array_priority__ = 1000    # prefer Tensor ops over numpy ones
+
     def __array__(self, dtype=None):
         if dtype is None:
             return self.cpu().numpy()