diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 491858a1a25b43..e280db56c42257 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -74,7 +74,7 @@ fi
 WERROR=1 python setup.py install
 
 # Add the test binaries so that they won't be git clean'ed away
-git add -f build/bin build/lib
+git add -f build/bin
 
 # Testing ATen install
 if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then
@@ -101,3 +101,11 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   make html
   popd
 fi
+
+# Test no-Python build
+if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
+  echo "Building libtorch"
+  # NB: Install outside of source directory (at the same level as the root
+  # pytorch folder) so that it doesn't get cleaned away prior to docker push.
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$PWD/../cpp-build"
+fi
diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh
index af5240f2caa88f..41b272eae63a8f 100755
--- a/.jenkins/pytorch/macos-build.sh
+++ b/.jenkins/pytorch/macos-build.sh
@@ -61,12 +61,6 @@ export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
 
 python setup.py install
 
-# this is a bit hacky, but not too bad. Bundle the test binaries into
-# the installation directory, so they can catch a free ride on the 7z
-# train.
-mkdir -p ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch/test_binaries/build
-mv build/{bin,lib} ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch/test_binaries/build/
-
 # Upload torch binaries when the build job is finished
 7z a ${IMAGE_COMMIT_TAG}.7z ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 aws s3 cp ${IMAGE_COMMIT_TAG}.7z s3://ossci-macos-build/pytorch/${IMAGE_COMMIT_TAG}.7z --acl public-read
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 222fea663a5730..92ef7ad191adb0 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -50,13 +50,22 @@ test_python_all() {
 test_cpp_api() {
   # C++ API
 
+  # NB: Install outside of source directory (at the same level as the root
+  # pytorch folder) so that it doesn't get cleaned away prior to docker push.
+  # But still clean it before we perform our own build.
+  #
+  CPP_BUILD="$PWD/../cpp-build"
+  rm -rf $CPP_BUILD
+  mkdir -p $CPP_BUILD
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$CPP_BUILD"
+
   python tools/download_mnist.py --quiet -d test/cpp/api/mnist
 
   # Unfortunately it seems like the test can't load from miniconda3
   # without these paths being set
   export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$PWD/miniconda3/lib"
   export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/miniconda3/lib"
-  ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch/test_binaries/build/bin/test_api
+  "$CPP_BUILD"/caffe2/bin/test_api
 }
 
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 6527e734490d7f..4530b3f9fa483b 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -108,13 +108,14 @@ test_torchvision() {
 test_libtorch() {
   if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
      echo "Testing libtorch"
+     CPP_BUILD="$PWD/../cpp-build"
      if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-       ./build/bin/test_jit
+       "$CPP_BUILD"/caffe2/bin/test_jit
      else
-       ./build/bin/test_jit "[cpu]"
+       "$CPP_BUILD"/caffe2/bin/test_jit "[cpu]"
      fi
      python tools/download_mnist.py --quiet -d test/cpp/api/mnist
-     OMP_NUM_THREADS=2 ./build/bin/test_api
+     OMP_NUM_THREADS=2 "$CPP_BUILD"/caffe2/bin/test_api
   fi
 }
 
diff --git a/aten/src/ATen/Backend.h b/aten/src/ATen/Backend.h
new file mode 100644
index 00000000000000..ccb96feeed238a
--- /dev/null
+++ b/aten/src/ATen/Backend.h
@@ -0,0 +1,58 @@
+#pragma once
+#include <stdexcept>
+
+namespace at {
+
+enum class Backend { CPU, CUDA, SparseCPU, SparseCUDA, Undefined, NumOptions };
+
+constexpr Backend kCPU = Backend::CPU;
+constexpr Backend kCUDA = Backend::CUDA;
+constexpr Backend kSparseCPU = Backend::SparseCPU;
+constexpr Backend kSparseCUDA = Backend::SparseCUDA;
+
+static inline Backend toSparse(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return Backend::SparseCPU;
+    case Backend::CUDA:
+      return Backend::SparseCUDA;
+    case Backend::SparseCPU:
+      return Backend::SparseCPU;
+    case Backend::SparseCUDA:
+      return Backend::SparseCUDA;
+    default:
+      throw std::runtime_error("Unknown backend");
+  }
+}
+
+static inline Backend toDense(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return Backend::CPU;
+    case Backend::CUDA:
+      return Backend::CUDA;
+    case Backend::SparseCPU:
+      return Backend::CPU;
+    case Backend::SparseCUDA:
+      return Backend::CUDA;
+    default:
+      throw std::runtime_error("Unknown backend");
+  }
+}
+
+static inline const char* toString(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return "CPU";
+    case Backend::CUDA:
+      return "CUDA";
+    case Backend::SparseCPU:
+      return "SparseCPU";
+    case Backend::SparseCUDA:
+      return "SparseCUDA";
+    default:
+      return "UNKNOWN_BACKEND";
+  }
+}
+
+} // namespace at
diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h
index b0a99b6a4c3e50..b777e76cde50ec 100644
--- a/aten/src/ATen/Device.h
+++ b/aten/src/ATen/Device.h
@@ -3,6 +3,7 @@
 #include <ATen/ScalarType.h>
 #include <ATen/core/Error.h>
 #include <ATen/core/DeviceType.h>
+#include <ATen/core/Error.h>
 
 #include <cstddef>
 #include <iosfwd>
@@ -38,7 +39,8 @@ struct Device {
     }
   }
 
-  /// Constructs a new `Device` from a `DeviceType` and an optional device index.
+  /// Constructs a new `Device` from a `DeviceType` and an optional device
+  /// index.
   /* implicit */ Device(DeviceType type, int32_t index = -1)
       : type_(type), index_(index) {
     AT_CHECK(
diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h
index 0af1d72d2b6b6b..5b1f7877df268a 100644
--- a/aten/src/ATen/ScalarType.h
+++ b/aten/src/ATen/ScalarType.h
@@ -1,168 +1,4 @@
 #pragma once
-
-#include "ATen/ATenGeneral.h"
-#include "ATen/core/ArrayRef.h"
-#include "ATen/core/Half.h"
-
-#include <cstdint>
-#include <iostream>
-
-namespace at {
-
-// NB: Order matters for this macro; it is relied upon in
-// _promoteTypesLookup and the serialization format.
-#define AT_FORALL_SCALAR_TYPES(_) \
-_(uint8_t,Byte,i)  /* 0 */ \
-_(int8_t,Char,i)   /* 1 */ \
-_(int16_t,Short,i) /* 2 */ \
-_(int,Int,i)       /* 3 */ \
-_(int64_t,Long,i)  /* 4 */ \
-_(at::Half,Half,d) /* 5 */ \
-_(float,Float,d)   /* 6 */ \
-_(double,Double,d) /* 7 */
-
-#define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \
-_(uint8_t,Byte,i) \
-_(int8_t,Char,i) \
-_(int16_t,Short,i) \
-_(int,Int,i) \
-_(int64_t,Long,i) \
-_(float,Float,d) \
-_(double,Double,d)
-
-enum class ScalarType {
-#define DEFINE_ENUM(_1,n,_2) \
-  n,
-  AT_FORALL_SCALAR_TYPES(DEFINE_ENUM)
-#undef DEFINE_ENUM
-  Undefined, // 8
-  NumOptions
-};
-
-enum class Backend {
-  CPU,
-  CUDA,
-  SparseCPU,
-  SparseCUDA,
-  Undefined,
-  NumOptions
-};
-
-constexpr Backend kCPU = Backend::CPU;
-constexpr Backend kCUDA = Backend::CUDA;
-constexpr Backend kSparseCPU = Backend::SparseCPU;
-constexpr Backend kSparseCUDA = Backend::SparseCUDA;
-
-static inline Backend toSparse(Backend b) {
-  switch (b) {
-    case Backend::CPU: return Backend::SparseCPU;
-    case Backend::CUDA: return Backend::SparseCUDA;
-    case Backend::SparseCPU: return Backend::SparseCPU;
-    case Backend::SparseCUDA: return Backend::SparseCUDA;
-    default: throw std::runtime_error("Unknown backend");
-  }
-}
-
-static inline Backend toDense(Backend b) {
-  switch (b) {
-    case Backend::CPU: return Backend::CPU;
-    case Backend::CUDA: return Backend::CUDA;
-    case Backend::SparseCPU: return Backend::CPU;
-    case Backend::SparseCUDA: return Backend::CUDA;
-    default: throw std::runtime_error("Unknown backend");
-  }
-}
-
-static inline const char * toString(Backend b) {
-  switch(b) {
-    case Backend::CPU: return "CPU";
-    case Backend::CUDA: return "CUDA";
-    case Backend::SparseCPU: return "SparseCPU";
-    case Backend::SparseCUDA: return "SparseCUDA";
-    default: return "UNKNOWN_BACKEND";
-  }
-}
-
-#define DEFINE_CONSTANT(_,name,_2) \
-constexpr ScalarType k##name = ScalarType::name;
-
-AT_FORALL_SCALAR_TYPES(DEFINE_CONSTANT)
-#undef DEFINE_CONSTANT
-
-static inline const char * toString(ScalarType t) {
-#define DEFINE_CASE(_,name,_2) \
-  case ScalarType:: name : return #name;
-
-  switch(t) {
-    AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
-    default:
-      return "UNKNOWN_SCALAR";
-  }
-#undef DEFINE_CASE
-}
-
-static inline size_t elementSize(ScalarType t) {
-#define CASE_ELEMENTSIZE_CASE(ctype,name,_2) \
-  case ScalarType:: name : return sizeof(ctype);
-
-  switch(t) {
-    AT_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE)
-    default:
-      AT_ERROR("Unknown ScalarType");
-  }
-#undef CASE_ELEMENTSIZE_CASE
-}
-
-static inline bool isIntegralType(ScalarType t) {
-  return (t == ScalarType::Byte ||
-          t == ScalarType::Char ||
-          t == ScalarType::Int ||
-          t == ScalarType::Long ||
-          t == ScalarType::Short);
-}
-
-static inline bool isFloatingType(ScalarType t) {
-  return (t == ScalarType::Double ||
-          t == ScalarType::Float ||
-          t == ScalarType::Half);
-}
-
-static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
-  // This is generated according to NumPy's promote_types
-  constexpr auto u1 = ScalarType::Byte;
-  constexpr auto i1 = ScalarType::Char;
-  constexpr auto i2 = ScalarType::Short;
-  constexpr auto i4 = ScalarType::Int;
-  constexpr auto i8 = ScalarType::Long;
-  constexpr auto f2 = ScalarType::Half;
-  constexpr auto f4 = ScalarType::Float;
-  constexpr auto f8 = ScalarType::Double;
-  constexpr auto ud = ScalarType::Undefined;
-  static constexpr ScalarType _promoteTypesLookup
-      [static_cast<int>(ScalarType::NumOptions)]
-      [static_cast<int>(ScalarType::NumOptions)] = {
-            /* u1  i1  i2  i4  i8  f2  f4  f8, ud */
-    /* u1 */ { u1, i2, i2, i4, i8, f2, f4, f8, ud },
-    /* i1 */ { i2, i1, i2, i4, i8, f2, f4, f8, ud },
-    /* i2 */ { i2, i2, i2, i4, i8, f4, f4, f8, ud },
-    /* i4 */ { i4, i4, i4, i4, i8, f8, f4, f8, ud },
-    /* i8 */ { i8, i8, i8, i8, i8, f8, f4, f8, ud },
-    /* f2 */ { f2, f2, f4, f8, f8, f2, f4, f8, ud },
-    /* f4 */ { f4, f4, f4, f4, f4, f4, f4, f8, ud },
-    /* f8 */ { f8, f8, f8, f8, f8, f8, f8, f8, ud },
-    /* ud */ { ud, ud, ud, ud, ud, ud, ud, ud, ud },
-  };
-  return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
-}
-
-struct Tensor;
-typedef ArrayRef<int64_t> IntList;
-typedef ArrayRef<Tensor> TensorList;
-
-} // namespace at
-
-inline std::ostream& operator<<(
-    std::ostream& stream,
-    at::ScalarType scalar_type) {
-  return stream << at::toString(scalar_type);
-}
+#include <ATen/ATenGeneral.h> // for BC reasons
+#include <ATen/Backend.h>
+#include <ATen/core/ScalarType.h>
diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h
index ee8471f66fe258..a6769b10b93eed 100644
--- a/aten/src/ATen/core/ATenCoreTest.h
+++ b/aten/src/ATen/core/ATenCoreTest.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/core/Backtrace.h b/aten/src/ATen/core/Backtrace.h
index ec4c17c6f6a531..9aa3ac826ce78c 100644
--- a/aten/src/ATen/core/Backtrace.h
+++ b/aten/src/ATen/core/Backtrace.h
@@ -4,7 +4,7 @@
 #include <string>
 #include <typeinfo>
 
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 
 namespace at {
 /// Utility to demangle a C++ symbol name.
diff --git a/aten/src/ATen/core/DeviceType.h b/aten/src/ATen/core/DeviceType.h
index f3b3dcb03cde9d..0a3d32bfe14317 100644
--- a/aten/src/ATen/core/DeviceType.h
+++ b/aten/src/ATen/core/DeviceType.h
@@ -3,7 +3,7 @@
 // ATen/core (which would require a lot more build system hacking.)
 // If you modify me, keep me synchronized with that file.
 
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 
 #include <ostream>
 
@@ -12,19 +12,21 @@ namespace at {
 // Underlying type declared to be int32_t for consistency with protobufs.
 enum class DeviceType : int32_t {
   CPU = 0,
-  CUDA = 1,                   // CUDA.
-  MKLDNN = 2,                 // Reserved for explicit MKLDNN
-  OPENGL = 3,                 // OpenGL
-  OPENCL = 4,                 // OpenCL
-  IDEEP = 5,                  // IDEEP.
-  HIP = 6,                    // AMD HIP
+  CUDA = 1, // CUDA.
+  MKLDNN = 2, // Reserved for explicit MKLDNN
+  OPENGL = 3, // OpenGL
+  OPENCL = 4, // OpenCL
+  IDEEP = 5, // IDEEP.
+  HIP = 6, // AMD HIP
   // Change the following number if you add more devices in the code.
   COMPILE_TIME_MAX_DEVICE_TYPES = 7,
-  ONLY_FOR_TEST = 20901701,   // This device type is only for test.
+  ONLY_FOR_TEST = 20901701, // This device type is only for test.
 };
 
-AT_CORE_API std::string DeviceTypeName(at::DeviceType d, bool lower_case = false);
+AT_CORE_API std::string DeviceTypeName(
+    at::DeviceType d,
+    bool lower_case = false);
 
-}
+} // namespace at
 
 AT_CORE_API std::ostream& operator<<(std::ostream& stream, at::DeviceType type);
diff --git a/aten/src/ATen/core/Error.h b/aten/src/ATen/core/Error.h
index 5b567dd0de3506..fffc2aef56d035 100644
--- a/aten/src/ATen/core/Error.h
+++ b/aten/src/ATen/core/Error.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 #include <ATen/core/optional.h>
 
 #include <cstddef>
diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
index d89b496d7083b8..14ceeff51284b1 100644
--- a/aten/src/ATen/core/Half-inl.h
+++ b/aten/src/ATen/core/Half-inl.h
@@ -2,7 +2,7 @@
 
 #include <cstring>
 #include <limits>
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
index 385f18e78cab02..5b11711519ff58 100644
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@@ -9,7 +9,7 @@
 /// If you are writing a compute bound kernel, you can use the CUDA half
 /// intrinsics directly on the Half type from device code.
 
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 
 #include <cmath>
 #include <cstdint>
diff --git a/aten/src/ATen/core/CoreAPI.h b/aten/src/ATen/core/Macros.h
similarity index 61%
rename from aten/src/ATen/core/CoreAPI.h
rename to aten/src/ATen/core/Macros.h
index 0ee114d9f4cfdd..dcad67ddb68c8f 100644
--- a/aten/src/ATen/core/CoreAPI.h
+++ b/aten/src/ATen/core/Macros.h
@@ -1,3 +1,5 @@
+#pragma once
+
 // You can use the definition AT_CORE_STATIC_WINDOWS to control whether
 // or not we apply __declspec.  You will want to set this as
 // -DAT_CORE_STATIC_WINDOWS=1 when compiling code which links
@@ -18,3 +20,11 @@
 #else
 #define AT_CORE_API
 #endif
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#ifndef DISABLE_COPY_AND_ASSIGN
+#define DISABLE_COPY_AND_ASSIGN(classname)                                     \
+  classname(const classname&) = delete;                                        \
+  classname& operator=(const classname&) = delete
+#endif
diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h
new file mode 100644
index 00000000000000..804f73d356e972
--- /dev/null
+++ b/aten/src/ATen/core/ScalarType.h
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "ATen/core/ArrayRef.h"
+#include "ATen/core/Half.h"
+
+#include <cstdint>
+#include <iostream>
+
+namespace at {
+
+// NB: Order matters for this macro; it is relied upon in
+// _promoteTypesLookup and the serialization format.
+#define AT_FORALL_SCALAR_TYPES(_) \
+_(uint8_t,Byte,i)  /* 0 */ \
+_(int8_t,Char,i)   /* 1 */ \
+_(int16_t,Short,i) /* 2 */ \
+_(int,Int,i)       /* 3 */ \
+_(int64_t,Long,i)  /* 4 */ \
+_(at::Half,Half,d) /* 5 */ \
+_(float,Float,d)   /* 6 */ \
+_(double,Double,d) /* 7 */
+
+#define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \
+_(uint8_t,Byte,i) \
+_(int8_t,Char,i) \
+_(int16_t,Short,i) \
+_(int,Int,i) \
+_(int64_t,Long,i) \
+_(float,Float,d) \
+_(double,Double,d)
+
+enum class ScalarType {
+#define DEFINE_ENUM(_1,n,_2) \
+  n,
+  AT_FORALL_SCALAR_TYPES(DEFINE_ENUM)
+#undef DEFINE_ENUM
+  Undefined, // 8
+  NumOptions
+};
+
+#define DEFINE_CONSTANT(_,name,_2) \
+constexpr ScalarType k##name = ScalarType::name;
+
+AT_FORALL_SCALAR_TYPES(DEFINE_CONSTANT)
+#undef DEFINE_CONSTANT
+
+static inline const char * toString(ScalarType t) {
+#define DEFINE_CASE(_,name,_2) \
+  case ScalarType:: name : return #name;
+
+  switch(t) {
+    AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
+    default:
+      return "UNKNOWN_SCALAR";
+  }
+#undef DEFINE_CASE
+}
+
+static inline size_t elementSize(ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype,name,_2) \
+  case ScalarType:: name : return sizeof(ctype);
+
+  switch(t) {
+    AT_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE)
+    default:
+      AT_ERROR("Unknown ScalarType");
+  }
+#undef CASE_ELEMENTSIZE_CASE
+}
+
+static inline bool isIntegralType(ScalarType t) {
+  return (t == ScalarType::Byte ||
+          t == ScalarType::Char ||
+          t == ScalarType::Int ||
+          t == ScalarType::Long ||
+          t == ScalarType::Short);
+}
+
+static inline bool isFloatingType(ScalarType t) {
+  return (t == ScalarType::Double ||
+          t == ScalarType::Float ||
+          t == ScalarType::Half);
+}
+
+static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
+  // This is generated according to NumPy's promote_types
+  constexpr auto u1 = ScalarType::Byte;
+  constexpr auto i1 = ScalarType::Char;
+  constexpr auto i2 = ScalarType::Short;
+  constexpr auto i4 = ScalarType::Int;
+  constexpr auto i8 = ScalarType::Long;
+  constexpr auto f2 = ScalarType::Half;
+  constexpr auto f4 = ScalarType::Float;
+  constexpr auto f8 = ScalarType::Double;
+  constexpr auto ud = ScalarType::Undefined;
+  static constexpr ScalarType _promoteTypesLookup
+      [static_cast<int>(ScalarType::NumOptions)]
+      [static_cast<int>(ScalarType::NumOptions)] = {
+            /* u1  i1  i2  i4  i8  f2  f4  f8, ud */
+    /* u1 */ { u1, i2, i2, i4, i8, f2, f4, f8, ud },
+    /* i1 */ { i2, i1, i2, i4, i8, f2, f4, f8, ud },
+    /* i2 */ { i2, i2, i2, i4, i8, f4, f4, f8, ud },
+    /* i4 */ { i4, i4, i4, i4, i8, f8, f4, f8, ud },
+    /* i8 */ { i8, i8, i8, i8, i8, f8, f4, f8, ud },
+    /* f2 */ { f2, f2, f4, f8, f8, f2, f4, f8, ud },
+    /* f4 */ { f4, f4, f4, f4, f4, f4, f4, f8, ud },
+    /* f8 */ { f8, f8, f8, f8, f8, f8, f8, f8, ud },
+    /* ud */ { ud, ud, ud, ud, ud, ud, ud, ud, ud },
+  };
+  return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
+}
+
+struct Tensor;
+typedef ArrayRef<int64_t> IntList;
+typedef ArrayRef<Tensor> TensorList;
+
+} // namespace at
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::ScalarType scalar_type) {
+  return stream << at::toString(scalar_type);
+}
diff --git a/aten/src/ATen/core/SmallVector.h b/aten/src/ATen/core/SmallVector.h
index 269b21b0d5cf37..483144794f46e1 100644
--- a/aten/src/ATen/core/SmallVector.h
+++ b/aten/src/ATen/core/SmallVector.h
@@ -21,7 +21,7 @@
 #pragma once
 
 #include <ATen/core/AlignOf.h>
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 
 #include <algorithm>
 #include <cassert>
diff --git a/aten/src/ATen/core/TensorTypeId.cpp b/aten/src/ATen/core/TensorTypeId.cpp
new file mode 100644
index 00000000000000..605d303ad62ee3
--- /dev/null
+++ b/aten/src/ATen/core/TensorTypeId.cpp
@@ -0,0 +1,5 @@
+#include "ATen/core/TensorTypeId.h"
+
+std::ostream& operator<<(std::ostream& str, at::TensorTypeId rhs) {
+  return str << rhs.underlyingId();
+}
diff --git a/aten/src/ATen/core/TensorTypeId.h b/aten/src/ATen/core/TensorTypeId.h
new file mode 100644
index 00000000000000..5fc411137e08b4
--- /dev/null
+++ b/aten/src/ATen/core/TensorTypeId.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <iostream>
+#include <mutex>
+#include <string>
+#include <unordered_set>
+#include "ATen/core/IdWrapper.h"
+
+namespace at {
+class TensorTypeId;
+}
+
+std::ostream& operator<<(std::ostream&, at::TensorTypeId);
+
+namespace at {
+
+namespace details {
+using _tensorTypeId_underlyingType = uint8_t;
+}
+
+/**
+ * Dynamic type ID of a Tensor argument.  It represents something like
+ * CPUTensor, etc.
+ */
+class TensorTypeId final
+    : public at::
+          IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
+ public:
+  // Don't use this!
+  // Unfortunately, a default constructor needs to be defined because of
+  // https://reviews.llvm.org/D41223
+  constexpr TensorTypeId() noexcept : IdWrapper(0) {}
+
+ private:
+  constexpr explicit TensorTypeId(
+      details::_tensorTypeId_underlyingType id) noexcept
+      : IdWrapper(id) {}
+
+  friend class TensorTypeIdCreator;
+  friend std::ostream& ::operator<<(std::ostream&, TensorTypeId);
+};
+
+} // namespace at
+
+AT_DEFINE_HASH_FOR_IDWRAPPER(at::TensorTypeId)
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.cpp b/aten/src/ATen/core/TensorTypeIdRegistration.cpp
new file mode 100644
index 00000000000000..af0b992e51c6ff
--- /dev/null
+++ b/aten/src/ATen/core/TensorTypeIdRegistration.cpp
@@ -0,0 +1,62 @@
+#include <ATen/core/TensorTypeIdRegistration.h>
+#include <ATen/core/C++17.h>
+#include <ATen/core/Error.h>
+
+namespace at {
+
+constexpr at::TensorTypeId TensorTypeIdCreator::max_id_;
+
+TensorTypeIds::TensorTypeIds() : creator_(), registry_() {}
+
+TensorTypeIds& TensorTypeIds::singleton() {
+  static TensorTypeIds singleton;
+  return singleton;
+}
+
+TensorTypeIdCreator::TensorTypeIdCreator() : last_id_(0) {}
+
+at::TensorTypeId TensorTypeIdCreator::create() {
+  auto id = TensorTypeId(++last_id_);
+
+  if (id == max_id_) {
+    // If this happens in prod, we have to change
+    // details::_tensorTypeId_underlyingType to uint16_t.
+    AT_ERROR(
+        "Tried to define more than ",
+        std::numeric_limits<details::_tensorTypeId_underlyingType>::max() - 1,
+        " tensor types, which is unsupported");
+  }
+
+  return id;
+}
+
+TensorTypeIdRegistry::TensorTypeIdRegistry() : registeredTypeIds_(), mutex_() {}
+
+void TensorTypeIdRegistry::registerId(at::TensorTypeId id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  registeredTypeIds_.emplace(id);
+}
+
+void TensorTypeIdRegistry::deregisterId(at::TensorTypeId id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  registeredTypeIds_.erase(id);
+}
+
+at::TensorTypeId TensorTypeIds::createAndRegister() {
+  at::TensorTypeId id = creator_.create();
+  registry_.registerId(id);
+  return id;
+}
+
+void TensorTypeIds::deregister(at::TensorTypeId id) {
+  registry_.deregisterId(id);
+}
+
+TensorTypeIdRegistrar::TensorTypeIdRegistrar()
+    : id_(TensorTypeIds::singleton().createAndRegister()) {}
+
+TensorTypeIdRegistrar::~TensorTypeIdRegistrar() {
+  TensorTypeIds::singleton().deregister(id_);
+}
+
+} // namespace at
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.h b/aten/src/ATen/core/TensorTypeIdRegistration.h
new file mode 100644
index 00000000000000..a890c7990c4a41
--- /dev/null
+++ b/aten/src/ATen/core/TensorTypeIdRegistration.h
@@ -0,0 +1,99 @@
+#pragma once
+
+/**
+ * To register your own tensor types, do in a header file:
+ *   AT_DECLARE_TENSOR_TYPE(MY_TENSOR)
+ * and in one (!) cpp file:
+ *   AT_DEFINE_TENSOR_TYPE(MY_TENSOR)
+ * Both must be in the same namespace.
+ */
+
+#include "ATen/core/Macros.h"
+#include "ATen/core/TensorTypeId.h"
+
+#include <atomic>
+#include <unordered_set>
+
+namespace at {
+
+class TensorTypeIdCreator final {
+ public:
+  TensorTypeIdCreator();
+
+  at::TensorTypeId create();
+
+  static constexpr at::TensorTypeId undefined() noexcept {
+    return TensorTypeId(0);
+  }
+
+ private:
+  std::atomic<details::_tensorTypeId_underlyingType> last_id_;
+
+  static constexpr at::TensorTypeId max_id_ = TensorTypeId(
+      std::numeric_limits<details::_tensorTypeId_underlyingType>::max());
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
+};
+
+class TensorTypeIdRegistry final {
+ public:
+  TensorTypeIdRegistry();
+
+  void registerId(at::TensorTypeId id);
+  void deregisterId(at::TensorTypeId id);
+
+ private:
+  std::unordered_set<at::TensorTypeId> registeredTypeIds_;
+  std::mutex mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
+};
+
+class TensorTypeIds final {
+ public:
+  static TensorTypeIds& singleton();
+
+  at::TensorTypeId createAndRegister();
+  void deregister(at::TensorTypeId id);
+
+  static constexpr at::TensorTypeId undefined() noexcept;
+
+ private:
+  TensorTypeIds();
+
+  TensorTypeIdCreator creator_;
+  TensorTypeIdRegistry registry_;
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
+};
+
+inline constexpr at::TensorTypeId TensorTypeIds::undefined() noexcept {
+  return TensorTypeIdCreator::undefined();
+}
+
+class TensorTypeIdRegistrar final {
+ public:
+  TensorTypeIdRegistrar();
+  ~TensorTypeIdRegistrar();
+
+  at::TensorTypeId id() const noexcept;
+
+ private:
+  at::TensorTypeId id_;
+
+  DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
+};
+
+inline at::TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
+  return id_;
+}
+
+} // namespace at
+
+#define AT_DECLARE_TENSOR_TYPE(TensorName) at::TensorTypeId TensorName();
+
+#define AT_DEFINE_TENSOR_TYPE(TensorName)           \
+  at::TensorTypeId TensorName() {                   \
+    static TensorTypeIdRegistrar registration_raii; \
+    return registration_raii.id();                  \
+  }
diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h
index 299c729e125a58..405d286308e087 100644
--- a/aten/src/ATen/core/UniqueVoidPtr.h
+++ b/aten/src/ATen/core/UniqueVoidPtr.h
@@ -1,6 +1,6 @@
 #include <memory>
 
-#include <ATen/core/CoreAPI.h>
+#include <ATen/core/Macros.h>
 
 namespace at {
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 1e977f0f493f14..123244d220665f 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -223,7 +223,13 @@ target_include_directories(caffe2 PRIVATE ${Caffe2_CPU_INCLUDE})
 target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 # Set standard properties on the target
 aten_set_target_props(caffe2)
+
+if (MSVC)
 target_compile_options(caffe2 INTERFACE "-std=c++11")
+else()
+target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
+endif()
+
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 if (MSVC AND NOT BUILD_SHARED_LIBS)
   # Note [Supporting both static and dynamic libraries on Window]
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 32b06b649be258..7d002028b14f36 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -26,6 +26,8 @@
 // is automatically generated by the cmake script during build.
 #include "caffe2/core/macros.h"
 
+#include "ATen/core/Macros.h"
+
 namespace caffe2 {
 
 // Data type for caffe2 Index/Size. We use size_t to be safe here as well as for
@@ -62,14 +64,6 @@ using std::vector;
 #define CAFFE2_USED __attribute__((__used__))
 #endif //_MSC_VER
 
-// Disable the copy and assignment operator for a class. Note that this will
-// disable the usage of the class in std containers.
-#ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname)                                     \
-  classname(const classname&) = delete;                                        \
-  classname& operator=(const classname&) = delete
-#endif
-
 // Define enabled when building for iOS or Android devices
 #if !defined(CAFFE2_MOBILE)
 #if defined(__ANDROID__)
diff --git a/caffe2/core/dispatch/CMakeLists.txt b/caffe2/core/dispatch/CMakeLists.txt
index c028bfa2b93070..da3177cb8cc0b0 100644
--- a/caffe2/core/dispatch/CMakeLists.txt
+++ b/caffe2/core/dispatch/CMakeLists.txt
@@ -8,8 +8,6 @@ set(LIB_SOURCES
         LeftRight.cpp
         OpSchema.cpp
         OpSchemaRegistration.cpp
-        TensorTypeId.cpp
-        TensorTypeIdRegistration.cpp
 )
 
 set(TEST_SOURCES
diff --git a/caffe2/core/dispatch/TensorTypeId.cpp b/caffe2/core/dispatch/TensorTypeId.cpp
deleted file mode 100644
index fe1ad1b7c88a95..00000000000000
--- a/caffe2/core/dispatch/TensorTypeId.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "caffe2/core/dispatch/TensorTypeId.h"
-
-std::ostream& operator<<(std::ostream& str, c10::TensorTypeId rhs) {
-  return str << rhs.underlyingId();
-}
diff --git a/caffe2/core/dispatch/TensorTypeId.h b/caffe2/core/dispatch/TensorTypeId.h
deleted file mode 100644
index 244817904667b9..00000000000000
--- a/caffe2/core/dispatch/TensorTypeId.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-
-#include "ATen/core/IdWrapper.h"
-#include <string>
-#include <iostream>
-#include <mutex>
-#include <unordered_set>
-
-namespace c10 {
-class TensorTypeId;
-}
-
-std::ostream& operator<<(std::ostream&, c10::TensorTypeId);
-
-namespace c10 {
-
-namespace details {
-  using _tensorTypeId_underlyingType = uint8_t;
-}
-
-/**
- * Dynamic type ID of a Tensor argument.  It represents something like CPUTensor, etc.
- */
-class TensorTypeId final : public at::IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
-public:
-  // Don't use this!
-  // Unfortunately, a default constructor needs to be defined because of https://reviews.llvm.org/D41223
-  constexpr TensorTypeId() noexcept: IdWrapper(0) {}
-private:
-  constexpr explicit TensorTypeId(details::_tensorTypeId_underlyingType id) noexcept: IdWrapper(id) {}
-
-  friend class TensorTypeIdCreator;
-  friend std::ostream& ::operator<<(std::ostream&, TensorTypeId);
-};
-
-}  // namespace c10
-
-AT_DEFINE_HASH_FOR_IDWRAPPER(c10::TensorTypeId)
diff --git a/caffe2/core/dispatch/TensorTypeIdRegistration.cpp b/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
deleted file mode 100644
index 31b4c6b671aa29..00000000000000
--- a/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "caffe2/core/dispatch/TensorTypeIdRegistration.h"
-#include <ATen/core/C++17.h>
-
-namespace c10 {
-
-constexpr TensorTypeId TensorTypeIdCreator::max_id_;
-
-TensorTypeIds::TensorTypeIds()
-: creator_(), registry_() {}
-
-TensorTypeIds& TensorTypeIds::singleton() {
-  static TensorTypeIds singleton;
-  return singleton;
-}
-
-TensorTypeIdCreator::TensorTypeIdCreator()
-: last_id_(0) {}
-
-TensorTypeId TensorTypeIdCreator::create() {
-  auto id = TensorTypeId(++last_id_);
-
-  if (id == max_id_) {
-    // If this happens in prod, we have to change details::_tensorTypeId_underlyingType to uint16_t.
-    throw std::logic_error("Tried to define more than " + c10::guts::to_string(std::numeric_limits<details::_tensorTypeId_underlyingType>::max()-1) + " tensor types, which is unsupported");
-  }
-
-  return id;
-}
-
-TensorTypeIdRegistry::TensorTypeIdRegistry()
-: registeredTypeIds_(), mutex_() {}
-
-void TensorTypeIdRegistry::registerId(TensorTypeId id) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  registeredTypeIds_.emplace(id);
-}
-
-void TensorTypeIdRegistry::deregisterId(TensorTypeId id) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  registeredTypeIds_.erase(id);
-}
-
-TensorTypeId TensorTypeIds::createAndRegister() {
-  TensorTypeId id = creator_.create();
-  registry_.registerId(id);
-  return id;
-}
-
-void TensorTypeIds::deregister(TensorTypeId id) {
-  registry_.deregisterId(id);
-}
-
-TensorTypeIdRegistrar::TensorTypeIdRegistrar()
-: id_(TensorTypeIds::singleton().createAndRegister()) {
-}
-
-TensorTypeIdRegistrar::~TensorTypeIdRegistrar() {
-  TensorTypeIds::singleton().deregister(id_);
-}
-
-}  // namespace c10
diff --git a/caffe2/core/dispatch/TensorTypeIdRegistration.h b/caffe2/core/dispatch/TensorTypeIdRegistration.h
deleted file mode 100644
index a7af6337acb2a5..00000000000000
--- a/caffe2/core/dispatch/TensorTypeIdRegistration.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#pragma once
-
-/**
- * To register your own tensor types, do in a header file:
- *   C10_DECLARE_TENSOR_TYPE(MY_TENSOR)
- * and in one (!) cpp file:
- *   C10_DEFINE_TENSOR_TYPE(MY_TENSOR)
- * Both must be in the same namespace.
- */
-
-#include "caffe2/core/dispatch/TensorTypeId.h"
-#include "caffe2/core/common.h"
-#include <atomic>
-#include "caffe2/utils/flat_hash_map/flat_hash_map.h"
-
-namespace c10 {
-
-class TensorTypeIdCreator final {
-public:
-  TensorTypeIdCreator();
-
-  TensorTypeId create();
-
-  static constexpr TensorTypeId undefined() noexcept {
-    return TensorTypeId(0);
-  }
-
-private:
-  std::atomic<details::_tensorTypeId_underlyingType> last_id_;
-
-  static constexpr TensorTypeId max_id_ = TensorTypeId(std::numeric_limits<details::_tensorTypeId_underlyingType>::max());
-
-  DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
-};
-
-class TensorTypeIdRegistry final {
-public:
-  TensorTypeIdRegistry();
-
-  void registerId(TensorTypeId id);
-  void deregisterId(TensorTypeId id);
-
-private:
-  ska::flat_hash_set<TensorTypeId> registeredTypeIds_;
-  std::mutex mutex_;
-
-  DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
-};
-
-class TensorTypeIds final {
-public:
-  static TensorTypeIds& singleton();
-
-  TensorTypeId createAndRegister();
-  void deregister(TensorTypeId id);
-
-  static constexpr TensorTypeId undefined() noexcept;
-
-private:
-  TensorTypeIds();
-
-  TensorTypeIdCreator creator_;
-  TensorTypeIdRegistry registry_;
-
-  DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
-};
-
-inline constexpr TensorTypeId TensorTypeIds::undefined() noexcept {
-  return TensorTypeIdCreator::undefined();
-}
-
-class TensorTypeIdRegistrar final {
-public:
-  TensorTypeIdRegistrar();
-  ~TensorTypeIdRegistrar();
-
-  TensorTypeId id() const noexcept;
-
-private:
-  TensorTypeId id_;
-
-  DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
-};
-
-inline TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
-  return id_;
-}
-
-}  // namespace c10
-
-#define C10_DECLARE_TENSOR_TYPE(TensorName)                                      \
-  TensorTypeId TensorName();                                                     \
-
-#define C10_DEFINE_TENSOR_TYPE(TensorName)                                       \
-  TensorTypeId TensorName() {                                                    \
-    static TensorTypeIdRegistrar registration_raii;                              \
-    return registration_raii.id();                                               \
-  }
diff --git a/cmake/public/threads.cmake b/cmake/public/threads.cmake
index 44c3f0ed9dc46e..f223f497c76f43 100644
--- a/cmake/public/threads.cmake
+++ b/cmake/public/threads.cmake
@@ -5,14 +5,12 @@ if(THREADS_FOUND AND NOT TARGET Threads::Threads)
   add_library(Threads::Threads INTERFACE IMPORTED)
 
   if(THREADS_HAVE_PTHREAD_ARG)
-    set_property(
-        TARGET Threads::Threads
-        PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
+    set_property(TARGET Threads::Threads
+                 PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
   endif()
 
   if(CMAKE_THREAD_LIBS_INIT)
-    set_property(
-        TARGET Threads::Threads
-        PROPERTY INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+    set_property(TARGET Threads::Threads
+                 PROPERTY INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
   endif()
-endif()
\ No newline at end of file
+endif()
diff --git a/test/test_jit.py b/test/test_jit.py
index 4e67cf69b30aab..08e80933b1ea24 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -107,6 +107,20 @@ def get_fn(file_name, script_path):
     return fn
 
 
+# Python equivalents for the empty list construction builtins. We need
+# these otherwise the tests won't execute in regular Python mode.
+def _construct_empty_int_list():
+    return []
+
+
+def _construct_empty_float_list():
+    return []
+
+
+def _construct_empty_tensor_list():
+    return []
+
+
 class JitTestCase(TestCase):
     _do_cuda_memory_leak_check = True
 
@@ -1816,6 +1830,26 @@ def capture_stdout(self):
             os.close(r)
             os.close(w)
 
+    def checkScriptRaisesRegex(self, script, inputs, exception, regex,
+                               optimize=True, outputs=None, capture_output=False):
+        """
+        Checks that a given function will throw the correct exception,
+        when executed with normal python, the string frontend, and the AST frontend
+        """
+        # normal python
+        with self.assertRaisesRegex(exception, regex):
+            script(*inputs)
+        # string frontend
+        with self.assertRaisesRegex(exception, regex):
+            source = textwrap.dedent(inspect.getsource(script))
+            cu = torch.jit.CompilationUnit(source, optimize)
+            ge = getattr(cu, script.__name__)
+            ge(*inputs)
+        # python AST frontend
+        with self.assertRaisesRegex(exception, regex):
+            ge = torch.jit.script(script, optimize)
+            ge(*inputs)
+
     def checkScript(self, script, inputs, optimize=True, outputs=None, name='func', capture_output=False, frames_up=1):
         if isinstance(script, str):
             cu = torch.jit.CompilationUnit(script, optimize, _frames_up=frames_up)
@@ -1948,6 +1982,11 @@ def func(x):
         x = torch.rand(10, dtype=torch.float, requires_grad=True)
         self.checkScript(func, [x], optimize=True)
 
+        def func2(x):
+            return x[5:]
+
+        self.checkScript(func2, [x], optimize=True)
+
     def test_gather(self):
         def func(x):
             return x[0]
@@ -2059,17 +2098,6 @@ def foo3(x):
             canonical(foo3.graph))
 
     def test_list_literal(self):
-        # Python equivalents for the empty list construction builtins. We need
-        # these otherwise the tests won't execute in regular Python mode.
-        def _construct_empty_int_list():
-            return []
-
-        def _construct_empty_float_list():
-            return []
-
-        def _construct_empty_tensor_list():
-            return []
-
         def reassign():
             x = [1]
             if True:
@@ -2123,6 +2151,100 @@ def reassign_nested():
         with self.assertRaisesRegex(RuntimeError, "previously has type"):
             self.checkScript(reassign_nested, (), optimize=True)
 
+    def test_list_gather(self):
+        def index():
+            a = [1, 2, 3]
+            return a[1]
+
+        self.checkScript(index, ())
+
+        def negative_index():
+            a = [1, 2, 3]
+            return a[-1]
+
+        self.checkScript(negative_index, ())
+
+        def bad_index():
+            a = [1, 2, 3]
+            return a[4]
+
+        self.checkScriptRaisesRegex(bad_index, (), IndexError,
+                                    "list index out of range")
+
+        def bad_negative_index():
+            a = [1, 2, 3]
+            return a[-5]
+
+        self.checkScriptRaisesRegex(bad_negative_index, (), IndexError,
+                                    "list index out of range")
+
+    def test_list_len(self):
+        def func():
+            a = [1, 2, 3]
+            return len(a) == 3
+
+        self.checkScript(func, ())
+
+        def func2():
+            a = _construct_empty_tensor_list()
+            return len(a) == 0
+
+        self.checkScript(func2, ())
+
+    def test_list_ops(self):
+        def test_equality():
+            a = [1, 2, 3]
+            b = [1, 2, 3]
+            return a == b
+
+        self.checkScript(test_equality, (), optimize=True)
+
+        def test_non_equality():
+            a = [1, 2, 3]
+            b = [3]
+            return a == b
+
+        self.checkScript(test_non_equality, (), optimize=True)
+
+        def test_list_add():
+            a = [1, 2, 3]
+            b = [2]
+            c = a + b
+            return c == [1, 2, 3, 2]
+
+        self.checkScript(test_list_add, (), optimize=True)
+
+        def test_list_add_empty():
+            a = [1, 2, 3]
+            b = _construct_empty_int_list()
+            c = a + b
+            return c == [1, 2, 3]
+
+        self.checkScript(test_list_add_empty, (), optimize=True)
+
+        def test_tensor_list_equality():
+            t1 = torch.ones([1, 1])
+            t2 = torch.ones([1, 1])
+            x = [t1, t2]
+            y = [t2, t1]
+            return x == y
+
+        self.checkScript(test_tensor_list_equality, (), optimize=True)
+
+        def test_invalid_list_equality():
+            t1 = torch.ones([2, 2])
+            t2 = torch.ones([2, 2])
+            x = [t1, t2]
+            y = [t2, t1]
+            # will throw since the tensors have more than one element
+            return x == y
+
+        self.checkScriptRaisesRegex(
+            test_invalid_list_equality,
+            (),
+            RuntimeError,
+            "bool value of Tensor")
+
     def test_func_call(self):
         script = '''
         def add(a, b):
diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh
new file mode 100755
index 00000000000000..6a50c14e05523e
--- /dev/null
+++ b/tools/cpp_build/build_caffe2.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+pushd $SCRIPTPATH
+source ./build_common.sh
+
+echo "Building Caffe2"
+
+mkdir -p $CAFFE2_BUILDPATH
+pushd $CAFFE2_BUILDPATH
+
+cmake -DUSE_CUDA:BOOL=$USE_CUDA \
+      -DBUILD_TORCH=ON \
+      -DUSE_OPENMP:BOOL=${USE_OPENMP:ON} \
+      -DBUILD_CAFFE2=OFF \
+      -DBUILD_ATEN=ON \
+      -DBUILD_PYTHON=OFF \
+      -DBUILD_BINARY=OFF \
+      -DBUILD_SHARED_LIBS=ON \
+      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
+      -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \
+      -DCMAKE_INSTALL_MESSAGE=NEVER \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
+      -G "$GENERATE" \
+      $PYTORCHPATH/
+$MAKE -j "$JOBS" install
+
+popd
+popd
diff --git a/tools/cpp_build/build_common.sh b/tools/cpp_build/build_common.sh
new file mode 100755
index 00000000000000..be9ac2b271743d
--- /dev/null
+++ b/tools/cpp_build/build_common.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+BUILD_PATH="${1:-$SCRIPTPATH/build}"
+INSTALL_PREFIX="$BUILD_PATH/install"
+PYTORCHPATH="$SCRIPTPATH/../.."
+
+USE_CUDA=0
+if [ -x "$(command -v nvcc)" ]; then
+  USE_CUDA=1
+fi
+
+CAFFE2_BUILDPATH="$BUILD_PATH/caffe2"
+NANOPB_BUILDPATH="$BUILD_PATH/nanopb"
+
+# Build with Ninja if available. It has much cleaner output.
+GENERATE="Unix Makefiles"
+MAKE=make
+if [ -x "$(command -v ninja)" ]; then
+  GENERATE=Ninja
+  MAKE=ninja
+fi
+
+# Code is developed a lot more than released, so default to Debug.
+BUILD_TYPE=${BUILD_TYPE:-Debug}
+
+# Try to build with as many threads as we have cores, default to 4 if the
+# command fails.
+set +e
+if [ -n "$MAX_JOBS" ]; then  # Use MAX_JOBS if it is set
+  JOBS=$MAX_JOBS
+elif [[ "$(uname)" == "Linux" ]]; then
+  # https://stackoverflow.com/questions/6481005/how-to-obtain-the-number-of-cpus-cores-in-linux-from-the-command-line
+  JOBS="$(grep -c '^processor' /proc/cpuinfo)"
+else # if [[ "$(uname)" == "Darwin"]]
+  # https://stackoverflow.com/questions/1715580/how-to-discover-number-of-logical-cores-on-mac-os-x
+  JOBS="$(sysctl -n hw.ncpu)"
+fi
+set -e
+if [[ $? -ne 0 ]]; then
+  JOBS=4
+fi
+
+# Make sure an ONNX namespace is set
+if [ -z "$ONNX_NAMESPACE" ]; then
+  ONNX_NAMESPACE="onnx_torch"
+fi
diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh
new file mode 100755
index 00000000000000..6dd9a589cf1074
--- /dev/null
+++ b/tools/cpp_build/build_libtorch.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+pushd $SCRIPTPATH
+source ./build_common.sh
+
+echo "Building Torch"
+
+mkdir -p $LIBTORCH_BUILDPATH
+pushd $LIBTORCH_BUILDPATH
+
+cmake -DUSE_CUDA:BOOL=$USE_CUDA \
+      -DNO_API:BOOL=${NO_API:-0} \
+      -DCAFFE2_PATH=$PYTORCHPATH/ \
+      -DCAFFE2_BUILD_PATH=$CAFFE2_BUILDPATH \
+      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
+      -DNANOPB_BUILD_PATH=$NANOPB_BUILDPATH \
+      -DINSTALL_PREFIX=$INSTALL_PREFIX \
+      -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \
+      -DCMAKE_INSTALL_MESSAGE=NEVER \
+      -Dnanopb_BUILD_GENERATOR:BOOL=OFF \
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
+      -DVERBOSE:BOOL=${VERBOSE:-0} \
+      -G "$GENERATE" \
+      $PYTORCHPATH/torch
+$MAKE -j "$JOBS"
+
+popd
+popd
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index fda98bbd24c14f..058712f6b33db0 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -389,7 +389,6 @@ if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
   if (USE_CUDA)
     target_link_libraries(test_jit ${CUDA_LIBRARIES})
   endif()
-
 endif()
 
 if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
@@ -438,5 +437,4 @@ if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
         -Wno-unused-but-set-parameter)
     endif()
   endif()
-
 endif()
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 3362d36f11b6c5..9d371ae95a2d92 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -432,6 +432,7 @@ DEFINE_TO(double, toDouble)
 DEFINE_TO(int64_t, toInt)
 DEFINE_TO(Shared<DoubleList>, toDoubleList)
 DEFINE_TO(Shared<IntList>, toIntList)
+DEFINE_TO(Shared<TensorList>, toTensorList)
 DEFINE_TO(Shared<ConstantString>, toString)
 DEFINE_TO(at::Scalar, toScalar)
 DEFINE_TO(bool, toInt)
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 59b91cce49067d..6054811a7e3c6d 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -310,7 +310,127 @@ RegisterOperators reg({
     };                                                                 \
   }),
 
+template <typename T>
+Operation listSelect(Node* node) {
+  return [=](Stack& stack) {
+    T list;
+    int64_t idx;
+    pop(stack, list, idx);
+    const int64_t list_size = list->elements().size();
+    if (idx >= list_size) {
+      throw std::out_of_range("list index out of range");
+    }
+
+    if (idx < 0) {
+      // Handle negative indexing
+      idx = list_size + idx;
+      if (idx < 0) {
+        throw std::out_of_range("list index out of range");
+      }
+    }
+
+    auto element = list->elements().at(idx);
+    push(stack, std::move(element));
+    return 0;
+  };
+}
+
+template <typename T>
+Operation listLen(Node* node) {
+  return [=](Stack& stack) {
+    T a;
+    pop(stack, a);
+    const int64_t size = a->elements().size();
+    push(stack, size);
+    return 0;
+  };
+}
+
+template <typename T>
+Operation listEq(Node* node) {
+  return [=](Stack& stack) {
+    T a;
+    T b;
+    pop(stack, a, b);
+    if (a->elements() == b->elements()) {
+      push(stack, 1);
+    } else {
+      push(stack, 0);
+    }
+    return 0;
+  };
+}
+
+// Specialization for at::Tensor, since it doesn't define operator==
+template <>
+Operation listEq<Shared<TensorList>>(Node* node) {
+  return [=](Stack& stack) {
+    Shared<TensorList> a;
+    Shared<TensorList> b;
+    pop(stack, a, b);
+    if (a->elements().size() != b->elements().size()) {
+      push(stack, 0);
+      return 0;
+    }
+
+    for (size_t i = 0; i < a->elements().size(); ++i) {
+      const auto& a_element = a->elements()[i];
+      const auto& b_element = b->elements()[i];
+      // This preserves Python's semantics, which uses eq() to compare two
+      // elements, then passes the result to bool().
+      // see: https://docs.python.org/3.4/reference/datamodel.html#object.__ge__
+      const auto cmp_result = a_element.eq(b_element);
+      if (!cmp_result.is_nonzero()) {
+        push(stack, 0);
+        return 0;
+      }
+    }
+
+    push(stack, 1);
+    return 0;
+  };
+}
+
+template <class TList, class TElement>
+Operation listAdd(Node* node) {
+  return [=](Stack& stack) {
+    TList a;
+    TList b;
+    pop(stack, a, b);
+
+    std::vector<TElement> ret;
+    const auto total_size = a->elements().size() + b->elements().size();
+    ret.reserve(total_size);
+    for (const auto& a_element : a->elements()) {
+      ret.push_back(a_element);
+    }
+    for (const auto& b_element : b->elements()) {
+      ret.push_back(b_element);
+    }
+
+    push(stack, ret);
+    return 0;
+  };
+}
+
 RegisterOperators reg2({
+    Operator("aten::select(int[] a, int b) -> int", listSelect<Shared<IntList>>),
+    Operator("aten::select(float[] a, int b) -> float", listSelect<Shared<DoubleList>>),
+    Operator("aten::select(Tensor[] a, int b) -> Tensor", listSelect<Shared<TensorList>>),
+
+    Operator("aten::len(int[] a) -> int", listLen<Shared<IntList>>),
+    Operator("aten::len(float[] a) -> int", listLen<Shared<DoubleList>>),
+    Operator("aten::len(Tensor[] a) -> int", listLen<Shared<TensorList>>),
+
+    Operator("aten::eq(int[] a, int[] b) -> int", listEq<Shared<IntList>>),
+    Operator("aten::eq(float[] a, float[] b) -> int", listEq<Shared<DoubleList>>),
+    Operator("aten::eq(Tensor[] a, Tensor[] b) -> int", listEq<Shared<TensorList>>),
+
+    Operator("aten::add(int[] a, int[] b) -> int[]", listAdd<Shared<IntList>, int64_t>),
+    Operator("aten::add(float[] a, float[] b) -> float[]", listAdd<Shared<DoubleList>, double>),
+    Operator("aten::add(Tensor[] a, Tensor[] b) -> Tensor[]", listAdd<Shared<TensorList>, at::Tensor>),
+
+
     DEFINE_BINARY_OP(aten::add, a + b)
     DEFINE_BINARY_OP(aten::sub, a - b)
     DEFINE_BINARY_OP(aten::mul, a * b)
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 73808fcb5863d1..b34524ae6b644b 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -1359,10 +1359,7 @@ struct to_ir {
         return emitNone(tree->range());
       } break;
       case TK_SLICE: {
-        const auto slice = Slice(tree);
-        return emitSlice(
-            slice.range(),
-            {slice.value(), slice.startOr(0), slice.endOr(-1)});
+        return emitSlice(Slice(tree));
       } break;
       case TK_GATHER: {
         const auto gather = Gather(tree);
@@ -1380,7 +1377,8 @@ struct to_ir {
         auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity);
         if (values.size() == 0) {
           throw ErrorReport(tree) << "Empty list literals not allowed. "
-                                  << "Use _constructEmptyFooList() instead";
+                                  << "Use _construct_empty_foo_list() instead. "
+                                  << "`foo` can be `int`, `float` or `tensor`";
         }
         const auto elem_type = values.at(0)->type();
         for (auto v : values) {
@@ -1424,28 +1422,33 @@ struct to_ir {
 
   // Desugars slice syntactic sugar tensor[begin:end] -> tensor.slice(begin,
   // end).
-  Value* emitSlice(
-      const SourceRange& loc,
-      TreeList&& inputs) {
-    const auto applyInputs =
-        Compound::create(TK_LIST, loc, std::move(inputs));
-    const auto input_values = getNamedValues(applyInputs->trees(),
-                                             /*maybe_unpack*/false,
-                                             identity);
+  Value* emitSlice(const Slice& slice) {
+    const auto& loc = slice.range();
+    TreeList inputs = {slice.value(), slice.startOr(0)};
+    const auto applyInputs = Compound::create(TK_LIST, loc, std::move(inputs));
+    const auto input_values = getNamedValues(
+        applyInputs->trees(),
+        /*maybe_unpack*/ false,
+        identity);
+
     NamedValue tensor = input_values[0];
     NamedValue begin = input_values[1];
-    NamedValue end = input_values[2];
-    NamedValue dim = NamedValue(loc, "dim",
-        graph->insertConstant(0, loc));
-    NamedValue step = NamedValue(loc, "step",
-        graph->insertConstant(1, loc));
-
-    return emitBuiltinCall(
-               loc, method, "slice", {tensor, dim, begin, end, step}, {}, true)
+    NamedValue dim = NamedValue(loc, "dim", graph->insertConstant(0, loc));
+    NamedValue step = NamedValue(loc, "step", graph->insertConstant(1, loc));
+
+    std::vector<NamedValue> args = {tensor, dim, begin};
+    const auto has_end = slice.end().present();
+    if (has_end) {
+      // If the user specified an `end` index, pass it down
+      args.emplace_back(loc, "end", emitExpr(Expr(slice.end().get()), identity));
+    }
+
+    // Otherwise rely on the schema default argument
+    return emitBuiltinCall(loc, method, "slice", args, {step}, true)
         ->asValue(loc, method);
   }
 
-  // Desugars gather syntactic sugar tensor[idx] -> tensor.select(idx).
+  // Desugars gather syntactic sugar foo[i]
   Value* emitGather(
       const SourceRange& loc,
       TreeList&& inputs) {
@@ -1454,15 +1457,21 @@ struct to_ir {
     auto input_values = getNamedValues(applyInputs->trees(),
                                         /*maybe_unpack*/false,
                                         identity);
-    NamedValue tensor = input_values[0];
-    NamedValue dim = NamedValue(
-        loc,
-        "dim",
-        graph->insertConstant(0, loc));
+    NamedValue gatherable = input_values[0];
     NamedValue idx = input_values[1];
-
-    return emitBuiltinCall(loc, method, "select", {tensor, dim, idx}, {}, true)
-        ->asValue(loc, method);
+    if (gatherable.value->type()->kind() == TypeKind::ListType) {
+      // if it's a list, emit a regular index selection op
+      return emitBuiltinCall(
+                 loc, method, "select", {gatherable, idx}, {}, true)
+          ->asValue(loc, method);
+
+    } else {
+      // if it's a single tensor, map tensor[idx] -> tensor.select(0, idx)
+      NamedValue dim = NamedValue(loc, "dim", graph->insertConstant(0, loc));
+      return emitBuiltinCall(
+                 loc, method, "select", {gatherable, dim, idx}, {}, true)
+          ->asValue(loc, method);
+    }
   }
 };