ROCm · iotamudelta · Sep 21, 2018 · Sep 19, 2018 · Sep 19, 2018 · Sep 19, 2018
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
@@ -15,7 +15,7 @@ if [ ! -d "${PYTORCH_ENV_DIR}/miniconda3" ]; then
 fi
 export PATH="${PYTORCH_ENV_DIR}/miniconda3/bin:$PATH"
 source ${PYTORCH_ENV_DIR}/miniconda3/bin/activate
-conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja
+conda install -y mkl mkl-include numpy pyyaml setuptools cmake cffi ninja future six
 if [ -z "${IN_CIRCLECI}" ]; then
   rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 fi

diff --git a/.jenkins/pytorch/perf_test/compare_with_baseline.py b/.jenkins/pytorch/perf_test/compare_with_baseline.py
@@ -1,5 +1,6 @@
 import sys
 import json
+import math
 import numpy
 import argparse
 
@@ -35,14 +36,25 @@
 print("population mean: ", mean)
 print("population sigma: ", sigma)
 
+# Let the test pass if baseline number is NaN (which happened in
+# the past when we didn't have logic for catching NaN numbers)
+if math.isnan(mean) or math.isnan(sigma):
+    mean = sys.maxsize
+    sigma = 0.001
+
 sample_stats_data = json.loads(args.sample_stats)
 
-sample_mean = sample_stats_data['mean']
-sample_sigma = sample_stats_data['sigma']
+sample_mean = float(sample_stats_data['mean'])
+sample_sigma = float(sample_stats_data['sigma'])
 
 print("sample mean: ", sample_mean)
 print("sample sigma: ", sample_sigma)
 
+if math.isnan(sample_mean):
+    raise Exception('''Error: sample mean is NaN''')
+elif math.isnan(sample_sigma):
+    raise Exception('''Error: sample sigma is NaN''')
+
 z_value = (sample_mean - mean) / sigma
 
 print("z-value: ", z_value)

diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh b/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh
@@ -20,6 +20,9 @@ test_gpu_speed_mnist () {
   SAMPLE_ARRAY=()
   NUM_RUNS=$1
 
+  # Needs warm up to get accurate number
+  python main.py --epochs 1 --no-log
+
   for (( i=1; i<=$NUM_RUNS; i++ )) do
     runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
     echo $runtime

diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
@@ -45,7 +45,7 @@ curl https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
 call C:\\Jenkins\\Miniconda3\\Scripts\\activate.bat C:\\Jenkins\\Miniconda3
 call conda install -y -q numpy mkl cffi pyyaml boto3
 
-pip install ninja
+pip install ninja future
 
 call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\VC\\Auxiliary\\Build\\vcvarsall.bat" x86_amd64
 

diff --git a/README.md b/README.md
@@ -239,7 +239,7 @@ You can then build the documentation by running ``make <format>`` from the
 ### Previous Versions
 
 Installation instructions and binaries for previous PyTorch versions may be found
-on [our website](http://pytorch.org/previous-versions/).
+on [our website](http://pytorch.org/previous-versions).
 
 
 ## Getting Started

diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
@@ -207,7 +207,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
   for (size_t i = 0; i < tensors.size() - 1; i++) {
     oss << tensors[i].sizes() << ", ";
   }
-  oss << "and " << tensors[tensors.size() - 1]
+  oss << "and " << tensors[tensors.size() - 1].sizes()
       << " to have the same number of elements, but got ";
   for (size_t i = 0; i < tensors.size() - 1; i++) {
     oss << tensors[i].numel() << ", ";
@@ -220,7 +220,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
 inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
   checkBackend("CPU_tensor_apply", tensors, Backend::CPU);
   if (!_all_equal_numel(tensors))
-    throw std::runtime_error(_all_equal_numel_error(tensors));
+    AT_ERROR(_all_equal_numel_error(tensors));
   // An empty tensor has no elements
   for (auto& t : tensors)
     if (t.numel() == 0)

diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
@@ -3218,38 +3218,6 @@
       kwarg_only: True
     - double p
 ]]
-[[
-  name: _bernoulli_
-  backends:
-    - CPU
-    - CUDA
-  cname: bernoulli
-  return: self
-  variants: function
-  arguments:
-    - THTensor* self
-    - arg: THGenerator* generator
-      default: nullptr
-      kwarg_only: True
-    - double p
-]]
-[[
-  name: _th_bernoulli
-  types:
-    - Float
-    - Double
-  return: argument 0
-  variants: function
-  cname: bernoulli_Tensor
-  arguments:
-    - arg: THTensor* output
-      output: True
-      resize: self
-    - arg: THGenerator* generator
-      default: nullptr
-      kwarg_only: True
-    - THTensor* self
-]]
 [[
   name: _dirichlet_grad
   types:

diff --git a/aten/src/ATen/core/DeviceType.h b/aten/src/ATen/core/DeviceType.h
@@ -8,6 +8,7 @@
 #include <ATen/core/Macros.h>
 
 #include <ostream>
+#include <functional>
 
 namespace at {
 
@@ -32,3 +33,11 @@ AT_CORE_API std::string DeviceTypeName(
 AT_CORE_API std::ostream& operator<<(std::ostream& stream, at::DeviceType type);
 
 } // namespace at
+
+namespace std {
+template <> struct hash<at::DeviceType> {
+  std::size_t operator()(const at::DeviceType &k) const {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+} // namespace std
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
@@ -441,12 +441,10 @@ struct AT_API Tensor {
   Tensor & atan_();
   Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
   Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
-  Tensor bernoulli(const Tensor & p, Generator * generator=nullptr) const;
-  Tensor bernoulli(double p, Generator * generator=nullptr) const;
-  Tensor bernoulli() const;
+  Tensor bernoulli(Generator * generator=nullptr) const;
   Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr);
-  Tensor & bernoulli_(double p, Generator * generator=nullptr);
-  Tensor & bernoulli_();
+  Tensor & bernoulli_(double p=0.5, Generator * generator=nullptr);
+  Tensor bernoulli(double p, Generator * generator=nullptr) const;
   Tensor bincount(const Tensor & weights={}, int64_t minlength=0) const;
   Tensor bmm(const Tensor & mat2) const;
   Tensor ceil() const;

diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
@@ -605,23 +605,17 @@ inline Tensor Tensor::baddbmm(const Tensor & batch1, const Tensor & batch2, Scal
 inline Tensor & Tensor::baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
     return type().baddbmm_(*this, batch1, batch2, beta, alpha);
 }
-inline Tensor Tensor::bernoulli(const Tensor & p, Generator * generator) const {
-    return type().bernoulli(*this, p, generator);
-}
-inline Tensor Tensor::bernoulli(double p, Generator * generator) const {
-    return type().bernoulli(*this, p, generator);
-}
-inline Tensor Tensor::bernoulli() const {
-    return type().bernoulli(*this);
+inline Tensor Tensor::bernoulli(Generator * generator) const {
+    return type().bernoulli(*this, generator);
 }
 inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) {
     return type().bernoulli_(*this, p, generator);
 }
 inline Tensor & Tensor::bernoulli_(double p, Generator * generator) {
     return type().bernoulli_(*this, p, generator);
 }
-inline Tensor & Tensor::bernoulli_() {
-    return type().bernoulli_(*this);
+inline Tensor Tensor::bernoulli(double p, Generator * generator) const {
+    return type().bernoulli(*this, p, generator);
 }
 inline Tensor Tensor::bincount(const Tensor & weights, int64_t minlength) const {
     return type().bincount(*this, weights, minlength);

diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
@@ -381,8 +381,6 @@ struct AT_API Type {
   virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0;
   virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim) const = 0;
-  AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step) const = 0);
-  AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0);
   virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor argmax(const Tensor & self) const = 0;
   virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim) const = 0;
@@ -397,12 +395,10 @@ struct AT_API Type {
   virtual Tensor & atan_(Tensor & self) const = 0;
   virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
   virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
-  virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator) const = 0;
-  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0;
-  virtual Tensor bernoulli(const Tensor & self) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, Generator * generator) const = 0;
   virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) const = 0;
   virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator) const = 0;
-  virtual Tensor & bernoulli_(Tensor & self) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0;
   virtual Tensor bincount(const Tensor & self, const Tensor & weights, int64_t minlength) const = 0;
   virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0;
   virtual Tensor ceil(const Tensor & self) const = 0;
@@ -430,7 +426,6 @@ struct AT_API Type {
   virtual Tensor div(const Tensor & self, Scalar other) const = 0;
   virtual Tensor & div_(Tensor & self, Scalar other) const = 0;
   virtual Tensor dot(const Tensor & self, const Tensor & tensor) const = 0;
-  AT_DEPRECATED(virtual Tensor empty(IntList size) const = 0);
   virtual Tensor erf(const Tensor & self) const = 0;
   virtual Tensor & erf_(Tensor & self) const = 0;
   virtual Tensor erfc(const Tensor & self) const = 0;
@@ -441,13 +436,11 @@ struct AT_API Type {
   virtual Tensor & expm1_(Tensor & self) const = 0;
   virtual Tensor expand(const Tensor & self, IntList size, bool implicit) const = 0;
   virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0;
-  AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m) const = 0);
   virtual Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim) const = 0;
   virtual Tensor & fill_(Tensor & self, Scalar value) const = 0;
   virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0;
   virtual Tensor floor(const Tensor & self) const = 0;
   virtual Tensor & floor_(Tensor & self) const = 0;
-  AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0);
   virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0;
   virtual std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A) const = 0;
   virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
@@ -469,7 +462,6 @@ struct AT_API Type {
   virtual bool is_signed(const Tensor & self) const = 0;
   virtual bool is_sparse(const Tensor & self) const = 0;
   virtual std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) const = 0;
-  AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps) const = 0);
   virtual Tensor log(const Tensor & self) const = 0;
   virtual Tensor & log_(Tensor & self) const = 0;
   virtual Tensor log10(const Tensor & self) const = 0;
@@ -479,7 +471,6 @@ struct AT_API Type {
   virtual Tensor log2(const Tensor & self) const = 0;
   virtual Tensor & log2_(Tensor & self) const = 0;
   virtual Tensor logdet(const Tensor & self) const = 0;
-  AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps) const = 0);
   virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0;
   virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim) const = 0;
   virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0;
@@ -504,16 +495,9 @@ struct AT_API Type {
   virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0;
   virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0;
   virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
-  AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0);
   virtual Tensor permute(const Tensor & self, IntList dims) const = 0;
   virtual Tensor pin_memory(const Tensor & self) const = 0;
   virtual Tensor pinverse(const Tensor & self, double rcond) const = 0;
-  AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator) const = 0);
-  AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step) const = 0);
   virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0;
   virtual Tensor reshape(const Tensor & self, IntList shape) const = 0;
   virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0;
@@ -581,7 +565,6 @@ struct AT_API Type {
   virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0;
   virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0;
   virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0;
-  AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0);
   virtual Tensor norm(const Tensor & self, Scalar p) const = 0;
   virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) const = 0;
   virtual Tensor clone(const Tensor & self) const = 0;

diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp
@@ -0,0 +1,22 @@
+#include <ATen/core/context_base.h>
+
+namespace caffe2 {
+
+// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
+StaticContextMap& GetStaticContexts() {
+  static StaticContextMap static_contexts;
+  return static_contexts;
+}
+
+void set_static_context(at::DeviceType t, BaseStaticContext* ptr) {
+  auto& static_contexts = GetStaticContexts();
+  static_contexts[t] = ptr;
+}
+
+BaseStaticContext* get_static_context(at::DeviceType t) {
+  auto* ptr = GetStaticContexts()[t];
+  AT_ASSERTM(ptr, "StaticContext for ", t, " is not registered yet.");
+  return ptr;
+}
+
+} // namespace caffe2
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
@@ -10,6 +10,7 @@
 #include <ATen/core/Error.h>
 #include <ATen/core/UniqueVoidPtr.h>
 #include <ATen/core/typeid.h>
+#include <ATen/core/ATenGeneral.h>
 
 namespace caffe2 {
 class Event;
@@ -184,3 +185,27 @@ class AT_CORE_API BaseContext {
 };
 
 } // namespace at
+
+namespace caffe2 {
+
+using at::BaseContext;
+using at::BaseStaticContext;
+
+using StaticContextMap = std::unordered_map<at::DeviceType, BaseStaticContext*>;
+AT_API StaticContextMap& GetStaticContexts();
+AT_API void set_static_context(at::DeviceType t, BaseStaticContext* ptr);
+AT_API BaseStaticContext* get_static_context(at::DeviceType t);
+
+template <at::DeviceType t>
+struct StaticContextFunctionRegisterer {
+  explicit StaticContextFunctionRegisterer(BaseStaticContext* ptr) {
+    set_static_context(t, ptr);
+  }
+};
+
+#define REGISTER_STATIC_CONTEXT(t, f)                                \
+  namespace {                                                        \
+  static StaticContextFunctionRegisterer<t> g_static_context_##d(f); \
+  }
+
+} // namespace caffe2
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -438,4 +438,14 @@ interleave2(const Vec256<T>& a, const Vec256<T>& b) {
                         Vec256<T>::loadu(static_cast<void*>(buffer2)));
 }
 
+template <typename src_T, typename dst_T>
+void convert(const src_T *src, dst_T *dst, int64_t n) {
+#pragma unroll
+  for (int64_t i = 0; i < n; i++) {
+    *dst = static_cast<dst_T>(*src);
+    src++;
+    dst++;
+  }
+}
+
 }}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -208,6 +208,38 @@ struct Vec256<int32_t> : public Vec256i {
   }
 };
 
+template <>
+void convert(const int32_t *src, float *dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#pragma unroll
+  for (i = 0; i <= (n - Vec256<int32_t>::size); i += Vec256<int32_t>::size) {
+    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_ps(input_vec);
+    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+void convert(const int32_t *src, double *dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#pragma unroll
+  for (i = 0; i <= (n - Vec256<double>::size); i += Vec256<double>::size) {
+    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
+    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
 template <>
 struct Vec256<int16_t> : public Vec256i {
   static constexpr int size = 16;