ROCm
diff --git a/‎.jenkins/pytorch/build.sh‎
Lines changed: 1 addition & 9 deletions b/‎.jenkins/pytorch/build.sh‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎.jenkins/pytorch/macos-build.sh‎
Lines changed: 6 additions & 0 deletions b/‎.jenkins/pytorch/macos-build.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.jenkins/pytorch/macos-test.sh‎
Lines changed: 1 addition & 10 deletions b/‎.jenkins/pytorch/macos-test.sh‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎.jenkins/pytorch/test.sh‎
Lines changed: 3 additions & 4 deletions b/‎.jenkins/pytorch/test.sh‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎caffe2/core/tensor.h‎
Lines changed: 55 additions & 19 deletions b/‎caffe2/core/tensor.h‎
Lines changed: 55 additions & 19 deletions
diff --git a/‎caffe2/experiments/operators/tt_pad_op.h‎
Lines changed: 1 addition & 1 deletion b/‎caffe2/experiments/operators/tt_pad_op.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/mobile/contrib/ulp2/ulp_neon.cc‎
Lines changed: 1 addition & 1 deletion b/‎caffe2/mobile/contrib/ulp2/ulp_neon.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/operators/dataset_ops.cc‎
Lines changed: 1 addition & 1 deletion b/‎caffe2/operators/dataset_ops.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/operators/group_norm_op.cc‎
Lines changed: 73 additions & 20 deletions b/‎caffe2/operators/group_norm_op.cc‎
Lines changed: 73 additions & 20 deletions
@@ -74,7 +74,7 @@ fi
 WERROR=1 python setup.py install
 
 # Add the test binaries so that they won't be git clean'ed away
-git add -f build/bin
+git add -f build/bin build/lib
 
 # Testing ATen install
 if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then
@@ -101,11 +101,3 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   make html
   popd
 fi
-
-# Test no-Python build
-if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
-  echo "Building libtorch"
-  # NB: Install outside of source directory (at the same level as the root
-  # pytorch folder) so that it doesn't get cleaned away prior to docker push.
-  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$PWD/../cpp-build"
-fi
@@ -61,6 +61,12 @@ export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
 
 python setup.py install
 
+# this is a bit hacky, but not too bad. Bundle the test binaries into
+# the installation directory, so they can catch a free ride on the 7z
+# train.
+mkdir -p ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch/test_binaries/build
+mv build/{bin,lib} ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch/test_binaries/build/
+
 # Upload torch binaries when the build job is finished
 7z a ${IMAGE_COMMIT_TAG}.7z ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 aws s3 cp ${IMAGE_COMMIT_TAG}.7z s3://ossci-macos-build/pytorch/${IMAGE_COMMIT_TAG}.7z --acl public-read
@@ -50,22 +50,13 @@ test_python_all() {
 test_cpp_api() {
   # C++ API
 
-  # NB: Install outside of source directory (at the same level as the root
-  # pytorch folder) so that it doesn't get cleaned away prior to docker push.
-  # But still clean it before we perform our own build.
-  #
-  CPP_BUILD="$PWD/../cpp-build"
-  rm -rf $CPP_BUILD
-  mkdir -p $CPP_BUILD
-  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$CPP_BUILD"
-
   python tools/download_mnist.py --quiet -d test/cpp/api/mnist
 
   # Unfortunately it seems like the test can't load from miniconda3
   # without these paths being set
   export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$PWD/miniconda3/lib"
   export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/miniconda3/lib"
-  "$CPP_BUILD"/caffe2/bin/test_api
+  ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch/test_binaries/build/bin/test_api
 }
 
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
 
@@ -108,14 +108,13 @@ test_torchvision() {
 test_libtorch() {
   if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
      echo "Testing libtorch"
-     CPP_BUILD="$PWD/../cpp-build"
      if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-       "$CPP_BUILD"/caffe2/bin/test_jit
+       ./build/bin/test_jit
      else
-       "$CPP_BUILD"/caffe2/bin/test_jit "[cpu]"
+       ./build/bin/test_jit "[cpu]"
      fi
      python tools/download_mnist.py --quiet -d test/cpp/api/mnist
-     OMP_NUM_THREADS=2 "$CPP_BUILD"/caffe2/bin/test_api
+     OMP_NUM_THREADS=2 ./build/bin/test_api
   fi
 }
 
 
@@ -231,6 +231,17 @@ class Tensor {
 
   virtual ~Tensor() noexcept {}
 
+  /**
+   * @brief Extend the outer-most dimension of this tensor
+   *        to dimension of `num`.
+   */
+  void ExtendTo(TIndex num, float growthPct, BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
+    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
+    Extend(num - dims_[0], growthPct, context);
+  }
+
   /**
    * @brief Extends the outer-most dimension of this tensor by num elements,
    * preserving the existing data.
@@ -242,6 +253,8 @@ class Tensor {
    */
   void Extend(TIndex num, float growthPct, BaseContext* context) {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(
+        num, 0, "`num` must be non-negative for Extend");
     auto newDims = dims_;
     newDims[0] += num;
     if (!data_) {
@@ -261,30 +274,17 @@ class Tensor {
     auto newCapacity = dims_;
     newCapacity[0] = std::max<size_t>(
         newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
-    Reserve(newCapacity, context);
-    dims_ = newDims;
-    size_ = newSize;
-  }
-
-  template <class T>
-  void Reserve(const std::vector<T>& newCapacity, BaseContext* context) {
-    auto newSize = std::accumulate(
-        newCapacity.begin(),
-        newCapacity.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
-    if (newSize * meta_.itemsize() <= capacity_) {
-      return;
-    }
     auto oldData = std::move(data_);
     auto oldSize = size_;
     auto oldDims = dims_;
     Resize(newCapacity);
     auto* newData = raw_mutable_data(meta_);
+    CAFFE_ENFORCE(
+        context != nullptr, "Context must be provided to Extend the tensor");
     context->CopyItemsSameDevice(meta_, oldSize, oldData.get(), newData);
-    dims_ = oldDims;
-    size_ = oldSize;
     reserved_ = true;
+    dims_ = newDims;
+    size_ = newSize;
   }
 
   /**
@@ -293,7 +293,7 @@ class Tensor {
    * This method guarantees that no re-allocations are carried out, which means
    * that the extra capacity after the end of the shurnk tensor is maintained.
    */
-  void Shrink(TIndex outer_dim) {
+  void ShrinkTo(TIndex outer_dim) {
     CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
     CAFFE_ENFORCE_WITH_CALLER(
         outer_dim <= dims_[0],
@@ -306,6 +306,38 @@ class Tensor {
         std::multiplies<TIndex>());
   }
 
+  /**
+   * @brief Reserve space for the underlying tensor.
+   *
+   * This must be called after Resize(), since we only specify the first
+   * dimension This does not copy over the old data to the newly allocated space
+   */
+  template <class T>
+  void ReserveSpace(const T& outer_dim) {
+    CAFFE_ENFORCE(
+        size_ != -1, "size should be initialized before calling ReserveSpace");
+    auto newCapacity = dims_;
+    newCapacity[0] = outer_dim;
+    auto newSize = std::accumulate(
+        newCapacity.begin(),
+        newCapacity.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+    if (newSize * meta_.itemsize() <= capacity_) {
+      return;
+    }
+    // Old data is discarded
+    data_.reset();
+    auto oldSize = size_;
+    auto oldDims = dims_;
+    Resize(newCapacity);
+    // Allocate new memory and don't copy over the data
+    raw_mutable_data(meta_);
+    dims_ = oldDims;
+    size_ = oldSize;
+    reserved_ = true;
+  }
+
   /**
    * @brief Resizes a tensor.
    *
@@ -389,7 +421,7 @@ class Tensor {
     capacity_ = 0;
     // If reserved is true and we changed tensor memory then it is fine
     // to switch it to false, if Resize is called from Reserve and it triggers
-    // FreeMemory() then reserved_ will be set to true at end of Reserve()
+    // FreeMemory() then reserved_ will be set to true at end of ReserveSpace()
     reserved_ = false;
   }
 
@@ -740,6 +772,10 @@ class Tensor {
   TypeMeta meta_;
   std::shared_ptr<void> data_;
   size_t capacity_ = 0;
+  // we decide to keep reserved and it will
+  // live in Tensor after the split
+  // The logic is that if Extend() or ReserveSpace() were ever called,
+  // then subsequent Resize()s will not free up Storage.
   bool reserved_ = false;
   DeviceType device_type_ = CPU;
   // In case of chunk load we store how much data was already loaded
 
@@ -83,7 +83,7 @@ class TTPadGradientOp final : public Operator<Context> {
     auto dim1 = G.dim(1);
 
     if (old_dim0 < new_dim0) {
-      output->Shrink(old_dim0);
+      output->ShrinkTo(old_dim0);
     }
 
     return true;
 
@@ -537,7 +537,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,
   } else {
     CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);
     CAFFE_ENFORCE_EQ(Y->dim32(1), OC);
-    Y->Shrink(X.dim32(0) * OH * OW);
+    Y->ShrinkTo(X.dim32(0) * OH * OW);
     Y->Reshape(std::vector<TIndex>{{TIndex(X.dim(0)), TIndex(OH), TIndex(OW), TIndex(OC)}});
   }
 }
 
@@ -1004,7 +1004,7 @@ class TrimDatasetOp : public Operator<CPUContext> {
     // trim each column to the offset
     for (int col = 0; col < walker.fields().size(); ++col) {
       auto newOuterSize = walker.fields().at(col).offset();
-      Output(col)->Shrink(newOuterSize);
+      Output(col)->ShrinkTo(newOuterSize);
     }
     return true;
   }
 
@@ -18,29 +18,64 @@ namespace caffe2 {
 namespace {
 
 template <typename T>
-inline T Cube(const T& x) {
-  return x * x * x;
+void GroupNormForwardNCHW(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
+    const T* X,
+    const T* mu,
+    const T* rsig,
+    const T* gamma,
+    const T* beta,
+    T* Y) {
+  const int C = G * D;
+  EigenArrayMap<T>(Y, D * HxW, N * G) =
+      (ConstEigenArrayMap<T>(X, D * HxW, N * G).rowwise() -
+       ConstEigenVectorArrayMap<T>(mu, N * G).transpose())
+          .rowwise() *
+      ConstEigenVectorArrayMap<T>(rsig, N * G).transpose();
+  T* Y_ptr = Y;
+  const int stride = C * HxW;
+  ConstEigenVectorArrayMap<T> gamma_arr(gamma, C);
+  ConstEigenVectorArrayMap<T> beta_arr(beta, C);
+  for (int i = 0; i < N; ++i) {
+    EigenArrayMap<T> Y_arr(Y_ptr, HxW, C);
+    Y_arr = (Y_arr.rowwise() * gamma_arr.transpose()).rowwise() +
+        beta_arr.transpose();
+    Y_ptr += stride;
+  }
 }
 
-template <typename T, StorageOrder kOrder>
-void GroupNormForward(
-    const std::array<int, 4>& dims,
+template <typename T>
+void GroupNormForwardNHWC(
+    const int N,
+    const int G,
+    const int D,
+    const int HxW,
     const T* X,
     const T* mu,
     const T* rsig,
     const T* gamma,
     const T* beta,
     T* Y) {
-  constexpr int kGDim = kOrder == StorageOrder::NCHW ? 1 : 2;
-  constexpr int kDDim = kOrder == StorageOrder::NCHW ? 2 : 3;
-  const int size = dims[0] * dims[1] * dims[2] * dims[3];
-  std::array<int, 4> index = {0, 0, 0, 0};
-  for (int i = 0; i < size; ++i) {
-    const int i_mu = index[0] * dims[kGDim] + index[kGDim];
-    const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim];
-    Y[i] = gamma[i_gamma] * (X[i] - mu[i_mu]) * rsig[i_mu] + beta[i_gamma];
-    math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
+  const int C = G * D;
+  const T* X_ptr = X;
+  T* Y_ptr = Y;
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < HxW; ++j) {
+      EigenArrayMap<T>(Y_ptr, D, G) =
+          (ConstEigenArrayMap<T>(X_ptr, D, G).rowwise() -
+           ConstEigenVectorArrayMap<T>(mu + i * G, G).transpose())
+              .rowwise() *
+          ConstEigenVectorArrayMap<T>(rsig + i * G, G).transpose();
+      X_ptr += C;
+      Y_ptr += C;
+    }
   }
+  EigenArrayMap<T> Y_arr(Y, C, N * HxW);
+  Y_arr = (Y_arr.colwise() * ConstEigenVectorArrayMap<T>(gamma, C)).colwise() +
+      ConstEigenVectorArrayMap<T>(beta, C);
 }
 
 template <typename T, StorageOrder kOrder>
@@ -97,8 +132,8 @@ void GroupNormBackward(
   for (int i = 0; i < size; ++i) {
     const int i_mu = index[0] * dims[kGDim] + index[kGDim];
     const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim];
-    const T u =
-        (db[i_mu] * mu[i_mu] - ds[i_mu]) * (X[i] - mu[i_mu]) * Cube(rsig[i_mu]);
+    const T u = (db[i_mu] * mu[i_mu] - ds[i_mu]) * (X[i] - mu[i_mu]) *
+        math::utils::Cube(rsig[i_mu]);
     const T v = db[i_mu] * rsig[i_mu];
     dX[i] = gamma[i_gamma] * dY[i] * rsig[i_mu] + (u - v) * denom;
     dgamma[i_gamma] += dY[i] * (X[i] - mu[i_mu]) * rsig[i_mu];
@@ -138,11 +173,29 @@ bool GroupNormOp<T, Context>::RunOnDeviceImpl(
 
   // Computes Y = gamma * (X - mu) * rsig + beta.
   if (order_ == StorageOrder::NCHW) {
-    GroupNormForward<T, StorageOrder::NCHW>(
-        dims, X_data, mu_data, rsig_data, gamma_data, beta_data, Y_data);
+    GroupNormForwardNCHW<T>(
+        N,
+        G,
+        D,
+        HxW,
+        X_data,
+        mu_data,
+        rsig_data,
+        gamma_data,
+        beta_data,
+        Y_data);
   } else {
-    GroupNormForward<T, StorageOrder::NHWC>(
-        dims, X_data, mu_data, rsig_data, gamma_data, beta_data, Y_data);
+    GroupNormForwardNHWC<T>(
+        N,
+        G,
+        D,
+        HxW,
+        X_data,
+        mu_data,
+        rsig_data,
+        gamma_data,
+        beta_data,
+        Y_data);
   }
   return true;
 }
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ class TTPadGradientOp final : public Operator<Context> {`
`83`	`83`	`auto dim1 = G.dim(1);`
`84`	`84`
`85`	`85`	`if (old_dim0 < new_dim0) {`
`86`		`- output->Shrink(old_dim0);`
	`86`	`+ output->ShrinkTo(old_dim0);`
`87`	`87`	`}`
`88`	`88`
`89`	`89`	`return true;`
Original file line number	Diff line number	Diff line change
`@@ -537,7 +537,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,`
`537`	`537`	`} else {`
`538`	`538`	`CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);`
`539`	`539`	`CAFFE_ENFORCE_EQ(Y->dim32(1), OC);`
`540`		`- Y->Shrink(X.dim32(0) * OH * OW);`
	`540`	`+ Y->ShrinkTo(X.dim32(0) * OH * OW);`
`541`	`541`	`Y->Reshape(std::vector<TIndex>{{TIndex(X.dim(0)), TIndex(OH), TIndex(OW), TIndex(OC)}});`
`542`	`542`	`}`
`543`	`543`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1004,7 +1004,7 @@ class TrimDatasetOp : public Operator<CPUContext> {`
`1004`	`1004`	`// trim each column to the offset`
`1005`	`1005`	`for (int col = 0; col < walker.fields().size(); ++col) {`
`1006`	`1006`	`auto newOuterSize = walker.fields().at(col).offset();`
`1007`		`- Output(col)->Shrink(newOuterSize);`
	`1007`	`+ Output(col)->ShrinkTo(newOuterSize);`
`1008`	`1008`	`}`
`1009`	`1009`	`return true;`
`1010`	`1010`	`}`