ROCm
diff --git a/‎.jenkins/pytorch/build.sh
Lines changed: 2 additions & 2 deletions b/‎.jenkins/pytorch/build.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 2 deletions
diff --git a/‎aten/src/ATen/Retainable.h
Lines changed: 3 additions & 1 deletion b/‎aten/src/ATen/Retainable.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎aten/src/ATen/core/ATenGeneral.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/core/ATenGeneral.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/Error.h
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/Error.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/Macros.h
Lines changed: 20 additions & 8 deletions b/‎aten/src/ATen/core/Macros.h
Lines changed: 20 additions & 8 deletions
diff --git a/‎aten/src/ATen/core/intrusive_ptr.h
Lines changed: 7 additions & 6 deletions b/‎aten/src/ATen/core/intrusive_ptr.h
Lines changed: 7 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/cuda/Embedding.cu
Lines changed: 6 additions & 2 deletions b/‎aten/src/ATen/native/cuda/Embedding.cu
Lines changed: 6 additions & 2 deletions
diff --git a/‎caffe2/CMakeLists.txt
Lines changed: 1 addition & 7 deletions b/‎caffe2/CMakeLists.txt
Lines changed: 1 addition & 7 deletions
diff --git a/‎caffe2/core/nomnigraph/README.md
Lines changed: 27 additions & 2 deletions b/‎caffe2/core/nomnigraph/README.md
Lines changed: 27 additions & 2 deletions
diff --git a/‎caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
Lines changed: 52 additions & 0 deletions b/‎caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
Lines changed: 52 additions & 0 deletions
diff --git a/‎caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
Lines changed: 2 additions & 2 deletions b/‎caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
Lines changed: 9 additions & 0 deletions b/‎caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
Lines changed: 9 additions & 0 deletions
@@ -6,8 +6,8 @@ fi
 
 # TODO: move this to Docker
 # TODO: add both NCCL and MPI in CI test by fixing these test first
-# sudo apt-get update
-# sudo apt-get install libnccl-dev libnccl2
+sudo apt-get update
+sudo apt-get install libnccl-dev libnccl2
 # sudo apt-get install openmpi-bin libopenmpi-dev
 
 # Required environment variable: $BUILD_ENVIRONMENT
 
@@ -144,7 +144,7 @@ option(USE_DISTRIBUTED "Use THD (distributed)" OFF)
 # Used when building Caffe2 through setup.py
 option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF)
 
-if (ANDROID OR IOS) 
+if (ANDROID OR IOS)
   set(BUILD_ATEN_MOBILE ON)
 endif()
 
@@ -213,7 +213,9 @@ if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow")
+  if (CMAKE_COMPILER_IS_GNUCXX AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow")
+  endif()
   # These flags are not available in GCC-4.8.5. Set only when using clang.
   # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
   if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
 
@@ -2,11 +2,13 @@
 
 #include <atomic>
 
+#include "ATen/core/ATenGeneral.h"
+
 namespace at {
 
 // base class for refcounted things, allows for collects of generic
 // refcounted objects that include tensors
-struct Retainable {
+struct AT_API Retainable {
   Retainable(): refcount(1), weak_refcount(1) {}
   void retain() {
     ++refcount;
 
@@ -4,3 +4,5 @@
 
 // TODO: Merge the *_API macros.
 #define AT_API AT_CORE_API
+#define AT_EXPORT AT_CORE_EXPORT
+#define AT_IMPORT AT_CORE_IMPORT
@@ -19,7 +19,7 @@ namespace at {
 namespace detail {
 
 // Obtains the base name from a full path.
-std::string StripBasename(const std::string& full_path);
+AT_CORE_API std::string StripBasename(const std::string& full_path);
 
 inline std::ostream& _str(std::ostream& ss) {
   return ss;
@@ -56,7 +56,7 @@ inline std::string str(const char* c_str) {
 }
 
 /// Represents a location in source code (for debugging).
-struct SourceLocation {
+struct AT_CORE_API SourceLocation {
   const char* function;
   const char* file;
   uint32_t line;
 
@@ -12,21 +12,33 @@
 
 #ifdef _WIN32
 #if !defined(AT_CORE_STATIC_WINDOWS)
-// TODO: unfiy the controlling macros.
-#if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#define AT_CORE_API __declspec(dllexport)
-#else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#define AT_CORE_API __declspec(dllimport)
-#endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#define AT_CORE_EXPORT __declspec(dllexport)
+#define AT_CORE_IMPORT __declspec(dllimport)
 #else // !defined(AT_CORE_STATIC_WINDOWS)
-#define AT_CORE_API
+#define AT_CORE_EXPORT
+#define AT_CORE_IMPORT
 #endif // !defined(AT_CORE_STATIC_WINDOWS)
 #else  // _WIN32
 #if defined(__GNUC__)
-#define AT_CORE_API __attribute__((__visibility__("default")))
+#define AT_CORE_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define AT_CORE_EXPORT
 #endif // defined(__GNUC__)
+#define AT_CORE_IMPORT AT_CORE_EXPORT
 #endif  // _WIN32
 
+// AT_CORE_API is a macro that, depends on whether you are building the
+// main library or not, resolves to either AT_CORE_EXPORT or
+// AT_CORE_IMPORT.
+//
+
+// TODO: unify the controlling macros.
+#if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#define AT_CORE_API AT_CORE_EXPORT
+#else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#define AT_CORE_API AT_CORE_IMPORT
+#endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+
 // Disable the copy and assignment operator for a class. Note that this will
 // disable the usage of the class in std containers.
 #define AT_DISABLE_COPY_AND_ASSIGN(classname) \
 
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ATen/core/ATenGeneral.h>
 #include <ATen/core/Error.h>
 #include <atomic>
 #include <stdexcept>
@@ -32,7 +33,7 @@ namespace c10 {
 // tells us if the object was allocated by us.  If it wasn't, no
 // intrusive_ptr for you!
 
-class intrusive_ptr_target {
+class AT_CORE_API intrusive_ptr_target {
   // Note [Weak references for intrusive refcounting]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Here's the scheme:
@@ -113,7 +114,7 @@ class intrusive_ptr_target {
 
 namespace detail {
 template <class TTarget>
-struct intrusive_target_default_null_type final {
+struct AT_CORE_EXPORT intrusive_target_default_null_type final {
   static constexpr TTarget* singleton() noexcept {
     return nullptr;
   }
@@ -126,7 +127,7 @@ class weak_intrusive_ptr;
 template <
     class TTarget,
     class NullType = detail::intrusive_target_default_null_type<TTarget>>
-class intrusive_ptr final {
+class AT_CORE_EXPORT intrusive_ptr final {
  private:
   static_assert(
       std::is_base_of<intrusive_ptr_target, TTarget>::value,
@@ -415,7 +416,7 @@ inline bool operator!=(
 template <
     typename TTarget,
     class NullType = detail::intrusive_target_default_null_type<TTarget>>
-class weak_intrusive_ptr final {
+class AT_CORE_EXPORT weak_intrusive_ptr final {
  private:
   static_assert(
       std::is_base_of<intrusive_ptr_target, TTarget>::value,
@@ -797,13 +798,13 @@ namespace std {
 // To allow intrusive_ptr and weak_intrusive_ptr inside std::unordered_map or
 // std::unordered_set, we need std::hash
 template <class TTarget, class NullType>
-struct hash<c10::intrusive_ptr<TTarget, NullType>> {
+struct AT_CORE_EXPORT hash<c10::intrusive_ptr<TTarget, NullType>> {
   size_t operator()(const c10::intrusive_ptr<TTarget, NullType>& x) const {
     return std::hash<TTarget*>()(x.get());
   }
 };
 template <class TTarget, class NullType>
-struct hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
+struct AT_CORE_EXPORT hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
   size_t operator()(const c10::weak_intrusive_ptr<TTarget, NullType>& x) const {
     return std::hash<TTarget*>()(x._unsafe_get_target());
   }
 
@@ -47,14 +47,18 @@ __global__ void embedding_backward_feature_kernel
     if(batch_start + tid < n)
       indices_batch[tid] = (int)indices[batch_start + tid];
 
+    int batch_end = batch_start + blockDim.x*blockDim.y < n ? 
+                    batch_start + blockDim.x*blockDim.y : n;
+
     // Loop over the batch of <= 1024 loaded indices in chunks of blockDim.y = 32
-    for(int chunk_start = batch_start; chunk_start < n; chunk_start += blockDim.y)
+    for(int chunk_start = batch_start; chunk_start < batch_end; chunk_start += blockDim.y)
     {
       // This does double duty:  it makes sure indices_batch is ready, and it makes sure match-group
       // leaders are done with their accumulates before other warps start loading again.
       __syncthreads();
 
-      int n_this_chunk = (n - chunk_start) < blockDim.y ? (n - chunk_start) : blockDim.y;
+      int n_this_chunk = (batch_end - chunk_start) < blockDim.y ? 
+                         (batch_end - chunk_start) : blockDim.y;
 
       int src_row = chunk_start + threadIdx.y;
       int dst_row = indices_batch[src_row - batch_start]; // This warp's target row in grad_weight
 
@@ -207,6 +207,7 @@ target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf)
 # Compile exposed libraries.
 list(APPEND Caffe2_CPU_SRCs $<TARGET_OBJECTS:c10>)
 add_library(caffe2 ${Caffe2_CPU_SRCS})
+target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
 caffe2_interface_library(caffe2_protos caffe2_protos_whole)
 target_link_libraries(caffe2 PRIVATE caffe2_protos_whole)
 if (${CAFFE2_LINK_LOCAL_PROTOBUF})
@@ -229,13 +230,6 @@ else()
 target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
 endif()
 
-# Note(jiayq): This is not complete yet, but in the end we will need to deal with
-# explicit hidden visibility.
-# This line is here so that when testing build, we can enable it to properly test
-# annotation of public symbols. When finally doing proper build with all symbols
-# annotated, we will enable this line and have it wrapped with gcc/clang checks.
-# target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
-
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 if (MSVC AND NOT BUILD_SHARED_LIBS)
   # Note [Supporting both static and dynamic libraries on Window]
 
@@ -67,7 +67,22 @@ Do not create `OperatorDef`s in the transformation itself! This is an anti-patte
 
 Below is a subset of selected API calls that are quite useful.  Lower level manipulation calls are omitted.
 
+### Graph transformation API
+Nomnigraph provides a ReplaceSubgraph API to perform graph transformation operations without having to write custom subgraph matching logic. The main header file is [SubgraphMatcher.h](include/nomnigraph/Transformations/SubgraphMatcher.h).
+
+ReplaceSubgraph API takes in
+- A subgraph pattern to be matched
+- A graph to be scanned for matching patterns
+- A ReplaceGraph lambda function that takes in a matched subgraph; callers should implement specific graph transformation operation in the lambda.
+
+The ReplaceSubgraph implementation takes care of the pattern matching part and also provides tools for callers to implement graph transformation logic with less effort.
+
+Example usage of the API can be found in [subgraph_matcher_test.cc](tests/subgraph_matcher_test.cc)
+
+Example usage of the API for NNGraph can be found in [neural_net_test.cc](tests/neural_net_test.cc)
+
 ### Graph API
+Nomnigraph's core graph APIs provide a generic graph data structure and basic graph manipulation abilities. The main header file is [Graph.h](include/nomnigraph/Graph/Graph.h).
 
 ```cpp
 auto g = Graph<T>(); // Constructor
@@ -91,6 +106,9 @@ T d = n->data(); // Get the data stored at the node
 ```
 
 ### NN API
+NN (NeuralNet) extends core Graph with functionalities specific to neural network computation graph. The main header file is [NeuralNet.h](include/nomnigraph/Representations/NeuralNet.h).
+
+Type checking & data accessing
 
 ```cpp
 repr::NNModule nn = ...;
@@ -101,13 +119,18 @@ repr::NNGraph::NodeRef n;  // Canonical node of the neural network
 bool b = repr::nn::is<repr::Tensor>(n); // Checks the type stored on the node.  (Works with parent types.)
 
 repr::Conv* c = repr::nn::get<repr::Conv>(n); // Returns a pointer to the NeuralNetOperator or NeuralNetData in the node
+```
 
+Iterate through nodes in a NNGraph.
+```cpp
 auto pairs = dataIterator(nn); // A useful paradigm for iterating through nodes and corresponding data in no particular order.
 auto nodeRefs = nodeIterator(nn); // Iterate through nodes in no particular order.
 // See https://github.com/pytorch/pytorch/blob/master/caffe2/opt/mobile.cc#L106-L109
+```
 
 
-///// These functions make it easy to check attributes on nodes. /////
+These functions make it easy to check attributes on nodes.
+```cpp
 // -- Tensor node functions --
 bool b = hasProducer(tensorNode);  // Checks for producers.
 auto n = getProducer(tensorNode); // Returns the producer of the tensor
@@ -118,8 +141,10 @@ std::vector<NNGraph::NodeRef> consumers = getConsumers(tensorNode); // Returns a
 bool b = hasInputs(n); // Checks if there are any input tensors.
 std::vector<NNGraph::NodeRef> getInputs(n); // Returns a vector of all the input tensor nodes.
 std::vector<NNGraph::NodeRef> getOutputs(n); // Returns a vector of all the output tensor nodes.
+```
 
-///// These functions are less commonly useful /////
+These functions are less commonly useful
+```cpp
 coalesceInsertedDataDependencies(&nn); // Fixes up all the inserted dependencies in the dataflow graph.
 
 insertOp<repr::Relu>(nn.dataFlow, n1, n2); // Inserts an operator into the dataflow graph and creates a new blob to do so.
 
@@ -531,6 +531,36 @@ class BatchNormalization : public NeuralNetOperator {
   bool IsTest;
 };
 
+class Clip : public NeuralNetOperator {
+ public:
+  Clip(float min, float max)
+      : NeuralNetOperator(NNKind::Clip), Min(min), Max(max) {}
+
+  ~Clip() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Clip);
+
+  float getMin() const {
+    return Min;
+  }
+
+  float getMax() const {
+    return Max;
+  }
+
+  void setMin(float min) {
+    Min = min;
+  }
+
+  void setMax(float max) {
+    Max = max;
+  }
+
+ private:
+  float Min;
+  float Max;
+};
+
 class FC : public NeuralNetOperator {
  public:
   FC() : NeuralNetOperator(NNKind::FC) {}
@@ -638,6 +668,28 @@ class Flatten : public NeuralNetOperator {
  private:
 };
 
+class CopyToOpenCL : public NeuralNetOperator {
+ public:
+  CopyToOpenCL() : NeuralNetOperator(NNKind::CopyToOpenCL) {}
+
+  ~CopyToOpenCL() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(CopyToOpenCL);
+
+ private:
+};
+
+class CopyFromOpenCL : public NeuralNetOperator {
+ public:
+  CopyFromOpenCL() : NeuralNetOperator(NNKind::CopyFromOpenCL) {}
+
+  ~CopyFromOpenCL() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(CopyFromOpenCL);
+
+ private:
+};
+
 class NCHW2NHWC : public NeuralNetOperator {
  public:
   NCHW2NHWC() : NeuralNetOperator(NNKind::NCHW2NHWC) {}
 
@@ -1,4 +1,4 @@
 Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool,
-    MaxPoolRelu, Sum, SumRelu, Send, Receive, BatchNormalization, FC,
+    MaxPoolRelu, Sum, SumRelu, Send, Receive, BatchNormalization, Clip, FC,
     GivenTensorFill, Concat, Softmax, ChannelShuffle, Add, Reshape, Flatten,
-    NCHW2NHWC, NHWC2NCHW
+    CopyToOpenCL, CopyFromOpenCL, NCHW2NHWC, NHWC2NCHW
@@ -37,6 +37,9 @@ case NNKind::Receive:
 case NNKind::BatchNormalization:
   return "BatchNormalization";
 
+case NNKind::Clip:
+  return "Clip";
+
 case NNKind::FC:
   return "FC";
 
@@ -61,6 +64,12 @@ case NNKind::Reshape:
 case NNKind::Flatten:
   return "Flatten";
 
+case NNKind::CopyToOpenCL:
+  return "CopyToOpenCL";
+
+case NNKind::CopyFromOpenCL:
+  return "CopyFromOpenCL";
+
 case NNKind::NCHW2NHWC:
   return "NCHW2NHWC";