diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index bc5e6c973e5e36..928c5fa0e47387 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -157,6 +157,7 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
   export HCC_AMDGPU_TARGET=gfx900
 
   ########## HIPIFY Caffe2 operators
+  ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_pytorch_amd.py"
   ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"
 fi
 
@@ -190,7 +191,6 @@ else
 fi
 
 
-
 ###############################################################################
 # Configure and make
 ###############################################################################
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index bfbd40b259fbc3..b0a08a922d9623 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -30,7 +30,6 @@ cmake --version
 pip install -r requirements.txt || true
 
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  export MAX_JOBS=4
   # This is necessary in order to cross compile (or else we'll have missing GPU device).
   export HCC_AMDGPU_TARGET=gfx900
 
@@ -48,6 +47,7 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   sudo apt-get install libc++abi1
 
   python tools/amd_build/build_pytorch_amd.py
+  python tools/amd_build/build_caffe2_amd.py
   USE_ROCM=1 python setup.py install --user
   exit 0
 fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23a5080a88d0a9..edbd4381c70bab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,7 +55,8 @@ endif()
 include(CMakeDependentOption)
 option(BUILD_TORCH "Build Torch" OFF)
 option(BUILD_CAFFE2 "Build Caffe2" ON)
-option(BUILD_ATEN "Build ATen" OFF)
+option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
+option(BUILD_ATEN_MOBILE "Build ATen for Android and iOS" OFF)
 option(BUILD_BINARY "Build C++ binaries" ON)
 option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
@@ -75,7 +76,6 @@ cmake_dependent_option(
     "BUILD_TEST" OFF)
 option(USE_ACL "Use ARM Compute Library" OFF)
 option(USE_ASAN "Use Address Sanitizer" OFF)
-option(USE_ATEN "Use ATen" OFF)
 option(USE_CUDA "Use CUDA" ON)
 option(USE_ROCM "Use ROCm" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
@@ -145,8 +145,8 @@ option(USE_DISTRIBUTED_MW "Use THD (distributed) master worker" OFF)
 # Used when building Caffe2 through setup.py
 option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF)
 
-if (USE_ATEN)
-    set(BUILD_ATEN ${USE_ATEN})
+if (ANDROID OR IOS) 
+  set(BUILD_ATEN_MOBILE ON)
 endif()
 
 # ---[ CMake scripts + modules
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 2f2ffdce186d39..ee025265a982e7 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -1,8 +1,4 @@
-if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  if (NOT BUILD_ATEN)
-    return()
-  endif()
-else()
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
   project(ATen CXX C)
   include(CMakeDependentOption)
@@ -14,9 +10,10 @@ else()
       USE_CUDNN "Use cuDNN" ON
       "USE_CUDA" OFF)
   option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
-
-  # Flag for shared dependencies
-  set(BUILD_ATEN ON)
+else()
+  if (BUILD_ATEN_MOBILE)
+    return()
+  endif()
 endif()
 
 # Find modules
diff --git a/aten/src/ATen/ATenGeneral.h b/aten/src/ATen/ATenGeneral.h
index fd6eeb3937e0ee..6fd55e20ff7b36 100644
--- a/aten/src/ATen/ATenGeneral.h
+++ b/aten/src/ATen/ATenGeneral.h
@@ -3,4 +3,4 @@
 #include "ATen/core/Macros.h"
 
 // TODO: Merge the *_API macros.
-#define AT_API AT_CORE_API
\ No newline at end of file
+#define AT_API AT_CORE_API
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 9005fa189f9e66..f6d296dfe79e45 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -13,23 +13,6 @@ IF(NOT MSVC)
   SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-absolute-value")
 ENDIF(NOT MSVC)
 
-################################################################################
-# Helper functions
-################################################################################
-
-function(filter_list output input)
-    unset(result)
-    foreach(filename ${${input}})
-        foreach(pattern ${ARGN})
-            if("${filename}" MATCHES "${pattern}")
-                list(APPEND result "${filename}")
-            endif()
-        endforeach()
-    endforeach()
-    set(${output} ${result} PARENT_SCOPE)
-endfunction()
-
-
 # Can be compiled standalone
 IF(NOT AT_INSTALL_BIN_DIR OR NOT AT_INSTALL_LIB_DIR OR NOT AT_INSTALL_INCLUDE_DIR OR NOT AT_INSTALL_SHARE_DIR)
   SET(AT_INSTALL_BIN_DIR "bin" CACHE PATH "AT install binary subdirectory")
diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h
index 3beda18f31e764..366adf0f2396fe 100644
--- a/aten/src/ATen/cuda/ATenCUDAGeneral.h
+++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #ifdef _WIN32
-# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS)
+# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_BUILD_MAIN_LIB)
 #  define AT_CUDA_API __declspec(dllexport)
 # else
 #  define AT_CUDA_API __declspec(dllimport)
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 257c8caf3e6fb3..755272915a826b 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -127,7 +127,7 @@ auto ConvParams::use_miopen(const at::Tensor& input) const -> bool {
 
 auto ConvParams::use_mkldnn(const at::Tensor& input) const -> bool {
 #if AT_MKLDNN_ENABLED()
-  return input.type().backend() == kCPU &&
+  return input.type().backend() == at::Backend::CPU &&
          input.type().scalarType() == kFloat && // only on CPU Float Tensors
          !is_dilated() && // doesn't support dilation
          !transposed && // or transposed tensors
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index c2ad676c56329d..c41992832ebcb2 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -29,13 +29,16 @@ using namespace at::native::detail;
 // counting_iterator => index to fill
 struct cnt_to_dst_idx_functor : public thrust::unary_function<int64_t, int64_t>
 {
-  const int64_t last_dim_size;
-  const int64_t last_dim_start_slice;
-  const int64_t last_dim_to_fill_size;
+  int64_t last_dim_size;
+  int64_t last_dim_start_slice;
+  int64_t last_dim_to_fill_size;
 
   cnt_to_dst_idx_functor(int64_t last_dim_size, int64_t last_dim_start_slice) :
     last_dim_size(last_dim_size), last_dim_start_slice(last_dim_start_slice),
     last_dim_to_fill_size(last_dim_size - last_dim_start_slice) {}
+  
+  __host__ __device__
+  cnt_to_dst_idx_functor & operator=(const cnt_to_dst_idx_functor&) = default;
 
   __host__ __device__ __forceinline__
   int64_t operator()(const int64_t& i) const
diff --git a/aten/src/ATen/native/miopen/BatchNorm.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
similarity index 100%
rename from aten/src/ATen/native/miopen/BatchNorm.cpp
rename to aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
diff --git a/aten/src/ATen/native/miopen/Conv.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
similarity index 100%
rename from aten/src/ATen/native/miopen/Conv.cpp
rename to aten/src/ATen/native/miopen/Conv_miopen.cpp
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index 44c536e7e5d701..bb9f7b92bd691a 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -514,7 +514,7 @@ void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot,
   THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
 }
 
-THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize)
+void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize)
 {
   if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
   {
@@ -531,7 +531,7 @@ THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, co
 }
 
 
-THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize)
+void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize)
 {
   if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) )
   {
diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in
index 3ba4ed9719b5ee..af0227abdd0b1c 100644
--- a/aten/src/THC/THCGeneral.h.in
+++ b/aten/src/THC/THCGeneral.h.in
@@ -23,7 +23,7 @@
 #endif
 
 #ifdef _WIN32
-# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS)
+# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_BUILD_MAIN_LIB)
 #  define THC_API THC_EXTERNC __declspec(dllexport)
 #  define THC_CLASS __declspec(dllexport)
 # else
diff --git a/aten/src/THC/THCSleep.cu b/aten/src/THC/THCSleep.cu
index d30576212e9b7b..a6ebbdb1f42742 100644
--- a/aten/src/THC/THCSleep.cu
+++ b/aten/src/THC/THCSleep.cu
@@ -12,7 +12,7 @@ __global__ void spin_kernel(int64_t cycles)
   }
 }
 
-THC_API void THC_sleep(THCState* state, int64_t cycles)
+void THC_sleep(THCState* state, int64_t cycles)
 {
   dim3 grid(1);
   dim3 block(1);
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 6ab3362d6ab20a..7c1a84cc19c687 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -7,7 +7,7 @@ include(../cmake/Codegen.cmake)
 add_subdirectory(utils)
 
 # ---[ ATen build
-if(BUILD_ATEN)
+if (NOT BUILD_ATEN_MOBILE)
   set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE})
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   set(AT_LINK_STYLE INTERFACE)
@@ -49,7 +49,7 @@ if(BUILD_ATEN)
 
   IF(USE_ROCM)
     # Set the HIP Variables
-    set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
+    set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${ATen_CUDA_SRCS})
     set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
   ENDIF(USE_ROCM)
 else()
@@ -340,6 +340,12 @@ if(USE_CUDA)
       target_compile_options(caffe2_gpu PUBLIC "-DAT_CORE_STATIC_WINDOWS=1")
   endif()
 
+  # NB: This must be target_compile_definitions, not target_compile_options,
+  # as the latter is not respected by nvcc
+  if (MSVC)
+      target_compile_definitions(caffe2_gpu PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
+  endif()
+
   # Set standard properties on the target
   aten_set_target_props(caffe2_gpu)
 
@@ -351,21 +357,19 @@ endif()
 # ---[ Caffe2 HIP sources.
 if(USE_ROCM)
   # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs.
-  if(BUILD_ATEN)
-    # Get Compile Definitions from the directory (FindHIP.cmake bug)
-    get_directory_property(MY_DEFINITIONS COMPILE_DEFINITIONS)
-    if(MY_DEFINITIONS)
-      foreach(_item ${MY_DEFINITIONS})
-        LIST(APPEND HIP_HCC_FLAGS "-D${_item}")
-      endforeach()
-    endif()
-
-    # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs.
-    hip_include_directories(${Caffe2_HIP_INCLUDES})
+  # Get Compile Definitions from the directory (FindHIP.CMake bug)
+  get_directory_property(MY_DEFINITIONS COMPILE_DEFINITIONS)
+  if(MY_DEFINITIONS)
+    foreach(_item ${MY_DEFINITIONS})
+      LIST(APPEND HIP_HCC_FLAGS "-D${_item}")
+    endforeach()
   endif()
-  IF(BUILD_CAFFE2)
-     set_source_files_properties(${Caffe2_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-  ENDIF()
+
+  # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs.
+  hip_include_directories(${Caffe2_HIP_INCLUDES})
+
+  filter_list(__caffe2_hip_srcs_cpp Caffe2_HIP_SRCS "\\.(cc|cpp|cu)$")
+  set_source_files_properties(${__caffe2_hip_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
 
   # FindHIP.CMake checks if the SHARED flag is set and adds extra logic accordingly.
   hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS})
@@ -444,7 +448,7 @@ if(BUILD_CAFFE2)
   set(__aten_test_dir "test/aten")
 endif()
 # Todo - Set up ATen tests for ROCm in an upcoming PR
-if(BUILD_ATEN AND NOT USE_ROCM)
+if(NOT USE_ROCM)
   foreach(test_src ${ATen_CPU_TEST_SRCS})
     get_filename_component(test_name ${test_src} NAME_WE)
     add_executable(${test_name} "${test_src}")
diff --git a/caffe2/README.md b/caffe2/README.md
index a1166b8e4f9451..afd8fab339c310 100644
--- a/caffe2/README.md
+++ b/caffe2/README.md
@@ -6,7 +6,7 @@ Caffe2 is a lightweight, modular, and scalable deep learning framework. Building
 
 ## Questions and Feedback
 
-Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features.
+Please use Github issues (https://github.com/pytorch/pytorch/issues) to ask questions, report bugs, and request new features.
 
 ### Further Resources on [Caffe2.ai](http://caffe2.ai)
 
diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt
index 5bc2341e3d2884..92eb671e019cb7 100644
--- a/caffe2/contrib/aten/CMakeLists.txt
+++ b/caffe2/contrib/aten/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(BUILD_ATEN)
+if(NOT BUILD_ATEN_MOBILE)
   # Add source generated by Codegen.cmake and pass to parent
   list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc)
   list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc)
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 8f5c79a74c6a9e..048d634df80dfa 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -94,19 +94,6 @@ using std::vector;
 #define CAFFE2_NORETURN __attribute__((noreturn))
 #endif
 
-/**
- * Macro for marking functions as having public visibility.
- * Ported from folly/CPortability.h
- */
-#ifndef __GNUC_PREREQ
-#if defined __GNUC__ && defined __GNUC_MINOR__
-#define __GNUC_PREREQ(maj, min) \
-  ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
-#else
-#define __GNUC_PREREQ(maj, min) 0
-#endif
-#endif
-
 // Defines CAFFE2_EXPORT and CAFFE2_IMPORT. On Windows, this corresponds to
 // different declarations (dllexport and dllimport). On Linux/Mac, it just
 // resolves to the same "default visibility" setting.
@@ -120,11 +107,7 @@ using std::vector;
 #endif
 #else
 #if defined(__GNUC__)
-#if __GNUC_PREREQ(4, 9)
-#define CAFFE2_EXPORT [[gnu::visibility("default")]]
-#else
 #define CAFFE2_EXPORT __attribute__((__visibility__("default")))
-#endif
 #else
 #define CAFFE2_EXPORT
 #endif
diff --git a/caffe2/core/graph.h b/caffe2/core/graph.h
index 1bd0d4fa9616da..ac037d5f0867ae 100644
--- a/caffe2/core/graph.h
+++ b/caffe2/core/graph.h
@@ -16,7 +16,7 @@ namespace transform {
 /**
  *  Graph representation of an operator.
  */
-struct Node {
+struct CAFFE2_API Node {
  public:
   // Empty constructor for resize
   Node() {}
@@ -45,7 +45,7 @@ struct Node {
 /**
  *  Graph representation of a Netdef.
  */
-struct Graph {
+struct CAFFE2_API Graph {
  public:
   /**
    * Given a subgraph, gets all of the parents of the subgraph, as well as
@@ -155,7 +155,7 @@ struct Graph {
 
 // Adds an operator def to a netdef.
 // Returns the ptr, if you want to add anything extra (such as device_option)
-OperatorDef* AddOp(
+CAFFE2_API OperatorDef* AddOp(
     NetDef* netdef_ptr,
     string op_type,
     std::vector<string> inputs,
@@ -168,12 +168,12 @@ OperatorDef* AddOp(
  * For example, if we wanted to match an operator to Conv or FC, we can give:
  * "Conv|FC" as the type() of that op.
  */
-bool MatchStrings(string p, string s);
+CAFFE2_API bool MatchStrings(string p, string s);
 
 /**
  * This ensures that each named arg that exists in the pattern exists in g_op,
  * is equal in value.
  */
-bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op);
+CAFFE2_API bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op);
 
 } // namespace caffe2
diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h
index 99060ddb0bcaf9..550326d700f0dd 100644
--- a/caffe2/core/net_simple.h
+++ b/caffe2/core/net_simple.h
@@ -16,7 +16,7 @@ namespace caffe2 {
 // This is the very basic structure you need to run a network - all it
 // does is simply to run everything in sequence. If you want more fancy control
 // such as a DAG-like execution, check out other better net implementations.
-class SimpleNet : public NetBase {
+class CAFFE2_API SimpleNet : public NetBase {
  public:
   SimpleNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
   bool SupportsAsync() override {
diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
index a60ddb127d545b..c31de031f853c6 100644
--- a/caffe2/core/nomnigraph/Representations/NeuralNet.cc
+++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
@@ -199,12 +199,13 @@ NNNodeMatchCriteria matchAnyNode() {
       [](NNGraph::NodeRef /* unused */) { return true; }, "matchAnyNode");
 }
 
-NNMatchGraph::NodeRef operatorTree(
+NNMatchGraph::NodeRef operatorSubgraph(
     NNMatchGraph& g,
     const NNNodeMatchCriteria& root,
     const std::vector<NNMatchGraph::NodeRef>& childrenCriteria,
     int count) {
-  return tree(g, matchAnyNode(), {tree(g, root, childrenCriteria)}, count);
+  return subgraph(
+      g, matchAnyNode(), {subgraph(g, root, childrenCriteria)}, count);
 }
 
 } // namespace nn
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index 425c6ffe3fec46..4f072545d4e6ed 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -412,6 +412,10 @@ class Graph {
     return result;
   }
 
+  size_t getEdgesCount() const {
+    return (size_t)edges_.size();
+  }
+
  private:
   std::list<Node<T, U...>> nodes_;
   std::list<Edge<T, U...>> edges_;
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index ac4e1fa61328e1..98e1bcba123642 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -487,9 +487,9 @@ using NNSubgraphMatcher =
     nom::matcher::SubgraphMatcher<NNGraph, NNNodeMatchCriteria, NNNodeMatch>;
 
 // This helper method makes it easy to create matching criteria in NNGraph.
-// For example, operatorTree(opMatch, ...) will refer to a tree like this:
+// For example, operatorSubgraph(opMatch, ...) will refer to a tree like this:
 // ... -> opMatch -> opMatch_Output
-NNMatchGraph::NodeRef operatorTree(
+NNMatchGraph::NodeRef operatorSubgraph(
     NNMatchGraph& g,
     const NNNodeMatchCriteria& root,
     const std::vector<NNMatchGraph::NodeRef>& childrenCriteria = {},
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
index 020454bf72d33c..9e0f44c896ac1e 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
@@ -5,6 +5,7 @@
 
 #include <functional>
 #include <sstream>
+#include <unordered_map>
 #include <vector>
 
 namespace nom {
@@ -56,7 +57,7 @@ template <typename NodeMatchCriteria>
 using MatchNodeRef = typename MatchGraph<NodeMatchCriteria>::NodeRef;
 
 template <typename NodeMatchCriteria>
-MatchNodeRef<NodeMatchCriteria> tree(
+MatchNodeRef<NodeMatchCriteria> subgraph(
     MatchGraph<NodeMatchCriteria>& graph,
     const NodeMatchCriteria& root,
     const std::vector<MatchNodeRef<NodeMatchCriteria>>& children,
@@ -96,19 +97,20 @@ std::string debugString(MatchNodeRef<NodeMatchCriteria> rootCriteriaRef) {
 }
 
 template <typename GraphType>
-class SubtreeMatchResult {
+class SubgraphMatchResult {
  public:
-  static SubtreeMatchResult<GraphType> notMatched(
+  static SubgraphMatchResult<GraphType> notMatched(
       const std::string& debugMessage) {
-    return SubtreeMatchResult<GraphType>(false, debugMessage);
+    return SubgraphMatchResult<GraphType>(false, debugMessage);
   }
 
-  static SubtreeMatchResult<GraphType> notMatched() {
-    return SubtreeMatchResult<GraphType>(false, "Debug message is not enabled");
+  static SubgraphMatchResult<GraphType> notMatched() {
+    return SubgraphMatchResult<GraphType>(
+        false, "Debug message is not enabled");
   }
 
-  static SubtreeMatchResult<GraphType> matched() {
-    return SubtreeMatchResult<GraphType>(true, "");
+  static SubgraphMatchResult<GraphType> matched() {
+    return SubgraphMatchResult<GraphType>(true, "Matched");
   }
 
   bool isMatch() const {
@@ -120,7 +122,7 @@ class SubtreeMatchResult {
   }
 
  private:
-  SubtreeMatchResult(bool isMatch, const std::string& debugMessage)
+  SubgraphMatchResult(bool isMatch, const std::string& debugMessage)
       : isMatch_(isMatch), debugMessage_(debugMessage) {}
 
   const bool isMatch_;
@@ -141,32 +143,103 @@ struct SubgraphMatcher {
     return NodeMatcherClass::isMatch(node, criteria);
   }
 
-  // Check if there can be a sub-tree that matches the given criteria that
+  // Check if there can be a subgraph that matches the given criteria that
   // is rooted at the given rootNode.
   // The flag invertGraphTraversal specify if we should follow out edges or
   // in edges. The default is true which is useful for a functional
   // intepretation of a dataflow graph.
-  static SubtreeMatchResult<GraphType> isSubtreeMatch(
+  static SubgraphMatchResult<GraphType> isSubgraphMatch(
+      typename GraphType::NodeRef root,
+      const MatchNodeRef<NodeMatchCriteria>& rootCriteriaRef,
+      bool invertGraphTraversal = true,
+      bool debug = false) {
+    std::unordered_map<
+        MatchNodeRef<NodeMatchCriteria>,
+        typename GraphType::NodeRef>
+        matchedNodes;
+    return isSubgraphMatchInternal(
+        matchedNodes, root, rootCriteriaRef, invertGraphTraversal, debug);
+  }
+
+  // Utility to transform a graph by looking for subgraphs that match
+  // a given pattern and then allow callers to mutate the graph based on
+  // subgraphs that are found.
+  // The current implementation doesn't handle any graph transformation
+  // itself. Callers should be responsible for all intended mutation, including
+  // deleting nodes in the subgraphs found by this algorithm.
+  // Note: if the replaceFunction lambda returns false, the entire procedure
+  // is aborted. This maybe useful in certain cases when we want to terminate
+  // the subgraph search early.
+  // invertGraphTraversal flag: see documentation in isSubgraphMatch
+  static void replaceSubgraph(
+      GraphType& graph,
+      const MatchNodeRef<NodeMatchCriteria>& criteria,
+      const std::function<bool(GraphType&, typename GraphType::NodeRef)>&
+          replaceFunction,
+      bool invertGraphTraversal = true) {
+    for (auto nodeRef : graph.getMutableNodes()) {
+      // Make sure the node is still in the graph.
+      if (!graph.hasNode(nodeRef)) {
+        continue;
+      }
+      if (isSubgraphMatch(nodeRef, criteria, invertGraphTraversal).isMatch()) {
+        if (!replaceFunction(graph, nodeRef)) {
+          // If replaceFunction returns false, it means that we should abort
+          // the entire procedure.
+          break;
+        }
+      }
+    }
+  }
+
+ private:
+  static SubgraphMatchResult<GraphType> isSubgraphMatchInternal(
+      std::unordered_map<
+          MatchNodeRef<NodeMatchCriteria>,
+          typename GraphType::NodeRef>& matchedNodes,
       typename GraphType::NodeRef root,
       const MatchNodeRef<NodeMatchCriteria>& rootCriteriaRef,
       bool invertGraphTraversal = true,
       bool debug = false) {
     auto rootCriteriaNode = rootCriteriaRef->data();
+
+    if (rootCriteriaNode.getCount() == 1) {
+      auto matchedNodeEntry = matchedNodes.find(rootCriteriaRef);
+      if (matchedNodeEntry != matchedNodes.end()) {
+        // If rootCriteriaRef has been matched before (without multiplicity),
+        // we should look up the corresponding matched node in the graph
+        // and verify if it is the same.
+        auto matchedNode = matchedNodeEntry->second;
+        if (matchedNode == root) {
+          return SubgraphMatchResult<GraphType>::matched();
+        } else if (debug) {
+          std::ostringstream debugMessage;
+          debugMessage << "Subgraph root at " << root << " is not the same as "
+                       << matchedNode << " which previously matched criteria "
+                       << debugString<NodeMatchCriteria>(rootCriteriaRef);
+          return SubgraphMatchResult<GraphType>::notMatched(debugMessage.str());
+        } else {
+          return SubgraphMatchResult<GraphType>::notMatched();
+        }
+      }
+    }
+
     if (!isNodeMatch(root, rootCriteriaNode.getCriteria())) {
       if (debug) {
         std::ostringstream debugMessage;
-        debugMessage << "Subtree root at " << root
+        debugMessage << "Subgraph root at " << root
                      << " does not match criteria "
                      << debugString<NodeMatchCriteria>(rootCriteriaRef);
-        return SubtreeMatchResult<GraphType>::notMatched(debugMessage.str());
+        return SubgraphMatchResult<GraphType>::notMatched(debugMessage.str());
       } else {
-        return SubtreeMatchResult<GraphType>::notMatched();
+        return SubgraphMatchResult<GraphType>::notMatched();
       }
     }
     if (rootCriteriaNode.isNonTerminal()) {
       // This is sufficient to be a match if this criteria specifies a non
       // terminal node.
-      return SubtreeMatchResult<GraphType>::matched();
+      matchedNodes[rootCriteriaRef] = root;
+      return SubgraphMatchResult<GraphType>::matched();
     }
     auto& edges =
         invertGraphTraversal ? root->getInEdges() : root->getOutEdges();
@@ -176,7 +249,7 @@ struct SubgraphMatcher {
     int numChildrenCriteria = outEdges.size();
 
     // The current algorithm implies that the ordering of the children is
-    // important. The children nodes will be matched with the children subtree
+    // important. The children nodes will be matched with the children subgraph
     // criteria in the given order.
 
     int currentEdgeIdx = 0;
@@ -200,7 +273,8 @@ struct SubgraphMatcher {
         auto edge = edges[currentEdgeIdx];
         auto child = invertGraphTraversal ? edge->tail() : edge->head();
 
-        if (!isSubtreeMatch(child, childrenCriteriaRef, invertGraphTraversal)
+        if (!isSubgraphMatchInternal(
+                 matchedNodes, child, childrenCriteriaRef, invertGraphTraversal)
                  .isMatch()) {
           if (!isStarCount) {
             // If the current criteria isn't a * pattern, this indicates a
@@ -213,10 +287,10 @@ struct SubgraphMatcher {
                                   childrenCriteriaRef)
                            << ". We expected " << expectedCount
                            << " matches but only found " << countMatch << ".";
-              return SubtreeMatchResult<GraphType>::notMatched(
+              return SubgraphMatchResult<GraphType>::notMatched(
                   debugMessage.str());
             } else {
-              return SubtreeMatchResult<GraphType>::notMatched();
+              return SubgraphMatchResult<GraphType>::notMatched();
             }
           } else {
             // Otherwise, we should move on to the next children criteria.
@@ -236,9 +310,9 @@ struct SubgraphMatcher {
                        << " matches for child criteria "
                        << debugString<NodeMatchCriteria>(childrenCriteriaRef)
                        << " but only found " << countMatch;
-          return SubtreeMatchResult<GraphType>::notMatched(debugMessage.str());
+          return SubgraphMatchResult<GraphType>::notMatched(debugMessage.str());
         } else {
-          return SubtreeMatchResult<GraphType>::notMatched();
+          return SubgraphMatchResult<GraphType>::notMatched();
         }
       }
     }
@@ -247,48 +321,17 @@ struct SubgraphMatcher {
       // Fails because there are unmatched edges.
       if (debug) {
         std::ostringstream debugMessage;
-        debugMessage << "Unmatched children for subtree root at " << root
+        debugMessage << "Unmatched children for subgraph root at " << root
                      << ". There are " << numEdges
                      << " children, but only found " << currentEdgeIdx
                      << " matches for the children criteria.";
-        return SubtreeMatchResult<GraphType>::notMatched(debugMessage.str());
+        return SubgraphMatchResult<GraphType>::notMatched(debugMessage.str());
       } else {
-        return SubtreeMatchResult<GraphType>::notMatched();
-      }
-    }
-    return SubtreeMatchResult<GraphType>::matched();
-  }
-
-  // Utility to transform a graph by looking for subtrees that match
-  // a given pattern and then allow callers to mutate the graph based on
-  // subtrees that are found.
-  // The current implementation doesn't handle any graph transformation
-  // itself. Callers should be responsible for all intended mutation, including
-  // deleting nodes in the subtrees found by this algorithm.
-  // Note: if the replaceFunction lambda returns false, the entire procedure
-  // is aborted. This maybe useful in certain cases when we want to terminate
-  // the subtree search early.
-  // invertGraphTraversal flag: see documentation in isSubtreeMatch
-  static void replaceSubtree(
-      GraphType& graph,
-      const MatchNodeRef<NodeMatchCriteria>& criteria,
-      const std::function<
-          bool(GraphType& g, typename GraphType::NodeRef subtreeRoot)>&
-          replaceFunction,
-      bool invertGraphTraversal = true) {
-    for (auto nodeRef : graph.getMutableNodes()) {
-      // Make sure the node is still in the graph.
-      if (!graph.hasNode(nodeRef)) {
-        continue;
-      }
-      if (isSubtreeMatch(nodeRef, criteria, invertGraphTraversal).isMatch()) {
-        if (!replaceFunction(graph, nodeRef)) {
-          // If replaceFunction returns false, it means that we should abort
-          // the entire procedure.
-          break;
-        }
+        return SubgraphMatchResult<GraphType>::notMatched();
       }
     }
+    matchedNodes[rootCriteriaRef] = root;
+    return SubgraphMatchResult<GraphType>::matched();
   }
 };
 
diff --git a/caffe2/core/nomnigraph/tests/neural_net_test.cc b/caffe2/core/nomnigraph/tests/neural_net_test.cc
index bdafce3b364cc8..34dd9840309eac 100644
--- a/caffe2/core/nomnigraph/tests/neural_net_test.cc
+++ b/caffe2/core/nomnigraph/tests/neural_net_test.cc
@@ -44,23 +44,23 @@ TEST(NeuralNetGraph, ReplaceGraph) {
 
   auto mg = NNMatchGraph();
   // clang-format off
-  auto pattern = tree(mg,
+  auto pattern = subgraph(mg,
       matchNodeType<Relu>(), {
-          operatorTree(mg,
+          operatorSubgraph(mg,
               matchNodeType<Sum>(), {
-                tree(mg, matchNodeType<Tensor>(), {}, 2, true)
+                subgraph(mg, matchNodeType<Tensor>(), {}, 2, true)
               }),
       });
   // clang-format on
 
-  EXPECT_FALSE(NNSubgraphMatcher::isSubtreeMatch(sum, pattern).isMatch());
+  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, pattern).isMatch());
   EXPECT_FALSE(
-      NNSubgraphMatcher::isSubtreeMatch(reluOutput, pattern).isMatch());
-  EXPECT_FALSE(NNSubgraphMatcher::isSubtreeMatch(input1, pattern).isMatch());
+      NNSubgraphMatcher::isSubgraphMatch(reluOutput, pattern).isMatch());
+  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, pattern).isMatch());
 
-  EXPECT_TRUE(NNSubgraphMatcher::isSubtreeMatch(relu, pattern).isMatch());
+  EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, pattern).isMatch());
 
-  NNSubgraphMatcher::replaceSubtree(
+  NNSubgraphMatcher::replaceSubgraph(
       graph, pattern, [](NNGraph& g, NNGraph::NodeRef relu) {
         auto sumOutput = getInputs(relu)[0];
         auto sum = getProducer(sumOutput);
diff --git a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc
index adcc56e6027141..ced26d69beb30b 100644
--- a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc
+++ b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc
@@ -41,11 +41,11 @@ TestMatchGraph::NodeRef Tree(
     const Criteria& root,
     const std::vector<TestMatchGraph::NodeRef>& children = {},
     int count = 1) {
-  return tree(graph, root, children, count, false);
+  return subgraph(graph, root, children, count, false);
 }
 
 TestMatchGraph::NodeRef NonTerminal(const Criteria& root, int count = 1) {
-  return tree(graph, root, {}, count, true);
+  return subgraph(graph, root, {}, count, true);
 }
 
 Criteria any() {
@@ -202,11 +202,11 @@ TestGraph::NodeRef getInNode(TestGraph::NodeRef node, int index) {
   return node->getInEdges()[index]->tail();
 }
 
-bool isSubtreeMatch(
+bool isSubgraphMatch(
     TestGraph::NodeRef nodeRef,
     const TestMatchGraph::NodeRef& criteria,
     bool invertGraphTraversal = true) {
-  return TestMatcher::isSubtreeMatch(nodeRef, criteria, invertGraphTraversal)
+  return TestMatcher::isSubgraphMatch(nodeRef, criteria, invertGraphTraversal)
       .isMatch();
 }
 } // namespace matcher
@@ -254,32 +254,32 @@ TEST(SubgraphMatcher, IsSubtreeMatch) {
 
   reset();
   auto subtree = Tree(any(), {Tree(any()), Tree(any())});
-  EXPECT_FALSE(isSubtreeMatch(n1, subtree, false));
-  EXPECT_FALSE(isSubtreeMatch(n4, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n1, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n4, subtree, false));
 
-  EXPECT_TRUE(isSubtreeMatch(n2, subtree, false));
-  EXPECT_TRUE(isSubtreeMatch(n5, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n2, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n5, subtree, false));
 
   reset();
   subtree = Tree(Criteria("5"), {Tree(any()), Tree(any())});
-  EXPECT_FALSE(isSubtreeMatch(n2, subtree, false));
-  EXPECT_TRUE(isSubtreeMatch(n5, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n2, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n5, subtree, false));
 
   reset();
   subtree = Tree(any(), {Tree(any()), Tree(Criteria("4"))});
-  EXPECT_TRUE(isSubtreeMatch(n2, subtree, false));
-  EXPECT_FALSE(isSubtreeMatch(n5, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n2, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n5, subtree, false));
 
   reset();
   // Accepts non terminal node
   subtree = Tree(any(), {NonTerminal(any()), NonTerminal(any())});
-  EXPECT_TRUE(isSubtreeMatch(n1, subtree, false));
-  EXPECT_TRUE(isSubtreeMatch(n2, subtree, false));
-  EXPECT_TRUE(isSubtreeMatch(n5, subtree, false));
-  EXPECT_FALSE(isSubtreeMatch(n3, subtree, false));
-  EXPECT_FALSE(isSubtreeMatch(n4, subtree, false));
-  EXPECT_FALSE(isSubtreeMatch(n6, subtree, false));
-  EXPECT_FALSE(isSubtreeMatch(n7, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n1, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n2, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n5, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n3, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n4, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n6, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n7, subtree, false));
 }
 
 // Test subtree matching in which * (repeated) matching of children is allowed.
@@ -304,11 +304,11 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
 
   reset();
   auto subtree = Tree(any(), {Tree(Criteria("2"))});
-  EXPECT_FALSE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n1, subtree, false));
 
   reset();
   subtree = Tree(any(), {Tree(Criteria("2"), {}, TestMatchNode::kStarCount)});
-  EXPECT_FALSE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n1, subtree, false));
 
   reset();
   // clang-format off
@@ -318,7 +318,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
     Tree(Criteria("4"), {}, 2),
     Tree(Criteria("5"), {}, 3)
   });
-  EXPECT_TRUE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n1, subtree, false));
 
   reset();
   subtree = Tree(any(), {
@@ -328,7 +328,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
     Tree(Criteria("5"), {}, 4)
   });
   // Failes because exepected 4 matches of n5 but found 3.
-  EXPECT_FALSE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n1, subtree, false));
 
   reset();
   subtree = Tree(any(), {
@@ -337,7 +337,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
     Tree(Criteria("4"), {}, 2),
     Tree(Criteria("5"), {}, TestMatchNode::kStarCount)
   });
-  EXPECT_TRUE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n1, subtree, false));
 
   reset();
   subtree = Tree(any(), {
@@ -346,7 +346,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
     Tree(Criteria("4"), {}, 2),
     Tree(Criteria("5"), {}, TestMatchNode::kStarCount)
   });
-  EXPECT_TRUE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_TRUE(isSubgraphMatch(n1, subtree, false));
 
   reset();
   subtree = Tree(any(), {
@@ -354,7 +354,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
     Tree(Criteria("3"), {}, TestMatchNode::kStarCount),
   });
   // Fails because there are unmatched edges.
-  EXPECT_FALSE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n1, subtree, false));
 
   reset();
   subtree = Tree(any(), {
@@ -365,21 +365,192 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
   });
   // Fails because the count is wrong; we have 2 edges to node N4 while
   // the pattern expects only 1.
-  EXPECT_FALSE(isSubtreeMatch(n1, subtree, false));
+  EXPECT_FALSE(isSubgraphMatch(n1, subtree, false));
   // clang-format on
 }
 
+TEST(SubgraphMatcher, DagMatching) {
+  reset();
+
+  // clang-format off
+  auto n4match = Tree(Criteria("4"), {
+    Tree(Criteria("5"))
+  });
+  auto subgraph = Tree(Criteria("1"), {
+    Tree(Criteria("2"), {
+      n4match
+    }),
+    Tree(Criteria("3"), {
+      n4match
+    }),
+  });
+  // clang-format on
+
+  {
+    TestGraph graph;
+    auto n1 = graph.createNode("1");
+    auto n2 = graph.createNode("2");
+    auto n3 = graph.createNode("3");
+    auto n4 = graph.createNode("4");
+    auto n5 = graph.createNode("5");
+
+    graph.createEdge(n1, n2);
+    graph.createEdge(n1, n3);
+    graph.createEdge(n2, n4);
+    graph.createEdge(n3, n4);
+    graph.createEdge(n4, n5);
+
+    /*       N1
+           /     \
+        N2         N3
+            \   /
+             N4
+             |
+             N5
+    */
+
+    EXPECT_TRUE(isSubgraphMatch(n1, subgraph, false));
+  }
+
+  {
+    TestGraph graph;
+    auto n1 = graph.createNode("1");
+    auto n2 = graph.createNode("2");
+    auto n3 = graph.createNode("3");
+    auto n4A = graph.createNode("4");
+    auto n4B = graph.createNode("4");
+    auto n5 = graph.createNode("5");
+
+    graph.createEdge(n1, n2);
+    graph.createEdge(n1, n3);
+    graph.createEdge(n2, n4A);
+    graph.createEdge(n3, n4B);
+    graph.createEdge(n4A, n5);
+    graph.createEdge(n4B, n5);
+
+    /*       N1
+           /    \
+        N2       N3
+        /          \
+       N4A        N4B
+          \     /
+            N5
+    */
+
+    // This should fail because n4A and n4B are not the same node.
+    EXPECT_FALSE(isSubgraphMatch(n1, subgraph, false));
+  }
+}
+
+TEST(SubgraphMatcher, DagMatchingMultiEdges) {
+  reset();
+
+  // clang-format off
+  auto n2match = Tree(Criteria("2"));
+  auto subgraph = Tree(Criteria("1"), {
+    n2match,
+    n2match
+  });
+  // clang-format on
+
+  {
+    TestGraph graph;
+    auto n1 = graph.createNode("1");
+    auto n2 = graph.createNode("2");
+
+    graph.createEdge(n1, n2);
+    graph.createEdge(n1, n2);
+
+    EXPECT_TRUE(isSubgraphMatch(n1, subgraph, false));
+  }
+
+  {
+    TestGraph graph;
+    auto n1 = graph.createNode("1");
+    auto n2A = graph.createNode("2");
+    auto n2B = graph.createNode("2");
+
+    graph.createEdge(n1, n2A);
+    graph.createEdge(n1, n2B);
+
+    EXPECT_FALSE(isSubgraphMatch(n1, subgraph, false));
+  }
+}
+
+TEST(SubgraphMatcher, DagMatchingRandomLargeGraph) {
+  reset();
+  // clang-format off
+  auto n4match = Tree(any(), {
+    NonTerminal(any(), 1)
+  });
+  auto subtree = Tree(any(), {
+    Tree(any(), {
+      n4match
+    }),
+    Tree(any(), {
+      n4match
+    }),
+  });
+  // clang-format on
+  /*       N1
+         /     \
+      N2         N3
+          \   /
+           N4
+           |
+           N5
+  */
+
+  // Look for the diamond pattern in a random large graph.
+  TestGraph graph;
+  std::vector<nom::Graph<std::string>::NodeRef> nodes;
+
+  // Here we create a test graph and then randomly embed the above
+  // pattern into the graph repeatedly (numPatterns times).
+  // The actual number of match will be less than numPatterns because the
+  // embedded patterns can overlap which become unmatched subgraphs.
+  const int numNodes = 50000;
+  const int numPatterns = 5000;
+
+  for (int i = 0; i < numNodes; i++) {
+    auto node = graph.createNode("Node");
+    nodes.emplace_back(node);
+  }
+
+  TestRandom random(517);
+  for (int i = 0; i < numPatterns; i++) {
+    std::vector<int> nodeIdx;
+    for (int k = 0; k < 5; k++) {
+      nodeIdx.emplace_back(random.nextInt() % numNodes);
+    }
+    graph.createEdge(nodes[nodeIdx[0]], nodes[nodeIdx[1]]);
+    graph.createEdge(nodes[nodeIdx[0]], nodes[nodeIdx[2]]);
+    graph.createEdge(nodes[nodeIdx[1]], nodes[nodeIdx[3]]);
+    graph.createEdge(nodes[nodeIdx[2]], nodes[nodeIdx[3]]);
+    graph.createEdge(nodes[nodeIdx[3]], nodes[nodeIdx[4]]);
+  }
+  EXPECT_EQ(graph.getEdgesCount(), 5 * numPatterns);
+
+  int countMatch = 0;
+  for (auto node : graph.getMutableNodes()) {
+    if (isSubgraphMatch(node, subtree, false)) {
+      countMatch++;
+    }
+  }
+  EXPECT_EQ(countMatch, 1072);
+}
+
 TEST(SubgraphMatcher, IsSubtreeMatchRealistic) {
   reset();
   auto graph = DataFlowTestGraph();
   auto subtree = DataFlowTestGraphCriteria();
 
-  EXPECT_FALSE(isSubtreeMatch(graph.opF, subtree));
-  EXPECT_FALSE(isSubtreeMatch(graph.opC, subtree));
-  EXPECT_FALSE(isSubtreeMatch(graph.opB, subtree));
-  EXPECT_FALSE(isSubtreeMatch(graph.dataOut, subtree));
+  EXPECT_FALSE(isSubgraphMatch(graph.opF, subtree));
+  EXPECT_FALSE(isSubgraphMatch(graph.opC, subtree));
+  EXPECT_FALSE(isSubgraphMatch(graph.opB, subtree));
+  EXPECT_FALSE(isSubgraphMatch(graph.dataOut, subtree));
 
-  EXPECT_TRUE(isSubtreeMatch(graph.opG, subtree));
+  EXPECT_TRUE(isSubgraphMatch(graph.opG, subtree));
 }
 
 TEST(SubgraphMatcher, ReplaceSubtreeRealistic) {
@@ -387,7 +558,7 @@ TEST(SubgraphMatcher, ReplaceSubtreeRealistic) {
   auto graph = DataFlowTestGraph();
   auto subtree = DataFlowTestGraphCriteria();
 
-  TestMatcher::replaceSubtree(
+  TestMatcher::replaceSubgraph(
       graph.graph, subtree, [](TestGraph& g, TestGraph::NodeRef opG) {
         auto opFused = g.createNode("opFused");
 
diff --git a/caffe2/core/nomnigraph/tests/test_util.h b/caffe2/core/nomnigraph/tests/test_util.h
index 2c447b556acb01..f5693c03d36a2e 100644
--- a/caffe2/core/nomnigraph/tests/test_util.h
+++ b/caffe2/core/nomnigraph/tests/test_util.h
@@ -34,6 +34,23 @@ struct NNEquality {
   }
 };
 
+// Very simple random number generator used to generate platform independent
+// random test data.
+class TestRandom {
+ public:
+  TestRandom(unsigned int seed) : seed_(seed){};
+
+  unsigned int nextInt() {
+    seed_ = A * seed_ + C;
+    return seed_;
+  }
+
+ private:
+  static const unsigned int A = 1103515245;
+  static const unsigned int C = 12345;
+  unsigned int seed_;
+};
+
 /** Our test graph looks like this:
  *           +-------+
  *           | entry |
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index a10ce5d0b2f500..b1f31af6e33d8a 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -799,7 +799,7 @@ typedef Registry<
     Workspace*>* (*RegistryFunction)();
 CAFFE2_API std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry();
 
-struct DeviceTypeRegisterer {
+struct CAFFE2_API DeviceTypeRegisterer {
   explicit DeviceTypeRegisterer(int32_t type, RegistryFunction func) {
     if (gDeviceTypeRegistry()->count(type)) {
       std::cerr << "Device type " << type
@@ -923,7 +923,7 @@ struct StaticLinkingProtector {
 // specific engines that only implement a subset of the features required by
 // the original operator schema.
 // TODO(jiayq): make more feature-complete exception message.
-class UnsupportedOperatorFeature : public std::exception {
+class CAFFE2_API UnsupportedOperatorFeature : public std::exception {
  public:
   UnsupportedOperatorFeature(const string& msg) : msg_(msg) {}
   const char* what() const noexcept override {
@@ -961,40 +961,40 @@ using PerOpEnginePrefType =
     CaffeMap<int, CaffeMap<std::string, EnginePrefType>>;
 // {device_type -> EnginePrefType}
 using GlobalEnginePrefType = CaffeMap<int, EnginePrefType>;
-void SetPerOpEnginePref(const PerOpEnginePrefType& per_op_engine_pref);
-void SetGlobalEnginePref(const GlobalEnginePrefType& global_engine_pref);
-void SetEnginePref(
+CAFFE2_API void SetPerOpEnginePref(const PerOpEnginePrefType& per_op_engine_pref);
+CAFFE2_API void SetGlobalEnginePref(const GlobalEnginePrefType& global_engine_pref);
+CAFFE2_API void SetEnginePref(
     const PerOpEnginePrefType& per_op_engine_pref,
     const GlobalEnginePrefType& global_engine_pref);
-void SetOpEnginePref(
+CAFFE2_API void SetOpEnginePref(
     const std::string& op_type,
     const CaffeMap<int, EnginePrefType>& op_pref);
 
-TensorShape GetTensorShapeOfBlob(const Blob* b);
+CAFFE2_API TensorShape GetTensorShapeOfBlob(const Blob* b);
 
-TensorShapes InferBlobShapesAndTypes(
+CAFFE2_API TensorShapes InferBlobShapesAndTypes(
     CaffeMap<string, TensorShape>& blob_desc,
     const vector<NetDef*>& nets);
 
-TensorShapes InferBlobShapesAndTypesFromWorkspace(
+CAFFE2_API TensorShapes InferBlobShapesAndTypesFromWorkspace(
     Workspace* ws,
     const vector<NetDef*>& nets);
 
-TensorShapes InferBlobShapesAndTypesFromMap(
+CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
     const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
     const vector<NetDef*>& nets);
 
-TensorShapes InferBlobShapesAndTypesFromMap(
+CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
     const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
     const CaffeMap<std::string, TensorProto_DataType>& blob_types,
     const vector<NetDef*>& nets);
 
-std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
+CAFFE2_API std::map<string, std::pair<DeviceOption, DeviceOption>> ValidateTensorDevices(
     OperatorBase& op,
     const OperatorDef& op_def);
 
 // Get a set of registered operator names
-std::set<std::string> GetRegisteredOperators();
+CAFFE2_API std::set<std::string> GetRegisteredOperators();
 
 }  // namespace caffe2
 
diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
index 4072065f515dfd..f08778fd22eac2 100644
--- a/caffe2/core/operator_gradient.h
+++ b/caffe2/core/operator_gradient.h
@@ -14,7 +14,7 @@ namespace caffe2 {
  * a sparse blob, its gradient name should be written into indice_ for
  * the sparse indices and value_ for the values.
  */
-struct GradientWrapper {
+struct CAFFE2_API GradientWrapper {
   string dense_;
   string indices_;
   string values_;
@@ -33,7 +33,7 @@ struct GradientWrapper {
 /**
  * A struct that holds the gradient operators and related gradient maps.
  */
-struct GradientOpsMeta {
+struct CAFFE2_API GradientOpsMeta {
   vector<OperatorDef> ops_;
   vector<GradientWrapper> g_input_;
 
@@ -44,7 +44,7 @@ struct GradientOpsMeta {
       : ops_(ops), g_input_(v) {}
 };
 
-class GradientMakerBase {
+class CAFFE2_API GradientMakerBase {
  public:
   GradientMakerBase(
       const OperatorDef& def,
@@ -256,7 +256,7 @@ class GradientMakerBase {
  * that the gradient computation should not flow through it at all, and throws
  * an error if it is called.
  */
-class NoGradient : public GradientMakerBase {
+class CAFFE2_API NoGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return vector<OperatorDef>();
@@ -321,7 +321,7 @@ CAFFE_DECLARE_REGISTRY(
 /**
  * @brief Gets the GradientOpsMeta for the given operator def.
  */
-GradientOpsMeta GetGradientForOp(
+CAFFE2_API GradientOpsMeta GetGradientForOp(
     const OperatorDef& def,
     const vector<GradientWrapper>& g_output);
 
diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h
index f142c182dfe748..86c6827e3039a1 100644
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@@ -40,7 +40,7 @@ struct CAFFE2_API ExportedStatValue {
 using ExportedStatList = std::vector<ExportedStatValue>;
 using ExportedStatMap = std::unordered_map<std::string, int64_t>;
 
-ExportedStatMap toMap(const ExportedStatList& stats);
+CAFFE2_API ExportedStatMap toMap(const ExportedStatList& stats);
 
 /**
  * @brief Holds a map of atomic counters keyed by name.
diff --git a/caffe2/core/transform.h b/caffe2/core/transform.h
index 63f7e26467332a..9c10ca58c0d9b1 100644
--- a/caffe2/core/transform.h
+++ b/caffe2/core/transform.h
@@ -31,7 +31,7 @@ namespace caffe2 {
  * own transform, write your implementations for PatternRule, ValidatorRule, and
  * ReplaceRule.
  */
-class Transform {
+class CAFFE2_API Transform {
  public:
   Transform() {}
 
@@ -148,7 +148,7 @@ class Transform {
 };
 
 // Creates a Transform based on a key, which should be defined in registry.
-unique_ptr<Transform> CreateTransform(string key);
+CAFFE2_API unique_ptr<Transform> CreateTransform(string key);
 
 CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform);
 #define REGISTER_TRANSFORM(name, ...) \
@@ -156,14 +156,14 @@ CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform);
 
 // Create a Transform object from registry,
 // and immediately apply it to a Netdef.
-NetDef ApplyTransform(const string& key, const NetDef& netdef);
+CAFFE2_API NetDef ApplyTransform(const string& key, const NetDef& netdef);
 
 // Create a Transform object from registry, apply it to a NetDef.
 // Will only return the transformed net if it is faster than the old net.
 // This will run the init net first, will run the two nets warmup_runs times.
 // Then, we will take the average time of main_runs runs, and only keep the
 // transformed net if it is faster by a factor of improvement_threshold.
-NetDef ApplyTransformIfFaster(
+CAFFE2_API NetDef ApplyTransformIfFaster(
     const string& key,
     const NetDef& netdef,
     const NetDef& init_netdef,
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 6e718a7d75a276..64642ca7ea41e5 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -674,25 +674,54 @@ Caffe2Ops Caffe2Backend::CreateGemm(
 
   auto trans_a = onnx_node->attributes.get<int64_t>("transA", 0L);
   auto trans_b = onnx_node->attributes.get<int64_t>("transB", 0L);
-  auto broadcast = onnx_node->attributes.get<int64_t>("broadcast", 0L);
+  // Support broadcast by default when opset_version > 6.
+  auto broadcast =
+    onnx_node->attributes.get<int64_t>("broadcast",
+                                       (ctx.opset_version() > 6) ? 1L : 0L);
+
+  // If the c's shape information is available and c is a 1d tensor(except
+  // c is a scalar), use FC aggressively.
+  auto check_fc = [&]() -> bool {
+    const auto input_c_vi_iter = ctx.value_infos().find(node.input(2));
+
+    if (input_c_vi_iter == ctx.value_infos().end()) {
+      return false;
+    }
 
-  bool use_fc = false;
-  if ((!trans_a) && trans_b) {
-    if (broadcast) {
-      use_fc = true;
-    } else {
-      const auto input_c_vi_iter = ctx.value_infos().find(node.input(2));
-      if (input_c_vi_iter != ctx.value_infos().end() &&
-          input_c_vi_iter->second.type().tensor_type().shape().dim_size() ==
-              1) {
-        use_fc = true;
+    const auto input_c_shape =
+        input_c_vi_iter->second.type().tensor_type().shape();
+
+    if (input_c_shape.dim_size() != 1) {
+      return false;
+    }
+
+    // c is a scalar.
+    if (input_c_shape.dim(0).dim_value() == 1) {
+      const auto input_b_vi_iter = ctx.value_infos().find(node.input(1));
+
+      // If the b's shape is not available, skip FC.
+      if (input_b_vi_iter == ctx.value_infos().end()) {
+        return false;
+      }
+      const auto input_b_shape =
+          input_b_vi_iter->second.type().tensor_type().shape();
+      int input_b_last_dim_index = (trans_b) ? 0 : 1;
+      // If b's last dim is not 1, skip FC.
+      if (input_b_shape.dim(input_b_last_dim_index).dim_value() != 1) {
+        return false;
       }
     }
-  }
 
-  if (use_fc) {
+    return true;
+  };
+
+  if (!trans_a && broadcast && check_fc()) {
     auto* c2_op = ret.ops.Add();
-    BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output});
+    if (trans_b) {
+      BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output});
+    } else {
+      BuildOperator(c2_op, "FCTransposed", {input_a, input_b, input_c}, {output});
+    }
   } else {
     auto ab = dummy_->NewDummyName();
     caffe2::Argument arg_trans_a;
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
index 681ab5b30d10b0..6aa5f271cc5c2a 100644
--- a/caffe2/onnx/backend.h
+++ b/caffe2/onnx/backend.h
@@ -11,7 +11,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-constexpr int kKnownOpsetVersion = 6;
+constexpr int kKnownOpsetVersion = 7;
 
 namespace caffe2 {
 namespace onnx {
diff --git a/caffe2/onnx/onnx_exporter.h b/caffe2/onnx/onnx_exporter.h
index 51f62df0eb2212..578edc0ee17e2d 100644
--- a/caffe2/onnx/onnx_exporter.h
+++ b/caffe2/onnx/onnx_exporter.h
@@ -25,11 +25,11 @@ using ConvertedResult =
 
 // Rewrite Caffe2 nets into SSA forms. Notice that we will preserve the external
 // output names for predict net.
-std::unordered_map<std::string, std::string> SsaRewrite(
+CAFFE2_API std::unordered_map<std::string, std::string> SsaRewrite(
     caffe2::NetDef* init_net,
     caffe2::NetDef* pred_net);
 
-class OnnxExporter {
+class CAFFE2_API OnnxExporter {
   using SpecialOpConverter = ConvertedResult (OnnxExporter::*)(
       const caffe2::OperatorDef&,
       const std::unordered_map<std::string, caffe2::TensorShape>&);
diff --git a/caffe2/operators/bisect_percentile_op.cc b/caffe2/operators/bisect_percentile_op.cc
new file mode 100644
index 00000000000000..cec3cf7bc1fe34
--- /dev/null
+++ b/caffe2/operators/bisect_percentile_op.cc
@@ -0,0 +1,92 @@
+#include "caffe2/operators/bisect_percentile_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(BisectPercentile, BisectPercentileOp<CPUContext>);
+OPERATOR_SCHEMA(BisectPercentile)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+    This operator is to map raw feature values into the percentile
+    representations based on Bisection for more than one feature.
+
+    The input is the bath of input feature values, with the size of (batch_size,
+    num_feature), where num_feature = F (F >= 1).
+
+    For each feature, we also need additional information regarding the feature
+    value distribution.
+    There are several vectors to keep data to percentile mappping information
+    as arguments (context):
+    1. feature raw values (R)
+    2. feature percentile mapping (P)
+    3. feature percentile lower bound (L)
+    4. feature percentile upper bound (U)
+
+    A toy example:
+    Suppose the sampled data distribution is as follows:
+    1, 1, 2, 2, 2, 2, 2, 2, 3, 4
+    We have the mapping vectors as follows:
+    R = [1, 2, 3, 4]
+    P = [0.15, 0.55, 0.9, 1.0]
+    L = [0.1, 0.3, 0.9, 1.0]
+    U = [0.2, 0.8, 0.9, 1.0]
+    Where P is computed as (L + U) / 2.
+
+    For a given list of feature values, X = [x_0, x_1, ..., x_i, ...], for each
+    feature value (x_i) we first apply bisection to find the right index (t),
+    such that R[t] <= x_i < R[t+1].
+    If x_i = R[t], P[t] is returned;
+    otherwise, the interpolation is apply by (R[t], R[t+1]) and (U[t] and L[t]).
+
+    As there are F features (F >= 1), we concate all the R_f, P_f, L_f, and
+    U_f for each feature f and use an additional input length to keep track of
+    the number of points for each set of raw feature value to percentile mapping.
+    For example, there are two features:
+    R_1 =[0.1, 0.4, 0.5];
+    R_2 = [0.3, 1.2];
+    We will build R = [0.1, 0.4, 0.5, 0.3, 1.2]; besides, we have
+    lengths = [3, 2]
+    to indicate the boundries of the percentile information.
+
+)DOC")
+    .Arg(
+        "percentile_raw",
+        "1D tensor, which is the concatenation of all sorted raw feature "
+        "values for all features.")
+    .Arg(
+        "percentile_mapping",
+        "1D tensor. There is one-one mapping between percentile_mapping and "
+        "percentile_raw such that each element in percentile_mapping "
+        "corresponds to the percentile value of the corresponding raw feature "
+        "value.")
+    .Arg(
+        "percentile_lower",
+        "1D tensor. There is one-one mapping between percentile_upper and "
+        "percentile_raw such that each element in percentile_mapping "
+        "corresponds to the percentile lower bound of the corresponding raw "
+        "feature value.")
+    .Arg(
+        "percentile_upper",
+        "1D tensor. There is one-one mapping between percentile_upper and "
+        "percentile_raw such that each element in percentile_mapping "
+        "corresponds to the percentile upper bound of the corresponding raw "
+        "feature value.")
+    .Arg(
+        "lengths",
+        "1D tensor. There is one-one mapping between percentile_upper and "
+        "percentile_raw such that each element in percentile_mapping "
+        "corresponds to the percentile upper bound of the corresponding raw "
+        "feature value.")
+    .Input(
+        0,
+        "raw_values",
+        "Input 2D tensor of floats of size (N, D), where N is the batch size "
+        "and D is the feature dimension.")
+    .Output(
+        0,
+        "percentile",
+        "2D tensor of output with the same dimensions as the input raw_values.");
+
+NO_GRADIENT(BisectPercentile);
+
+} // namespace caffe2
diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h
new file mode 100644
index 00000000000000..98d347cc73b10a
--- /dev/null
+++ b/caffe2/operators/bisect_percentile_op.h
@@ -0,0 +1,167 @@
+#ifndef CAFFE2_OPERATORS_BISECT_PERCENTILE_OP_H_
+#define CAFFE2_OPERATORS_BISECT_PERCENTILE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class BisectPercentileOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BisectPercentileOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        pct_raw_(OperatorBase::GetRepeatedArgument<float>(
+            "percentile_raw",
+            vector<float>{})),
+        pct_mapping_(OperatorBase::GetRepeatedArgument<float>(
+            "percentile_mapping",
+            vector<float>{})),
+        pct_lower_(OperatorBase::GetRepeatedArgument<float>(
+            "percentile_lower",
+            vector<float>{})),
+        pct_upper_(OperatorBase::GetRepeatedArgument<float>(
+            "percentile_upper",
+            vector<float>{})),
+        pct_lens_(
+            OperatorBase::GetRepeatedArgument<int>("lengths", vector<int>{})) {
+    CAFFE_ENFORCE_EQ(
+        pct_raw_.size(),
+        pct_mapping_.size(),
+        "Feature (raw) data and percentile value dimension should match.");
+    CAFFE_ENFORCE_EQ(
+        pct_raw_.size(),
+        pct_lower_.size(),
+        "Feature (raw) data and lower bound dimension should match.");
+    CAFFE_ENFORCE_EQ(
+        pct_raw_.size(),
+        pct_upper_.size(),
+        "Feature (raw) data and upper bound dimension should match.");
+    n_features = pct_lens_.size();
+    index.reserve(n_features + 1);
+    index[0] = 0;
+    for (int i = 1; i <= n_features; ++i) {
+      index[i] = index[i - 1] + pct_lens_[i - 1];
+    }
+    CAFFE_ENFORCE_EQ(
+        index[n_features], // The sum of lengths_data
+        pct_raw_.size(),
+        "Sum of lengths should be equal to the total number of percentile "
+        "mapping data samples");
+  }
+
+  bool RunOnDevice() override {
+    // Input
+    const auto& raw = Input(RAW);
+    CAFFE_ENFORCE_EQ(raw.ndim(), 2);
+    const auto batch_size = raw.dim(0);
+    const auto num_features = raw.dim(1);
+    CAFFE_ENFORCE_EQ(num_features, pct_lens_.size());
+    const float* raw_data = raw.template data<float>();
+
+    // Output
+    auto* pct = Output(PCT);
+    pct->ResizeLike(raw);
+    float* pct_output = pct->template mutable_data<float>();
+
+    // Compute percentile for each raw feature value
+    int feature_start_index = 0;
+    int feature_length = 0;
+    int cur_index = 0;
+
+    for (int i = 0; i < num_features; ++i) {
+      cur_index = i;
+      feature_start_index = index[i];
+      feature_length = pct_lens_[i];
+      for (int j = 0; j < batch_size; ++j) {
+        pct_output[cur_index] = compute_percentile(
+            pct_raw_.begin() + feature_start_index,
+            pct_mapping_.begin() + feature_start_index,
+            pct_lower_.begin() + feature_start_index,
+            pct_upper_.begin() + feature_start_index,
+            feature_length,
+            raw_data[cur_index]);
+        cur_index += num_features;
+      }
+    }
+    return true;
+  }
+
+ protected:
+  INPUT_TAGS(RAW);
+  OUTPUT_TAGS(PCT);
+
+ private:
+  int n_features;
+  vector<float> pct_raw_;
+  vector<float> pct_mapping_;
+  vector<float> pct_lower_;
+  vector<float> pct_upper_;
+  vector<int> pct_lens_;
+  vector<int> index;
+  vector<std::map<float, float>> fast_pct;
+
+  const float kEPSILON = 1e-10;
+
+  int binary_search(
+      const std::vector<float>::iterator& data,
+      int lo,
+      int hi,
+      float val) {
+    int mid;
+    bool low_cond, high_cond;
+
+    while (lo < hi) {
+      mid = (lo + hi) >> 1;
+      low_cond = (data[mid] <= val);
+      high_cond = (val < data[mid + 1]);
+      if (low_cond && high_cond) {
+        return mid;
+      } else if (!low_cond) {
+        hi = mid - 1;
+      } else {
+        lo = mid + 1;
+      }
+    }
+    return lo;
+  }
+
+  float compute_percentile(
+      const std::vector<float>::iterator& pct_raw_it,
+      const std::vector<float>::iterator& pct_mapping_it,
+      const std::vector<float>::iterator& pct_lower_it,
+      const std::vector<float>::iterator& pct_upper_it,
+      const int size,
+      const float val) {
+    // Corner cases where no interpolation is needed.
+    if (val < pct_raw_it[0]) {
+      return 0.;
+    }
+    if (val > pct_raw_it[size - 1]) {
+      return 1.;
+    }
+
+    float result;
+    // Interpolation by binary search
+    const auto k = binary_search(pct_raw_it, 0, size - 1, val);
+
+    if (pct_raw_it[k] == val) {
+      // Exact match
+      result = pct_mapping_it[k];
+    } else {
+      // interpolation
+      float w = (val - pct_raw_it[k]) /
+          (pct_raw_it[k + 1] - pct_raw_it[k] + kEPSILON);
+      result = (1 - w) * pct_upper_it[k] + w * pct_lower_it[k + 1];
+    }
+    return result;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_BISECT_PERCENTILE_OP_H_
diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h
index 041a5be002421d..08207644f7f094 100644
--- a/caffe2/operators/ensure_cpu_output_op.h
+++ b/caffe2/operators/ensure_cpu_output_op.h
@@ -40,7 +40,7 @@ class EnsureCPUOutputOp : public Operator<Context> {
         input.size(),
         input.raw_data(),
         output->raw_mutable_data(input.meta()));
-
+    context_.FinishDeviceComputation();
     return true;
   }
 };
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
index d66bf9e5635f56..1d6e28c9b3abe3 100644
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@@ -45,7 +45,7 @@ class ConstTensorView {
 // anchors: predefined anchors, size(A, 4)
 // Return: all_anchors_vec: (H * W, A * 4)
 // Need to reshape to (H * W * A, 4) to match the format in python
-ERMatXf ComputeAllAnchors(
+CAFFE2_API ERMatXf ComputeAllAnchors(
     const TensorCPU& anchors,
     int height,
     int width,
diff --git a/caffe2/opt/backend_cutting.h b/caffe2/opt/backend_cutting.h
index 0e2bf7c7f7de60..cc3ed14c3dc833 100644
--- a/caffe2/opt/backend_cutting.h
+++ b/caffe2/opt/backend_cutting.h
@@ -9,7 +9,7 @@
 namespace caffe2 {
 namespace opt {
 
-caffe2::NetDef OptimizeForBackend(
+CAFFE2_API caffe2::NetDef OptimizeForBackend(
     caffe2::NetDef& net,
     std::function<bool(const caffe2::OperatorDef&)> supports,
     std::function<caffe2::NetDef(const caffe2::NetDef&)> transform_func);
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index 5a1b7d01d84cfc..ec90507664fa95 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -12,7 +12,7 @@
 
 namespace caffe2 {
 
-class Caffe2Annotation : public nom::repr::Annotation {
+class CAFFE2_API Caffe2Annotation : public nom::repr::Annotation {
 public:
   Caffe2Annotation() : Annotation(AnnotationKind::Caffe2) {}
   Caffe2Annotation(std::string device)
@@ -57,23 +57,23 @@ class Caffe2Annotation : public nom::repr::Annotation {
   int DeviceType = caffe2::DeviceType::CPU;
 };
 
-nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
+CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
 
-caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
+CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
 
 // Pass in an oldNet to copy all the attributes of that network.
 // Be warned that transformations that modify the graph's inputs or outputs
 // are not reflected in changes to external_input or external_output.
-caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&, const caffe2::NetDef& oldNet);
+CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&, const caffe2::NetDef& oldNet);
 
 // Use these functions instead of the registry directly.
-std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+CAFFE2_API std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
     const caffe2::OperatorDef& op);
 
-caffe2::OperatorDef convertToOperatorDef(
+CAFFE2_API caffe2::OperatorDef convertToOperatorDef(
     const nom::repr::NNGraph::NodeRef& instrNode);
 
-class Converter {
+class CAFFE2_API Converter {
  public:
   explicit Converter() {}
   virtual std::unique_ptr<nom::repr::NeuralNetOperator>
diff --git a/caffe2/opt/device.h b/caffe2/opt/device.h
index eeb16469a87ebe..daa634de0563fa 100644
--- a/caffe2/opt/device.h
+++ b/caffe2/opt/device.h
@@ -1,9 +1,10 @@
+#include "caffe2/core/common.h"
 #include "nomnigraph/Representations/NeuralNet.h"
 
 namespace caffe2 {
 namespace opt {
 
-void insertCopies(
+CAFFE2_API void insertCopies(
     nom::repr::NNModule* nn,
     std::function<bool(nom::repr::NNGraph::NodeRef)> supported,
     std::function<nom::repr::NNGraph::NodeRef(nom::repr::NNGraph&)> copyToFn,
diff --git a/caffe2/opt/fusion.h b/caffe2/opt/fusion.h
index 67b2cb7bcaf795..f8cd4b469be5b1 100644
--- a/caffe2/opt/fusion.h
+++ b/caffe2/opt/fusion.h
@@ -25,7 +25,7 @@ namespace opt {
 
 using namespace nom;
 
-void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
+CAFFE2_API void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
 
 // Generic activation fusion helper.
 //
@@ -33,11 +33,11 @@ void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
 // \tparam ActivationT The activation to be fused.
 // \param nn Neural network module to be modified in place
 // \param should_fuse Given a conv op, check whether we want to fuse it with
-// subsequent relu or not 
+// subsequent relu or not
 // \param postprocess Functor to postprocess the conv node,
 // attaching additional attributes if necessary
 template <typename OperationT, typename ActivationT>
-void fuseActivation(
+CAFFE2_API void fuseActivation(
     repr::NNModule* nn,
     std::function<bool(const OperationT& conv)> should_fuse,
     std::function<void(repr::NNGraph::NodeRef conv_node)> postprocess) {
diff --git a/caffe2/opt/mobile.h b/caffe2/opt/mobile.h
index 1bc9a32e2e51ea..78e98763a32ea5 100644
--- a/caffe2/opt/mobile.h
+++ b/caffe2/opt/mobile.h
@@ -1,13 +1,14 @@
 #ifndef CAFFE2_OPT_MOBILE_H_
 #define CAFFE2_OPT_MOBILE_H_
 
+#include "caffe2/core/common.h"
 #include "nomnigraph/Representations/NeuralNet.h"
 
 namespace caffe2 {
 namespace opt {
 
-void addNNPACK(nom::repr::NNModule* nn, bool low_memory = false);
-void fuseNNPACKConvRelu(nom::repr::NNModule* nn);
+CAFFE2_API void addNNPACK(nom::repr::NNModule* nn, bool low_memory = false);
+CAFFE2_API void fuseNNPACKConvRelu(nom::repr::NNModule* nn);
 
 } // namespace opt
 } // namespace caffe2
diff --git a/caffe2/opt/onnx_convert.h b/caffe2/opt/onnx_convert.h
index 42a9c95aba471c..b21e0da9920a0b 100644
--- a/caffe2/opt/onnx_convert.h
+++ b/caffe2/opt/onnx_convert.h
@@ -1,4 +1,4 @@
-class OnnxAnnotation : public nom::repr::Annotation {
+class CAFFE2_API OnnxAnnotation : public nom::repr::Annotation {
 public:
   OnnxAnnotation() : Annotation(AnnotationKind::Onnx) {}
   OnnxAnnotation(std::string device)
@@ -10,11 +10,11 @@ class OnnxAnnotation : public nom::repr::Annotation {
   void setOperatorDef(caffe2::OperatorDef* opDef) {
     OpDef = opDef;
   }
-  const caffe2::OperatorDef* getOperatorDef() const { 
+  const caffe2::OperatorDef* getOperatorDef() const {
     assert(OpDef && "OperatorDef was never set.  Use OnnxAnnotation::setOperatorDef.");
     return OpDef;
   }
-  caffe2::OperatorDef* getMutableOperatorDef() { 
+  caffe2::OperatorDef* getMutableOperatorDef() {
     assert(OpDef && "OperatorDef was never set.  Use OnnxAnnotation::setOperatorDef.");
     return OpDef;
   }
@@ -28,9 +28,8 @@ class OnnxAnnotation : public nom::repr::Annotation {
   caffe2::OperatorDef* OpDef = nullptr;
 };
 
-nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
+CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
 
-caffe2::NetDef convertToOnnxProto(nom::repr::NNModule&);
-
-std::unique_ptr<nom::repr::NeuralNetOperator> convertToOperatorDef(caffe2::OperatorDef op);
+CAFFE2_API caffe2::NetDef convertToOnnxProto(nom::repr::NNModule&);
 
+CAFFE2_API std::unique_ptr<nom::repr::NeuralNetOperator> convertToOperatorDef(caffe2::OperatorDef op);
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index 197a026e3a8b9a..9ac80c799f8f4d 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -18,7 +18,7 @@ namespace onnx {
 class OnnxExporter;
 }
 
-class OnnxifiTransformer {
+class CAFFE2_API OnnxifiTransformer {
  public:
   explicit OnnxifiTransformer(bool debug);
 
diff --git a/caffe2/opt/optimize_ideep.h b/caffe2/opt/optimize_ideep.h
index 24635785336e57..edfd1fac027643 100644
--- a/caffe2/opt/optimize_ideep.h
+++ b/caffe2/opt/optimize_ideep.h
@@ -8,7 +8,7 @@
 namespace caffe2 {
 namespace opt {
 
-void OptimizeForIdeep(
+CAFFE2_API void OptimizeForIdeep(
     nom::repr::NNModule* nn,
     caffe2::Workspace* ws,
     bool training_mode = false);
diff --git a/caffe2/opt/optimizer.h b/caffe2/opt/optimizer.h
index e0756d16874649..a83232e0843d82 100644
--- a/caffe2/opt/optimizer.h
+++ b/caffe2/opt/optimizer.h
@@ -8,8 +8,8 @@
 namespace caffe2 {
 namespace opt {
 
-NetDef optimize(NetDef net, Workspace* ws, int level = 1);
-NetDef optimize(NetDef net, int level = 1);
+CAFFE2_API NetDef optimize(NetDef net, Workspace* ws, int level = 1);
+CAFFE2_API NetDef optimize(NetDef net, int level = 1);
 
 } // namespace opt
 } // namespace caffe2
diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h
index 585741664ca6fe..a0cda390ca83fd 100644
--- a/caffe2/opt/passes.h
+++ b/caffe2/opt/passes.h
@@ -21,7 +21,7 @@ namespace caffe2 {
  * use a different registry and inherit from WorkspaceOptimizationPass.
  */
 
-class OptimizationPass {
+class CAFFE2_API OptimizationPass {
  public:
   OptimizationPass(NNModule* nn) : nn_(nn) {}
   virtual void run() = 0;
@@ -31,7 +31,7 @@ class OptimizationPass {
   NNModule* nn_;
 };
 
-class WorkspaceOptimizationPass : public OptimizationPass {
+class CAFFE2_API WorkspaceOptimizationPass : public OptimizationPass {
  public:
   WorkspaceOptimizationPass(NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {}
   virtual ~WorkspaceOptimizationPass(){}
diff --git a/caffe2/opt/sink.h b/caffe2/opt/sink.h
index 37ad523e5f0b65..53f082c47598f6 100644
--- a/caffe2/opt/sink.h
+++ b/caffe2/opt/sink.h
@@ -8,7 +8,7 @@
 namespace caffe2 {
 namespace opt {
 
-void sinkMaxPool(nom::repr::NNModule* nn);
+CAFFE2_API void sinkMaxPool(nom::repr::NNModule* nn);
 
 } // namespace opt
 } // namespace caffe2
diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py
index ba2cbe1677c8b1..aca9e52af29293 100644
--- a/caffe2/python/_import_c_extension.py
+++ b/caffe2/python/_import_c_extension.py
@@ -19,7 +19,9 @@
     except ImportError as gpu_e:
         logging.info('Failed to import cuda module: {}'.format(gpu_e))
         try:
-            from caffe2.python.caffe2_pybind11_state_hip import *  # noqa
+            RTLD_LAZY = 1
+            with extension_loader.DlopenGuard(RTLD_LAZY):
+                from caffe2.python.caffe2_pybind11_state_hip import *  # noqa
             if num_hip_devices():
                 has_hip_support = True
                 logging.info('This caffe2 python run has AMD GPU support!')
diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py
index fe85d53680eaa5..10ac74ba9fd2a2 100644
--- a/caffe2/python/extension_loader.py
+++ b/caffe2/python/extension_loader.py
@@ -14,10 +14,10 @@
 
 
 @contextlib.contextmanager
-def DlopenGuard():
+def DlopenGuard(extra_flags=ctypes.RTLD_GLOBAL):
     if _set_global_flags:
         old_flags = sys.getdlopenflags()
-        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
+        sys.setdlopenflags(old_flags | extra_flags)
     yield
     if _set_global_flags:
         sys.setdlopenflags(old_flags)
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index e526d74f73921a..8ff58a68ce1083 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -150,7 +150,7 @@ def test_gemm(self):
             'Gemm',
             ['A', 'B', 'C'],
             ["Y"],
-            transA=True)
+            transA=1)
         output = c2.run_node(node_def, [A, B, C])
         np.testing.assert_almost_equal(
             output["Y"],
@@ -164,12 +164,12 @@ def test_gemm(self):
             'Gemm',
             ['A', 'B', 'C'],
             ["Y"],
-            transB=True)
+            transB=1)
         output = c2.run_node(node_def, [A, B, C])
         np.testing.assert_almost_equal(
             output["Y"],
             np.dot(A, np.transpose(B)) + C)
-        # revert A
+        # revert B
         B = np.transpose(B)
 
         # scale
@@ -186,27 +186,121 @@ def test_gemm(self):
             output["Y"],
             alpha * np.dot(A, B) + beta * C)
 
-        # broadcast
+        # setup broadcastable C
         C = np.random.randn(4).astype(np.float32)
+
+        # broadcast for opset7
         node_def = make_node(
             'Gemm',
             ['A', 'B', 'C'],
             ["Y"],
             alpha=alpha,
             beta=beta)
-        output = c2.run_node(node_def, [A, B, C])
+        output = c2.run_node(node_def, [A, B, C], opset_version=7)
+        np.testing.assert_almost_equal(
+            output["Y"],
+            alpha * np.dot(A, B) + beta * C)
+        # broadcast for opset3 and 6
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=alpha,
+            beta=beta,
+            broadcast=1)
+        output = c2.run_node(node_def, [A, B, C], opset_version=6)
+        np.testing.assert_almost_equal(
+            output["Y"],
+            alpha * np.dot(A, B) + beta * C)
+
+        # transB
+        B = np.transpose(B)
+
+        # transB and broadcast for opset7
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=alpha,
+            beta=beta,
+            transB=1)
+        output = c2.run_node(node_def, [A, B, C], opset_version=7)
+        np.testing.assert_almost_equal(
+            output["Y"],
+            alpha * np.dot(A, np.transpose(B)) + beta * C)
+        # transB and broadcast for opset3 and 6
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=alpha,
+            beta=beta,
+            broadcast=1,
+            transB=1)
+        output = c2.run_node(node_def, [A, B, C], opset_version=6)
+        np.testing.assert_almost_equal(
+            output["Y"],
+            alpha * np.dot(A, np.transpose(B)) + beta * C)
+
+        # revert B
+        B = np.transpose(B)
+        # set a scalar to C
+        C = np.random.randn(1).astype(np.float32)
+
+        # scalar broadcast for opset7
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=alpha,
+            beta=beta)
+        output = c2.run_node(node_def, [A, B, C], opset_version=7)
+        np.testing.assert_almost_equal(
+            output["Y"],
+            alpha * np.dot(A, B) + beta * C)
+        # scalar broadcast for opset3 and 6
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=alpha,
+            beta=beta,
+            broadcast=1)
+        output = c2.run_node(node_def, [A, B, C], opset_version=6)
         np.testing.assert_almost_equal(
             output["Y"],
             alpha * np.dot(A, B) + beta * C)
 
     def test_gemm_conversion(self):
         node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=2.,
+            beta=3.)
+        node_def_broadcast = make_node(
             'Gemm',
             ['A', 'B', 'C'],
             ["Y"],
             alpha=2.,
             beta=3.,
-            transB=True)
+            broadcast=1)
+        node_def_transpose_b = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=2.,
+            beta=3.,
+            transB=1)
+
+        node_def_transpose_b_broadcast = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=2.,
+            beta=3.,
+            transB=1,
+            broadcast=1)
 
         backend = C.Caffe2Backend()
 
@@ -220,10 +314,48 @@ def test_gemm_conversion(self):
             op_names.append(op.type)
         self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
 
-        # with shape info (that indicates C is 1D), gemm will be
-        # converted to FC
+        # opset7
+        # If C is a 1d tensor, gemm will be converted to FC/FCTransposed
+        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
+        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
+        7)
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
+
         _, op_strs = backend.convert_node(node_def.SerializeToString(
-        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()])
+        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
+        7)
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed'])
+
+        # opset6 without broadcast(C should match A*B's dim)
+        # The gemm will be converted to matmul + add, since the FC requires c
+        # to be 1d tensor.
+        _, op_strs = backend.convert_node(node_def.SerializeToString(
+        ), [make_tensor_value_info("A", onnx.TensorProto.FLOAT, (3,2)).SerializeToString(),
+            make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,3)).SerializeToString(),
+            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,3)).SerializeToString()],
+        6)
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
+
+        # opset6 with broadcast
+        # If C is a 1d tensor, gemm will be converted to FC/FCTransposed
+        _, op_strs = backend.convert_node(node_def_transpose_b_broadcast.SerializeToString(
+        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
+        6)
         op_names = []
         for s in op_strs:
             op = caffe2_pb2.OperatorDef()
@@ -231,21 +363,62 @@ def test_gemm_conversion(self):
             op_names.append(op.type)
         self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
 
-        # or with broadcast, gemm will be converted to fc
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            transB=True,
-            broadcast=1)
+        _, op_strs = backend.convert_node(node_def_broadcast.SerializeToString(
+        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
+        6)
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed'])
+
+        # opset7
+        # If C is a scalar and B's last dim is 1, gemm will be converted to FC/FCTransposed
+        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
+        ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (1,2)).SerializeToString(),
+            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
+        7)
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
 
-        _, op_strs = backend.convert_node(node_def.SerializeToString())
+        _, op_strs = backend.convert_node(node_def.SerializeToString(
+        ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,1)).SerializeToString(),
+            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
+        7)
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed'])
+        # If C is a scalar and B's last dim is not 1, gemm will be converted
+        # to matmul + add.
+        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
+        ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,2)).SerializeToString(),
+            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
+        7)
         op_names = []
         for s in op_strs:
             op = caffe2_pb2.OperatorDef()
             op.ParseFromString(s)
             op_names.append(op.type)
-        self.assertEqual(op_names, ['FC'])
+        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
+        # If C is a scalar and B's shape info is not available,
+        # gemm will be converted to matmul + add.
+        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
+        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
+        7)
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
 
     def test_tensor_filling_ops(self):
         for dtype in [
diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py
new file mode 100644
index 00000000000000..77faeaeeb608c4
--- /dev/null
+++ b/caffe2/python/operator_test/bisect_percentile_op_test.py
@@ -0,0 +1,182 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import hypothesis.strategies as st
+
+from caffe2.python import core, workspace
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+
+import bisect
+import numpy as np
+
+
+class TestBisectPercentileOp(hu.HypothesisTestCase):
+    def compare_reference(
+            self,
+            raw_data,
+            pct_raw_data,
+            pct_mapping,
+            pct_upper,
+            pct_lower,
+            lengths,
+    ):
+        def bisect_percentile_op_ref(
+            raw_data,
+            pct_raw_data,
+            pct_mapping,
+            pct_lower,
+            pct_upper,
+            lengths
+        ):
+            results = np.zeros_like(raw_data)
+            indices = [0]
+            for j in range(len(lengths)):
+                indices.append(indices[j] + lengths[j])
+            for i in range(len(raw_data)):
+                for j in range(len(raw_data[0])):
+                    start = indices[j]
+                    end = indices[j + 1]
+                    val = raw_data[i][j]
+                    pct_raw_data_i = pct_raw_data[start:end]
+                    pct_lower_i = pct_lower[start:end]
+                    pct_upper_i = pct_upper[start:end]
+                    pct_mapping_i = pct_mapping[start:end]
+
+                    # Corner cases
+                    if val < pct_raw_data_i[0]:
+                        results[i][j] = 0
+                        continue
+                    if val > pct_raw_data_i[-1]:
+                        results[i][j] = 1.
+                        continue
+
+                    # interpolation
+                    k = bisect.bisect_left(pct_raw_data_i, val)
+                    if pct_raw_data_i[k] == val:
+                        results[i][j] = pct_mapping_i[k]
+                    else:
+                        k = k - 1
+                        slope = ((pct_lower_i[k + 1] - pct_upper_i[k])
+                            / (pct_raw_data_i[k + 1] - pct_raw_data_i[k]))
+                        results[i][j] = pct_upper_i[k] + \
+                            slope * (val - pct_raw_data_i[k])
+
+            return results
+
+        workspace.ResetWorkspace()
+        workspace.FeedBlob("raw_data", raw_data)
+
+        op = core.CreateOperator(
+            "BisectPercentile",
+            ["raw_data"],
+            ["pct_output"],
+            percentile_raw=pct_raw_data,
+            percentile_mapping=pct_mapping,
+            percentile_lower=pct_lower,
+            percentile_upper=pct_upper,
+            lengths=lengths
+        )
+        workspace.RunOperatorOnce(op)
+
+        expected_output = bisect_percentile_op_ref(
+            raw_data,
+            pct_raw_data,
+            pct_mapping,
+            pct_lower,
+            pct_upper,
+            lengths
+        )
+        output = workspace.blobs['pct_output']
+        np.testing.assert_array_almost_equal(output, expected_output)
+
+    def test_bisect_percentil_op_simple(self):
+        raw_data = np.array([
+            [1, 1],
+            [2, 2],
+            [3, 3],
+            [3, 1],
+            [9, 10],
+            [1.5, 5],
+            [1.32, 2.4],
+            [2.9, 5.7],
+            [-1, -1],
+            [3, 7]
+        ], dtype=np.float32)
+        pct_raw_data = np.array([1, 2, 3, 2, 7], dtype=np.float32)
+        pct_lower = np.array([0.1, 0.2, 0.9, 0.1, 0.5], dtype=np.float32)
+        pct_upper = np.array([0.1, 0.8, 1.0, 0.4, 1.0], dtype=np.float32)
+        pct_mapping = np.array([0.1, 0.5, 0.95, 0.25, 0.75], dtype=np.float32)
+        lengths = np.array([3, 2], dtype=np.int32)
+        self.compare_reference(
+            raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths)
+
+    @given(
+        N=st.integers(min_value=20, max_value=100),
+        lengths=st.lists(
+            elements=st.integers(min_value=2, max_value=10),
+            min_size=2,
+            max_size=5,
+        ),
+        max_value=st.integers(min_value=100, max_value=1000),
+        discrete=st.booleans(),
+        p=st.floats(min_value=0, max_value=0.9),
+        **hu.gcs_cpu_only
+    )
+    def test_bisect_percentil_op_large(
+        self, N, lengths, max_value, discrete, p, gc, dc
+    ):
+        lengths = np.array(lengths, dtype=np.int32)
+        D = len(lengths)
+
+        if discrete:
+            raw_data = np.random.randint(0, max_value, size=(N, D))
+        else:
+            raw_data = np.random.randn(N, D)
+
+        # To generate valid pct_lower and pct_upper
+        pct_lower = []
+        pct_upper = []
+        pct_raw_data = []
+        for i in range(D):
+            pct_lower_val = 0.
+            pct_upper_val = 0.
+            pct_lower_cur = []
+            pct_upper_cur = []
+            # There is no duplicated values in pct_raw_data
+            if discrete:
+                pct_raw_data_cur = np.random.choice(
+                    np.arange(max_value), size=lengths[i], replace=False)
+            else:
+                pct_raw_data_cur = np.random.randn(lengths[i])
+                while len(set(pct_raw_data_cur)) < lengths[i]:
+                    pct_raw_data_cur = np.random.randn(lengths[i])
+            pct_raw_data_cur = np.sort(pct_raw_data_cur)
+            for _ in range(lengths[i]):
+                pct_lower_val = pct_upper_val + 0.01
+                pct_lower_cur.append(pct_lower_val)
+                pct_upper_val = pct_lower_val + \
+                    0.01 * np.random.randint(1, 20) * (np.random.uniform() < p)
+                pct_upper_cur.append(pct_upper_val)
+            # normalization
+            pct_lower_cur = np.array(pct_lower_cur, np.float32) / pct_upper_val
+            pct_upper_cur = np.array(pct_upper_cur, np.float32) / pct_upper_val
+            pct_lower.extend(pct_lower_cur)
+            pct_upper.extend(pct_upper_cur)
+            pct_raw_data.extend(pct_raw_data_cur)
+
+        pct_lower = np.array(pct_lower, dtype=np.float32)
+        pct_upper = np.array(pct_upper, dtype=np.float32)
+        pct_mapping = (pct_lower + pct_upper) / 2.
+        raw_data = np.array(raw_data, dtype=np.float32)
+        pct_raw_data = np.array(pct_raw_data, dtype=np.float32)
+
+        self.compare_reference(
+            raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/transforms/common_subexpression_elimination.h b/caffe2/transforms/common_subexpression_elimination.h
index e66ccf153abb85..2a2f6b882c74c1 100644
--- a/caffe2/transforms/common_subexpression_elimination.h
+++ b/caffe2/transforms/common_subexpression_elimination.h
@@ -25,7 +25,7 @@ namespace caffe2 {
  *
  * TODO(benz): Fix the error to not match nodes that write to external output.
  */
-class CommonSubexpressionEliminationTransform : public Transform {
+class CAFFE2_API CommonSubexpressionEliminationTransform : public Transform {
  public:
   CommonSubexpressionEliminationTransform() {
     SetPatternMatchType(SORTED_WRT_EXECUTION_ORDER);
diff --git a/caffe2/transforms/conv_to_nnpack_transform.h b/caffe2/transforms/conv_to_nnpack_transform.h
index 6438b147b5f3a9..83f91c364c5b02 100644
--- a/caffe2/transforms/conv_to_nnpack_transform.h
+++ b/caffe2/transforms/conv_to_nnpack_transform.h
@@ -7,7 +7,7 @@
 
 namespace caffe2 {
 
-class ConvToNNPackTransform : public SingleOpTransform {
+class CAFFE2_API ConvToNNPackTransform : public SingleOpTransform {
  protected:
   // Specify what the op needs to be to match the pattern.
   bool MatchOperator(const OperatorDef& op) override {
diff --git a/caffe2/transforms/pattern_net_transform.h b/caffe2/transforms/pattern_net_transform.h
index 1f54ccc1eb2d08..c22b42d9deb143 100644
--- a/caffe2/transforms/pattern_net_transform.h
+++ b/caffe2/transforms/pattern_net_transform.h
@@ -15,7 +15,7 @@ namespace caffe2 {
  * and this Transform will find subgraphs which fit the pattern net,
  * and replace it with the replace net.
  */
-class PatternNetTransform : public Transform {
+class CAFFE2_API PatternNetTransform : public Transform {
  public:
   PatternNetTransform(const NetDef& pattern_net, const NetDef& replace_net)
       : p_(transform::Graph(pattern_net)), r_(transform::Graph(replace_net)) {
diff --git a/caffe2/transforms/single_op_transform.h b/caffe2/transforms/single_op_transform.h
index dbc53e33831251..ae21f9aaa8d35e 100644
--- a/caffe2/transforms/single_op_transform.h
+++ b/caffe2/transforms/single_op_transform.h
@@ -15,7 +15,7 @@ namespace caffe2 {
  * Transforms which derive from SingleOpTransform need to override:
  * ReplaceOperator and MatchOperator.
  */
-class SingleOpTransform : public Transform {
+class CAFFE2_API SingleOpTransform : public Transform {
  protected:
   bool PatternRule(
       const transform::Graph& g,
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 4e8d2268258416..bb42109b770f6e 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -4,6 +4,24 @@
 # - Creates an ATen target for its generated C++ files and adds it
 #   as a dependency
 
+################################################################################
+# Helper functions
+################################################################################
+
+function(filter_list output input)
+    unset(result)
+    foreach(filename ${${input}})
+        foreach(pattern ${ARGN})
+            if("${filename}" MATCHES "${pattern}")
+                list(APPEND result "${filename}")
+            endif()
+        endforeach()
+    endforeach()
+    set(${output} ${result} PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
 if (DEFINED ENV{PYTORCH_PYTHON})
   message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}")
   set(PYCMD "$ENV{PYTORCH_PYTHON}")
@@ -20,7 +38,7 @@ configure_file(
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
         DESTINATION include
         FILES_MATCHING PATTERN "*.h")
-if (NOT BUILD_ATEN)
+if (BUILD_ATEN_MOBILE)
   install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
           DESTINATION include/ATen
           FILES_MATCHING PATTERN "*.h")
@@ -29,7 +47,7 @@ install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
         DESTINATION include/caffe2/core)
 
 # ---[ ATen specific
-if (BUILD_ATEN)
+if (NOT BUILD_ATEN_MOBILE)
   # SET_SOURCE_FILES_PROPERTIES must be in the same CMakeLists.txt file as the target that includes the file
   # so we need to set these commands here rather than in src/TH
   IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 887e357fd0c0f8..6f8609fc8007bf 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -61,7 +61,7 @@ if(BUILD_CAFFE2)
 endif()
 
 # ---[ BLAS
-if(BUILD_ATEN)
+if(NOT BUILD_ATEN_MOBILE)
   set(BLAS "MKL" CACHE STRING "Selected BLAS library")
 else()
   set(BLAS "Eigen" CACHE STRING "Selected BLAS library")
@@ -512,7 +512,7 @@ if(USE_CUDA)
 endif()
 
 # ---[ HIP
-if(BUILD_CAFFE2 OR BUILD_ATEN)
+if(BUILD_CAFFE2 OR NOT BUILD_ATEN_MOBILE)
   include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
   if(PYTORCH_FOUND_HIP)
     message(INFO "Compiling with HIP for AMD.")
@@ -539,19 +539,27 @@ if(BUILD_CAFFE2 OR BUILD_ATEN)
     set(Caffe2_HIP_DEPENDENCY_LIBS
       ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
     # Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image)
-    if(BUILD_ATEN)
+    if(NOT BUILD_ATEN_MOBILE)
       set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES})
     endif()
     # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
     list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
       roc::rocblas)
+
+    # TODO: Currently pytorch hipify script uses a feature called
+    # "disabled_modules" that effectively ifdef out a file, but
+    # without doing extra processing in the callers, which results in
+    # some unresolved symbols in the shared lib
+    # (libcaffe2_hip.so). Remove this when all disabled_modules are
+    # eliminated.
+    set(CMAKE_EXE_LINKER_FLAGS "-Wl,--unresolved-symbols=ignore-in-shared-libs ${CMAKE_EXE_LINKER_FLAGS}")
   else()
     caffe2_update_option(USE_ROCM OFF)
   endif()
 endif()
 
 # ---[ ROCm
-if(USE_ROCM AND NOT BUILD_CAFFE2)
+if(USE_ROCM)
  include_directories(SYSTEM ${HIP_PATH}/include)
  include_directories(SYSTEM ${ROCBLAS_PATH}/include)
  include_directories(SYSTEM ${HIPSPARSE_PATH}/include)
@@ -745,7 +753,7 @@ if (USE_NNAPI AND NOT ANDROID)
   caffe2_update_option(USE_NNAPI OFF)
 endif()
 
-if (BUILD_ATEN)
+if (NOT BUILD_ATEN_MOBILE)
   if (BUILD_CAFFE2)
     list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen)
     if (USE_CUDA)
@@ -809,7 +817,7 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
 endif()
 
 # --[ ATen checks
-if (BUILD_ATEN)
+if (NOT BUILD_ATEN_MOBILE)
   set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
   set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
 
@@ -846,28 +854,26 @@ if (BUILD_ATEN)
 
   #Check if certain std functions are supported. Sometimes
   #_GLIBCXX_USE_C99 macro is not defined and some functions are missing.
-  if (NOT ANDROID)
-    CHECK_CXX_SOURCE_COMPILES("
-    #include <cmath>
-    #include <string>
-
-    int main() {
-      int a = std::isinf(3.0);
-      int b = std::isnan(0.0);
-      std::string s = std::to_string(1);
-
-      return 0;
-      }" SUPPORT_GLIBCXX_USE_C99)
-
-    if (NOT SUPPORT_GLIBCXX_USE_C99)
-      message(FATAL_ERROR
-              "The C++ compiler does not support required functions. "
-              "This is very likely due to a known bug in GCC 5 "
-              "(and maybe other versions) on Ubuntu 17.10 and newer. "
-              "For more information, see: "
-              "https://github.com/pytorch/pytorch/issues/5229"
-             )
-    endif()
+  CHECK_CXX_SOURCE_COMPILES("
+  #include <cmath>
+  #include <string>
+
+  int main() {
+    int a = std::isinf(3.0);
+    int b = std::isnan(0.0);
+    std::string s = std::to_string(1);
+
+    return 0;
+    }" SUPPORT_GLIBCXX_USE_C99)
+
+  if (NOT SUPPORT_GLIBCXX_USE_C99)
+    message(FATAL_ERROR
+            "The C++ compiler does not support required functions. "
+            "This is very likely due to a known bug in GCC 5 "
+            "(and maybe other versions) on Ubuntu 17.10 and newer. "
+            "For more information, see: "
+            "https://github.com/pytorch/pytorch/issues/5229"
+           )
   endif()
 
   # Top-level build config
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index e1debe8be669f2..091d1f3c28a06c 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -19,7 +19,7 @@ function (caffe2_print_configuration_summary)
   message(STATUS "")
 
   message(STATUS "  BUILD_CAFFE2          : ${BUILD_CAFFE2}")
-  message(STATUS "  BUILD_ATEN            : ${BUILD_ATEN}")
+  message(STATUS "  BUILD_ATEN_MOBILE     : ${BUILD_ATEN_MOBILE}")
   message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
   message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
   if (${CAFFE2_LINK_LOCAL_PROTOBUF})
@@ -45,7 +45,6 @@ function (caffe2_print_configuration_summary)
   message(STATUS "  BUILD_TEST            : ${BUILD_TEST}")
 
   message(STATUS "  USE_ASAN              : ${USE_ASAN}")
-  message(STATUS "  USE_ATEN              : ${USE_ATEN}")
   message(STATUS "  USE_CUDA              : ${USE_CUDA}")
   if(${USE_CUDA})
     message(STATUS "    CUDA static link    : ${CAFFE2_STATIC_LINK_CUDA}")
@@ -127,11 +126,9 @@ function (caffe2_print_configuration_summary)
   message(STATUS "  USE_REDIS             : ${USE_REDIS}")
   message(STATUS "  USE_ROCKSDB           : ${USE_ROCKSDB}")
   message(STATUS "  USE_ZMQ               : ${USE_ZMQ}")
-  if(${BUILD_ATEN})
-    message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
-    if(${USE_DISTRIBUTED})
-      message(STATUS "    USE_DISTRIBUTED_MW     : ${USE_DISTRIBUTED_MW}")
-    endif()
+  message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
+  if(${USE_DISTRIBUTED})
+    message(STATUS "    USE_DISTRIBUTED_MW     : ${USE_DISTRIBUTED_MW}")
   endif()
 
   message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh
index 62185d1e9dc821..d31a732ed21c8d 100755
--- a/scripts/build_anaconda.sh
+++ b/scripts/build_anaconda.sh
@@ -318,7 +318,6 @@ if [[ -n $integrated ]]; then
     #add_package $cuda_feature_name
     conda_channel+=('-c pytorch')
 
-    caffe2_cmake_args+=("-DUSE_ATEN=ON")
   fi
 fi
 
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 1f039f648dfcbf..bf2cf4ebb719c3 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -559,6 +559,16 @@ def forward(self, input):
         input = Variable(torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9))
         self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
 
+    def test_log(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, input):
+                return input.log()
+        input = Variable(torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9))
+        self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
+
     def test_trigonometry(self):
         def test_func(name):
             class MyModel(torch.nn.Module):
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index 1f33081beed272..3db7a42ffd2361 100755
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -1,3 +1,4 @@
+import os
 import unittest
 import sys
 
@@ -15,7 +16,10 @@
 
 from torch.utils.cpp_extension import CUDA_HOME
 TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
-TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_available()
+TEST_CUDNN = False
+if TEST_CUDA:
+    CUDNN_HEADER_EXISTS = os.path.isfile(os.path.join(CUDA_HOME, 'include/cudnn.h'))
+    TEST_CUDNN = TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available()
 
 
 class TestCppExtension(common.TestCase):
diff --git a/test/test_jit.py b/test/test_jit.py
index f7945cd9cb25c3..a9766739feaf46 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -2336,7 +2336,7 @@ def reassign_from_empty_literal():
             if True:
                 x = [1, 2, 3]
             return
-        with self.assertRaisesRegex(RuntimeError, "Empty list literals not allowed"):
+        with self.assertRaisesRegex(RuntimeError, "previously has type Tensor\[\]"):
             self.checkScript(reassign_from_empty_literal, (), optimize=False)
 
         def reassign_from_empty_builtin():
@@ -5754,6 +5754,22 @@ def foo(self, x : torch.Tensor, y : Tuple[torch.Tensor, Tensor]) -> Tuple[Tensor
         fn = self._get_py3_code(code, 'instance')
         self.assertExpected(fn.__getattr__('foo').pretty_print_schema())
 
+    def test_method_casts_script(self):
+        cast_types = [
+            'byte', 'char', 'double', 'float', 'int', 'long', 'short'
+        ]
+
+        for cast_type in cast_types:
+            cu = torch.jit.CompilationUnit('''
+            def cast_to(x):
+                return x.{cast_type}()
+            '''.format(cast_type=cast_type))
+
+            x = torch.rand(3, 4, 5) * 128
+            cu_result = cu.cast_to(x)
+            reference = getattr(x, cast_type)()
+            self.assertEqual(cu_result, reference)
+
 
 class TestEndToEndHybridFrontendModels(JitTestCase):
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 1e47ec1b202c29..6bed41a64f7494 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -911,6 +911,7 @@ def test_storage_not_null(self):
 
     @cuda_only
     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
+    @skipIfRocm
     def test_same_gpu(self):
         i = self.IndexTensor([[2]]).cuda(1)
         v = self.ValueTensor([5]).cuda(1)
@@ -959,6 +960,7 @@ def test_new(self):
         self.assertEqual(x.new(indices, values, x.size()), x)
 
     @cpu_only  # not really, but we only really want to run this once
+    @skipIfRocm
     def test_factory(self):
         default_size = torch.Size([1, 3])
         size = torch.Size([3, 3])
@@ -987,6 +989,7 @@ def test_factory(self):
                             self.assertEqual(device, sparse_tensor._values().device)
                         self.assertEqual(True, sparse_tensor.requires_grad)
 
+    @skipIfRocm
     def test_factory_size_check(self):
         indices = self.IndexTensor([[1, 2], [0, 2]])
         values = self.ValueTensor([.5, .5])
@@ -1016,6 +1019,7 @@ def test_factory_type_inference(self):
         self.assertEqual(torch.int64, t.dtype)
 
     @cuda_only
+    @skipIfRocm
     def test_factory_device_type_inference(self):
         # both indices/values are CUDA
         shape = (1, 3)
@@ -1127,6 +1131,7 @@ def setUp(self):
 
 class TestSparseOneOff(TestCase):
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @skipIfRocm
     def test_cuda_from_cpu(self):
         self.assertExpectedRaises(
             RuntimeError,
@@ -1135,6 +1140,7 @@ def test_cuda_from_cpu(self):
                                              [3, 4, 4]))
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @skipIfRocm
     def test_cuda_sparse_cpu_dense_add(self):
         x = torch.zeros(3, 4, 4)
         sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
diff --git a/test/test_utils.py b/test/test_utils.py
index b28b4f83171aaf..af93e3652e63be 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -414,6 +414,7 @@ def test_cpu(self):
     @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows")
     @skipIfRocm
     def test_gpu(self):
+        from torch.utils.cpp_extension import CUDA_HOME
         create_extension(
             name='gpulib',
             headers=[test_dir + '/ffi/src/cuda/cudalib.h'],
@@ -422,6 +423,7 @@ def test_gpu(self):
             ],
             with_cuda=True,
             verbose=False,
+            include_dirs=[os.path.join(CUDA_HOME, 'include')],
         ).build()
         import gpulib
         tensor = torch.ones(2, 2).float()
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index c9e9daae89f5ef..e0384c9fec9c44 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -145,7 +145,6 @@
     "aten/src/ATen/native/cuda/CuFFTUtils.h",
     "aten/src/ATen/native/cuda/CuFFTPlanCache.h",
     "aten/src/ATen/native/cuda/SpectralOps.cu",
-    "aten/src/ATen/native/cuda/Distributions.cu",
   ],
   "disabled_functions": [
     {
@@ -171,7 +170,8 @@
       "functions": [
         "_s_poisson_cuda",
         "poisson_cuda_kernel",
-        "gamma_cuda_kernel"
+        "gamma_cuda_kernel",
+        "gamma_grad_cuda_kernel",
       ]
     },
     {
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index ec18705fe86151..2f8b3ae1c5ebce 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -183,7 +183,6 @@ goto:eof
                   -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^
                   -DNO_API=ON ^
                   -DBUILD_SHARED_LIBS="%BUILD_SHARED_LIBS%" ^
-                  -DBUILD_ATEN=ON ^
                   -DBUILD_PYTHON=OFF ^
                   -DBUILD_BINARY=OFF ^
                   -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index c1e0e1975167f2..f53de42c90a60b 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -265,7 +265,6 @@ function build_caffe2() {
       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
       -DBUILD_CAFFE2=$FULL_CAFFE2 \
       -DBUILD_TORCH=$BUILD_TORCH \
-      -DBUILD_ATEN=ON \
       -DBUILD_PYTHON=$FULL_CAFFE2 \
       -DBUILD_BINARY=OFF \
       -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 1fe8d69fbbf91b..583d0155023e19 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -254,9 +254,22 @@ struct Environment {
         throw ErrorReport(loc) << "Cannot re-assign '" << name << "' because it has type " << value->kind() <<
 	" and " << name << " is not a first-class value.  Only reassignments to first-class values are allowed";
       }
-      if(!as_simple_value->type()->isSubtypeOf(unshapedType(simple_parent->type()))) {
-        throw ErrorReport(loc) << "variable '" << name << "' previously has type " << simple_parent->type()->str()
-        << " but is now being assigned to a value of type " << as_simple_value->type()->str();
+      if (!as_simple_value->type()->isSubtypeOf(
+              unshapedType(simple_parent->type()))) {
+        std::stringstream errMsg;
+        errMsg << "variable '" << name << "' previously has type "
+               << simple_parent->type()->str()
+               << " but is now being assigned to a value of type "
+               << as_simple_value->type()->str();
+        // Special-cased error msg if we're trying to assign to a tensor list.
+        if (simple_parent->type()->kind() == TypeKind::ListType &&
+            as_simple_value->type()->kind() == TypeKind::ListType) {
+          errMsg << "\n. (Note: empty lists are constructed as Tensor[]; "
+                 << "if you want an empty list of a different type, "
+                 << "use `_construct_empty_foo_list`, "
+                 << "where `foo` is `int` or `float`)";
+        }
+        throw ErrorReport(loc) << errMsg.str();
       }
     }
     if (as_simple_value)
@@ -1374,12 +1387,10 @@ struct to_ir {
       case TK_LIST_LITERAL: {
         auto ll = ListLiteral(tree);
         auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity);
-        if (values.size() == 0) {
-          throw ErrorReport(tree) << "Empty list literals not allowed. "
-                                  << "Use _construct_empty_foo_list() instead. "
-                                  << "`foo` can be `int`, `float` or `tensor`";
-        }
-        const auto elem_type = values.at(0)->type();
+
+        // If this is an empty list literal `[]`, construct an empty Tensor[]
+        const auto elem_type =
+            values.empty() ? DynamicType::get() : values.at(0)->type();
         for (auto v : values) {
           if (v->type() != elem_type) {
             throw ErrorReport(tree)
@@ -1481,9 +1492,27 @@ struct to_ir {
   }
 };
 
+static const std::unordered_map<std::string, std::string> &builtin_cast_methods() {
+  static std::unordered_map<std::string, std::string> builtin_cast_methods = {
+    {"byte", "_cast_Byte"},
+    {"char", "_cast_Char"},
+    {"double", "_cast_Double"},
+    {"float", "_cast_Float"},
+    {"int", "_cast_Int"},
+    {"long", "_cast_Long"},
+    {"short", "_cast_Short"},
+    {"half", "_cast_Half"}
+  };
+  return builtin_cast_methods;
+}
+
 // support syntax sugar for x.foo(y, z) by allowing x.foo to return a
 // callable value that will resolve to foo(x, y, z) when called.
 std::shared_ptr<SugaredValue> SimpleValue::attr(SourceRange loc, Method & m, const std::string& field) {
+  // Allow method-style casts on Tensor types. e.g. x.int()
+  if (value->type()->isSubtypeOf(DynamicType::get()) && builtin_cast_methods().count(field)) {
+    return std::make_shared<BuiltinFunction>(builtin_cast_methods().at(field), NamedValue(loc, "self", value));
+  }
   return std::make_shared<BuiltinFunction>(field, NamedValue(loc, "self", value));
 }
 
@@ -1583,7 +1612,6 @@ TypePtr parseTypeFromExpr(Expr expr) {
         && select.selector().name() == "Tensor") {
       return ident_to_type_lut().at("Tensor");
     }
-    std::cout << select << std::endl;
   }
   throw ErrorReport(expr.range()) << "Expression of type " << kindToString(expr.kind())
                                   << " cannot be used in a type expression";
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 8c1ee681122edd..db767610db8a9e 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -359,7 +359,7 @@ def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
 def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
     r"""Reduces the tensor data across all machines.
 
-    Only the process with rank :attr`dst` is going to receive the final result.
+    Only the process with rank :attr:`dst` is going to receive the final result.
 
     Arguments:
         tensor (Tensor): Input and output of the collective. The function
diff --git a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
index dd4bc47f52f981..78ce8b7951c1ee 100644
--- a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
@@ -96,7 +96,7 @@ class NCCLTest : public NCCLTestBase {
     // Copy inputs to outputs
     for (auto i = 0; i < numDevices_; i++) {
       cudaStreamSynchronize(streams_[i].getStream());
-      outputs[i] = inputs_[i].toBackend(at::kCPU);
+      outputs[i] = inputs_[i].cpu();
     }
 
     return outputs;
@@ -115,7 +115,7 @@ class NCCLTest : public NCCLTestBase {
     for (auto i = 0; i < numDevices_; ++i) {
       cudaStreamSynchronize(streams_[i].getStream());
       for (auto j = 0; j < worldSize_; ++j) {
-        outputs[i][j] = outputs_[i][j].toBackend(at::kCPU);
+        outputs[i][j] = outputs_[i][j].cpu();
       }
     }
     return outputs;
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index f5e7503f71f971..688eb0cdd9bbf0 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -749,6 +749,10 @@ def abs(g, self):
     return g.op("Abs", self)
 
 
+def log(g, self):
+    return g.op("Log", self)
+
+
 def pow(g, self, exponent):
     exponent = _maybe_get_scalar(exponent)
     return g.op("Pow", self, _if_scalar_type_as(g, exponent, self))
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 0f05191f3c5104..2a1815e7d9708b 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -69,6 +69,10 @@ def _find_cuda_home():
 BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+')
 
 
+def is_binary_build():
+    return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__)
+
+
 def check_compiler_abi_compatibility(compiler):
     '''
     Verifies that the given compiler is ABI-compatible with PyTorch.
@@ -81,7 +85,7 @@ def check_compiler_abi_compatibility(compiler):
         False if the compiler is (likely) ABI-incompatible with PyTorch,
         else True.
     '''
-    if BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__):
+    if not is_binary_build():
         return True
     try:
         check_cmd = '{}' if sys.platform == 'win32' else '{} --version'
@@ -138,6 +142,7 @@ def build_extensions(self):
         self._check_abi()
         for extension in self.extensions:
             self._define_torch_extension_name(extension)
+            self._add_gnu_abi_flag_if_binary(extension)
 
         # Register .cu and .cuh as valid source extensions.
         self.compiler.src_extensions += ['.cu', '.cuh']
@@ -270,6 +275,21 @@ def _define_torch_extension_name(self, extension):
         else:
             extension.extra_compile_args.append(define)
 
+    def _add_gnu_abi_flag_if_binary(self, extension):
+        # If the version string looks like a binary build,
+        # we know that PyTorch was compiled with gcc 4.9.2.
+        # if the extension is compiled with gcc >= 5.1,
+        # then we have to define _GLIBCXX_USE_CXX11_ABI=0
+        # so that the std::string in the API is resolved to
+        # non-C++11 symbols
+        define = '-D_GLIBCXX_USE_CXX11_ABI=0'
+        if is_binary_build():
+            if isinstance(extension.extra_compile_args, dict):
+                for args in extension.extra_compile_args.values():
+                    args.append(define)
+            else:
+                extension.extra_compile_args.append(define)
+
 
 def CppExtension(name, sources, *args, **kwargs):
     '''
@@ -792,6 +812,9 @@ def _write_ninja_file(path,
     common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
     common_cflags += ['-I{}'.format(include) for include in includes]
 
+    if is_binary_build():
+        common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0']
+
     cflags = common_cflags + ['-fPIC', '-std=c++11'] + extra_cflags
     if sys.platform == 'win32':
         from distutils.spawn import _nt_quote_args