diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index bc5e6c973e5e36..928c5fa0e47387 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -157,6 +157,7 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then export HCC_AMDGPU_TARGET=gfx900 ########## HIPIFY Caffe2 operators + ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_pytorch_amd.py" ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py" fi @@ -190,7 +191,6 @@ else fi - ############################################################################### # Configure and make ############################################################################### diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index bfbd40b259fbc3..b0a08a922d9623 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -30,7 +30,6 @@ cmake --version pip install -r requirements.txt || true if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then - export MAX_JOBS=4 # This is necessary in order to cross compile (or else we'll have missing GPU device). export HCC_AMDGPU_TARGET=gfx900 @@ -48,6 +47,7 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then sudo apt-get install libc++abi1 python tools/amd_build/build_pytorch_amd.py + python tools/amd_build/build_caffe2_amd.py USE_ROCM=1 python setup.py install --user exit 0 fi diff --git a/CMakeLists.txt b/CMakeLists.txt index 23a5080a88d0a9..edbd4381c70bab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,7 +55,8 @@ endif() include(CMakeDependentOption) option(BUILD_TORCH "Build Torch" OFF) option(BUILD_CAFFE2 "Build Caffe2" ON) -option(BUILD_ATEN "Build ATen" OFF) +option(ATEN_NO_TEST "Do not build ATen test binaries" OFF) +option(BUILD_ATEN_MOBILE "Build ATen for Android and iOS" OFF) option(BUILD_BINARY "Build C++ binaries" ON) option(BUILD_DOCS "Build Caffe2 documentation" OFF) option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON) @@ -75,7 +76,6 @@ cmake_dependent_option( "BUILD_TEST" OFF) option(USE_ACL "Use ARM Compute Library" OFF) option(USE_ASAN "Use Address Sanitizer" OFF) -option(USE_ATEN "Use ATen" OFF) option(USE_CUDA "Use CUDA" ON) option(USE_ROCM "Use ROCm" OFF) option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) @@ -145,8 +145,8 @@ option(USE_DISTRIBUTED_MW "Use THD (distributed) master worker" OFF) # Used when building Caffe2 through setup.py option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF) -if (USE_ATEN) - set(BUILD_ATEN ${USE_ATEN}) +if (ANDROID OR IOS) + set(BUILD_ATEN_MOBILE ON) endif() # ---[ CMake scripts + modules diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index 2f2ffdce186d39..ee025265a982e7 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -1,8 +1,4 @@ -if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - if (NOT BUILD_ATEN) - return() - endif() -else() +if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) cmake_minimum_required(VERSION 3.0 FATAL_ERROR) project(ATen CXX C) include(CMakeDependentOption) @@ -14,9 +10,10 @@ else() USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) option(ATEN_NO_TEST "Do not build ATen test binaries" OFF) - - # Flag for shared dependencies - set(BUILD_ATEN ON) +else() + if (BUILD_ATEN_MOBILE) + return() + endif() endif() # Find modules diff --git a/aten/src/ATen/ATenGeneral.h b/aten/src/ATen/ATenGeneral.h index fd6eeb3937e0ee..6fd55e20ff7b36 100644 --- a/aten/src/ATen/ATenGeneral.h +++ b/aten/src/ATen/ATenGeneral.h @@ -3,4 +3,4 @@ #include "ATen/core/Macros.h" // TODO: Merge the *_API macros. -#define AT_API AT_CORE_API \ No newline at end of file +#define AT_API AT_CORE_API diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 9005fa189f9e66..f6d296dfe79e45 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -13,23 +13,6 @@ IF(NOT MSVC) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-absolute-value") ENDIF(NOT MSVC) -################################################################################ -# Helper functions -################################################################################ - -function(filter_list output input) - unset(result) - foreach(filename ${${input}}) - foreach(pattern ${ARGN}) - if("${filename}" MATCHES "${pattern}") - list(APPEND result "${filename}") - endif() - endforeach() - endforeach() - set(${output} ${result} PARENT_SCOPE) -endfunction() - - # Can be compiled standalone IF(NOT AT_INSTALL_BIN_DIR OR NOT AT_INSTALL_LIB_DIR OR NOT AT_INSTALL_INCLUDE_DIR OR NOT AT_INSTALL_SHARE_DIR) SET(AT_INSTALL_BIN_DIR "bin" CACHE PATH "AT install binary subdirectory") diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h index 3beda18f31e764..366adf0f2396fe 100644 --- a/aten/src/ATen/cuda/ATenCUDAGeneral.h +++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h @@ -1,7 +1,7 @@ #pragma once #ifdef _WIN32 -# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) +# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_BUILD_MAIN_LIB) # define AT_CUDA_API __declspec(dllexport) # else # define AT_CUDA_API __declspec(dllimport) diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 257c8caf3e6fb3..755272915a826b 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -127,7 +127,7 @@ auto ConvParams::use_miopen(const at::Tensor& input) const -> bool { auto ConvParams::use_mkldnn(const at::Tensor& input) const -> bool { #if AT_MKLDNN_ENABLED() - return input.type().backend() == kCPU && + return input.type().backend() == at::Backend::CPU && input.type().scalarType() == kFloat && // only on CPU Float Tensors !is_dilated() && // doesn't support dilation !transposed && // or transposed tensors diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index c2ad676c56329d..c41992832ebcb2 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -29,13 +29,16 @@ using namespace at::native::detail; // counting_iterator => index to fill struct cnt_to_dst_idx_functor : public thrust::unary_function { - const int64_t last_dim_size; - const int64_t last_dim_start_slice; - const int64_t last_dim_to_fill_size; + int64_t last_dim_size; + int64_t last_dim_start_slice; + int64_t last_dim_to_fill_size; cnt_to_dst_idx_functor(int64_t last_dim_size, int64_t last_dim_start_slice) : last_dim_size(last_dim_size), last_dim_start_slice(last_dim_start_slice), last_dim_to_fill_size(last_dim_size - last_dim_start_slice) {} + + __host__ __device__ + cnt_to_dst_idx_functor & operator=(const cnt_to_dst_idx_functor&) = default; __host__ __device__ __forceinline__ int64_t operator()(const int64_t& i) const diff --git a/aten/src/ATen/native/miopen/BatchNorm.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp similarity index 100% rename from aten/src/ATen/native/miopen/BatchNorm.cpp rename to aten/src/ATen/native/miopen/BatchNorm_miopen.cpp diff --git a/aten/src/ATen/native/miopen/Conv.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp similarity index 100% rename from aten/src/ATen/native/miopen/Conv.cpp rename to aten/src/ATen/native/miopen/Conv_miopen.cpp diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index 44c536e7e5d701..bb9f7b92bd691a 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -514,7 +514,7 @@ void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize)); } -THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize) +void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize) { if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) ) { @@ -531,7 +531,7 @@ THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, co } -THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize) +void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize) { if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) ) { diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in index 3ba4ed9719b5ee..af0227abdd0b1c 100644 --- a/aten/src/THC/THCGeneral.h.in +++ b/aten/src/THC/THCGeneral.h.in @@ -23,7 +23,7 @@ #endif #ifdef _WIN32 -# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) +# if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_BUILD_MAIN_LIB) # define THC_API THC_EXTERNC __declspec(dllexport) # define THC_CLASS __declspec(dllexport) # else diff --git a/aten/src/THC/THCSleep.cu b/aten/src/THC/THCSleep.cu index d30576212e9b7b..a6ebbdb1f42742 100644 --- a/aten/src/THC/THCSleep.cu +++ b/aten/src/THC/THCSleep.cu @@ -12,7 +12,7 @@ __global__ void spin_kernel(int64_t cycles) } } -THC_API void THC_sleep(THCState* state, int64_t cycles) +void THC_sleep(THCState* state, int64_t cycles) { dim3 grid(1); dim3 block(1); diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 6ab3362d6ab20a..7c1a84cc19c687 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -7,7 +7,7 @@ include(../cmake/Codegen.cmake) add_subdirectory(utils) # ---[ ATen build -if(BUILD_ATEN) +if (NOT BUILD_ATEN_MOBILE) set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(AT_LINK_STYLE INTERFACE) @@ -49,7 +49,7 @@ if(BUILD_ATEN) IF(USE_ROCM) # Set the HIP Variables - set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS}) + set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${ATen_CUDA_SRCS}) set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE}) ENDIF(USE_ROCM) else() @@ -340,6 +340,12 @@ if(USE_CUDA) target_compile_options(caffe2_gpu PUBLIC "-DAT_CORE_STATIC_WINDOWS=1") endif() + # NB: This must be target_compile_definitions, not target_compile_options, + # as the latter is not respected by nvcc + if (MSVC) + target_compile_definitions(caffe2_gpu PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") + endif() + # Set standard properties on the target aten_set_target_props(caffe2_gpu) @@ -351,21 +357,19 @@ endif() # ---[ Caffe2 HIP sources. if(USE_ROCM) # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs. - if(BUILD_ATEN) - # Get Compile Definitions from the directory (FindHIP.cmake bug) - get_directory_property(MY_DEFINITIONS COMPILE_DEFINITIONS) - if(MY_DEFINITIONS) - foreach(_item ${MY_DEFINITIONS}) - LIST(APPEND HIP_HCC_FLAGS "-D${_item}") - endforeach() - endif() - - # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs. - hip_include_directories(${Caffe2_HIP_INCLUDES}) + # Get Compile Definitions from the directory (FindHIP.CMake bug) + get_directory_property(MY_DEFINITIONS COMPILE_DEFINITIONS) + if(MY_DEFINITIONS) + foreach(_item ${MY_DEFINITIONS}) + LIST(APPEND HIP_HCC_FLAGS "-D${_item}") + endforeach() endif() - IF(BUILD_CAFFE2) - set_source_files_properties(${Caffe2_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - ENDIF() + + # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs. + hip_include_directories(${Caffe2_HIP_INCLUDES}) + + filter_list(__caffe2_hip_srcs_cpp Caffe2_HIP_SRCS "\\.(cc|cpp|cu)$") + set_source_files_properties(${__caffe2_hip_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) # FindHIP.CMake checks if the SHARED flag is set and adds extra logic accordingly. hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS}) @@ -444,7 +448,7 @@ if(BUILD_CAFFE2) set(__aten_test_dir "test/aten") endif() # Todo - Set up ATen tests for ROCm in an upcoming PR -if(BUILD_ATEN AND NOT USE_ROCM) +if(NOT USE_ROCM) foreach(test_src ${ATen_CPU_TEST_SRCS}) get_filename_component(test_name ${test_src} NAME_WE) add_executable(${test_name} "${test_src}") diff --git a/caffe2/README.md b/caffe2/README.md index a1166b8e4f9451..afd8fab339c310 100644 --- a/caffe2/README.md +++ b/caffe2/README.md @@ -6,7 +6,7 @@ Caffe2 is a lightweight, modular, and scalable deep learning framework. Building ## Questions and Feedback -Please use Github issues (https://github.com/caffe2/caffe2/issues) to ask questions, report bugs, and request new features. +Please use Github issues (https://github.com/pytorch/pytorch/issues) to ask questions, report bugs, and request new features. ### Further Resources on [Caffe2.ai](http://caffe2.ai) diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt index 5bc2341e3d2884..92eb671e019cb7 100644 --- a/caffe2/contrib/aten/CMakeLists.txt +++ b/caffe2/contrib/aten/CMakeLists.txt @@ -1,4 +1,4 @@ -if(BUILD_ATEN) +if(NOT BUILD_ATEN_MOBILE) # Add source generated by Codegen.cmake and pass to parent list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc) list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc) diff --git a/caffe2/core/common.h b/caffe2/core/common.h index 8f5c79a74c6a9e..048d634df80dfa 100644 --- a/caffe2/core/common.h +++ b/caffe2/core/common.h @@ -94,19 +94,6 @@ using std::vector; #define CAFFE2_NORETURN __attribute__((noreturn)) #endif -/** - * Macro for marking functions as having public visibility. - * Ported from folly/CPortability.h - */ -#ifndef __GNUC_PREREQ -#if defined __GNUC__ && defined __GNUC_MINOR__ -#define __GNUC_PREREQ(maj, min) \ - ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) -#else -#define __GNUC_PREREQ(maj, min) 0 -#endif -#endif - // Defines CAFFE2_EXPORT and CAFFE2_IMPORT. On Windows, this corresponds to // different declarations (dllexport and dllimport). On Linux/Mac, it just // resolves to the same "default visibility" setting. @@ -120,11 +107,7 @@ using std::vector; #endif #else #if defined(__GNUC__) -#if __GNUC_PREREQ(4, 9) -#define CAFFE2_EXPORT [[gnu::visibility("default")]] -#else #define CAFFE2_EXPORT __attribute__((__visibility__("default"))) -#endif #else #define CAFFE2_EXPORT #endif diff --git a/caffe2/core/graph.h b/caffe2/core/graph.h index 1bd0d4fa9616da..ac037d5f0867ae 100644 --- a/caffe2/core/graph.h +++ b/caffe2/core/graph.h @@ -16,7 +16,7 @@ namespace transform { /** * Graph representation of an operator. */ -struct Node { +struct CAFFE2_API Node { public: // Empty constructor for resize Node() {} @@ -45,7 +45,7 @@ struct Node { /** * Graph representation of a Netdef. */ -struct Graph { +struct CAFFE2_API Graph { public: /** * Given a subgraph, gets all of the parents of the subgraph, as well as @@ -155,7 +155,7 @@ struct Graph { // Adds an operator def to a netdef. // Returns the ptr, if you want to add anything extra (such as device_option) -OperatorDef* AddOp( +CAFFE2_API OperatorDef* AddOp( NetDef* netdef_ptr, string op_type, std::vector inputs, @@ -168,12 +168,12 @@ OperatorDef* AddOp( * For example, if we wanted to match an operator to Conv or FC, we can give: * "Conv|FC" as the type() of that op. */ -bool MatchStrings(string p, string s); +CAFFE2_API bool MatchStrings(string p, string s); /** * This ensures that each named arg that exists in the pattern exists in g_op, * is equal in value. */ -bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op); +CAFFE2_API bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op); } // namespace caffe2 diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h index 99060ddb0bcaf9..550326d700f0dd 100644 --- a/caffe2/core/net_simple.h +++ b/caffe2/core/net_simple.h @@ -16,7 +16,7 @@ namespace caffe2 { // This is the very basic structure you need to run a network - all it // does is simply to run everything in sequence. If you want more fancy control // such as a DAG-like execution, check out other better net implementations. -class SimpleNet : public NetBase { +class CAFFE2_API SimpleNet : public NetBase { public: SimpleNet(const std::shared_ptr& net_def, Workspace* ws); bool SupportsAsync() override { diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc index a60ddb127d545b..c31de031f853c6 100644 --- a/caffe2/core/nomnigraph/Representations/NeuralNet.cc +++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc @@ -199,12 +199,13 @@ NNNodeMatchCriteria matchAnyNode() { [](NNGraph::NodeRef /* unused */) { return true; }, "matchAnyNode"); } -NNMatchGraph::NodeRef operatorTree( +NNMatchGraph::NodeRef operatorSubgraph( NNMatchGraph& g, const NNNodeMatchCriteria& root, const std::vector& childrenCriteria, int count) { - return tree(g, matchAnyNode(), {tree(g, root, childrenCriteria)}, count); + return subgraph( + g, matchAnyNode(), {subgraph(g, root, childrenCriteria)}, count); } } // namespace nn diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h index 425c6ffe3fec46..4f072545d4e6ed 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h @@ -412,6 +412,10 @@ class Graph { return result; } + size_t getEdgesCount() const { + return (size_t)edges_.size(); + } + private: std::list> nodes_; std::list> edges_; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index ac4e1fa61328e1..98e1bcba123642 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -487,9 +487,9 @@ using NNSubgraphMatcher = nom::matcher::SubgraphMatcher; // This helper method makes it easy to create matching criteria in NNGraph. -// For example, operatorTree(opMatch, ...) will refer to a tree like this: +// For example, operatorSubgraph(opMatch, ...) will refer to a tree like this: // ... -> opMatch -> opMatch_Output -NNMatchGraph::NodeRef operatorTree( +NNMatchGraph::NodeRef operatorSubgraph( NNMatchGraph& g, const NNNodeMatchCriteria& root, const std::vector& childrenCriteria = {}, diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h index 020454bf72d33c..9e0f44c896ac1e 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h @@ -5,6 +5,7 @@ #include #include +#include #include namespace nom { @@ -56,7 +57,7 @@ template using MatchNodeRef = typename MatchGraph::NodeRef; template -MatchNodeRef tree( +MatchNodeRef subgraph( MatchGraph& graph, const NodeMatchCriteria& root, const std::vector>& children, @@ -96,19 +97,20 @@ std::string debugString(MatchNodeRef rootCriteriaRef) { } template -class SubtreeMatchResult { +class SubgraphMatchResult { public: - static SubtreeMatchResult notMatched( + static SubgraphMatchResult notMatched( const std::string& debugMessage) { - return SubtreeMatchResult(false, debugMessage); + return SubgraphMatchResult(false, debugMessage); } - static SubtreeMatchResult notMatched() { - return SubtreeMatchResult(false, "Debug message is not enabled"); + static SubgraphMatchResult notMatched() { + return SubgraphMatchResult( + false, "Debug message is not enabled"); } - static SubtreeMatchResult matched() { - return SubtreeMatchResult(true, ""); + static SubgraphMatchResult matched() { + return SubgraphMatchResult(true, "Matched"); } bool isMatch() const { @@ -120,7 +122,7 @@ class SubtreeMatchResult { } private: - SubtreeMatchResult(bool isMatch, const std::string& debugMessage) + SubgraphMatchResult(bool isMatch, const std::string& debugMessage) : isMatch_(isMatch), debugMessage_(debugMessage) {} const bool isMatch_; @@ -141,32 +143,103 @@ struct SubgraphMatcher { return NodeMatcherClass::isMatch(node, criteria); } - // Check if there can be a sub-tree that matches the given criteria that + // Check if there can be a subgraph that matches the given criteria that // is rooted at the given rootNode. // The flag invertGraphTraversal specify if we should follow out edges or // in edges. The default is true which is useful for a functional // intepretation of a dataflow graph. - static SubtreeMatchResult isSubtreeMatch( + static SubgraphMatchResult isSubgraphMatch( + typename GraphType::NodeRef root, + const MatchNodeRef& rootCriteriaRef, + bool invertGraphTraversal = true, + bool debug = false) { + std::unordered_map< + MatchNodeRef, + typename GraphType::NodeRef> + matchedNodes; + return isSubgraphMatchInternal( + matchedNodes, root, rootCriteriaRef, invertGraphTraversal, debug); + } + + // Utility to transform a graph by looking for subgraphs that match + // a given pattern and then allow callers to mutate the graph based on + // subgraphs that are found. + // The current implementation doesn't handle any graph transformation + // itself. Callers should be responsible for all intended mutation, including + // deleting nodes in the subgraphs found by this algorithm. + // Note: if the replaceFunction lambda returns false, the entire procedure + // is aborted. This maybe useful in certain cases when we want to terminate + // the subgraph search early. + // invertGraphTraversal flag: see documentation in isSubgraphMatch + static void replaceSubgraph( + GraphType& graph, + const MatchNodeRef& criteria, + const std::function& + replaceFunction, + bool invertGraphTraversal = true) { + for (auto nodeRef : graph.getMutableNodes()) { + // Make sure the node is still in the graph. + if (!graph.hasNode(nodeRef)) { + continue; + } + if (isSubgraphMatch(nodeRef, criteria, invertGraphTraversal).isMatch()) { + if (!replaceFunction(graph, nodeRef)) { + // If replaceFunction returns false, it means that we should abort + // the entire procedure. + break; + } + } + } + } + + private: + static SubgraphMatchResult isSubgraphMatchInternal( + std::unordered_map< + MatchNodeRef, + typename GraphType::NodeRef>& matchedNodes, typename GraphType::NodeRef root, const MatchNodeRef& rootCriteriaRef, bool invertGraphTraversal = true, bool debug = false) { auto rootCriteriaNode = rootCriteriaRef->data(); + + if (rootCriteriaNode.getCount() == 1) { + auto matchedNodeEntry = matchedNodes.find(rootCriteriaRef); + if (matchedNodeEntry != matchedNodes.end()) { + // If rootCriteriaRef has been matched before (without multiplicity), + // we should look up the corresponding matched node in the graph + // and verify if it is the same. + auto matchedNode = matchedNodeEntry->second; + if (matchedNode == root) { + return SubgraphMatchResult::matched(); + } else if (debug) { + std::ostringstream debugMessage; + debugMessage << "Subgraph root at " << root << " is not the same as " + << matchedNode << " which previously matched criteria " + << debugString(rootCriteriaRef); + return SubgraphMatchResult::notMatched(debugMessage.str()); + } else { + return SubgraphMatchResult::notMatched(); + } + } + } + if (!isNodeMatch(root, rootCriteriaNode.getCriteria())) { if (debug) { std::ostringstream debugMessage; - debugMessage << "Subtree root at " << root + debugMessage << "Subgraph root at " << root << " does not match criteria " << debugString(rootCriteriaRef); - return SubtreeMatchResult::notMatched(debugMessage.str()); + return SubgraphMatchResult::notMatched(debugMessage.str()); } else { - return SubtreeMatchResult::notMatched(); + return SubgraphMatchResult::notMatched(); } } if (rootCriteriaNode.isNonTerminal()) { // This is sufficient to be a match if this criteria specifies a non // terminal node. - return SubtreeMatchResult::matched(); + matchedNodes[rootCriteriaRef] = root; + return SubgraphMatchResult::matched(); } auto& edges = invertGraphTraversal ? root->getInEdges() : root->getOutEdges(); @@ -176,7 +249,7 @@ struct SubgraphMatcher { int numChildrenCriteria = outEdges.size(); // The current algorithm implies that the ordering of the children is - // important. The children nodes will be matched with the children subtree + // important. The children nodes will be matched with the children subgraph // criteria in the given order. int currentEdgeIdx = 0; @@ -200,7 +273,8 @@ struct SubgraphMatcher { auto edge = edges[currentEdgeIdx]; auto child = invertGraphTraversal ? edge->tail() : edge->head(); - if (!isSubtreeMatch(child, childrenCriteriaRef, invertGraphTraversal) + if (!isSubgraphMatchInternal( + matchedNodes, child, childrenCriteriaRef, invertGraphTraversal) .isMatch()) { if (!isStarCount) { // If the current criteria isn't a * pattern, this indicates a @@ -213,10 +287,10 @@ struct SubgraphMatcher { childrenCriteriaRef) << ". We expected " << expectedCount << " matches but only found " << countMatch << "."; - return SubtreeMatchResult::notMatched( + return SubgraphMatchResult::notMatched( debugMessage.str()); } else { - return SubtreeMatchResult::notMatched(); + return SubgraphMatchResult::notMatched(); } } else { // Otherwise, we should move on to the next children criteria. @@ -236,9 +310,9 @@ struct SubgraphMatcher { << " matches for child criteria " << debugString(childrenCriteriaRef) << " but only found " << countMatch; - return SubtreeMatchResult::notMatched(debugMessage.str()); + return SubgraphMatchResult::notMatched(debugMessage.str()); } else { - return SubtreeMatchResult::notMatched(); + return SubgraphMatchResult::notMatched(); } } } @@ -247,48 +321,17 @@ struct SubgraphMatcher { // Fails because there are unmatched edges. if (debug) { std::ostringstream debugMessage; - debugMessage << "Unmatched children for subtree root at " << root + debugMessage << "Unmatched children for subgraph root at " << root << ". There are " << numEdges << " children, but only found " << currentEdgeIdx << " matches for the children criteria."; - return SubtreeMatchResult::notMatched(debugMessage.str()); + return SubgraphMatchResult::notMatched(debugMessage.str()); } else { - return SubtreeMatchResult::notMatched(); - } - } - return SubtreeMatchResult::matched(); - } - - // Utility to transform a graph by looking for subtrees that match - // a given pattern and then allow callers to mutate the graph based on - // subtrees that are found. - // The current implementation doesn't handle any graph transformation - // itself. Callers should be responsible for all intended mutation, including - // deleting nodes in the subtrees found by this algorithm. - // Note: if the replaceFunction lambda returns false, the entire procedure - // is aborted. This maybe useful in certain cases when we want to terminate - // the subtree search early. - // invertGraphTraversal flag: see documentation in isSubtreeMatch - static void replaceSubtree( - GraphType& graph, - const MatchNodeRef& criteria, - const std::function< - bool(GraphType& g, typename GraphType::NodeRef subtreeRoot)>& - replaceFunction, - bool invertGraphTraversal = true) { - for (auto nodeRef : graph.getMutableNodes()) { - // Make sure the node is still in the graph. - if (!graph.hasNode(nodeRef)) { - continue; - } - if (isSubtreeMatch(nodeRef, criteria, invertGraphTraversal).isMatch()) { - if (!replaceFunction(graph, nodeRef)) { - // If replaceFunction returns false, it means that we should abort - // the entire procedure. - break; - } + return SubgraphMatchResult::notMatched(); } } + matchedNodes[rootCriteriaRef] = root; + return SubgraphMatchResult::matched(); } }; diff --git a/caffe2/core/nomnigraph/tests/neural_net_test.cc b/caffe2/core/nomnigraph/tests/neural_net_test.cc index bdafce3b364cc8..34dd9840309eac 100644 --- a/caffe2/core/nomnigraph/tests/neural_net_test.cc +++ b/caffe2/core/nomnigraph/tests/neural_net_test.cc @@ -44,23 +44,23 @@ TEST(NeuralNetGraph, ReplaceGraph) { auto mg = NNMatchGraph(); // clang-format off - auto pattern = tree(mg, + auto pattern = subgraph(mg, matchNodeType(), { - operatorTree(mg, + operatorSubgraph(mg, matchNodeType(), { - tree(mg, matchNodeType(), {}, 2, true) + subgraph(mg, matchNodeType(), {}, 2, true) }), }); // clang-format on - EXPECT_FALSE(NNSubgraphMatcher::isSubtreeMatch(sum, pattern).isMatch()); + EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, pattern).isMatch()); EXPECT_FALSE( - NNSubgraphMatcher::isSubtreeMatch(reluOutput, pattern).isMatch()); - EXPECT_FALSE(NNSubgraphMatcher::isSubtreeMatch(input1, pattern).isMatch()); + NNSubgraphMatcher::isSubgraphMatch(reluOutput, pattern).isMatch()); + EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, pattern).isMatch()); - EXPECT_TRUE(NNSubgraphMatcher::isSubtreeMatch(relu, pattern).isMatch()); + EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, pattern).isMatch()); - NNSubgraphMatcher::replaceSubtree( + NNSubgraphMatcher::replaceSubgraph( graph, pattern, [](NNGraph& g, NNGraph::NodeRef relu) { auto sumOutput = getInputs(relu)[0]; auto sum = getProducer(sumOutput); diff --git a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc index adcc56e6027141..ced26d69beb30b 100644 --- a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc +++ b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc @@ -41,11 +41,11 @@ TestMatchGraph::NodeRef Tree( const Criteria& root, const std::vector& children = {}, int count = 1) { - return tree(graph, root, children, count, false); + return subgraph(graph, root, children, count, false); } TestMatchGraph::NodeRef NonTerminal(const Criteria& root, int count = 1) { - return tree(graph, root, {}, count, true); + return subgraph(graph, root, {}, count, true); } Criteria any() { @@ -202,11 +202,11 @@ TestGraph::NodeRef getInNode(TestGraph::NodeRef node, int index) { return node->getInEdges()[index]->tail(); } -bool isSubtreeMatch( +bool isSubgraphMatch( TestGraph::NodeRef nodeRef, const TestMatchGraph::NodeRef& criteria, bool invertGraphTraversal = true) { - return TestMatcher::isSubtreeMatch(nodeRef, criteria, invertGraphTraversal) + return TestMatcher::isSubgraphMatch(nodeRef, criteria, invertGraphTraversal) .isMatch(); } } // namespace matcher @@ -254,32 +254,32 @@ TEST(SubgraphMatcher, IsSubtreeMatch) { reset(); auto subtree = Tree(any(), {Tree(any()), Tree(any())}); - EXPECT_FALSE(isSubtreeMatch(n1, subtree, false)); - EXPECT_FALSE(isSubtreeMatch(n4, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n1, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n4, subtree, false)); - EXPECT_TRUE(isSubtreeMatch(n2, subtree, false)); - EXPECT_TRUE(isSubtreeMatch(n5, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n2, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n5, subtree, false)); reset(); subtree = Tree(Criteria("5"), {Tree(any()), Tree(any())}); - EXPECT_FALSE(isSubtreeMatch(n2, subtree, false)); - EXPECT_TRUE(isSubtreeMatch(n5, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n2, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n5, subtree, false)); reset(); subtree = Tree(any(), {Tree(any()), Tree(Criteria("4"))}); - EXPECT_TRUE(isSubtreeMatch(n2, subtree, false)); - EXPECT_FALSE(isSubtreeMatch(n5, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n2, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n5, subtree, false)); reset(); // Accepts non terminal node subtree = Tree(any(), {NonTerminal(any()), NonTerminal(any())}); - EXPECT_TRUE(isSubtreeMatch(n1, subtree, false)); - EXPECT_TRUE(isSubtreeMatch(n2, subtree, false)); - EXPECT_TRUE(isSubtreeMatch(n5, subtree, false)); - EXPECT_FALSE(isSubtreeMatch(n3, subtree, false)); - EXPECT_FALSE(isSubtreeMatch(n4, subtree, false)); - EXPECT_FALSE(isSubtreeMatch(n6, subtree, false)); - EXPECT_FALSE(isSubtreeMatch(n7, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n1, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n2, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n5, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n3, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n4, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n6, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n7, subtree, false)); } // Test subtree matching in which * (repeated) matching of children is allowed. @@ -304,11 +304,11 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { reset(); auto subtree = Tree(any(), {Tree(Criteria("2"))}); - EXPECT_FALSE(isSubtreeMatch(n1, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n1, subtree, false)); reset(); subtree = Tree(any(), {Tree(Criteria("2"), {}, TestMatchNode::kStarCount)}); - EXPECT_FALSE(isSubtreeMatch(n1, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n1, subtree, false)); reset(); // clang-format off @@ -318,7 +318,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { Tree(Criteria("4"), {}, 2), Tree(Criteria("5"), {}, 3) }); - EXPECT_TRUE(isSubtreeMatch(n1, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n1, subtree, false)); reset(); subtree = Tree(any(), { @@ -328,7 +328,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { Tree(Criteria("5"), {}, 4) }); // Failes because exepected 4 matches of n5 but found 3. - EXPECT_FALSE(isSubtreeMatch(n1, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n1, subtree, false)); reset(); subtree = Tree(any(), { @@ -337,7 +337,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { Tree(Criteria("4"), {}, 2), Tree(Criteria("5"), {}, TestMatchNode::kStarCount) }); - EXPECT_TRUE(isSubtreeMatch(n1, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n1, subtree, false)); reset(); subtree = Tree(any(), { @@ -346,7 +346,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { Tree(Criteria("4"), {}, 2), Tree(Criteria("5"), {}, TestMatchNode::kStarCount) }); - EXPECT_TRUE(isSubtreeMatch(n1, subtree, false)); + EXPECT_TRUE(isSubgraphMatch(n1, subtree, false)); reset(); subtree = Tree(any(), { @@ -354,7 +354,7 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { Tree(Criteria("3"), {}, TestMatchNode::kStarCount), }); // Fails because there are unmatched edges. - EXPECT_FALSE(isSubtreeMatch(n1, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n1, subtree, false)); reset(); subtree = Tree(any(), { @@ -365,21 +365,192 @@ TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { }); // Fails because the count is wrong; we have 2 edges to node N4 while // the pattern expects only 1. - EXPECT_FALSE(isSubtreeMatch(n1, subtree, false)); + EXPECT_FALSE(isSubgraphMatch(n1, subtree, false)); // clang-format on } +TEST(SubgraphMatcher, DagMatching) { + reset(); + + // clang-format off + auto n4match = Tree(Criteria("4"), { + Tree(Criteria("5")) + }); + auto subgraph = Tree(Criteria("1"), { + Tree(Criteria("2"), { + n4match + }), + Tree(Criteria("3"), { + n4match + }), + }); + // clang-format on + + { + TestGraph graph; + auto n1 = graph.createNode("1"); + auto n2 = graph.createNode("2"); + auto n3 = graph.createNode("3"); + auto n4 = graph.createNode("4"); + auto n5 = graph.createNode("5"); + + graph.createEdge(n1, n2); + graph.createEdge(n1, n3); + graph.createEdge(n2, n4); + graph.createEdge(n3, n4); + graph.createEdge(n4, n5); + + /* N1 + / \ + N2 N3 + \ / + N4 + | + N5 + */ + + EXPECT_TRUE(isSubgraphMatch(n1, subgraph, false)); + } + + { + TestGraph graph; + auto n1 = graph.createNode("1"); + auto n2 = graph.createNode("2"); + auto n3 = graph.createNode("3"); + auto n4A = graph.createNode("4"); + auto n4B = graph.createNode("4"); + auto n5 = graph.createNode("5"); + + graph.createEdge(n1, n2); + graph.createEdge(n1, n3); + graph.createEdge(n2, n4A); + graph.createEdge(n3, n4B); + graph.createEdge(n4A, n5); + graph.createEdge(n4B, n5); + + /* N1 + / \ + N2 N3 + / \ + N4A N4B + \ / + N5 + */ + + // This should fail because n4A and n4B are not the same node. + EXPECT_FALSE(isSubgraphMatch(n1, subgraph, false)); + } +} + +TEST(SubgraphMatcher, DagMatchingMultiEdges) { + reset(); + + // clang-format off + auto n2match = Tree(Criteria("2")); + auto subgraph = Tree(Criteria("1"), { + n2match, + n2match + }); + // clang-format on + + { + TestGraph graph; + auto n1 = graph.createNode("1"); + auto n2 = graph.createNode("2"); + + graph.createEdge(n1, n2); + graph.createEdge(n1, n2); + + EXPECT_TRUE(isSubgraphMatch(n1, subgraph, false)); + } + + { + TestGraph graph; + auto n1 = graph.createNode("1"); + auto n2A = graph.createNode("2"); + auto n2B = graph.createNode("2"); + + graph.createEdge(n1, n2A); + graph.createEdge(n1, n2B); + + EXPECT_FALSE(isSubgraphMatch(n1, subgraph, false)); + } +} + +TEST(SubgraphMatcher, DagMatchingRandomLargeGraph) { + reset(); + // clang-format off + auto n4match = Tree(any(), { + NonTerminal(any(), 1) + }); + auto subtree = Tree(any(), { + Tree(any(), { + n4match + }), + Tree(any(), { + n4match + }), + }); + // clang-format on + /* N1 + / \ + N2 N3 + \ / + N4 + | + N5 + */ + + // Look for the diamond pattern in a random large graph. + TestGraph graph; + std::vector::NodeRef> nodes; + + // Here we create a test graph and then randomly embed the above + // pattern into the graph repeatedly (numPatterns times). + // The actual number of match will be less than numPatterns because the + // embedded patterns can overlap which become unmatched subgraphs. + const int numNodes = 50000; + const int numPatterns = 5000; + + for (int i = 0; i < numNodes; i++) { + auto node = graph.createNode("Node"); + nodes.emplace_back(node); + } + + TestRandom random(517); + for (int i = 0; i < numPatterns; i++) { + std::vector nodeIdx; + for (int k = 0; k < 5; k++) { + nodeIdx.emplace_back(random.nextInt() % numNodes); + } + graph.createEdge(nodes[nodeIdx[0]], nodes[nodeIdx[1]]); + graph.createEdge(nodes[nodeIdx[0]], nodes[nodeIdx[2]]); + graph.createEdge(nodes[nodeIdx[1]], nodes[nodeIdx[3]]); + graph.createEdge(nodes[nodeIdx[2]], nodes[nodeIdx[3]]); + graph.createEdge(nodes[nodeIdx[3]], nodes[nodeIdx[4]]); + } + EXPECT_EQ(graph.getEdgesCount(), 5 * numPatterns); + + int countMatch = 0; + for (auto node : graph.getMutableNodes()) { + if (isSubgraphMatch(node, subtree, false)) { + countMatch++; + } + } + EXPECT_EQ(countMatch, 1072); +} + TEST(SubgraphMatcher, IsSubtreeMatchRealistic) { reset(); auto graph = DataFlowTestGraph(); auto subtree = DataFlowTestGraphCriteria(); - EXPECT_FALSE(isSubtreeMatch(graph.opF, subtree)); - EXPECT_FALSE(isSubtreeMatch(graph.opC, subtree)); - EXPECT_FALSE(isSubtreeMatch(graph.opB, subtree)); - EXPECT_FALSE(isSubtreeMatch(graph.dataOut, subtree)); + EXPECT_FALSE(isSubgraphMatch(graph.opF, subtree)); + EXPECT_FALSE(isSubgraphMatch(graph.opC, subtree)); + EXPECT_FALSE(isSubgraphMatch(graph.opB, subtree)); + EXPECT_FALSE(isSubgraphMatch(graph.dataOut, subtree)); - EXPECT_TRUE(isSubtreeMatch(graph.opG, subtree)); + EXPECT_TRUE(isSubgraphMatch(graph.opG, subtree)); } TEST(SubgraphMatcher, ReplaceSubtreeRealistic) { @@ -387,7 +558,7 @@ TEST(SubgraphMatcher, ReplaceSubtreeRealistic) { auto graph = DataFlowTestGraph(); auto subtree = DataFlowTestGraphCriteria(); - TestMatcher::replaceSubtree( + TestMatcher::replaceSubgraph( graph.graph, subtree, [](TestGraph& g, TestGraph::NodeRef opG) { auto opFused = g.createNode("opFused"); diff --git a/caffe2/core/nomnigraph/tests/test_util.h b/caffe2/core/nomnigraph/tests/test_util.h index 2c447b556acb01..f5693c03d36a2e 100644 --- a/caffe2/core/nomnigraph/tests/test_util.h +++ b/caffe2/core/nomnigraph/tests/test_util.h @@ -34,6 +34,23 @@ struct NNEquality { } }; +// Very simple random number generator used to generate platform independent +// random test data. +class TestRandom { + public: + TestRandom(unsigned int seed) : seed_(seed){}; + + unsigned int nextInt() { + seed_ = A * seed_ + C; + return seed_; + } + + private: + static const unsigned int A = 1103515245; + static const unsigned int C = 12345; + unsigned int seed_; +}; + /** Our test graph looks like this: * +-------+ * | entry | diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index a10ce5d0b2f500..b1f31af6e33d8a 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -799,7 +799,7 @@ typedef Registry< Workspace*>* (*RegistryFunction)(); CAFFE2_API std::map* gDeviceTypeRegistry(); -struct DeviceTypeRegisterer { +struct CAFFE2_API DeviceTypeRegisterer { explicit DeviceTypeRegisterer(int32_t type, RegistryFunction func) { if (gDeviceTypeRegistry()->count(type)) { std::cerr << "Device type " << type @@ -923,7 +923,7 @@ struct StaticLinkingProtector { // specific engines that only implement a subset of the features required by // the original operator schema. // TODO(jiayq): make more feature-complete exception message. -class UnsupportedOperatorFeature : public std::exception { +class CAFFE2_API UnsupportedOperatorFeature : public std::exception { public: UnsupportedOperatorFeature(const string& msg) : msg_(msg) {} const char* what() const noexcept override { @@ -961,40 +961,40 @@ using PerOpEnginePrefType = CaffeMap>; // {device_type -> EnginePrefType} using GlobalEnginePrefType = CaffeMap; -void SetPerOpEnginePref(const PerOpEnginePrefType& per_op_engine_pref); -void SetGlobalEnginePref(const GlobalEnginePrefType& global_engine_pref); -void SetEnginePref( +CAFFE2_API void SetPerOpEnginePref(const PerOpEnginePrefType& per_op_engine_pref); +CAFFE2_API void SetGlobalEnginePref(const GlobalEnginePrefType& global_engine_pref); +CAFFE2_API void SetEnginePref( const PerOpEnginePrefType& per_op_engine_pref, const GlobalEnginePrefType& global_engine_pref); -void SetOpEnginePref( +CAFFE2_API void SetOpEnginePref( const std::string& op_type, const CaffeMap& op_pref); -TensorShape GetTensorShapeOfBlob(const Blob* b); +CAFFE2_API TensorShape GetTensorShapeOfBlob(const Blob* b); -TensorShapes InferBlobShapesAndTypes( +CAFFE2_API TensorShapes InferBlobShapesAndTypes( CaffeMap& blob_desc, const vector& nets); -TensorShapes InferBlobShapesAndTypesFromWorkspace( +CAFFE2_API TensorShapes InferBlobShapesAndTypesFromWorkspace( Workspace* ws, const vector& nets); -TensorShapes InferBlobShapesAndTypesFromMap( +CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap( const CaffeMap>& blob_dimensions, const vector& nets); -TensorShapes InferBlobShapesAndTypesFromMap( +CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap( const CaffeMap>& blob_dimensions, const CaffeMap& blob_types, const vector& nets); -std::map> ValidateTensorDevices( +CAFFE2_API std::map> ValidateTensorDevices( OperatorBase& op, const OperatorDef& op_def); // Get a set of registered operator names -std::set GetRegisteredOperators(); +CAFFE2_API std::set GetRegisteredOperators(); } // namespace caffe2 diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h index 4072065f515dfd..f08778fd22eac2 100644 --- a/caffe2/core/operator_gradient.h +++ b/caffe2/core/operator_gradient.h @@ -14,7 +14,7 @@ namespace caffe2 { * a sparse blob, its gradient name should be written into indice_ for * the sparse indices and value_ for the values. */ -struct GradientWrapper { +struct CAFFE2_API GradientWrapper { string dense_; string indices_; string values_; @@ -33,7 +33,7 @@ struct GradientWrapper { /** * A struct that holds the gradient operators and related gradient maps. */ -struct GradientOpsMeta { +struct CAFFE2_API GradientOpsMeta { vector ops_; vector g_input_; @@ -44,7 +44,7 @@ struct GradientOpsMeta { : ops_(ops), g_input_(v) {} }; -class GradientMakerBase { +class CAFFE2_API GradientMakerBase { public: GradientMakerBase( const OperatorDef& def, @@ -256,7 +256,7 @@ class GradientMakerBase { * that the gradient computation should not flow through it at all, and throws * an error if it is called. */ -class NoGradient : public GradientMakerBase { +class CAFFE2_API NoGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; vector GetGradientDefs() override { return vector(); @@ -321,7 +321,7 @@ CAFFE_DECLARE_REGISTRY( /** * @brief Gets the GradientOpsMeta for the given operator def. */ -GradientOpsMeta GetGradientForOp( +CAFFE2_API GradientOpsMeta GetGradientForOp( const OperatorDef& def, const vector& g_output); diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h index f142c182dfe748..86c6827e3039a1 100644 --- a/caffe2/core/stats.h +++ b/caffe2/core/stats.h @@ -40,7 +40,7 @@ struct CAFFE2_API ExportedStatValue { using ExportedStatList = std::vector; using ExportedStatMap = std::unordered_map; -ExportedStatMap toMap(const ExportedStatList& stats); +CAFFE2_API ExportedStatMap toMap(const ExportedStatList& stats); /** * @brief Holds a map of atomic counters keyed by name. diff --git a/caffe2/core/transform.h b/caffe2/core/transform.h index 63f7e26467332a..9c10ca58c0d9b1 100644 --- a/caffe2/core/transform.h +++ b/caffe2/core/transform.h @@ -31,7 +31,7 @@ namespace caffe2 { * own transform, write your implementations for PatternRule, ValidatorRule, and * ReplaceRule. */ -class Transform { +class CAFFE2_API Transform { public: Transform() {} @@ -148,7 +148,7 @@ class Transform { }; // Creates a Transform based on a key, which should be defined in registry. -unique_ptr CreateTransform(string key); +CAFFE2_API unique_ptr CreateTransform(string key); CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform); #define REGISTER_TRANSFORM(name, ...) \ @@ -156,14 +156,14 @@ CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform); // Create a Transform object from registry, // and immediately apply it to a Netdef. -NetDef ApplyTransform(const string& key, const NetDef& netdef); +CAFFE2_API NetDef ApplyTransform(const string& key, const NetDef& netdef); // Create a Transform object from registry, apply it to a NetDef. // Will only return the transformed net if it is faster than the old net. // This will run the init net first, will run the two nets warmup_runs times. // Then, we will take the average time of main_runs runs, and only keep the // transformed net if it is faster by a factor of improvement_threshold. -NetDef ApplyTransformIfFaster( +CAFFE2_API NetDef ApplyTransformIfFaster( const string& key, const NetDef& netdef, const NetDef& init_netdef, diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc index 6e718a7d75a276..64642ca7ea41e5 100644 --- a/caffe2/onnx/backend.cc +++ b/caffe2/onnx/backend.cc @@ -674,25 +674,54 @@ Caffe2Ops Caffe2Backend::CreateGemm( auto trans_a = onnx_node->attributes.get("transA", 0L); auto trans_b = onnx_node->attributes.get("transB", 0L); - auto broadcast = onnx_node->attributes.get("broadcast", 0L); + // Support broadcast by default when opset_version > 6. + auto broadcast = + onnx_node->attributes.get("broadcast", + (ctx.opset_version() > 6) ? 1L : 0L); + + // If the c's shape information is available and c is a 1d tensor(except + // c is a scalar), use FC aggressively. + auto check_fc = [&]() -> bool { + const auto input_c_vi_iter = ctx.value_infos().find(node.input(2)); + + if (input_c_vi_iter == ctx.value_infos().end()) { + return false; + } - bool use_fc = false; - if ((!trans_a) && trans_b) { - if (broadcast) { - use_fc = true; - } else { - const auto input_c_vi_iter = ctx.value_infos().find(node.input(2)); - if (input_c_vi_iter != ctx.value_infos().end() && - input_c_vi_iter->second.type().tensor_type().shape().dim_size() == - 1) { - use_fc = true; + const auto input_c_shape = + input_c_vi_iter->second.type().tensor_type().shape(); + + if (input_c_shape.dim_size() != 1) { + return false; + } + + // c is a scalar. + if (input_c_shape.dim(0).dim_value() == 1) { + const auto input_b_vi_iter = ctx.value_infos().find(node.input(1)); + + // If the b's shape is not available, skip FC. + if (input_b_vi_iter == ctx.value_infos().end()) { + return false; + } + const auto input_b_shape = + input_b_vi_iter->second.type().tensor_type().shape(); + int input_b_last_dim_index = (trans_b) ? 0 : 1; + // If b's last dim is not 1, skip FC. + if (input_b_shape.dim(input_b_last_dim_index).dim_value() != 1) { + return false; } } - } - if (use_fc) { + return true; + }; + + if (!trans_a && broadcast && check_fc()) { auto* c2_op = ret.ops.Add(); - BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output}); + if (trans_b) { + BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output}); + } else { + BuildOperator(c2_op, "FCTransposed", {input_a, input_b, input_c}, {output}); + } } else { auto ab = dummy_->NewDummyName(); caffe2::Argument arg_trans_a; diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h index 681ab5b30d10b0..6aa5f271cc5c2a 100644 --- a/caffe2/onnx/backend.h +++ b/caffe2/onnx/backend.h @@ -11,7 +11,7 @@ #include #include -constexpr int kKnownOpsetVersion = 6; +constexpr int kKnownOpsetVersion = 7; namespace caffe2 { namespace onnx { diff --git a/caffe2/onnx/onnx_exporter.h b/caffe2/onnx/onnx_exporter.h index 51f62df0eb2212..578edc0ee17e2d 100644 --- a/caffe2/onnx/onnx_exporter.h +++ b/caffe2/onnx/onnx_exporter.h @@ -25,11 +25,11 @@ using ConvertedResult = // Rewrite Caffe2 nets into SSA forms. Notice that we will preserve the external // output names for predict net. -std::unordered_map SsaRewrite( +CAFFE2_API std::unordered_map SsaRewrite( caffe2::NetDef* init_net, caffe2::NetDef* pred_net); -class OnnxExporter { +class CAFFE2_API OnnxExporter { using SpecialOpConverter = ConvertedResult (OnnxExporter::*)( const caffe2::OperatorDef&, const std::unordered_map&); diff --git a/caffe2/operators/bisect_percentile_op.cc b/caffe2/operators/bisect_percentile_op.cc new file mode 100644 index 00000000000000..cec3cf7bc1fe34 --- /dev/null +++ b/caffe2/operators/bisect_percentile_op.cc @@ -0,0 +1,92 @@ +#include "caffe2/operators/bisect_percentile_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(BisectPercentile, BisectPercentileOp); +OPERATOR_SCHEMA(BisectPercentile) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( + This operator is to map raw feature values into the percentile + representations based on Bisection for more than one feature. + + The input is the bath of input feature values, with the size of (batch_size, + num_feature), where num_feature = F (F >= 1). + + For each feature, we also need additional information regarding the feature + value distribution. + There are several vectors to keep data to percentile mappping information + as arguments (context): + 1. feature raw values (R) + 2. feature percentile mapping (P) + 3. feature percentile lower bound (L) + 4. feature percentile upper bound (U) + + A toy example: + Suppose the sampled data distribution is as follows: + 1, 1, 2, 2, 2, 2, 2, 2, 3, 4 + We have the mapping vectors as follows: + R = [1, 2, 3, 4] + P = [0.15, 0.55, 0.9, 1.0] + L = [0.1, 0.3, 0.9, 1.0] + U = [0.2, 0.8, 0.9, 1.0] + Where P is computed as (L + U) / 2. + + For a given list of feature values, X = [x_0, x_1, ..., x_i, ...], for each + feature value (x_i) we first apply bisection to find the right index (t), + such that R[t] <= x_i < R[t+1]. + If x_i = R[t], P[t] is returned; + otherwise, the interpolation is apply by (R[t], R[t+1]) and (U[t] and L[t]). + + As there are F features (F >= 1), we concate all the R_f, P_f, L_f, and + U_f for each feature f and use an additional input length to keep track of + the number of points for each set of raw feature value to percentile mapping. + For example, there are two features: + R_1 =[0.1, 0.4, 0.5]; + R_2 = [0.3, 1.2]; + We will build R = [0.1, 0.4, 0.5, 0.3, 1.2]; besides, we have + lengths = [3, 2] + to indicate the boundries of the percentile information. + +)DOC") + .Arg( + "percentile_raw", + "1D tensor, which is the concatenation of all sorted raw feature " + "values for all features.") + .Arg( + "percentile_mapping", + "1D tensor. There is one-one mapping between percentile_mapping and " + "percentile_raw such that each element in percentile_mapping " + "corresponds to the percentile value of the corresponding raw feature " + "value.") + .Arg( + "percentile_lower", + "1D tensor. There is one-one mapping between percentile_upper and " + "percentile_raw such that each element in percentile_mapping " + "corresponds to the percentile lower bound of the corresponding raw " + "feature value.") + .Arg( + "percentile_upper", + "1D tensor. There is one-one mapping between percentile_upper and " + "percentile_raw such that each element in percentile_mapping " + "corresponds to the percentile upper bound of the corresponding raw " + "feature value.") + .Arg( + "lengths", + "1D tensor. There is one-one mapping between percentile_upper and " + "percentile_raw such that each element in percentile_mapping " + "corresponds to the percentile upper bound of the corresponding raw " + "feature value.") + .Input( + 0, + "raw_values", + "Input 2D tensor of floats of size (N, D), where N is the batch size " + "and D is the feature dimension.") + .Output( + 0, + "percentile", + "2D tensor of output with the same dimensions as the input raw_values."); + +NO_GRADIENT(BisectPercentile); + +} // namespace caffe2 diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h new file mode 100644 index 00000000000000..98d347cc73b10a --- /dev/null +++ b/caffe2/operators/bisect_percentile_op.h @@ -0,0 +1,167 @@ +#ifndef CAFFE2_OPERATORS_BISECT_PERCENTILE_OP_H_ +#define CAFFE2_OPERATORS_BISECT_PERCENTILE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +class BisectPercentileOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + BisectPercentileOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + pct_raw_(OperatorBase::GetRepeatedArgument( + "percentile_raw", + vector{})), + pct_mapping_(OperatorBase::GetRepeatedArgument( + "percentile_mapping", + vector{})), + pct_lower_(OperatorBase::GetRepeatedArgument( + "percentile_lower", + vector{})), + pct_upper_(OperatorBase::GetRepeatedArgument( + "percentile_upper", + vector{})), + pct_lens_( + OperatorBase::GetRepeatedArgument("lengths", vector{})) { + CAFFE_ENFORCE_EQ( + pct_raw_.size(), + pct_mapping_.size(), + "Feature (raw) data and percentile value dimension should match."); + CAFFE_ENFORCE_EQ( + pct_raw_.size(), + pct_lower_.size(), + "Feature (raw) data and lower bound dimension should match."); + CAFFE_ENFORCE_EQ( + pct_raw_.size(), + pct_upper_.size(), + "Feature (raw) data and upper bound dimension should match."); + n_features = pct_lens_.size(); + index.reserve(n_features + 1); + index[0] = 0; + for (int i = 1; i <= n_features; ++i) { + index[i] = index[i - 1] + pct_lens_[i - 1]; + } + CAFFE_ENFORCE_EQ( + index[n_features], // The sum of lengths_data + pct_raw_.size(), + "Sum of lengths should be equal to the total number of percentile " + "mapping data samples"); + } + + bool RunOnDevice() override { + // Input + const auto& raw = Input(RAW); + CAFFE_ENFORCE_EQ(raw.ndim(), 2); + const auto batch_size = raw.dim(0); + const auto num_features = raw.dim(1); + CAFFE_ENFORCE_EQ(num_features, pct_lens_.size()); + const float* raw_data = raw.template data(); + + // Output + auto* pct = Output(PCT); + pct->ResizeLike(raw); + float* pct_output = pct->template mutable_data(); + + // Compute percentile for each raw feature value + int feature_start_index = 0; + int feature_length = 0; + int cur_index = 0; + + for (int i = 0; i < num_features; ++i) { + cur_index = i; + feature_start_index = index[i]; + feature_length = pct_lens_[i]; + for (int j = 0; j < batch_size; ++j) { + pct_output[cur_index] = compute_percentile( + pct_raw_.begin() + feature_start_index, + pct_mapping_.begin() + feature_start_index, + pct_lower_.begin() + feature_start_index, + pct_upper_.begin() + feature_start_index, + feature_length, + raw_data[cur_index]); + cur_index += num_features; + } + } + return true; + } + + protected: + INPUT_TAGS(RAW); + OUTPUT_TAGS(PCT); + + private: + int n_features; + vector pct_raw_; + vector pct_mapping_; + vector pct_lower_; + vector pct_upper_; + vector pct_lens_; + vector index; + vector> fast_pct; + + const float kEPSILON = 1e-10; + + int binary_search( + const std::vector::iterator& data, + int lo, + int hi, + float val) { + int mid; + bool low_cond, high_cond; + + while (lo < hi) { + mid = (lo + hi) >> 1; + low_cond = (data[mid] <= val); + high_cond = (val < data[mid + 1]); + if (low_cond && high_cond) { + return mid; + } else if (!low_cond) { + hi = mid - 1; + } else { + lo = mid + 1; + } + } + return lo; + } + + float compute_percentile( + const std::vector::iterator& pct_raw_it, + const std::vector::iterator& pct_mapping_it, + const std::vector::iterator& pct_lower_it, + const std::vector::iterator& pct_upper_it, + const int size, + const float val) { + // Corner cases where no interpolation is needed. + if (val < pct_raw_it[0]) { + return 0.; + } + if (val > pct_raw_it[size - 1]) { + return 1.; + } + + float result; + // Interpolation by binary search + const auto k = binary_search(pct_raw_it, 0, size - 1, val); + + if (pct_raw_it[k] == val) { + // Exact match + result = pct_mapping_it[k]; + } else { + // interpolation + float w = (val - pct_raw_it[k]) / + (pct_raw_it[k + 1] - pct_raw_it[k] + kEPSILON); + result = (1 - w) * pct_upper_it[k] + w * pct_lower_it[k + 1]; + } + return result; + } +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_BISECT_PERCENTILE_OP_H_ diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h index 041a5be002421d..08207644f7f094 100644 --- a/caffe2/operators/ensure_cpu_output_op.h +++ b/caffe2/operators/ensure_cpu_output_op.h @@ -40,7 +40,7 @@ class EnsureCPUOutputOp : public Operator { input.size(), input.raw_data(), output->raw_mutable_data(input.meta())); - + context_.FinishDeviceComputation(); return true; } }; diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h index d66bf9e5635f56..1d6e28c9b3abe3 100644 --- a/caffe2/operators/generate_proposals_op.h +++ b/caffe2/operators/generate_proposals_op.h @@ -45,7 +45,7 @@ class ConstTensorView { // anchors: predefined anchors, size(A, 4) // Return: all_anchors_vec: (H * W, A * 4) // Need to reshape to (H * W * A, 4) to match the format in python -ERMatXf ComputeAllAnchors( +CAFFE2_API ERMatXf ComputeAllAnchors( const TensorCPU& anchors, int height, int width, diff --git a/caffe2/opt/backend_cutting.h b/caffe2/opt/backend_cutting.h index 0e2bf7c7f7de60..cc3ed14c3dc833 100644 --- a/caffe2/opt/backend_cutting.h +++ b/caffe2/opt/backend_cutting.h @@ -9,7 +9,7 @@ namespace caffe2 { namespace opt { -caffe2::NetDef OptimizeForBackend( +CAFFE2_API caffe2::NetDef OptimizeForBackend( caffe2::NetDef& net, std::function supports, std::function transform_func); diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h index 5a1b7d01d84cfc..ec90507664fa95 100644 --- a/caffe2/opt/converter.h +++ b/caffe2/opt/converter.h @@ -12,7 +12,7 @@ namespace caffe2 { -class Caffe2Annotation : public nom::repr::Annotation { +class CAFFE2_API Caffe2Annotation : public nom::repr::Annotation { public: Caffe2Annotation() : Annotation(AnnotationKind::Caffe2) {} Caffe2Annotation(std::string device) @@ -57,23 +57,23 @@ class Caffe2Annotation : public nom::repr::Annotation { int DeviceType = caffe2::DeviceType::CPU; }; -nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut = nullptr); +CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut = nullptr); -caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&); +CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&); // Pass in an oldNet to copy all the attributes of that network. // Be warned that transformations that modify the graph's inputs or outputs // are not reflected in changes to external_input or external_output. -caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&, const caffe2::NetDef& oldNet); +CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&, const caffe2::NetDef& oldNet); // Use these functions instead of the registry directly. -std::unique_ptr convertToNeuralNetOperator( +CAFFE2_API std::unique_ptr convertToNeuralNetOperator( const caffe2::OperatorDef& op); -caffe2::OperatorDef convertToOperatorDef( +CAFFE2_API caffe2::OperatorDef convertToOperatorDef( const nom::repr::NNGraph::NodeRef& instrNode); -class Converter { +class CAFFE2_API Converter { public: explicit Converter() {} virtual std::unique_ptr diff --git a/caffe2/opt/device.h b/caffe2/opt/device.h index eeb16469a87ebe..daa634de0563fa 100644 --- a/caffe2/opt/device.h +++ b/caffe2/opt/device.h @@ -1,9 +1,10 @@ +#include "caffe2/core/common.h" #include "nomnigraph/Representations/NeuralNet.h" namespace caffe2 { namespace opt { -void insertCopies( +CAFFE2_API void insertCopies( nom::repr::NNModule* nn, std::function supported, std::function copyToFn, diff --git a/caffe2/opt/fusion.h b/caffe2/opt/fusion.h index 67b2cb7bcaf795..f8cd4b469be5b1 100644 --- a/caffe2/opt/fusion.h +++ b/caffe2/opt/fusion.h @@ -25,7 +25,7 @@ namespace opt { using namespace nom; -void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws); +CAFFE2_API void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws); // Generic activation fusion helper. // @@ -33,11 +33,11 @@ void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws); // \tparam ActivationT The activation to be fused. // \param nn Neural network module to be modified in place // \param should_fuse Given a conv op, check whether we want to fuse it with -// subsequent relu or not +// subsequent relu or not // \param postprocess Functor to postprocess the conv node, // attaching additional attributes if necessary template -void fuseActivation( +CAFFE2_API void fuseActivation( repr::NNModule* nn, std::function should_fuse, std::function postprocess) { diff --git a/caffe2/opt/mobile.h b/caffe2/opt/mobile.h index 1bc9a32e2e51ea..78e98763a32ea5 100644 --- a/caffe2/opt/mobile.h +++ b/caffe2/opt/mobile.h @@ -1,13 +1,14 @@ #ifndef CAFFE2_OPT_MOBILE_H_ #define CAFFE2_OPT_MOBILE_H_ +#include "caffe2/core/common.h" #include "nomnigraph/Representations/NeuralNet.h" namespace caffe2 { namespace opt { -void addNNPACK(nom::repr::NNModule* nn, bool low_memory = false); -void fuseNNPACKConvRelu(nom::repr::NNModule* nn); +CAFFE2_API void addNNPACK(nom::repr::NNModule* nn, bool low_memory = false); +CAFFE2_API void fuseNNPACKConvRelu(nom::repr::NNModule* nn); } // namespace opt } // namespace caffe2 diff --git a/caffe2/opt/onnx_convert.h b/caffe2/opt/onnx_convert.h index 42a9c95aba471c..b21e0da9920a0b 100644 --- a/caffe2/opt/onnx_convert.h +++ b/caffe2/opt/onnx_convert.h @@ -1,4 +1,4 @@ -class OnnxAnnotation : public nom::repr::Annotation { +class CAFFE2_API OnnxAnnotation : public nom::repr::Annotation { public: OnnxAnnotation() : Annotation(AnnotationKind::Onnx) {} OnnxAnnotation(std::string device) @@ -10,11 +10,11 @@ class OnnxAnnotation : public nom::repr::Annotation { void setOperatorDef(caffe2::OperatorDef* opDef) { OpDef = opDef; } - const caffe2::OperatorDef* getOperatorDef() const { + const caffe2::OperatorDef* getOperatorDef() const { assert(OpDef && "OperatorDef was never set. Use OnnxAnnotation::setOperatorDef."); return OpDef; } - caffe2::OperatorDef* getMutableOperatorDef() { + caffe2::OperatorDef* getMutableOperatorDef() { assert(OpDef && "OperatorDef was never set. Use OnnxAnnotation::setOperatorDef."); return OpDef; } @@ -28,9 +28,8 @@ class OnnxAnnotation : public nom::repr::Annotation { caffe2::OperatorDef* OpDef = nullptr; }; -nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut = nullptr); +CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut = nullptr); -caffe2::NetDef convertToOnnxProto(nom::repr::NNModule&); - -std::unique_ptr convertToOperatorDef(caffe2::OperatorDef op); +CAFFE2_API caffe2::NetDef convertToOnnxProto(nom::repr::NNModule&); +CAFFE2_API std::unique_ptr convertToOperatorDef(caffe2::OperatorDef op); diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h index 197a026e3a8b9a..9ac80c799f8f4d 100644 --- a/caffe2/opt/onnxifi_transformer.h +++ b/caffe2/opt/onnxifi_transformer.h @@ -18,7 +18,7 @@ namespace onnx { class OnnxExporter; } -class OnnxifiTransformer { +class CAFFE2_API OnnxifiTransformer { public: explicit OnnxifiTransformer(bool debug); diff --git a/caffe2/opt/optimize_ideep.h b/caffe2/opt/optimize_ideep.h index 24635785336e57..edfd1fac027643 100644 --- a/caffe2/opt/optimize_ideep.h +++ b/caffe2/opt/optimize_ideep.h @@ -8,7 +8,7 @@ namespace caffe2 { namespace opt { -void OptimizeForIdeep( +CAFFE2_API void OptimizeForIdeep( nom::repr::NNModule* nn, caffe2::Workspace* ws, bool training_mode = false); diff --git a/caffe2/opt/optimizer.h b/caffe2/opt/optimizer.h index e0756d16874649..a83232e0843d82 100644 --- a/caffe2/opt/optimizer.h +++ b/caffe2/opt/optimizer.h @@ -8,8 +8,8 @@ namespace caffe2 { namespace opt { -NetDef optimize(NetDef net, Workspace* ws, int level = 1); -NetDef optimize(NetDef net, int level = 1); +CAFFE2_API NetDef optimize(NetDef net, Workspace* ws, int level = 1); +CAFFE2_API NetDef optimize(NetDef net, int level = 1); } // namespace opt } // namespace caffe2 diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h index 585741664ca6fe..a0cda390ca83fd 100644 --- a/caffe2/opt/passes.h +++ b/caffe2/opt/passes.h @@ -21,7 +21,7 @@ namespace caffe2 { * use a different registry and inherit from WorkspaceOptimizationPass. */ -class OptimizationPass { +class CAFFE2_API OptimizationPass { public: OptimizationPass(NNModule* nn) : nn_(nn) {} virtual void run() = 0; @@ -31,7 +31,7 @@ class OptimizationPass { NNModule* nn_; }; -class WorkspaceOptimizationPass : public OptimizationPass { +class CAFFE2_API WorkspaceOptimizationPass : public OptimizationPass { public: WorkspaceOptimizationPass(NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {} virtual ~WorkspaceOptimizationPass(){} diff --git a/caffe2/opt/sink.h b/caffe2/opt/sink.h index 37ad523e5f0b65..53f082c47598f6 100644 --- a/caffe2/opt/sink.h +++ b/caffe2/opt/sink.h @@ -8,7 +8,7 @@ namespace caffe2 { namespace opt { -void sinkMaxPool(nom::repr::NNModule* nn); +CAFFE2_API void sinkMaxPool(nom::repr::NNModule* nn); } // namespace opt } // namespace caffe2 diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py index ba2cbe1677c8b1..aca9e52af29293 100644 --- a/caffe2/python/_import_c_extension.py +++ b/caffe2/python/_import_c_extension.py @@ -19,7 +19,9 @@ except ImportError as gpu_e: logging.info('Failed to import cuda module: {}'.format(gpu_e)) try: - from caffe2.python.caffe2_pybind11_state_hip import * # noqa + RTLD_LAZY = 1 + with extension_loader.DlopenGuard(RTLD_LAZY): + from caffe2.python.caffe2_pybind11_state_hip import * # noqa if num_hip_devices(): has_hip_support = True logging.info('This caffe2 python run has AMD GPU support!') diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py index fe85d53680eaa5..10ac74ba9fd2a2 100644 --- a/caffe2/python/extension_loader.py +++ b/caffe2/python/extension_loader.py @@ -14,10 +14,10 @@ @contextlib.contextmanager -def DlopenGuard(): +def DlopenGuard(extra_flags=ctypes.RTLD_GLOBAL): if _set_global_flags: old_flags = sys.getdlopenflags() - sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL) + sys.setdlopenflags(old_flags | extra_flags) yield if _set_global_flags: sys.setdlopenflags(old_flags) diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py index e526d74f73921a..8ff58a68ce1083 100644 --- a/caffe2/python/onnx/tests/c2_ref_test.py +++ b/caffe2/python/onnx/tests/c2_ref_test.py @@ -150,7 +150,7 @@ def test_gemm(self): 'Gemm', ['A', 'B', 'C'], ["Y"], - transA=True) + transA=1) output = c2.run_node(node_def, [A, B, C]) np.testing.assert_almost_equal( output["Y"], @@ -164,12 +164,12 @@ def test_gemm(self): 'Gemm', ['A', 'B', 'C'], ["Y"], - transB=True) + transB=1) output = c2.run_node(node_def, [A, B, C]) np.testing.assert_almost_equal( output["Y"], np.dot(A, np.transpose(B)) + C) - # revert A + # revert B B = np.transpose(B) # scale @@ -186,27 +186,121 @@ def test_gemm(self): output["Y"], alpha * np.dot(A, B) + beta * C) - # broadcast + # setup broadcastable C C = np.random.randn(4).astype(np.float32) + + # broadcast for opset7 node_def = make_node( 'Gemm', ['A', 'B', 'C'], ["Y"], alpha=alpha, beta=beta) - output = c2.run_node(node_def, [A, B, C]) + output = c2.run_node(node_def, [A, B, C], opset_version=7) + np.testing.assert_almost_equal( + output["Y"], + alpha * np.dot(A, B) + beta * C) + # broadcast for opset3 and 6 + node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=alpha, + beta=beta, + broadcast=1) + output = c2.run_node(node_def, [A, B, C], opset_version=6) + np.testing.assert_almost_equal( + output["Y"], + alpha * np.dot(A, B) + beta * C) + + # transB + B = np.transpose(B) + + # transB and broadcast for opset7 + node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=alpha, + beta=beta, + transB=1) + output = c2.run_node(node_def, [A, B, C], opset_version=7) + np.testing.assert_almost_equal( + output["Y"], + alpha * np.dot(A, np.transpose(B)) + beta * C) + # transB and broadcast for opset3 and 6 + node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=alpha, + beta=beta, + broadcast=1, + transB=1) + output = c2.run_node(node_def, [A, B, C], opset_version=6) + np.testing.assert_almost_equal( + output["Y"], + alpha * np.dot(A, np.transpose(B)) + beta * C) + + # revert B + B = np.transpose(B) + # set a scalar to C + C = np.random.randn(1).astype(np.float32) + + # scalar broadcast for opset7 + node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=alpha, + beta=beta) + output = c2.run_node(node_def, [A, B, C], opset_version=7) + np.testing.assert_almost_equal( + output["Y"], + alpha * np.dot(A, B) + beta * C) + # scalar broadcast for opset3 and 6 + node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=alpha, + beta=beta, + broadcast=1) + output = c2.run_node(node_def, [A, B, C], opset_version=6) np.testing.assert_almost_equal( output["Y"], alpha * np.dot(A, B) + beta * C) def test_gemm_conversion(self): node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=2., + beta=3.) + node_def_broadcast = make_node( 'Gemm', ['A', 'B', 'C'], ["Y"], alpha=2., beta=3., - transB=True) + broadcast=1) + node_def_transpose_b = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=2., + beta=3., + transB=1) + + node_def_transpose_b_broadcast = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=2., + beta=3., + transB=1, + broadcast=1) backend = C.Caffe2Backend() @@ -220,10 +314,48 @@ def test_gemm_conversion(self): op_names.append(op.type) self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add']) - # with shape info (that indicates C is 1D), gemm will be - # converted to FC + # opset7 + # If C is a 1d tensor, gemm will be converted to FC/FCTransposed + _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString( + ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()], + 7) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'FC']) + _, op_strs = backend.convert_node(node_def.SerializeToString( - ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()]) + ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()], + 7) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed']) + + # opset6 without broadcast(C should match A*B's dim) + # The gemm will be converted to matmul + add, since the FC requires c + # to be 1d tensor. + _, op_strs = backend.convert_node(node_def.SerializeToString( + ), [make_tensor_value_info("A", onnx.TensorProto.FLOAT, (3,2)).SerializeToString(), + make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,3)).SerializeToString(), + make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,3)).SerializeToString()], + 6) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add']) + + # opset6 with broadcast + # If C is a 1d tensor, gemm will be converted to FC/FCTransposed + _, op_strs = backend.convert_node(node_def_transpose_b_broadcast.SerializeToString( + ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()], + 6) op_names = [] for s in op_strs: op = caffe2_pb2.OperatorDef() @@ -231,21 +363,62 @@ def test_gemm_conversion(self): op_names.append(op.type) self.assertEqual(op_names, ['Scale', 'Scale', 'FC']) - # or with broadcast, gemm will be converted to fc - node_def = make_node( - 'Gemm', - ['A', 'B', 'C'], - ["Y"], - transB=True, - broadcast=1) + _, op_strs = backend.convert_node(node_def_broadcast.SerializeToString( + ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()], + 6) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed']) + + # opset7 + # If C is a scalar and B's last dim is 1, gemm will be converted to FC/FCTransposed + _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString( + ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (1,2)).SerializeToString(), + make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()], + 7) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'FC']) - _, op_strs = backend.convert_node(node_def.SerializeToString()) + _, op_strs = backend.convert_node(node_def.SerializeToString( + ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,1)).SerializeToString(), + make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()], + 7) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed']) + # If C is a scalar and B's last dim is not 1, gemm will be converted + # to matmul + add. + _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString( + ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,2)).SerializeToString(), + make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()], + 7) op_names = [] for s in op_strs: op = caffe2_pb2.OperatorDef() op.ParseFromString(s) op_names.append(op.type) - self.assertEqual(op_names, ['FC']) + self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add']) + # If C is a scalar and B's shape info is not available, + # gemm will be converted to matmul + add. + _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString( + ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()], + 7) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add']) def test_tensor_filling_ops(self): for dtype in [ diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py new file mode 100644 index 00000000000000..77faeaeeb608c4 --- /dev/null +++ b/caffe2/python/operator_test/bisect_percentile_op_test.py @@ -0,0 +1,182 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import hypothesis.strategies as st + +from caffe2.python import core, workspace +from hypothesis import given +import caffe2.python.hypothesis_test_util as hu + +import bisect +import numpy as np + + +class TestBisectPercentileOp(hu.HypothesisTestCase): + def compare_reference( + self, + raw_data, + pct_raw_data, + pct_mapping, + pct_upper, + pct_lower, + lengths, + ): + def bisect_percentile_op_ref( + raw_data, + pct_raw_data, + pct_mapping, + pct_lower, + pct_upper, + lengths + ): + results = np.zeros_like(raw_data) + indices = [0] + for j in range(len(lengths)): + indices.append(indices[j] + lengths[j]) + for i in range(len(raw_data)): + for j in range(len(raw_data[0])): + start = indices[j] + end = indices[j + 1] + val = raw_data[i][j] + pct_raw_data_i = pct_raw_data[start:end] + pct_lower_i = pct_lower[start:end] + pct_upper_i = pct_upper[start:end] + pct_mapping_i = pct_mapping[start:end] + + # Corner cases + if val < pct_raw_data_i[0]: + results[i][j] = 0 + continue + if val > pct_raw_data_i[-1]: + results[i][j] = 1. + continue + + # interpolation + k = bisect.bisect_left(pct_raw_data_i, val) + if pct_raw_data_i[k] == val: + results[i][j] = pct_mapping_i[k] + else: + k = k - 1 + slope = ((pct_lower_i[k + 1] - pct_upper_i[k]) + / (pct_raw_data_i[k + 1] - pct_raw_data_i[k])) + results[i][j] = pct_upper_i[k] + \ + slope * (val - pct_raw_data_i[k]) + + return results + + workspace.ResetWorkspace() + workspace.FeedBlob("raw_data", raw_data) + + op = core.CreateOperator( + "BisectPercentile", + ["raw_data"], + ["pct_output"], + percentile_raw=pct_raw_data, + percentile_mapping=pct_mapping, + percentile_lower=pct_lower, + percentile_upper=pct_upper, + lengths=lengths + ) + workspace.RunOperatorOnce(op) + + expected_output = bisect_percentile_op_ref( + raw_data, + pct_raw_data, + pct_mapping, + pct_lower, + pct_upper, + lengths + ) + output = workspace.blobs['pct_output'] + np.testing.assert_array_almost_equal(output, expected_output) + + def test_bisect_percentil_op_simple(self): + raw_data = np.array([ + [1, 1], + [2, 2], + [3, 3], + [3, 1], + [9, 10], + [1.5, 5], + [1.32, 2.4], + [2.9, 5.7], + [-1, -1], + [3, 7] + ], dtype=np.float32) + pct_raw_data = np.array([1, 2, 3, 2, 7], dtype=np.float32) + pct_lower = np.array([0.1, 0.2, 0.9, 0.1, 0.5], dtype=np.float32) + pct_upper = np.array([0.1, 0.8, 1.0, 0.4, 1.0], dtype=np.float32) + pct_mapping = np.array([0.1, 0.5, 0.95, 0.25, 0.75], dtype=np.float32) + lengths = np.array([3, 2], dtype=np.int32) + self.compare_reference( + raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths) + + @given( + N=st.integers(min_value=20, max_value=100), + lengths=st.lists( + elements=st.integers(min_value=2, max_value=10), + min_size=2, + max_size=5, + ), + max_value=st.integers(min_value=100, max_value=1000), + discrete=st.booleans(), + p=st.floats(min_value=0, max_value=0.9), + **hu.gcs_cpu_only + ) + def test_bisect_percentil_op_large( + self, N, lengths, max_value, discrete, p, gc, dc + ): + lengths = np.array(lengths, dtype=np.int32) + D = len(lengths) + + if discrete: + raw_data = np.random.randint(0, max_value, size=(N, D)) + else: + raw_data = np.random.randn(N, D) + + # To generate valid pct_lower and pct_upper + pct_lower = [] + pct_upper = [] + pct_raw_data = [] + for i in range(D): + pct_lower_val = 0. + pct_upper_val = 0. + pct_lower_cur = [] + pct_upper_cur = [] + # There is no duplicated values in pct_raw_data + if discrete: + pct_raw_data_cur = np.random.choice( + np.arange(max_value), size=lengths[i], replace=False) + else: + pct_raw_data_cur = np.random.randn(lengths[i]) + while len(set(pct_raw_data_cur)) < lengths[i]: + pct_raw_data_cur = np.random.randn(lengths[i]) + pct_raw_data_cur = np.sort(pct_raw_data_cur) + for _ in range(lengths[i]): + pct_lower_val = pct_upper_val + 0.01 + pct_lower_cur.append(pct_lower_val) + pct_upper_val = pct_lower_val + \ + 0.01 * np.random.randint(1, 20) * (np.random.uniform() < p) + pct_upper_cur.append(pct_upper_val) + # normalization + pct_lower_cur = np.array(pct_lower_cur, np.float32) / pct_upper_val + pct_upper_cur = np.array(pct_upper_cur, np.float32) / pct_upper_val + pct_lower.extend(pct_lower_cur) + pct_upper.extend(pct_upper_cur) + pct_raw_data.extend(pct_raw_data_cur) + + pct_lower = np.array(pct_lower, dtype=np.float32) + pct_upper = np.array(pct_upper, dtype=np.float32) + pct_mapping = (pct_lower + pct_upper) / 2. + raw_data = np.array(raw_data, dtype=np.float32) + pct_raw_data = np.array(pct_raw_data, dtype=np.float32) + + self.compare_reference( + raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/caffe2/transforms/common_subexpression_elimination.h b/caffe2/transforms/common_subexpression_elimination.h index e66ccf153abb85..2a2f6b882c74c1 100644 --- a/caffe2/transforms/common_subexpression_elimination.h +++ b/caffe2/transforms/common_subexpression_elimination.h @@ -25,7 +25,7 @@ namespace caffe2 { * * TODO(benz): Fix the error to not match nodes that write to external output. */ -class CommonSubexpressionEliminationTransform : public Transform { +class CAFFE2_API CommonSubexpressionEliminationTransform : public Transform { public: CommonSubexpressionEliminationTransform() { SetPatternMatchType(SORTED_WRT_EXECUTION_ORDER); diff --git a/caffe2/transforms/conv_to_nnpack_transform.h b/caffe2/transforms/conv_to_nnpack_transform.h index 6438b147b5f3a9..83f91c364c5b02 100644 --- a/caffe2/transforms/conv_to_nnpack_transform.h +++ b/caffe2/transforms/conv_to_nnpack_transform.h @@ -7,7 +7,7 @@ namespace caffe2 { -class ConvToNNPackTransform : public SingleOpTransform { +class CAFFE2_API ConvToNNPackTransform : public SingleOpTransform { protected: // Specify what the op needs to be to match the pattern. bool MatchOperator(const OperatorDef& op) override { diff --git a/caffe2/transforms/pattern_net_transform.h b/caffe2/transforms/pattern_net_transform.h index 1f54ccc1eb2d08..c22b42d9deb143 100644 --- a/caffe2/transforms/pattern_net_transform.h +++ b/caffe2/transforms/pattern_net_transform.h @@ -15,7 +15,7 @@ namespace caffe2 { * and this Transform will find subgraphs which fit the pattern net, * and replace it with the replace net. */ -class PatternNetTransform : public Transform { +class CAFFE2_API PatternNetTransform : public Transform { public: PatternNetTransform(const NetDef& pattern_net, const NetDef& replace_net) : p_(transform::Graph(pattern_net)), r_(transform::Graph(replace_net)) { diff --git a/caffe2/transforms/single_op_transform.h b/caffe2/transforms/single_op_transform.h index dbc53e33831251..ae21f9aaa8d35e 100644 --- a/caffe2/transforms/single_op_transform.h +++ b/caffe2/transforms/single_op_transform.h @@ -15,7 +15,7 @@ namespace caffe2 { * Transforms which derive from SingleOpTransform need to override: * ReplaceOperator and MatchOperator. */ -class SingleOpTransform : public Transform { +class CAFFE2_API SingleOpTransform : public Transform { protected: bool PatternRule( const transform::Graph& g, diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index 4e8d2268258416..bb42109b770f6e 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -4,6 +4,24 @@ # - Creates an ATen target for its generated C++ files and adds it # as a dependency +################################################################################ +# Helper functions +################################################################################ + +function(filter_list output input) + unset(result) + foreach(filename ${${input}}) + foreach(pattern ${ARGN}) + if("${filename}" MATCHES "${pattern}") + list(APPEND result "${filename}") + endif() + endforeach() + endforeach() + set(${output} ${result} PARENT_SCOPE) +endfunction() + +################################################################################ + if (DEFINED ENV{PYTORCH_PYTHON}) message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}") set(PYCMD "$ENV{PYTORCH_PYTHON}") @@ -20,7 +38,7 @@ configure_file( install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2 DESTINATION include FILES_MATCHING PATTERN "*.h") -if (NOT BUILD_ATEN) +if (BUILD_ATEN_MOBILE) install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core DESTINATION include/ATen FILES_MATCHING PATTERN "*.h") @@ -29,7 +47,7 @@ install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h DESTINATION include/caffe2/core) # ---[ ATen specific -if (BUILD_ATEN) +if (NOT BUILD_ATEN_MOBILE) # SET_SOURCE_FILES_PROPERTIES must be in the same CMakeLists.txt file as the target that includes the file # so we need to set these commands here rather than in src/TH IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 887e357fd0c0f8..6f8609fc8007bf 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -61,7 +61,7 @@ if(BUILD_CAFFE2) endif() # ---[ BLAS -if(BUILD_ATEN) +if(NOT BUILD_ATEN_MOBILE) set(BLAS "MKL" CACHE STRING "Selected BLAS library") else() set(BLAS "Eigen" CACHE STRING "Selected BLAS library") @@ -512,7 +512,7 @@ if(USE_CUDA) endif() # ---[ HIP -if(BUILD_CAFFE2 OR BUILD_ATEN) +if(BUILD_CAFFE2 OR NOT BUILD_ATEN_MOBILE) include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake) if(PYTORCH_FOUND_HIP) message(INFO "Compiling with HIP for AMD.") @@ -539,19 +539,27 @@ if(BUILD_CAFFE2 OR BUILD_ATEN) set(Caffe2_HIP_DEPENDENCY_LIBS ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES}) # Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image) - if(BUILD_ATEN) + if(NOT BUILD_ATEN_MOBILE) set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES}) endif() # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES} list(APPEND Caffe2_HIP_DEPENDENCY_LIBS roc::rocblas) + + # TODO: Currently pytorch hipify script uses a feature called + # "disabled_modules" that effectively ifdef out a file, but + # without doing extra processing in the callers, which results in + # some unresolved symbols in the shared lib + # (libcaffe2_hip.so). Remove this when all disabled_modules are + # eliminated. + set(CMAKE_EXE_LINKER_FLAGS "-Wl,--unresolved-symbols=ignore-in-shared-libs ${CMAKE_EXE_LINKER_FLAGS}") else() caffe2_update_option(USE_ROCM OFF) endif() endif() # ---[ ROCm -if(USE_ROCM AND NOT BUILD_CAFFE2) +if(USE_ROCM) include_directories(SYSTEM ${HIP_PATH}/include) include_directories(SYSTEM ${ROCBLAS_PATH}/include) include_directories(SYSTEM ${HIPSPARSE_PATH}/include) @@ -745,7 +753,7 @@ if (USE_NNAPI AND NOT ANDROID) caffe2_update_option(USE_NNAPI OFF) endif() -if (BUILD_ATEN) +if (NOT BUILD_ATEN_MOBILE) if (BUILD_CAFFE2) list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen) if (USE_CUDA) @@ -809,7 +817,7 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) endif() # --[ ATen checks -if (BUILD_ATEN) +if (NOT BUILD_ATEN_MOBILE) set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST}) set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS}) @@ -846,28 +854,26 @@ if (BUILD_ATEN) #Check if certain std functions are supported. Sometimes #_GLIBCXX_USE_C99 macro is not defined and some functions are missing. - if (NOT ANDROID) - CHECK_CXX_SOURCE_COMPILES(" - #include - #include - - int main() { - int a = std::isinf(3.0); - int b = std::isnan(0.0); - std::string s = std::to_string(1); - - return 0; - }" SUPPORT_GLIBCXX_USE_C99) - - if (NOT SUPPORT_GLIBCXX_USE_C99) - message(FATAL_ERROR - "The C++ compiler does not support required functions. " - "This is very likely due to a known bug in GCC 5 " - "(and maybe other versions) on Ubuntu 17.10 and newer. " - "For more information, see: " - "https://github.com/pytorch/pytorch/issues/5229" - ) - endif() + CHECK_CXX_SOURCE_COMPILES(" + #include + #include + + int main() { + int a = std::isinf(3.0); + int b = std::isnan(0.0); + std::string s = std::to_string(1); + + return 0; + }" SUPPORT_GLIBCXX_USE_C99) + + if (NOT SUPPORT_GLIBCXX_USE_C99) + message(FATAL_ERROR + "The C++ compiler does not support required functions. " + "This is very likely due to a known bug in GCC 5 " + "(and maybe other versions) on Ubuntu 17.10 and newer. " + "For more information, see: " + "https://github.com/pytorch/pytorch/issues/5229" + ) endif() # Top-level build config diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index e1debe8be669f2..091d1f3c28a06c 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -19,7 +19,7 @@ function (caffe2_print_configuration_summary) message(STATUS "") message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}") - message(STATUS " BUILD_ATEN : ${BUILD_ATEN}") + message(STATUS " BUILD_ATEN_MOBILE : ${BUILD_ATEN_MOBILE}") message(STATUS " BUILD_BINARY : ${BUILD_BINARY}") message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}") if (${CAFFE2_LINK_LOCAL_PROTOBUF}) @@ -45,7 +45,6 @@ function (caffe2_print_configuration_summary) message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " USE_ASAN : ${USE_ASAN}") - message(STATUS " USE_ATEN : ${USE_ATEN}") message(STATUS " USE_CUDA : ${USE_CUDA}") if(${USE_CUDA}) message(STATUS " CUDA static link : ${CAFFE2_STATIC_LINK_CUDA}") @@ -127,11 +126,9 @@ function (caffe2_print_configuration_summary) message(STATUS " USE_REDIS : ${USE_REDIS}") message(STATUS " USE_ROCKSDB : ${USE_ROCKSDB}") message(STATUS " USE_ZMQ : ${USE_ZMQ}") - if(${BUILD_ATEN}) - message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}") - if(${USE_DISTRIBUTED}) - message(STATUS " USE_DISTRIBUTED_MW : ${USE_DISTRIBUTED_MW}") - endif() + message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}") + if(${USE_DISTRIBUTED}) + message(STATUS " USE_DISTRIBUTED_MW : ${USE_DISTRIBUTED_MW}") endif() message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}") diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh index 62185d1e9dc821..d31a732ed21c8d 100755 --- a/scripts/build_anaconda.sh +++ b/scripts/build_anaconda.sh @@ -318,7 +318,6 @@ if [[ -n $integrated ]]; then #add_package $cuda_feature_name conda_channel+=('-c pytorch') - caffe2_cmake_args+=("-DUSE_ATEN=ON") fi fi diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 1f039f648dfcbf..bf2cf4ebb719c3 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -559,6 +559,16 @@ def forward(self, input): input = Variable(torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9)) self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE) + def test_log(self): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, input): + return input.log() + input = Variable(torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9)) + self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE) + def test_trigonometry(self): def test_func(name): class MyModel(torch.nn.Module): diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py index 1f33081beed272..3db7a42ffd2361 100755 --- a/test/test_cpp_extensions.py +++ b/test/test_cpp_extensions.py @@ -1,3 +1,4 @@ +import os import unittest import sys @@ -15,7 +16,10 @@ from torch.utils.cpp_extension import CUDA_HOME TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None -TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_available() +TEST_CUDNN = False +if TEST_CUDA: + CUDNN_HEADER_EXISTS = os.path.isfile(os.path.join(CUDA_HOME, 'include/cudnn.h')) + TEST_CUDNN = TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available() class TestCppExtension(common.TestCase): diff --git a/test/test_jit.py b/test/test_jit.py index f7945cd9cb25c3..a9766739feaf46 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -2336,7 +2336,7 @@ def reassign_from_empty_literal(): if True: x = [1, 2, 3] return - with self.assertRaisesRegex(RuntimeError, "Empty list literals not allowed"): + with self.assertRaisesRegex(RuntimeError, "previously has type Tensor\[\]"): self.checkScript(reassign_from_empty_literal, (), optimize=False) def reassign_from_empty_builtin(): @@ -5754,6 +5754,22 @@ def foo(self, x : torch.Tensor, y : Tuple[torch.Tensor, Tensor]) -> Tuple[Tensor fn = self._get_py3_code(code, 'instance') self.assertExpected(fn.__getattr__('foo').pretty_print_schema()) + def test_method_casts_script(self): + cast_types = [ + 'byte', 'char', 'double', 'float', 'int', 'long', 'short' + ] + + for cast_type in cast_types: + cu = torch.jit.CompilationUnit(''' + def cast_to(x): + return x.{cast_type}() + '''.format(cast_type=cast_type)) + + x = torch.rand(3, 4, 5) * 128 + cu_result = cu.cast_to(x) + reference = getattr(x, cast_type)() + self.assertEqual(cu_result, reference) + class TestEndToEndHybridFrontendModels(JitTestCase): diff --git a/test/test_sparse.py b/test/test_sparse.py index 1e47ec1b202c29..6bed41a64f7494 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -911,6 +911,7 @@ def test_storage_not_null(self): @cuda_only @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected") + @skipIfRocm def test_same_gpu(self): i = self.IndexTensor([[2]]).cuda(1) v = self.ValueTensor([5]).cuda(1) @@ -959,6 +960,7 @@ def test_new(self): self.assertEqual(x.new(indices, values, x.size()), x) @cpu_only # not really, but we only really want to run this once + @skipIfRocm def test_factory(self): default_size = torch.Size([1, 3]) size = torch.Size([3, 3]) @@ -987,6 +989,7 @@ def test_factory(self): self.assertEqual(device, sparse_tensor._values().device) self.assertEqual(True, sparse_tensor.requires_grad) + @skipIfRocm def test_factory_size_check(self): indices = self.IndexTensor([[1, 2], [0, 2]]) values = self.ValueTensor([.5, .5]) @@ -1016,6 +1019,7 @@ def test_factory_type_inference(self): self.assertEqual(torch.int64, t.dtype) @cuda_only + @skipIfRocm def test_factory_device_type_inference(self): # both indices/values are CUDA shape = (1, 3) @@ -1127,6 +1131,7 @@ def setUp(self): class TestSparseOneOff(TestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') + @skipIfRocm def test_cuda_from_cpu(self): self.assertExpectedRaises( RuntimeError, @@ -1135,6 +1140,7 @@ def test_cuda_from_cpu(self): [3, 4, 4])) @unittest.skipIf(not TEST_CUDA, 'CUDA not available') + @skipIfRocm def test_cuda_sparse_cpu_dense_add(self): x = torch.zeros(3, 4, 4) sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), diff --git a/test/test_utils.py b/test/test_utils.py index b28b4f83171aaf..af93e3652e63be 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -414,6 +414,7 @@ def test_cpu(self): @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows") @skipIfRocm def test_gpu(self): + from torch.utils.cpp_extension import CUDA_HOME create_extension( name='gpulib', headers=[test_dir + '/ffi/src/cuda/cudalib.h'], @@ -422,6 +423,7 @@ def test_gpu(self): ], with_cuda=True, verbose=False, + include_dirs=[os.path.join(CUDA_HOME, 'include')], ).build() import gpulib tensor = torch.ones(2, 2).float() diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml index c9e9daae89f5ef..e0384c9fec9c44 100644 --- a/tools/amd_build/disabled_features.yaml +++ b/tools/amd_build/disabled_features.yaml @@ -145,7 +145,6 @@ "aten/src/ATen/native/cuda/CuFFTUtils.h", "aten/src/ATen/native/cuda/CuFFTPlanCache.h", "aten/src/ATen/native/cuda/SpectralOps.cu", - "aten/src/ATen/native/cuda/Distributions.cu", ], "disabled_functions": [ { @@ -171,7 +170,8 @@ "functions": [ "_s_poisson_cuda", "poisson_cuda_kernel", - "gamma_cuda_kernel" + "gamma_cuda_kernel", + "gamma_grad_cuda_kernel", ] }, { diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat index ec18705fe86151..2f8b3ae1c5ebce 100755 --- a/tools/build_pytorch_libs.bat +++ b/tools/build_pytorch_libs.bat @@ -183,7 +183,6 @@ goto:eof -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^ -DNO_API=ON ^ -DBUILD_SHARED_LIBS="%BUILD_SHARED_LIBS%" ^ - -DBUILD_ATEN=ON ^ -DBUILD_PYTHON=OFF ^ -DBUILD_BINARY=OFF ^ -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^ diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index c1e0e1975167f2..f53de42c90a60b 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -265,7 +265,6 @@ function build_caffe2() { -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DBUILD_CAFFE2=$FULL_CAFFE2 \ -DBUILD_TORCH=$BUILD_TORCH \ - -DBUILD_ATEN=ON \ -DBUILD_PYTHON=$FULL_CAFFE2 \ -DBUILD_BINARY=OFF \ -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \ diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 1fe8d69fbbf91b..583d0155023e19 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -254,9 +254,22 @@ struct Environment { throw ErrorReport(loc) << "Cannot re-assign '" << name << "' because it has type " << value->kind() << " and " << name << " is not a first-class value. Only reassignments to first-class values are allowed"; } - if(!as_simple_value->type()->isSubtypeOf(unshapedType(simple_parent->type()))) { - throw ErrorReport(loc) << "variable '" << name << "' previously has type " << simple_parent->type()->str() - << " but is now being assigned to a value of type " << as_simple_value->type()->str(); + if (!as_simple_value->type()->isSubtypeOf( + unshapedType(simple_parent->type()))) { + std::stringstream errMsg; + errMsg << "variable '" << name << "' previously has type " + << simple_parent->type()->str() + << " but is now being assigned to a value of type " + << as_simple_value->type()->str(); + // Special-cased error msg if we're trying to assign to a tensor list. + if (simple_parent->type()->kind() == TypeKind::ListType && + as_simple_value->type()->kind() == TypeKind::ListType) { + errMsg << "\n. (Note: empty lists are constructed as Tensor[]; " + << "if you want an empty list of a different type, " + << "use `_construct_empty_foo_list`, " + << "where `foo` is `int` or `float`)"; + } + throw ErrorReport(loc) << errMsg.str(); } } if (as_simple_value) @@ -1374,12 +1387,10 @@ struct to_ir { case TK_LIST_LITERAL: { auto ll = ListLiteral(tree); auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity); - if (values.size() == 0) { - throw ErrorReport(tree) << "Empty list literals not allowed. " - << "Use _construct_empty_foo_list() instead. " - << "`foo` can be `int`, `float` or `tensor`"; - } - const auto elem_type = values.at(0)->type(); + + // If this is an empty list literal `[]`, construct an empty Tensor[] + const auto elem_type = + values.empty() ? DynamicType::get() : values.at(0)->type(); for (auto v : values) { if (v->type() != elem_type) { throw ErrorReport(tree) @@ -1481,9 +1492,27 @@ struct to_ir { } }; +static const std::unordered_map &builtin_cast_methods() { + static std::unordered_map builtin_cast_methods = { + {"byte", "_cast_Byte"}, + {"char", "_cast_Char"}, + {"double", "_cast_Double"}, + {"float", "_cast_Float"}, + {"int", "_cast_Int"}, + {"long", "_cast_Long"}, + {"short", "_cast_Short"}, + {"half", "_cast_Half"} + }; + return builtin_cast_methods; +} + // support syntax sugar for x.foo(y, z) by allowing x.foo to return a // callable value that will resolve to foo(x, y, z) when called. std::shared_ptr SimpleValue::attr(SourceRange loc, Method & m, const std::string& field) { + // Allow method-style casts on Tensor types. e.g. x.int() + if (value->type()->isSubtypeOf(DynamicType::get()) && builtin_cast_methods().count(field)) { + return std::make_shared(builtin_cast_methods().at(field), NamedValue(loc, "self", value)); + } return std::make_shared(field, NamedValue(loc, "self", value)); } @@ -1583,7 +1612,6 @@ TypePtr parseTypeFromExpr(Expr expr) { && select.selector().name() == "Tensor") { return ident_to_type_lut().at("Tensor"); } - std::cout << select << std::endl; } throw ErrorReport(expr.range()) << "Expression of type " << kindToString(expr.kind()) << " cannot be used in a type expression"; diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index 8c1ee681122edd..db767610db8a9e 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -359,7 +359,7 @@ def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD): def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD): r"""Reduces the tensor data across all machines. - Only the process with rank :attr`dst` is going to receive the final result. + Only the process with rank :attr:`dst` is going to receive the final result. Arguments: tensor (Tensor): Input and output of the collective. The function diff --git a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp index dd4bc47f52f981..78ce8b7951c1ee 100644 --- a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp @@ -96,7 +96,7 @@ class NCCLTest : public NCCLTestBase { // Copy inputs to outputs for (auto i = 0; i < numDevices_; i++) { cudaStreamSynchronize(streams_[i].getStream()); - outputs[i] = inputs_[i].toBackend(at::kCPU); + outputs[i] = inputs_[i].cpu(); } return outputs; @@ -115,7 +115,7 @@ class NCCLTest : public NCCLTestBase { for (auto i = 0; i < numDevices_; ++i) { cudaStreamSynchronize(streams_[i].getStream()); for (auto j = 0; j < worldSize_; ++j) { - outputs[i][j] = outputs_[i][j].toBackend(at::kCPU); + outputs[i][j] = outputs_[i][j].cpu(); } } return outputs; diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index f5e7503f71f971..688eb0cdd9bbf0 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -749,6 +749,10 @@ def abs(g, self): return g.op("Abs", self) +def log(g, self): + return g.op("Log", self) + + def pow(g, self, exponent): exponent = _maybe_get_scalar(exponent) return g.op("Pow", self, _if_scalar_type_as(g, exponent, self)) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 0f05191f3c5104..2a1815e7d9708b 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -69,6 +69,10 @@ def _find_cuda_home(): BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+') +def is_binary_build(): + return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__) + + def check_compiler_abi_compatibility(compiler): ''' Verifies that the given compiler is ABI-compatible with PyTorch. @@ -81,7 +85,7 @@ def check_compiler_abi_compatibility(compiler): False if the compiler is (likely) ABI-incompatible with PyTorch, else True. ''' - if BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__): + if not is_binary_build(): return True try: check_cmd = '{}' if sys.platform == 'win32' else '{} --version' @@ -138,6 +142,7 @@ def build_extensions(self): self._check_abi() for extension in self.extensions: self._define_torch_extension_name(extension) + self._add_gnu_abi_flag_if_binary(extension) # Register .cu and .cuh as valid source extensions. self.compiler.src_extensions += ['.cu', '.cuh'] @@ -270,6 +275,21 @@ def _define_torch_extension_name(self, extension): else: extension.extra_compile_args.append(define) + def _add_gnu_abi_flag_if_binary(self, extension): + # If the version string looks like a binary build, + # we know that PyTorch was compiled with gcc 4.9.2. + # if the extension is compiled with gcc >= 5.1, + # then we have to define _GLIBCXX_USE_CXX11_ABI=0 + # so that the std::string in the API is resolved to + # non-C++11 symbols + define = '-D_GLIBCXX_USE_CXX11_ABI=0' + if is_binary_build(): + if isinstance(extension.extra_compile_args, dict): + for args in extension.extra_compile_args.values(): + args.append(define) + else: + extension.extra_compile_args.append(define) + def CppExtension(name, sources, *args, **kwargs): ''' @@ -792,6 +812,9 @@ def _write_ninja_file(path, common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)] common_cflags += ['-I{}'.format(include) for include in includes] + if is_binary_build(): + common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0'] + cflags = common_cflags + ['-fPIC', '-std=c++11'] + extra_cflags if sys.platform == 'win32': from distutils.spawn import _nt_quote_args