diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 9b6b28f2842b88..053a9be5e05487 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -100,6 +100,8 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then # Unknown reasons, need to debug rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/arg_ops_test.py") rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py") + rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py") + rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py") # Need to go through roi ops to replace max(...) with fmaxf(...) rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/roi_align_rotated_op_test.py") @@ -107,12 +109,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then # Our cuda top_k op has some asm code, the hipified version doesn't # compile yet, so we don't have top_k operator for now rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py") - - # These are fixed in rocm 1.8.2, re-enable them once our CI docker images are upgraded - rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/recurrent_net_executor_test.py") - rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py") - rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/conv_test.py") - rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/group_conv_test.py") fi # Python tests diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc index 6f3986b837d308..f6041932518db9 100644 --- a/caffe2/onnx/backend.cc +++ b/caffe2/onnx/backend.cc @@ -25,8 +25,6 @@ namespace onnx { namespace { -constexpr static int kKnownOpsetVersion = 6; - bool AlmostEqual(double a, double b) { constexpr static double kEps = 1e-15; return (fabs(a - b) < kEps); @@ -367,17 +365,19 @@ Caffe2Backend::get_special_operators() const { Caffe2Ops Caffe2Backend::CreateArgMaxMin( OnnxNode* onnx_node, - int opset_version) { + const ConversionContext& ctx) { auto& attributes = onnx_node->attributes; if (!attributes.HasAttribute("axis")) { auto* attr = attributes.AddRewrittenAttribute("axis"); attr->set_i(0); } - return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } -Caffe2Ops Caffe2Backend::CreateCast(OnnxNode* onnx_node, int opset_version) { - auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); +Caffe2Ops Caffe2Backend::CreateCast( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); auto onnx_dtype = onnx_node->attributes.get("to", TensorProto::UNDEFINED); @@ -443,7 +443,7 @@ Caffe2Ops Caffe2Backend::CreateCast(OnnxNode* onnx_node, int opset_version) { Caffe2Ops Caffe2Backend::CreateConstant( OnnxNode* onnx_node, - int opset_version) { + const ConversionContext& ctx) { CAFFE_ENFORCE_EQ(onnx_node->node.output_size(), 1); Caffe2Ops ret; @@ -486,7 +486,7 @@ Caffe2Ops Caffe2Backend::CreateConstant( // differently. Caffe2Ops Caffe2Backend::CreateConvPoolOpBase( OnnxNode* onnx_node, - int opset_version) { + const ConversionContext& ctx) { const auto& node = onnx_node->node; auto& attributes = onnx_node->attributes; if (node.op_type().find("Global") == 0) { @@ -512,16 +512,18 @@ Caffe2Ops Caffe2Backend::CreateConvPoolOpBase( } } - return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } -Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) { +Caffe2Ops Caffe2Backend::CreatePadPool( + OnnxNode* onnx_node, + const ConversionContext& ctx) { auto& node = onnx_node->node; auto& attributes = onnx_node->attributes; Caffe2Ops ret; // Pad bool padding = false; - const std::string pad_name = opset_version < 2 ? "paddings" : "pads"; + const std::string pad_name = ctx.opset_version() < 2 ? "paddings" : "pads"; const auto pad_input = dummy_->NewDummyName(); if (attributes.HasAttribute("count_include_pad") && attributes.HasAttribute(pad_name)) { @@ -561,7 +563,7 @@ Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) { } } // Pool - auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, opset_version); + auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, ctx); auto* pool_op = c2_ops.ops.Mutable(0); if (padding) { pool_op->set_input(0, pad_input); @@ -572,8 +574,10 @@ Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) { return ret; } -Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) { - auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); +Caffe2Ops Caffe2Backend::CreateReshape( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1); auto* op = c2_op.ops.Mutable(0); op->add_output(dummy_->NewDummyName()); @@ -583,7 +587,7 @@ Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) { Caffe2Ops Caffe2Backend::CreateReciprocal( OnnxNode* onnx_node, - int /*opset_version*/) { + const ConversionContext& ctx) { const auto& node = onnx_node->node; if (node.input_size() != 1 || node.output_size() != 1) { CAFFE_THROW("Caffe2 Reciprocal should have 1 input and 1 output"); @@ -599,7 +603,9 @@ Caffe2Ops Caffe2Backend::CreateReciprocal( return ret; } -Caffe2Ops Caffe2Backend::CreateGather(OnnxNode* onnx_node, int opset_version) { +Caffe2Ops Caffe2Backend::CreateGather( + OnnxNode* onnx_node, + const ConversionContext& ctx) { const auto& node = onnx_node->node; if (node.input_size() < 2 || node.output_size() < 1) { CAFFE_THROW("Caffe2 Gather should have 2 inputs and 1 output"); @@ -629,7 +635,9 @@ Caffe2Ops Caffe2Backend::CreateGather(OnnxNode* onnx_node, int opset_version) { return ret; } -Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) { +Caffe2Ops Caffe2Backend::CreateGemm( + OnnxNode* onnx_node, + const ConversionContext& ctx) { const auto& node = onnx_node->node; if (node.input_size() < 3 || node.output_size() < 1) { CAFFE_THROW("Caffe2 Gemm should have 3 inputs and 1 output"); @@ -667,7 +675,22 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) { auto trans_a = onnx_node->attributes.get("transA", 0L); auto trans_b = onnx_node->attributes.get("transB", 0L); auto broadcast = onnx_node->attributes.get("broadcast", 0L); - if ((!trans_a) && trans_b && broadcast) { + + bool use_fc = false; + if ((!trans_a) && trans_b) { + if (broadcast) { + use_fc = true; + } else { + const auto input_c_vi_iter = ctx.value_infos().find(node.input(2)); + if (input_c_vi_iter != ctx.value_infos().end() && + input_c_vi_iter->second.type().tensor_type().shape().dim_size() == + 1) { + use_fc = true; + } + } + } + + if (use_fc) { auto* c2_op = ret.ops.Add(); BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output}); } else { @@ -683,7 +706,7 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) { BuildOperator( c2_op, "MatMul", {input_a, input_b}, {ab}, {arg_trans_a, arg_trans_b}); c2_op = ret.ops.Add(); - if (opset_version >= 7) { + if (ctx.opset_version() >= 7) { BuildOperator(c2_op, "Add", {ab, input_c}, {output}); } else { caffe2::Argument arg_broadcast; @@ -696,10 +719,12 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) { return ret; } -Caffe2Ops Caffe2Backend::CreatePad(OnnxNode* onnx_node, int opset_version) { +Caffe2Ops Caffe2Backend::CreatePad( + OnnxNode* onnx_node, + const ConversionContext& ctx) { auto& attributes = onnx_node->attributes; ::google::protobuf::RepeatedField<::google::protobuf::int64> pads; - std::string pad_name = opset_version < 2 ? "paddings" : "pads"; + std::string pad_name = ctx.opset_version() < 2 ? "paddings" : "pads"; pads = attributes .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>( pad_name); @@ -734,14 +759,16 @@ Caffe2Ops Caffe2Backend::CreatePad(OnnxNode* onnx_node, int opset_version) { attr->add_ints(pads.Get(6)); attr->add_ints(pads.Get(7)); - return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } // TODO: Caffe2 Concat has an extra output. It should be only // used when doing training, so we should change Caffe2 to allow // 1 output. -Caffe2Ops Caffe2Backend::CreateConcat(OnnxNode* onnx_node, int opset_version) { - auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); +Caffe2Ops Caffe2Backend::CreateConcat( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1); auto* op = c2_op.ops.Mutable(0); op->add_output(dummy_->NewDummyName()); @@ -751,7 +778,7 @@ Caffe2Ops Caffe2Backend::CreateConcat(OnnxNode* onnx_node, int opset_version) { Caffe2Ops Caffe2Backend::CreateLogSoftmax( OnnxNode* onnx_node, - int opset_version) { + const ConversionContext& ctx) { const auto& node = onnx_node->node; if (node.input_size() < 1 || node.output_size() < 1) { CAFFE_THROW("LogSoftmax should have 1 input and 1 output"); @@ -771,8 +798,10 @@ Caffe2Ops Caffe2Backend::CreateLogSoftmax( return ret; } -Caffe2Ops Caffe2Backend::CreateSlice(OnnxNode* onnx_node, int opset_version) { - auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); +Caffe2Ops Caffe2Backend::CreateSlice( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1); auto* op = op_tmp.ops.Mutable(0); std::unordered_map args; @@ -922,40 +951,42 @@ Caffe2Ops Caffe2Backend::CreateSlice(OnnxNode* onnx_node, int opset_version) { Caffe2Ops Caffe2Backend::CreateBatchNormalization( OnnxNode* onnx_node, - int opset_version) { - if (opset_version < 6) { + const ConversionContext& ctx) { + if (ctx.opset_version() < 6) { auto& attributes = onnx_node->attributes; attributes.remove("consumed_inputs"); } - if (opset_version >= 7) { + if (ctx.opset_version() >= 7) { auto& attributes = onnx_node->attributes; auto* attr = attributes.AddRewrittenAttribute("is_test"); attr->set_i(1); } - return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } Caffe2Ops Caffe2Backend::CreateSplit( OnnxNode* onnx_node, - int opset_version) { + const ConversionContext& ctx) { auto& attributes = onnx_node->attributes; if (!attributes.HasAttribute("axis")) { auto* attr = attributes.AddRewrittenAttribute("axis"); attr->set_i(0); } - return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } -Caffe2Ops Caffe2Backend::CreateMatMul(OnnxNode* onnx_node, int opset_version) { +Caffe2Ops Caffe2Backend::CreateMatMul( + OnnxNode* onnx_node, + const ConversionContext& ctx) { const auto& node = onnx_node->node; if (node.input_size() != 2) { CAFFE_THROW("MatMul should have 2 inputs"); } - auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1); auto* op = c2_op.ops.Mutable(0); auto* broadcast_arg = op->add_arg(); @@ -965,10 +996,12 @@ Caffe2Ops Caffe2Backend::CreateMatMul(OnnxNode* onnx_node, int opset_version) { return c2_op; } -Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version) { +Caffe2Ops Caffe2Backend::CreateUpsample( + OnnxNode* onnx_node, + const ConversionContext& ctx) { auto& attributes = onnx_node->attributes; attributes.remove("mode"); - if (opset_version >= 7) { + if (ctx.opset_version() >= 7) { const auto& scales = attributes.get<::google::protobuf::RepeatedField>("scales"); if (scales.size() != 4) { CAFFE_THROW("The scales argument should have size 4"); @@ -976,7 +1009,7 @@ Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version) CAFFE_THROW("The first two elements in the scales argument must be 1"); } attributes.remove("scales"); - auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); auto* op = c2_op.ops.Mutable(0); auto* c2_height = op->add_arg(); c2_height->set_name("height_scale"); @@ -986,21 +1019,25 @@ Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version) c2_width->set_f(scales.Get(3)); return c2_op; } - return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } -Caffe2Ops Caffe2Backend::CreateDropout(OnnxNode* onnx_node, int opset_version) { - if (opset_version >= 7) { +Caffe2Ops Caffe2Backend::CreateDropout( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + if (ctx.opset_version() >= 7) { auto& attributes = onnx_node->attributes; auto* attr = attributes.AddRewrittenAttribute("is_test"); attr->set_i(1); } - return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } -Caffe2Ops Caffe2Backend::CreateLRN(OnnxNode* onnx_node, int opset_version) { - auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); +Caffe2Ops Caffe2Backend::CreateLRN( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); const auto& attributes = onnx_node->attributes; if (!attributes.HasAttribute("alpha")) { auto* arg = c2_op.ops.Mutable(0)->add_arg(); @@ -1052,7 +1089,7 @@ Caffe2Backend::AllNamesInGraph(const GraphProto &graph) { // and then fixing things up further. Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops( OnnxNode* onnx_node, - int opset_version) { + const ConversionContext& ctx) { Caffe2Ops ret; auto* c2_op = ret.ops.Add(); @@ -1064,12 +1101,12 @@ Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops( const auto onnx_op_type = node.op_type(); auto broken_version = caffe2::get_default( get_broken_operators(), onnx_op_type, std::numeric_limits::max()); - if (broken_version <= opset_version) { + if (broken_version <= ctx.opset_version()) { CAFFE_THROW( "Don't know how to translate op ", onnx_op_type, " in ONNX operator set v", - opset_version, + ctx.opset_version(), " (I only support prior to v", broken_version); } @@ -1102,14 +1139,14 @@ Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops( Caffe2Ops Caffe2Backend::ConvertNode( const std::string& node_str, - int opset_version) { + const ConversionContext& ctx) { ::google::protobuf::RepeatedPtrField nodes; auto* n = nodes.Add(); ParseProtoFromLargeString(node_str, n); ModelProto init_model; ModelProto pred_model; OnnxNode onnx_node = OnnxNode(nodes.Get(0)); - return OnnxNodeToCaffe2Ops(init_model, pred_model, &onnx_node, opset_version); + return OnnxNodeToCaffe2Ops(init_model, pred_model, ctx, &onnx_node); } void Caffe2Backend::CheckOpSchemaArguments( @@ -1142,14 +1179,14 @@ void Caffe2Backend::CheckOpSchemaArguments( Caffe2Ops Caffe2Backend::OnnxNodeToCaffe2Ops( const ModelProto& init_model, const ModelProto& pred_model, - OnnxNode* onnx_node, - int opset_version) { + const ConversionContext& ctx, + OnnxNode* onnx_node) { Caffe2Ops res; if (get_special_operators().count(onnx_node->node.op_type())) { res = (this->*get_special_operators().at(onnx_node->node.op_type()))( - onnx_node, opset_version); + onnx_node, ctx); } else { - res = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); + res = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); } for (const auto& result_op: res.ops){ @@ -1198,6 +1235,17 @@ void Caffe2Backend::OnnxToCaffe2( name_set.insert(name_set_pred.begin(), name_set_pred.end()); dummy_->Reset(name_set); + ValueInfoMap graph_value_infos{}; + for (const auto& vi : pred_model.graph().input()) { + graph_value_infos[vi.name()].CopyFrom(vi); + } + for (const auto& vi : pred_model.graph().output()) { + graph_value_infos[vi.name()].CopyFrom(vi); + } + for (const auto& vi : pred_model.graph().value_info()) { + graph_value_infos[vi.name()].CopyFrom(vi); + } + size_t idx_extra = 0; auto converter = [&](const ModelProto& model, caffe2::NetDef* net) mutable { net->mutable_device_option()->CopyFrom(device_option); @@ -1230,9 +1278,16 @@ void Caffe2Backend::OnnxToCaffe2( " without enough extra preconverted string"); } } else { + ValueInfoMap value_infos{}; + for (const auto& name : node.input()) { + auto iter = graph_value_infos.find(name); + if (iter != graph_value_infos.end()) { + value_infos[name].CopyFrom(iter->second); + } + } auto onnx_node = OnnxNode(node); auto c2ops = OnnxNodeToCaffe2Ops( - init_model, pred_model, &onnx_node, opset_version); + init_model, pred_model, {value_infos, opset_version}, &onnx_node); init_net_tmp->mutable_op()->MergeFrom(c2ops.init_ops); net->mutable_op()->MergeFrom(c2ops.ops); net->mutable_external_input()->MergeFrom(c2ops.interface_blobs); diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h index 437e572b8528b7..681ab5b30d10b0 100644 --- a/caffe2/onnx/backend.h +++ b/caffe2/onnx/backend.h @@ -11,6 +11,8 @@ #include #include +constexpr int kKnownOpsetVersion = 6; + namespace caffe2 { namespace onnx { @@ -19,6 +21,25 @@ using ::ONNX_NAMESPACE::GraphProto; using ::ONNX_NAMESPACE::ModelProto; using ::ONNX_NAMESPACE::NodeProto; using ::ONNX_NAMESPACE::TensorProto; +using ::ONNX_NAMESPACE::ValueInfoProto; + +using ValueInfoMap = std::unordered_map; + +class ConversionContext { + public: + ConversionContext(const ValueInfoMap& value_infos, int opset_version) + : value_infos_(value_infos), opset_version_(opset_version) {} + const ValueInfoMap& value_infos() const { + return value_infos_; + } + int opset_version() const { + return opset_version_; + } + + private: + const ValueInfoMap& value_infos_; + const int opset_version_; +}; // \brief This struct holds the converted ops after the onnx->c2 conversion. // Notice that for RNN ops, it may create ops in init_net. Hence we have the @@ -129,7 +150,9 @@ class Caffe2Backend { bool SupportOp(const std::string tyep) const; - Caffe2Ops ConvertNode(const std::string& node_str, int opset_version); + Caffe2Ops ConvertNode( + const std::string& node_str, + const ConversionContext& ctx); void BuildTensorFillingOp( caffe2::OperatorDef* c2_op, @@ -137,7 +160,8 @@ class Caffe2Backend { const std::string& name = ""); private: - using SpecialOpConverter = Caffe2Ops (Caffe2Backend::*)(OnnxNode*, int); + using SpecialOpConverter = + Caffe2Ops (Caffe2Backend::*)(OnnxNode*, const ConversionContext&); void OnnxToCaffe2( caffe2::NetDef* init_net, @@ -153,51 +177,56 @@ class Caffe2Backend { Caffe2Ops OnnxNodeToCaffe2Ops( const ModelProto& init_model, const ModelProto& pred_model, - OnnxNode* onnx_node, - int opset_version); + const ConversionContext& ctx, + OnnxNode* onnx_node); std::unordered_set AllNamesInGraph(const GraphProto& graph); - Caffe2Ops CommonOnnxNodeToCaffe2Ops(OnnxNode* onnx_node, int opset_version); - - Caffe2Ops CreateArgMaxMin(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CommonOnnxNodeToCaffe2Ops( + OnnxNode* onnx_node, + const ConversionContext& ctx); - Caffe2Ops CreateCast(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateArgMaxMin(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateConstant(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateCast(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateConvPoolOpBase(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateConstant(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreatePadPool(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateConvPoolOpBase( + OnnxNode* onnx_node, + const ConversionContext& ctx); - Caffe2Ops CreateReshape(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreatePadPool(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateGather(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateReshape(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateGemm(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateGather(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreatePad(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateGemm(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateConcat(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreatePad(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateLogSoftmax(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateConcat(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateSlice(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateLogSoftmax(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateSplit(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateSlice(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateSplit(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateBatchNormalization(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateMatMul(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateBatchNormalization( + OnnxNode* onnx_node, + const ConversionContext& ctx); - Caffe2Ops CreateUpsample(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateMatMul(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateDropout(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateUpsample(OnnxNode* onnx_node, const ConversionContext& ctx); - Caffe2Ops CreateLRN(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateDropout(OnnxNode* onnx_node, const ConversionContext& ctx); + Caffe2Ops CreateLRN(OnnxNode* onnx_node, const ConversionContext& ctx); // LUT related getters const std::unordered_map& get_renamed_operators() diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index 81bedce653612d..cb9932bc4542a2 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -763,12 +763,13 @@ def ftrl(w, nz, i, g, alpha): self.assertReferenceChecks(gc, op, [var, nz, indices, grad, alpha], ftrl) + # TODO: (bddppq) test_unique keeps running into segfault on rocm 1.8.2 @given(input=hu.tensor(max_value=20, max_dim=1, dtype=np.int32, elements=st.integers(min_value=0, max_value=10)), with_remapping=st.booleans(), - **hu.gcs) + **hu.gcs_no_hip) def test_unique(self, input, with_remapping, gc, dc): op = core.CreateOperator( "Unique", diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index e501a7d41d3ecc..f640f6db20eff8 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -252,7 +252,11 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None): cpu_do = caffe2_pb2.DeviceOption() gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA) hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP) -device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) + ([hip_do] if workspace.has_hip_support else []) +# (bddppq) Do not rely on this no_hip option! It's just used to +# temporarily skip some flaky tests on ROCM before it's getting more mature. +_device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) +device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else []) + # Include device option for each GPU expanded_device_options = [cpu_do] + ( [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i) @@ -275,6 +279,7 @@ def gradient_checker_device_option(): gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do])) gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do])) +gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip)) @contextlib.contextmanager diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 7a7d9440d1aa77..dab79b8b1fb0b4 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -212,34 +212,35 @@ def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version super(Caffe2Backend, cls).run_node(node, inputs, device=device, outputs_info=outputs_info, opset_version=opset_version) + value_infos = [] device_option = get_device_option(Device(device)) ws = Workspace() with core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value) + value_infos.append(onnx.helper.make_tensor_value_info( + name=key, + elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype], + shape=value.shape).SerializeToString()) else: assert len(node.input) == len(inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): ws.FeedBlob(key, value) + value_infos.append(onnx.helper.make_tensor_value_info( + name=key, + elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype], + shape=value.shape).SerializeToString()) ops = [] cbackend = C.Caffe2Backend(cls._dummy_name) - ops_str = cbackend.convert_node(node.SerializeToString(), opset_version) + ops_str = cbackend.convert_node(node.SerializeToString(), value_infos, opset_version) for s in ops_str[0] + ops_str[1]: op = caffe2_pb2.OperatorDef() op.ParseFromString(s) op.device_option.CopyFrom(device_option) ops.append(op) - # For testing - if "ONNX_CAFFE2_DEBUG" in os.environ: - init_ops, ops2, _ = cls._onnx_node_to_caffe2_op( - None, None, node, opset_version or cls._known_opset_version) - ops2 = init_ops + ops2 - for op in ops2: - op.device_option.CopyFrom(device_option) - print("\nC++:\n{}\nPython:\n{}".format(ops, ops2)) ws.RunOperatorsOnce(ops) output_values = [ws.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values) @@ -708,82 +709,34 @@ def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs): model = onnx.shape_inference.infer_shapes(model) - # Check whether we have RNN related ops - pred_model = cls.optimize_onnx(model, predict=True) - rnn_nodes = [] - for node in pred_model.graph.node: - if node.op_type in {'LSTM', 'GRU', 'RNN'}: - rnn_nodes.append(node) - - # Build the C++ backend - # TODO: build a predictor that supports GPU - # And for RNN nets, we need to avoid adding init_net - use_cpp_backend = device == 'CPU' and not rnn_nodes - # use python backend for now - use_cpp_backend = False - if use_cpp_backend: - c2_rnn_ops = [] - if rnn_nodes: - init_model = cls.optimize_onnx(model, init=True) - for node in rnn_nodes: - c2ops = cls._onnx_node_to_caffe2_op( - init_model, pred_model, node, opset_version) - init_ops = [x.SerializeToString() for x in c2ops.init_ops] - ops = [x.SerializeToString() for x in c2ops.ops] - external_inputs = c2ops.interface_blobs - c2_rnn_ops.append(C.Caffe2Ops(init_ops, ops, external_inputs)) - del init_model - - cbackend = C.Caffe2Backend(cls._dummy_name) - if raw_values_dict: - cls._external_value_resolution_pass(model, raw_values_dict) - rep = cbackend.prepare(model.SerializeToString(), device, c2_rnn_ops) - # For testing - # Dump the net descriptions to file for comparison with the Python ones - if "ONNX_CAFFE2_DEBUG" in os.environ: - pred_net_str = rep.pred_net() - pn = caffe2_pb2.NetDef() - pn.ParseFromString(pred_net_str) - init_net_str = rep.init_net() - inn = caffe2_pb2.NetDef() - inn.ParseFromString(init_net_str) - with open("cpp.txt", "w") as f: - f.write("pred_net: \n{}".format(pn)) - - rep_wrapper = Caffe2CppRep(rep) - return rep_wrapper - else: - ws = Workspace() - device_option = get_device_option(Device(device)) + ws = Workspace() + device_option = get_device_option(Device(device)) - init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False) + init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False) - if raw_values_dict: - cls._external_value_resolution_pass(model, raw_values_dict) + if raw_values_dict: + cls._external_value_resolution_pass(model, raw_values_dict) - # Directly load initializer data into blobs in workspace - cls._direct_initialize_parameters( - model.graph.initializer, - ws, - device_option, - ) + # Directly load initializer data into blobs in workspace + cls._direct_initialize_parameters( + model.graph.initializer, + ws, + device_option, + ) - initialized = {init.name for init in model.graph.initializer} + initialized = {init.name for init in model.graph.initializer} - cls._direct_initialize_inputs( - model.graph.input, - initialized, - ws, - device_option, - ) + cls._direct_initialize_inputs( + model.graph.input, + initialized, + ws, + device_option, + ) - uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized] + uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized] - if "ONNX_CAFFE2_DEBUG" in os.environ: - with open("python.txt", "w") as f: - f.write("pred_net: \n{}".format(predict_net)) - retval = Caffe2Rep(init_net, predict_net, ws, uninitialized) - return retval + retval = Caffe2Rep(init_net, predict_net, ws, uninitialized) + return retval @classmethod @@ -791,7 +744,20 @@ def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs): def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version): cbackend = C.Caffe2Backend(cls._dummy_name) if cbackend.support_onnx_import(node_def.op_type): - op_strs = cbackend.convert_node(node_def.SerializeToString(), opset_version) + + # extract value infos from pred model (value infos of + # node's inputs that are in init model should be all + # available in pred model) + value_infos = [] + for name in node_def.input: + if pred_model is not None: + for vi in itertools.chain(pred_model.graph.input, + pred_model.graph.output, + pred_model.graph.value_info): + if vi.name == name: + value_infos.append(vi.SerializeToString()) + + op_strs = cbackend.convert_node(node_def.SerializeToString(), value_infos, opset_version) init_ops = [] for s in op_strs[0]: op = caffe2_pb2.OperatorDef() diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py index 97d824e05897a5..e526d74f73921a 100644 --- a/caffe2/python/onnx/tests/c2_ref_test.py +++ b/caffe2/python/onnx/tests/c2_ref_test.py @@ -1,4 +1,4 @@ -## @package onnx +# @package onnx # Module caffe2.python.onnx.tests.c2_ref_test from __future__ import absolute_import @@ -39,14 +39,14 @@ def test_dummy_name(self): def test_check_arguments(self): b2 = C.Caffe2Backend() - node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"]) - b2.convert_node(node_def.SerializeToString(), 6) + node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"]) + b2.convert_node(node_def.SerializeToString()) - bad_node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"], foo = 42, bar = 56) + bad_node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"], foo=42, bar=56) with self.assertRaisesRegexp( - RuntimeError, - ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"): - b2.convert_node(bad_node_def.SerializeToString(), 6) + RuntimeError, + ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"): + b2.convert_node(bad_node_def.SerializeToString()) def test_relu_graph(self): X = np.random.randn(3, 2).astype(np.float32) @@ -199,6 +199,54 @@ def test_gemm(self): output["Y"], alpha * np.dot(A, B) + beta * C) + def test_gemm_conversion(self): + node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + alpha=2., + beta=3., + transB=True) + + backend = C.Caffe2Backend() + + # without broadcast and without shape info, gemm will be + # converted to matmul + add + _, op_strs = backend.convert_node(node_def.SerializeToString()) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add']) + + # with shape info (that indicates C is 1D), gemm will be + # converted to FC + _, op_strs = backend.convert_node(node_def.SerializeToString( + ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()]) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['Scale', 'Scale', 'FC']) + + # or with broadcast, gemm will be converted to fc + node_def = make_node( + 'Gemm', + ['A', 'B', 'C'], + ["Y"], + transB=True, + broadcast=1) + + _, op_strs = backend.convert_node(node_def.SerializeToString()) + op_names = [] + for s in op_strs: + op = caffe2_pb2.OperatorDef() + op.ParseFromString(s) + op_names.append(op.type) + self.assertEqual(op_names, ['FC']) + def test_tensor_filling_ops(self): for dtype in [ onnx.TensorProto.FLOAT, @@ -267,7 +315,6 @@ def test_tensor_filling_ops_c_backend(self): np.testing.assert_almost_equal(output[0], vals) np.testing.assert_almost_equal(ws.FetchBlob(op.output[0]), vals) - def test_slice(self): X = np.random.randn(1, 2, 3).astype(np.float32) starts = np.array([0, 1, 0], dtype=np.int32) diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py index cbc83bed116c4a..6854be44164b49 100644 --- a/caffe2/python/operator_test/spatial_bn_op_test.py +++ b/caffe2/python/operator_test/spatial_bn_op_test.py @@ -28,14 +28,11 @@ class TestSpatialBN(hu.HypothesisTestCase): order=st.sampled_from(["NCHW", "NHWC"]), epsilon=st.floats(min_value=1e-5, max_value=1e-2), inplace=st.sampled_from([True, False]), - **hu.gcs) + # Currently HIP SpatialBN only supports 2D + **hu.gcs_no_hip) def test_spatialbn_test_mode_3d( self, size, input_channels, batch_size, seed, order, epsilon, inplace, gc, dc): - # Currently HIP SpatialBN only supports 2D - if _run_in_hip(gc, dc): - return - op = core.CreateOperator( "SpatialBN", ["X", "scale", "bias", "mean", "var"], @@ -77,14 +74,11 @@ def reference_spatialbn_test(X, scale, bias, mean, var): order=st.sampled_from(["NCHW", "NHWC"]), epsilon=st.floats(min_value=1e-5, max_value=1e-2), inplace=st.sampled_from([True, False]), - **hu.gcs) + # Currently HIP SpatialBN only supports 2D + **hu.gcs_no_hip) def test_spatialbn_test_mode_1d( self, size, input_channels, batch_size, seed, order, epsilon, inplace, gc, dc): - # Currently HIP SpatialBN only supports 2D - if _run_in_hip(gc, dc): - return - op = core.CreateOperator( "SpatialBN", ["X", "scale", "bias", "mean", "var"], @@ -249,14 +243,11 @@ def test_spatialbn_train_mode_gradient_check( seed=st.integers(0, 65535), order=st.sampled_from(["NCHW", "NHWC"]), epsilon=st.floats(min_value=1e-5, max_value=1e-2), - **hu.gcs) + # Currently HIP SpatialBN only supports 2D + **hu.gcs_no_hip) def test_spatialbn_train_mode_gradient_check_1d( self, size, input_channels, batch_size, seed, order, epsilon, gc, dc): - # Currently HIP SpatialBN only supports 2D - if _run_in_hip(gc, dc): - return - op = core.CreateOperator( "SpatialBN", ["X", "scale", "bias", "mean", "var"], diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index 70bc635193f19c..04df247d821daf 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -793,13 +793,21 @@ void addObjectMethods(py::module& m) { "convert_node", [](caffe2::onnx::Caffe2Backend& instance, const py::bytes& node_str, + const std::vector& value_infos_bytes, int opset_version) -> std::vector> { // Note that we return two lists of serialized ops. The first set is // init_ops and the second set is ops for pred net. When converting // RNN related op, it is possible that we will create ops in the // init_net. Hence the return structure here + caffe2::onnx::ValueInfoMap value_infos{}; + for (const auto& vi_bytes : value_infos_bytes) { + ::ONNX_NAMESPACE::ValueInfoProto vi{}; + vi.ParseFromString(vi_bytes); + auto name = vi.name(); + value_infos.emplace(std::move(name), std::move(vi)); + } auto c2ops = instance.ConvertNode( - node_str.cast(), opset_version); + node_str.cast(), {value_infos, opset_version}); std::vector> vals; vals.emplace_back(); auto& init_vals = vals.back(); @@ -816,12 +824,15 @@ void addObjectMethods(py::module& m) { normal_vals.emplace_back(py::bytes(out)); } return vals; - }) + }, + py::arg("node_str"), + py::arg("value_infos_bytes") = std::vector{}, + py::arg("opset_version") = kKnownOpsetVersion) .def( - "_build_tensor_filling_op", - [](caffe2::onnx::Caffe2Backend& instance, - const py::bytes& tensor_proto_str, - const std::string& name="") -> py::bytes { + "_build_tensor_filling_op", + [](caffe2::onnx::Caffe2Backend& instance, + const py::bytes& tensor_proto_str, + const std::string& name = "") -> py::bytes { caffe2::OperatorDef op; ::ONNX_NAMESPACE::TensorProto tp; ParseProtoFromLargeString(tensor_proto_str, &tp); @@ -829,7 +840,7 @@ void addObjectMethods(py::module& m) { std::string out; op.SerializeToString(&out); return py::bytes(out); - }); + }); py::class_(m, "Predictor") .def( diff --git a/docs/Makefile b/docs/Makefile index 2a63943f00f0ab..4a56c12ca22d89 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -23,6 +23,13 @@ docset: html cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png +html-stable: + # stable differs from `make html` in two ways: + # 1) The stable logo is used instead of the unstable logo + # 2) There will not be a link to the stable docs. + # See conf.py for more details. + RELEASE=1 make html + .PHONY: help Makefile docset # Catch-all target: route all unknown targets to Sphinx using the new diff --git a/docs/source/conf.py b/docs/source/conf.py index b48a5ad27362a9..1eaaa3b9086d96 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os +import os # import sys # sys.path.insert(0, os.path.abspath('.')) import torch @@ -28,6 +28,8 @@ warnings.warn('unable to load "torchvision" package') import sphinx_rtd_theme +RELEASE = os.environ.get('RELEASE', False) + # -- General configuration ------------------------------------------------ @@ -54,6 +56,8 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] +if RELEASE: + templates_path = ['_templates-stable'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: @@ -122,6 +126,9 @@ } html_logo = '_static/img/pytorch-logo-dark-unstable.png' +if RELEASE: + html_logo = '_static/img/pytorch-logo-dark.svg' + # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/tools/cwrap/plugins/Broadcast.py b/tools/cwrap/plugins/Broadcast.py deleted file mode 100644 index 5b0a74167839c4..00000000000000 --- a/tools/cwrap/plugins/Broadcast.py +++ /dev/null @@ -1,362 +0,0 @@ -from . import CWrapPlugin -from string import Template - -# Arguments to the Broadcast Plugin: -# broadcast: args_to_broadcast_against [inplace] [fallback] -# [args_to_broadcast_against]: either a single argument (e.g. "arg1") or a comma-separated -# list of two arguments (e.g. "tensor1,tensor2") indicating -# arguments to broadcast specified argument (usually "self") against -# [inplace] will generate code for in-place function, which doesn't allow the in-place -# argument to be broadcast -# [fallback] if tensors aren't broadcastable, preserves "element number" pointwise behavior, -# where only number of elements need to match, and tensors are viewed as 1-dimensional. -# [dims] specify if the tensors shouldn't be broadcast to a specific tensor or tensors, but a combination -# of individual dimension sizes of a set of tensors. For example: addbmm(C,A,B) a.k.a. [C + A @ B] -# broadcasts C to the first dimension of A and the second dimension of B. Each dimension is specified as -# [arg].dim[#] and dimensions are comma-separated. So, to specify that the tensor should be -# broadcast to 3-dimensions with sizes: -# tensor0->size[0] x tensor1->size[1] x tensor2->size[2] -# you would write: -# dims:tensor0.dim0,tensor1.dim1,tensor2.dim2 -# [types] if the tensors should be of different types than THTensor, specify as X where -# the actual type to use is THXTensor (i.e. Byte for THByteTensor). If the type -# should be THTensor, use 'Real' - -# For out of place: -# Two args: expand the two args together -# Three args (fused kernels): (e.g. addcmul) expand all three args together -# Sketch of proof that this is the same: -# consider addcmul, under expansion we want: a + (b * c) = (a + b * c) [all expanded together] -# Let e(i, j) be the expansion of i with j, e(i, j, k) be the expansion of i with j,k -# -# Then a + (b * c) = e(a, e(b,c) * e(c,b)) + e(e(b,c) * e(c,b), a) -# = e(a, e(b,c)) + e(e(b,c) * e(c,b), a) (only size matters for second param) -# = e(a,b,c) + e(e(b,c) * e(c,b), a) (by associativity of max in expand) -# = e(a,b,c) + e(b,c,a) * e(c,b,a) (see L1) -# which is a + b * c all expanded together -# -# L1: Show e(i * j, a) = e(i,a) * e(j,a) where i,j have same size -# Consider any index _{ s_0, ..., s_n} -# e(i * j, a) = (i*j)_{f(s_0), ...,f(s_n)} where f is the expansion of that dimension with a -# = i_{f(s_0), ..., f(s_n)} * j_{f(s_0), ..., f(s_n)} by definition of pointwise operator -# = e(i,a) * e(j,a) - - -class Broadcast(CWrapPlugin): - - # Save and restore passed in arguments in case later plugins use - POST_TEMPLATE = Template( - """${arg_op_other} = ${arg_op_other}_save;\n""") - - def getPreArgStringTemplate(self, type=None): - if type is None: - ret = """THTensor *${arg_op_other}_save = ${arg_op_other}; - THTensorPtr ${arg_op_other}_guard(nullptr);\n""" - else: - cpu_t = "TH" + type + "Tensor" - gpu_t = "THCuda" + type + "Tensor" - ret = ("#if !IS_CUDA\n" + - cpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" + - cpu_t + "Ptr ${arg_op_other}_guard(nullptr);\n" + - "#else\n" + - gpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" + - "THPPointer<" + gpu_t + "> ${arg_op_other}_guard(nullptr);\n" + - "#endif\n") - return Template(ret) - - def getNewForExpand(self, type): - if type is None: - ret = """THTensor_(new)(LIBRARY_STATE_NOARGS);\n""" - else: - cpu_t = "TH" + type + "Tensor" - gpu_t = "THCuda" + type + "Tensor" - ret = ("#if !IS_CUDA\n" + - cpu_t + "_new(LIBRARY_STATE_NOARGS);\n" + - "#else\n" + - gpu_t + "_new(LIBRARY_STATE_NOARGS);\n" + - "#endif\n") - return ret - - def getExpandTemplate(self, same_size_check, expand_call, success_code, raise_errors): - if not raise_errors: - return Template( - "bool try_expand = !" + same_size_check + "\n" + - "if (try_expand) {\n" + - "bool expand_success = false;\n" + - "try {\n" + - expand_call + - "\nexpand_success = true;\n" + - "}\n" - "catch (std::exception &e) {}\n" + - "if(expand_success) {\n" + - success_code + - "\n}" + - "\n}\n") - else: - return Template( - "bool try_expand = !" + same_size_check + "\n" + - "if (try_expand) {\n" + - expand_call + "\n" + - success_code + "\n" - "}\n") - - def getOutPlacePreExpand2Template(self, type_op_a, type_op_other, raise_errors): - size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension, - ${arg_op_other}->size, ${arg_op_other}->nDimension);""" - expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" + - "${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" + - """expand_outplace2(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_other}_guard.get(), - ${arg_op_a}, ${arg_op_other}, - \"${op_a}\", \"${op_other}\", !${raise_errors});""") - success_code = """${arg_op_a} = ${arg_op_a}_guard.get(); - ${arg_op_other} = ${arg_op_other}_guard.get();""" - return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors) - - def getOutPlacePreExpand3Template(self, type_op_a, type_op_other1, type_op_other2, raise_errors): - size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension, - ${arg_op_other1}->size, ${arg_op_other1}->nDimension) && - THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension, - ${arg_op_other2}->size, ${arg_op_other2}->nDimension));""" - expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" + - "${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" + - "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" + - """expand_outplace3(LIBRARY_STATE ${arg_op_a}_guard.get(), - ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(), - ${arg_op_a}, ${arg_op_other1}, ${arg_op_other2}, - \"${op_a}\", \"${op_other1}\", \"${op_other2}\", !${raise_errors});""") - success_code = """${arg_op_a} = ${arg_op_a}_guard.get(); - ${arg_op_other1} = ${arg_op_other1}_guard.get(); - ${arg_op_other2} = ${arg_op_other2}_guard.get();""" - return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors) - - OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE = Template( - """if(THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim}) <= ${arg_op_dim_value}) { - THError("Argument %s requires at least %d dimensions, but only has %d", - "${op_dim}", ${arg_op_dim_value} + 1, THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim})); - } - int64_t ${arg_op_a}_dim${idx}_size = THTensor_(size)(LIBRARY_STATE ${arg_op_dim}, ${arg_op_dim_value});\n""") - - OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE = Template( - """THLongStoragePtr ${arg_op_a}_storage(THLongStorage_newWithSize1(${arg_op_a}_dim0_size));\n""") - - OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE = Template( - """THLongStoragePtr ${arg_op_a}_storage( - THLongStorage_newWithSize2(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size));\n""") - - OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE = Template( - """THLongStoragePtr ${arg_op_a}_storage( - THLongStorage_newWithSize3(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size, ${arg_op_a}_dim2_size));\n""") - - def getOutPlacePreExpandPostDimTemplate(self, type_op_a, raise_errors): - size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension, - ${arg_op_a}_storage->data, ${arg_op_a}_storage->size);""" - expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" + - """expand(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_a}, ${arg_op_a}_storage);""") - success_code = """${arg_op_a} = ${arg_op_a}_guard.get();""" - return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors) - - OUT_PLACE_PRE_TEMPLATE = Template( - """${code_arg_op_a}${code_arg_op_other1}${code_arg_op_other2} - ${expand_code}""") - - def getInPlacePreExpand1Template(self, type_op_other, raise_errors): - size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension, - ${arg_op_other}->size, ${arg_op_other}->nDimension);""" - expand_code = ("${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" + - """expand_inplace1(LIBRARY_STATE ${arg_op_other}_guard.get(), ${arg_op_other}, ${arg_op_a}, - \"${op_other}\", \"${op_a}\", !${raise_errors});""") - success_code = """${arg_op_other} = ${arg_op_other}_guard.get();""" - return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors) - - def getInPlacePreExpand2Template(self, type_op_other1, type_op_other2, raise_errors): - size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension, - ${arg_op_other1}->size, ${arg_op_other1}->nDimension) && - THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension, - ${arg_op_other2}->size, ${arg_op_other2}->nDimension));""" - expand_code = ("${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" + - "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" + - """expand_inplace2(LIBRARY_STATE ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(), - ${arg_op_other1}, ${arg_op_other2}, ${arg_op_a}, - \"${op_other1}\", \"${op_other2}\", \"${op_a}\", !${raise_errors});""") - success_code = """${arg_op_other1} = ${arg_op_other1}_guard.get(); - ${arg_op_other2} = ${arg_op_other2}_guard.get();""" - return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors) - - IN_PLACE_PRE_TEMPLATE = Template( - """${code_arg_op_other1}${code_arg_op_other2} - ${expand_code}""") - - def initialize(self, cwrap): - self.cwrap = cwrap - - # Arguments: - # [0]: name of tensor to broadcast with (possibly two comma separated) - # [1] inplace (optional). In place operations only broadcast on second tensor argument - # [2] fallback (optional). Will fallback to applying to tensor of equal nElem if broadcast fails - def process_option_code_template(self, template, option): - new_code_pre = [] - new_code_post = [] - for _, arg in enumerate(option['arguments']): - if 'broadcast' not in arg: - continue - - params = arg.get('broadcast').split(" ") - op_a = arg.get('assign_name', arg['name']) - in_place = "inplace" in params - raise_errors = "false" if "fallback" in params else "true" - - param_others = params[0].split(",") - if len(param_others) > 2: - raise ValueError('Broadcast only supports up to 2 secondary parameters') - op_b = param_others[0] - op_c = param_others[1] if len(param_others) == 2 else None - arg_op_b = "arg_" + op_b - arg_op_a = "arg_" + op_a - arg_op_c = ("arg_" + op_c) if op_c else None - - dims_kvs = [] - for p in params: - if p.startswith("dims:"): - assert(raise_errors == "true") - if len(dims_kvs) != 0: - raise ValueError("multiple specifications of dims") - dims = p[len("dims:"):].split(",") - for dim in dims: - batchdim = dim.split(".") - assert len(batchdim) == 2 - assert batchdim[1].startswith("dim") - dim_val = batchdim[1][len("dim"):] - dims_kvs.append({"op": batchdim[0], "arg_op": "arg_" + batchdim[0], "val": dim_val}) - - assert len(dims_kvs) <= 3 - for p in params[1:]: - if p != "inplace" and p != "fallback" and not p.startswith("dims:") and not p.startswith("types:"): - raise ValueError("invalid parameter {}".format(p)) - - type_op_b = None - type_op_c = None - for p in params: - if p.startswith("types:"): - if not in_place and len(dims_kvs) > 0: - raise ValueError("type specification not supported yet for out-of-place functions " - "that specify explicit dimensions") - types = p[len("types:"):].split(",") - assert(len(types) == (2 if op_c else 1)) - type_op_b = None if types[0] == "Real" else types[0] - if op_c: - type_op_c = None if types[1] == "Real" else types[1] - - op_b_mapping = { - "op_a": op_a, - "op_other": op_b, - "arg_op_a": arg_op_a, - "arg_op_other": arg_op_b, - "raise_errors": raise_errors - } - op_c_mapping = { - "op_a": op_a, - "op_other": op_c, - "arg_op_a": arg_op_a, - "arg_op_other": arg_op_c, - "raise_errors": raise_errors - } - raise_errors_s = raise_errors == "true" - - if in_place: - code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping) - code_arg_op_other2 = ( - self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping) if op_c else "") - - if op_c: - expand_code = self.getInPlacePreExpand2Template(type_op_b, type_op_c, raise_errors_s).substitute( - op_b_mapping, - op_other1=op_b, - op_other2=op_c, - arg_op_other1=arg_op_b, - arg_op_other2=arg_op_c) - else: - expand_code = self.getInPlacePreExpand1Template(type_op_b, raise_errors_s).substitute(op_b_mapping) - - new_code_pre.append(self.IN_PLACE_PRE_TEMPLATE.substitute( - arg_op_a=arg_op_a, - code_arg_op_other1=code_arg_op_other1, - code_arg_op_other2=code_arg_op_other2, - expand_code=expand_code, - raise_errors=raise_errors)) - new_code_pre.append("") - - post_code = self.POST_TEMPLATE.substitute(op_b_mapping) - if op_c: - post_code += self.POST_TEMPLATE.substitute(op_c_mapping) - - new_code_post.append(post_code) - new_code_post.append("") - else: - if len(dims_kvs) != 0: - code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a) - code_arg_op_other1 = "" - code_arg_op_other2 = "" - expand_code = "" - for idx, kv in enumerate(dims_kvs): - expand_code += self.OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE.substitute( - arg_op_a=arg_op_a, - op_dim=kv["op"], - arg_op_dim=kv["arg_op"], - arg_op_dim_value=kv["val"], - idx=idx) - - if len(dims_kvs) == 1: - expand_code += self.OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE.substitute( - arg_op_a=arg_op_a, - arg_op_dim0=dims_kvs[0]["arg_op"]) - elif len(dims_kvs) == 2: - expand_code += self.OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE.substitute( - arg_op_a=arg_op_a, - arg_op_dim0=dims_kvs[0]["arg_op"], - arg_op_dim1=dims_kvs[1]["arg_op"]) - else: - expand_code += self.OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE.substitute( - arg_op_a=arg_op_a, - arg_op_dim0=dims_kvs[0]["arg_op"], - arg_op_dim1=dims_kvs[1]["arg_op"], - arg_op_dim2=dims_kvs[2]["arg_op"]) - expand_code += self.getOutPlacePreExpandPostDimTemplate(None, raise_errors_s).substitute( - arg_op_a=arg_op_a, - raise_errors=raise_errors) - post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a) - - else: - code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a) - code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping) - code_arg_op_other2 = (self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping) - if op_c else "") - - if op_c: - expand_template = self.getOutPlacePreExpand3Template(None, type_op_b, type_op_c, raise_errors_s) - expand_code = expand_template.substitute( - op_b_mapping, - op_other1=op_b, - op_other2=op_c, - arg_op_other1=arg_op_b, - arg_op_other2=arg_op_c) - - else: - expand_code = self.getOutPlacePreExpand2Template(None, type_op_b, raise_errors_s).substitute( - op_b_mapping) - - post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a) - post_code += self.POST_TEMPLATE.substitute(op_b_mapping) - post_code += self.POST_TEMPLATE.substitute(op_c_mapping) if op_c else "" - - new_code_pre.append(self.OUT_PLACE_PRE_TEMPLATE.substitute( - code_arg_op_a=code_arg_op_a, - code_arg_op_other1=code_arg_op_other1, - code_arg_op_other2=code_arg_op_other2, - expand_code=expand_code)) - new_code_pre.append("") - - new_code_post.append(post_code) - new_code_post.append("") - - template = new_code_pre + template + new_code_post - return template diff --git a/tools/cwrap/plugins/__init__.py b/tools/cwrap/plugins/__init__.py index 7efb4a51bf1ce6..53789a0bed989a 100644 --- a/tools/cwrap/plugins/__init__.py +++ b/tools/cwrap/plugins/__init__.py @@ -432,4 +432,3 @@ def process_pre_arg_assign(self, template, option): from .AutoGPU import AutoGPU from .CuDNNPlugin import CuDNNPlugin from .WrapDim import WrapDim -from .Broadcast import Broadcast diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h index a3e3c0f40f48c0..69b5036766e998 100644 --- a/torch/csrc/jit/argument_spec.h +++ b/torch/csrc/jit/argument_spec.h @@ -4,6 +4,7 @@ #include #include "torch/csrc/autograd/variable.h" #include "torch/csrc/utils/hash.h" +#include "torch/csrc/jit/stack.h" #include "torch/csrc/jit/variable_tensor_list.h" namespace torch { namespace jit { @@ -16,14 +17,15 @@ namespace torch { namespace jit { // since it is used along the hot-path of the JIT to check if the code // we have created is valid for the given inputs. -// TensorInfoPOD is only used internally in ArgumentSpec -// API users should use TensorInfo -struct TensorInfoPOD { +// ArgumentInfoPOD is only used internally in ArgumentSpec +// API users should use ArgumentInfo +struct ArgumentInfoPOD { // total size is 64-bit - unsigned type : 8; + unsigned is_tensor : 8; // all other fields are invalid if this is false + unsigned type : 8; // scalar type unsigned defined : 1; unsigned requires_grad : 1; - signed device : 22; + signed device : 14; uint32_t total_dims; // all TensorInfoPODs are in ArgumentSpec's tensor_info() array. // total_dims is the total number of dimensions seen so far // in all previous members of tensor_info(), including this tensor @@ -32,34 +34,38 @@ struct TensorInfoPOD { // for tensor 0, the offset is always 0 }; -static_assert(sizeof(TensorInfoPOD) == sizeof(int64_t), - "TensorInfoPOD must be 64-bit struct for ArgumentSpec encoding to work"); +static_assert(sizeof(ArgumentInfoPOD) == sizeof(int64_t), + "ArgumentInfoPOD must be 64-bit struct for ArgumentSpec encoding to work"); -struct TensorInfo; +struct ArgumentInfo; struct ArgumentSpec { - // note: tensors must always be variables - ArgumentSpec(bool with_grad, const variable_tensor_list & tensors) - : hash_code(0), ntensors(tensors.size()) { - int all_dims = 0; - for(size_t i = 0; i < ntensors; i++) { - all_dims += tensors[i].defined() ? tensors[i].ndimension() : 0; + ArgumentSpec(bool with_grad, at::ArrayRef inputs) + : hash_code(0), ninputs(inputs.size()) { + int32_t all_dims = 0; + const int32_t num_inputs = inputs.size(); + for (int32_t i = 0; i < num_inputs; i++) { + if (!inputs[i].isTensor()) continue; + auto tensor = inputs[i].toTensor(); + all_dims += tensor.defined() ? tensor.ndimension() : 0; } // allocate enough room for all TensorPODs and dimensions - data.resize(ntensors + all_dims*2); + data.resize(ninputs + all_dims*2); // and reinterpret our data array as these structs - TensorInfoPOD * pods = reinterpret_cast(data.data()); + ArgumentInfoPOD * pods = reinterpret_cast(data.data()); int64_t * next_dim = sizes_strides(); - int total_dims = 0; - for(size_t i = 0; i < ntensors; i++) { - const auto & t = tensors[i]; + int32_t total_dims = 0; + for(int32_t i = 0; i < num_inputs; i++) { auto & pod = pods[i]; + pod.is_tensor = static_cast(inputs[i].isTensor()); + if (!pod.is_tensor) continue; + at::Tensor t = inputs[i].toTensor(); pod.defined = t.defined(); - if(t.defined()) { - pod.type = static_cast(t.type().scalarType()); + if (pod.defined) { + pod.type = static_cast(t.type().scalarType()); pod.device = (!t.type().is_cuda()) ? -1 : t.get_device(); - pod.requires_grad = with_grad && static_cast(t).requires_grad(); + pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); total_dims += t.ndimension(); auto sizes = t.sizes(); std::copy(sizes.begin(),sizes.end(), next_dim); @@ -73,51 +79,54 @@ struct ArgumentSpec { } // we precompute the hash_code to minimize the time inside of hash // table operations where we may need to hold a compiler cache lock. - hash_code = hash_combine(0, ntensors); + hash_code = hash_combine(0, ninputs); for(auto d : data) { hash_code = hash_combine(hash_code, d); } } - // equality is fast: check ntensors, and then check the raw array data, + // equality is fast: check ninputs, and then check the raw array data, // there are no size/stride indirections bool operator==(const ArgumentSpec & spec) const { - return ntensors == spec.ntensors && data == spec.data; + return ninputs == spec.ninputs && data == spec.data; } bool operator!=(const ArgumentSpec & spec) const { return !(*this == spec); } - friend struct TensorInfo; - TensorInfo tensorInfo(size_t i) const; + friend struct ArgumentInfo; + ArgumentInfo at(size_t i) const; size_t size() const { - return ntensors; + return ninputs; } size_t hashCode() const { return hash_code; } private: - ArrayRef tensor_info() const { - return ArrayRef(reinterpret_cast(data.data()), ntensors); + ArrayRef tensor_info() const { + return ArrayRef(reinterpret_cast(data.data()), ninputs); } - // the start of the sizes_strides information, which comes after the TensorInfoPOD list. + // the start of the sizes_strides information, which comes after the ArgumentInfoPOD list. const int64_t* sizes_strides() const { - return data.data() + ntensors; + return data.data() + ninputs; } int64_t* sizes_strides() { - return data.data() + ntensors; + return data.data() + ninputs; } size_t hash_code; // precomputed on construction - uint32_t ntensors; - // layout is ntensors of TensorPOD (each 64-bit) followed by their size and stride info + int32_t ninputs; + // layout is ninputs of TensorPOD (each 64-bit) followed by their size and stride info // for 3 tensors: [t0POD][t1POD][t2POD][t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides] std::vector data; }; -// public view of compressed TensorInfo -struct TensorInfo { - TensorInfo(const ArgumentSpec & spec, const int i) +// public view of compressed ArgumentInfo +struct ArgumentInfo { + ArgumentInfo(const ArgumentSpec & spec, const int i) : spec(spec), i(i) {} + bool isTensor() const { + return pod(i).is_tensor; + } at::ScalarType type() const { return at::ScalarType(pod(i).type); } @@ -148,20 +157,20 @@ struct TensorInfo { } private: // offsetinto sizes_strides() array where the sizes start for tensor j - // [valid range] valid range is [0, ntensors] - // (i.e. you can ask for the offset at ntensors, which would be the offset of the next tensor if it existed) + // [valid range] valid range is [0, ninputs] + // (i.e. you can ask for the offset at ninputs, which would be the offset of the next tensor if it existed) int sizes_strides_offset(int j) const { if(j == 0) return 0; return 2*pod(j - 1).total_dims; } - const TensorInfoPOD & pod(int j) const { + const ArgumentInfoPOD & pod(int j) const { return spec.tensor_info().at(j); } const ArgumentSpec & spec; const int i; }; -inline std::ostream & operator<<(std::ostream & out, const TensorInfo & info) { +inline std::ostream & operator<<(std::ostream & out, const ArgumentInfo & info) { if(!info.defined()) { return out << ""; } @@ -178,14 +187,14 @@ inline std::ostream& operator<<(std::ostream & out, const ArgumentSpec & spec) { for(size_t i = 0; i < spec.size(); ++i) { if (i > 0) out << ", "; - out << spec.tensorInfo(i); + out << spec.at(i); } out << "}"; return out; } -inline TensorInfo ArgumentSpec::tensorInfo(size_t i) const { - return TensorInfo(*this, i); +inline ArgumentInfo ArgumentSpec::at(size_t i) const { + return ArgumentInfo(*this, i); } }} diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index 0324d1f3e44b80..2c595ffd679c27 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -34,6 +34,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -51,38 +52,51 @@ using autograd::variable_list; struct ExecutionPlanAutogradFunction : public autograd::Function { ExecutionPlanAutogradFunction(GraphExecutor graph, size_t capture_size) : graph(std::move(graph)) { - captures.reserve(capture_size); + is_var_capture.reserve(capture_size); + var_captures.reserve(capture_size); + ivalue_captures.reserve(capture_size); } + virtual variable_list apply(variable_list&& inputs) override { - // TODO: expensive copies here to convert to/from tensor_list - // TODO: because inputs is passed by const reference there is no - // way to release tensors incrementally as this runs - variable_tensor_list all_inputs; - all_inputs.reserve(captures.size() + inputs.size()); - all_inputs.insert(all_inputs.end(), inputs.begin(), inputs.end()); - for(auto & sv : captures) { - all_inputs.push_back(sv.unpack(this->shared_from_this())); + Stack stack; + stack.reserve(is_var_capture.size() + inputs.size()); + stack.insert(stack.end(), std::make_move_iterator(inputs.begin()), + std::make_move_iterator(inputs.end())); + auto var_capture_it = var_captures.begin(); + auto ivalue_capture_it = ivalue_captures.begin(); + for (bool is_var : is_var_capture) { + if (is_var) { + stack.push_back(var_capture_it->unpack(this->shared_from_this())); + ++var_capture_it; + } else { + stack.push_back(*ivalue_capture_it); + ++ivalue_capture_it; + } + } + graph.run(stack); + return fmap(stack, [](IValue & val) { + return autograd::Variable(std::move(val).toTensor()); + }); + } + + void capture(const IValue & val) { + const bool is_tensor = val.isTensor(); + is_var_capture.push_back(is_tensor); + if (is_tensor) { + var_captures.emplace_back(Variable(val.toTensor()), false); + } else { + ivalue_captures.push_back(val); } - auto tensors = graph.run(std::move(all_inputs)); - // TODO: another copy that needs to be removed - return autograd::variable_list(tensors.begin(), tensors.end()); } private: friend struct ExecutionPlan; GraphExecutor graph; - std::vector captures; -}; - -// helper to run interpreter on variables until we switch -// everything to IValue -inline variable_tensor_list runOneStage(const Code & code, variable_tensor_list inputs) { - std::vector stack(inputs.begin(), inputs.end()); - InterpreterState(code).runOneStage(stack); - return variable_tensor_list(fmap(stack, [](IValue& v) { - return std::move(v).toTensor(); - })); -} + // INVARIANT: is_var_capture.size() == var_captures.size() + ivalue_captures.size() + std::vector is_var_capture; + std::vector var_captures; + std::vector ivalue_captures; +}; // an optimized way of executing the subgraph computed directly on // tensors rather than Variables. @@ -91,19 +105,25 @@ inline variable_tensor_list runOneStage(const Code & code, variable_tensor_list // to the output Variables if present. struct ExecutionPlan { ExecutionPlan(std::shared_ptr& graph) - : f(graph), graph(graph) {} + : f(graph), + graph(graph), + num_inputs(graph->inputs().size()), + num_outputs(graph->outputs().size()) {} ExecutionPlan(std::shared_ptr& graph, Gradient grad) : f(graph), graph(graph), grad(std::move(grad)), - grad_executor(this->grad.df) {} + grad_executor(this->grad.df), + num_inputs(graph->inputs().size()), + num_outputs(graph->outputs().size()) {} - variable_tensor_list run(variable_tensor_list&& stack) const { - if(grad) { - return runWithGrad(std::move(stack)); + void run(Stack & stack) const { + if (grad) { + return runWithGrad(stack); } - return runOneStage(f, std::move(stack)); + InterpreterState(f).runOneStage(stack); } + std::shared_ptr get_graph() const { return graph; } @@ -124,70 +144,73 @@ struct ExecutionPlan { } private: - // note: should be inplace to avoid allocations, but we have to switch from - // a list of tensor to a list of ivalues - std::vector unwrapVariables(variable_tensor_list && list) const { - return fmap(list, [](const Variable& v) -> IValue { - return v.defined() ? autograd::as_variable_ref(v).detach() : at::Tensor(); - }); - } - // note: should be inplace to avoid allocations, but we have to switch from - // a list of tensor to a list of ivalues - variable_tensor_list wrapTensors(tensor_list && list) const { - for(auto & v : list) { - v = autograd::make_variable(v, /*requires_grad=*/false); + void detachVariables(Stack & stack) const { + // It would be nice to use an ArrayRef here, but unfortunately those can only + // return const references, so we need to do a bunch of indexing ourselves. + const int64_t stack_size = stack.size(); + const int64_t stack_offset = stack_size - num_inputs; + for (int64_t i = stack_offset; i < stack_size; ++i) { + auto & v = stack[i]; + if (!v.isTensor()) continue; + auto t = std::move(v).toTensor(); + v = IValue{t.defined() ? autograd::as_variable_ref(t).detach() : std::move(t)}; } - return variable_tensor_list(std::move(list)); } // Capture (save) inputs that would be required to subsequently run backwards - void captureInputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & inputs) const { - for(auto offset : grad.df_input_captured_inputs) { - grad_fn.captures.emplace_back(autograd::as_variable_ref(inputs[offset]), false); + void captureInputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef inputs) const { + for (size_t offset : grad.df_input_captured_inputs) { + grad_fn.capture(inputs[offset]); } } - void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & outputs) const { - for(auto offset : grad.df_input_captured_outputs) { - grad_fn.captures.emplace_back(autograd::as_variable_ref(outputs[offset]), true); + void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef outputs) const { + for (size_t offset : grad.df_input_captured_outputs) { + grad_fn.capture(outputs[offset]); } } - variable_tensor_list runWithGrad(variable_tensor_list&& inputs) const { + // XXX: keep in mind that stack can be larger than the inputs we need! + void runWithGrad(Stack & stack) const { auto grad_fn = std::make_shared(grad_executor, grad.df_input_captured_inputs.size() + grad.df_input_captured_outputs.size()); - // hook up the outputs of df to the gradient functions of the inputs that require - // gradients - for(auto idx : grad.df_output_vjps) { - auto & v = autograd::as_variable_ref(inputs[idx]); - grad_fn->add_next_edge(v.gradient_edge()); + + { + auto inputs = last(stack, num_inputs); + // hook up the outputs of df to the gradient functions of the inputs that require gradients + for(auto idx : grad.df_output_vjps) { + auto v = Variable(inputs[idx].toTensor()); + grad_fn->add_next_edge(v.gradient_edge()); + } + captureInputs(*grad_fn, inputs); } - captureInputs(*grad_fn, inputs); - auto stack = unwrapVariables(std::move(inputs)); + detachVariables(stack); InterpreterState(f).runOneStage(stack); - variable_tensor_list outputs( - fmap(stack, [](IValue& v) { return std::move(v).toTensor(); })); - - // hookup the gradients for the output tensors that require gradients - // to the inputs to our gradient function df - // TODO - XXX - if any output is the same tensor multiple times, views have to be - // setup here. We need to refactor autograd until it is safe for - // tensors to be constructed without all the viewing infrastructure. - // this is currently intentionally not done here so we can get an idea of our - // perf before introducing overhead for correctness - for(auto idx : grad.df_input_vjps) { - // Note: we have to set this up in place, or we have to throw away and - // reallocate variables that were already created in wrapTensors. We - // should add an API for this. - auto& output = autograd::as_variable_ref(outputs[idx]); - autograd::create_gradient_edge(output, grad_fn); - output.set_requires_grad(true); + + { + auto outputs = last(stack, num_outputs); + // hookup the gradients for the output tensors that require gradients + // to the inputs to our gradient function df + // TODO - XXX - if any output is the same tensor multiple times, views have to be + // setup here. We need to refactor autograd until it is safe for + // tensors to be constructed without all the viewing infrastructure. + // this is currently intentionally not done here so we can get an idea of our + // perf before introducing overhead for correctness + for(auto idx : grad.df_input_vjps) { + // Note: we have to set this up in place, or we have to throw away and + // reallocate variables that were already created in wrapTensors. We + // should add an API for this. + Variable output = outputs[idx].toTensor(); + autograd::create_gradient_edge(output, grad_fn); + output.set_requires_grad(true); + } + captureOutputs(*grad_fn, outputs); + // drop the temporary outputs so that we return the same number of + // outputs as if we were not also calculating gradient + const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs; + stack.erase(stack.end() - num_temporary_outputs, stack.end()); } - captureOutputs(*grad_fn, outputs); - // drop the temporary outputs so that we return the same number of - // outputs as if we were not also calculating gradient - outputs.erase(outputs.begin() + grad.f_real_outputs, outputs.end()); - return outputs; } + Code f; // optimized graph for debugging and testing std::shared_ptr graph; @@ -195,6 +218,9 @@ struct ExecutionPlan { Gradient grad; // if(grad) is false when this is unused // executor for df, including code caches GraphExecutor grad_executor; + + const size_t num_inputs; + const size_t num_outputs; }; } // anonymous namespace @@ -210,6 +236,7 @@ struct GraphExecutorImpl { : graph(std::move(graph)) , optimize(optimize) , num_inputs(this->graph->inputs().size()) + , num_outputs(this->graph->outputs().size()) , symbolically_differentiable(symbolically_differentiable) , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {} GraphExecutorImpl(std::shared_ptr graph, bool optimize) @@ -223,34 +250,36 @@ struct GraphExecutorImpl { } // entry point where execution begins - variable_tensor_list run(variable_tensor_list inputs) { - if(inputs.size() != num_inputs) { + void run(Stack & stack) { + if(stack.size() < num_inputs) { std::stringstream ss; - ss << "expected " << num_inputs << " inputs but got " << inputs.size() << " inputs"; + ss << "expected " << num_inputs << " inputs but got " << stack.size() << " inputs"; throw std::runtime_error(ss.str()); } + auto inputs = last(stack, num_inputs); // the tracer has called a graph executor // there is no need to optimize, but we do need to splice the graph of // this excutor into the trace. Otherwise we might unroll control-flow // operations. if(tracer::isTracing()) { - return runTraced(std::move(inputs)); + return runTraced(stack); } // this is the fallback pathway, when we cannot differentiate if(!optimize || (!symbolically_differentiable && needsGradient(inputs))) { - return runFallback(std::move(inputs)); + return runFallback(stack); } // either we can symbolically differentiate, or we do not need a gradient. // go down the route where we treat the inputs as tensors // and fully optimize auto & implementation = getOrCompile(inputs); - return implementation.run(std::move(inputs)); + return implementation.run(stack); } - std::shared_ptr graphFor(const variable_tensor_list& inputs) const { + std::shared_ptr graphFor(const Stack& stack) const { + auto inputs = last(stack, num_inputs); ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs); if (!optimize || (!symbolically_differentiable && needsGradient(inputs))) { @@ -282,12 +311,15 @@ struct GraphExecutorImpl { private: friend struct GraphExecutor; - variable_tensor_list runTraced(variable_tensor_list inputs) { + void runTraced(Stack & stack) { auto state = tracer::getTracingState(); - auto input_values = fmap(inputs, tracer::getValueTrace); + auto inputs = last(stack, num_inputs); + auto input_values = fmap(inputs, [](const IValue & v) { + return tracer::getValueTrace(v.toTensor()); + }); ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs); - auto outputs = runFallback(std::move(inputs)); + runFallback(stack); auto all_dynamic = [](const at::ArrayRef xs) { for(Value* x : xs) { @@ -308,15 +340,18 @@ struct GraphExecutorImpl { } auto output_values = script::inlineCallTo(*state->graph, *local_graph, input_values); - for(size_t i = 0; i < outputs.size(); ++i) { - tracer::setValueTrace(outputs[i], output_values[i]); + auto outputs = last(stack, num_outputs); + for (size_t i = 0; i < outputs.size(); ++i) { + // We can't attach tracing states to scalars, so we have to skip them here + // TODO: Should we reinterpret them as scalar tensors instead? + if (!outputs[i].isTensor()) continue; + tracer::setValueTrace(outputs[i].toTensor(), output_values[i]); } - return outputs; } - variable_tensor_list runFallback(variable_tensor_list inputs) { + void runFallback(Stack & stack) { auto & fb = getOrCreateAutogradFallback(); - return runOneStage(fb, std::move(inputs)); + InterpreterState(fb).runOneStage(stack); } static bool calcMayIntroduceGradient(Block* b) { @@ -330,14 +365,16 @@ struct GraphExecutorImpl { } return false; } - bool needsGradient(const variable_tensor_list & inputs) const { + bool needsGradient(at::ArrayRef inputs) const { if (!autograd::GradMode::is_enabled()) { return false; } - if(may_introduce_gradient) + if (may_introduce_gradient) return true; - for (const auto & tensor : inputs) { - if(tensor.defined() && static_cast(tensor).requires_grad()) + for (const IValue & value : inputs) { + if (!value.isTensor()) continue; + auto t = value.toTensor(); + if (t.defined() && autograd::as_variable_ref(t).requires_grad()) return true; } return false; @@ -359,7 +396,7 @@ struct GraphExecutorImpl { autograd_fallback = Code(graph_); return autograd_fallback; } - const ExecutionPlan & getOrCompile(const variable_tensor_list & inputs) { + const ExecutionPlan & getOrCompile(at::ArrayRef inputs) { // outside lock guard, to minimize the time holding the lock on the fast path // ArgumentSpec even computes its hashCode here. ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs); @@ -376,7 +413,7 @@ struct GraphExecutorImpl { bool argumentSpecRequiresGradient(const ArgumentSpec & spec) { for(size_t i = 0; i < spec.size(); ++i) { - if(spec.tensorInfo(i).requires_grad()) + if(spec.at(i).requires_grad()) return true; } return false; @@ -396,7 +433,7 @@ struct GraphExecutorImpl { std::vector requires_grads; requires_grads.reserve(spec.size()); for(size_t i = 0; i < spec.size(); i++) - requires_grads.push_back(spec.tensorInfo(i).requires_grad()); + requires_grads.push_back(spec.at(i).requires_grad()); Gradient gradient = differentiate(graph_, requires_grads); graph_ = gradient.f; @@ -410,8 +447,9 @@ struct GraphExecutorImpl { // true - do everything we can to make this graph run fast // false - do not modifiy the graph at all and just use the interpreter // to run the graph. Useful for debugging correctness issues in the implementation - bool optimize; - size_t num_inputs; + const bool optimize; + const size_t num_inputs; + const size_t num_outputs; // GraphExecutor optimizes more aggresively when we _know_ the graph will be // symbolically differentiable. @@ -450,15 +488,15 @@ GraphExecutor::GraphExecutor(std::shared_ptr graph, bool optimize) GraphExecutor::GraphExecutor(std::shared_ptr graph, bool optimize, bool symbolically_differentiable) : pImpl(new GraphExecutorImpl(std::move(graph), optimize, symbolically_differentiable)) {} -variable_tensor_list GraphExecutor::run(variable_tensor_list && inputs) { - return pImpl->run(std::move(inputs)); +void GraphExecutor::run(Stack & inputs) { + return pImpl->run(inputs); } std::shared_ptr GraphExecutor::graph() const { return pImpl->graph; } -std::shared_ptr GraphExecutor::graphFor(const variable_tensor_list& inputs) const { +std::shared_ptr GraphExecutor::graphFor(const Stack& inputs) const { return pImpl->graphFor(inputs); } @@ -481,7 +519,7 @@ void specializeToSpec(const std::shared_ptr& graph_, const ArgumentSpec& // this must be first because later passes do not know what GradOfs are std::vector defined; for(size_t i = 0; i < spec.size(); ++i) { - defined.push_back(spec.tensorInfo(i).defined()); + defined.push_back(spec.at(i).defined()); } specializeUndef(*graph_, defined); diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h index d78076ab6484f5..4e862c9e0a1e44 100644 --- a/torch/csrc/jit/graph_executor.h +++ b/torch/csrc/jit/graph_executor.h @@ -38,12 +38,12 @@ struct TORCH_API GraphExecutor { GraphExecutor(std::shared_ptr graph, bool optimize = true); // note: if not specified, symbolically_differentiable is computed from the graph. GraphExecutor(std::shared_ptr graph, bool optimize, bool symbolically_differentiable); - variable_tensor_list run(variable_tensor_list && inputs); + void run(Stack & inputs); explicit operator bool() const { return pImpl != nullptr; } std::shared_ptr graph() const; - std::shared_ptr graphFor(const variable_tensor_list& inputs) const; + std::shared_ptr graphFor(const Stack& inputs) const; GraphExecutorState getDebugState(); private: std::shared_ptr pImpl; diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index a4bfdc2a5b8431..908404a43b649e 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -57,18 +57,19 @@ void initJITBindings(PyObject *module) { .def("_jit_pass_onnx", ToONNX) .def("_jit_pass_onnx_peephole", PeepholeOptimizeONNX) .def("_jit_pass_fuse", FuseGraph) - .def("_jit_pass_dce", [](std::shared_ptr& g){ + .def("_jit_pass_dce", [](std::shared_ptr& g) { return EliminateDeadCode(g); // overload resolution }) - .def("_jit_pass_cse", EliminateCommonSubexpression) + .def("_jit_pass_cse", [](std::shared_ptr& g) { + return EliminateCommonSubexpression(g); // overload resolution + }) .def("_jit_pass_peephole", PeepholeOptimize) .def("_jit_pass_canonicalize", [](const std::shared_ptr& g) { return Canonicalize(g); }) .def("_jit_pass_lint", LintGraph) .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) { - auto tensor_inputs = createVariableTensorList(inputs); - PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs)); + PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs))); }) .def("_jit_pass_remove_expands", RemoveExpands) .def("_jit_pass_erase_number_types", EraseNumberTypes) @@ -180,28 +181,15 @@ void initJITBindings(PyObject *module) { return ge.graph(); }) .def("graph_for", [](GraphExecutor& ge, py::args args) { - return ge.graphFor(createVariableTensorList(args)); + return ge.graphFor(createStack(args)); }) .def("get_debug_state", [](GraphExecutor& ge) { return ge.getDebugState(); }) .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object { - auto inputs = createVariableTensorList(args); - auto outputs = ge.run(std::move(inputs)); - // if we don't tell pybind these are variables it chokes on the - // conversion. - // TODO: fix conversions to be sane and make sure this works. - if (outputs.size() == 0) { - return py::none(); - } else if (outputs.size() == 1) { - return py::cast(autograd::as_variable_ref(outputs[0])); - } else { - py::tuple tuple(outputs.size()); - for(size_t i = 0; i < outputs.size(); i++) { - tuple[i] = py::cast(autograd::as_variable_ref(outputs[i])); - } - return tuple; - } + auto stack = createStack(args); + ge.run(stack); + return wrapStack(std::move(stack)); }); initPythonIRBindings(module); diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index 04664f62885e83..cf7dda32413c23 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -617,16 +617,9 @@ struct CodeImpl { auto executor = std::make_shared(node->g(attr::Subgraph)); graph_executors.emplace_back(executor.get()); - auto num_inputs = node->inputs().size(); return [=](Stack& stack) mutable { autograd::profiler::RecordFunction record("GraphExecutor"); - auto inputs = last(stack, num_inputs); - variable_tensor_list tinputs( - fmap(inputs, [](const IValue& v) { return v.toTensor(); })); - drop(stack, num_inputs); - //TODO: has graph executor work from a stack as well - variable_tensor_list toutputs = executor->run(variable_tensor_list(std::move(tinputs))); - stack.insert(stack.end(), toutputs.begin(), toutputs.end()); + executor->run(stack); return 0; }; } diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 81863baac9ce36..42a5be89e55e4b 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -4,6 +4,8 @@ #include +#include + namespace torch { namespace jit { // smart pointer to hold onto at::Retainable objects in a generic way diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index adcc5664179308..3b18699f94ffcd 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -455,7 +455,9 @@ void PropagateShapeOnBlock(Block * block, bool insert_expands) { void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec) { JIT_ASSERT(graph.inputs().size() == spec.size()); for(size_t i = 0; i < spec.size(); ++i) { - graph.inputs()[i]->setType(spec.tensorInfo(i)); + auto argspec = spec.at(i); + if (!argspec.isTensor()) continue; + graph.inputs()[i]->setType(argspec); } PropagateShapeOnBlock(graph.block()); } diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h index 8b7e78a4d54384..415fc311086ac9 100644 --- a/torch/csrc/jit/pybind_utils.h +++ b/torch/csrc/jit/pybind_utils.h @@ -2,17 +2,10 @@ #include "torch/csrc/utils/pybind.h" -#include "torch/csrc/jit/variable_tensor_list.h" - namespace torch { namespace jit { -namespace { - -// we cannot use the default py:cast because it currently -// unwraps the data tensor in the conversion process -// TODO: replace with bs type -variable_tensor_list createVariableTensorList(py::tuple tuple, size_t reserve_extra_space = 0) { - variable_tensor_list result; +inline Stack createStack(const py::tuple& tuple, size_t reserve_extra_space = 0) { + Stack result; result.reserve(tuple.size() + reserve_extra_space); for(auto e : tuple) { result.push_back(py::cast(e)); @@ -20,6 +13,20 @@ variable_tensor_list createVariableTensorList(py::tuple tuple, size_t reserve_ex return result; } -} // namespace +inline py::object wrapStack(Stack&& outputs) { + if (outputs.size() == 0) { + return py::none(); + } else if (outputs.size() == 1) { + JIT_ASSERT(outputs[0].isTensor()); + return py::cast(autograd::as_variable_ref(std::move(outputs[0]).toTensor())); + } else { + py::tuple tuple(outputs.size()); + for(size_t i = 0; i < outputs.size(); i++) { + JIT_ASSERT(outputs[i].isTensor()); + tuple[i] = py::cast(autograd::as_variable_ref(std::move(outputs[i]).toTensor())); + } + return tuple; + } +} } } // namespace torch::jit diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index d4fdb529782a69..c9e41e8a7eee26 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -145,7 +145,7 @@ void initPythonIRBindings(PyObject * module_) { return ss.str(); }) .def("propagate_shapes", [](Graph& g, std::vector inputs, bool with_grad) { - PropagateInputShapes(g, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs)))); + PropagateInputShapes(g, ArgumentSpec(with_grad, fmap(inputs))); }) .def("export", [](const std::shared_ptr g, const std::vector& initializers, int64_t onnx_opset_version, bool defer_weight_export, diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index 61100060f7f65f..576344427c0461 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -371,9 +371,9 @@ static void gatherParametersAndBuffers(std::vector & values, const } py::object runMethodFromPython(Method& m, py::args args) { - auto inputs = createVariableTensorList(args); - auto outputs = m.run(std::move(inputs)); - return unpackVariableTensorList(std::move(outputs)); + auto stack = createStack(args); + m.run(stack); + return wrapStack(std::move(stack)); } void initJitScriptBindings(PyObject* module) { @@ -502,7 +502,7 @@ void initJitScriptBindings(PyObject* module) { }) .def("graph_for", [](Module& self, py::args args) { if (self.find_method("forward")) { - return self.get_method("forward").graph_for(createVariableTensorList(args)); + return self.get_method("forward").graph_for(createStack(args)); } throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()"); }) @@ -530,7 +530,7 @@ void initJitScriptBindings(PyObject* module) { .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes) .def("params", &Method::params) .def("graph_for", [](Method& self, py::args args) { - return self.graph_for(createVariableTensorList(args)); + return self.graph_for(createStack(args)); }) .def("set_arg_and_return_types", [](Method &self, TypedDef &typed_def, bool method) { std::vector arg_type_args, return_type_args; diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h index 90ad6f75d1b38c..76518aaf1d26fa 100644 --- a/torch/csrc/jit/script/module.h +++ b/torch/csrc/jit/script/module.h @@ -54,13 +54,13 @@ struct Method { } } - variable_tensor_list run(variable_tensor_list && inputs) { - for(auto tp : member_inputs) { - inputs.push_back(*tp); + void run(Stack & stack) { + for(at::Tensor* tp : member_inputs) { + stack.push_back(*tp); } - return get_executor().run(std::move(inputs)); + get_executor().run(stack); } - std::shared_ptr graph_for(const variable_tensor_list& inputs) { + std::shared_ptr graph_for(const Stack& inputs) { return get_executor().graphFor(inputs); } std::shared_ptr graph() const { @@ -95,12 +95,15 @@ struct Method { std::shared_ptr propagate_shapes(std::vector inputs, bool with_grad=false) { auto retval = graph_->copy(); - for (auto inp : member_inputs) { - inputs.push_back(*inp); + Stack stack; + stack.reserve(inputs.size() + member_inputs.size()); + for (at::Tensor & i : inputs) { + stack.emplace_back(std::move(i)); + } + for (at::Tensor* inp : member_inputs) { + stack.push_back(*inp); } - PropagateInputShapes( - *retval, - ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs)))); + PropagateInputShapes(*retval, ArgumentSpec(with_grad, std::move(stack))); return retval; } @@ -110,8 +113,7 @@ struct Method { inputs.push_back(*inp); } if (propagate) { - auto inputs_copy = inputs; - PropagateInputShapes(*retval, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs_copy)))); + PropagateInputShapes(*retval, ArgumentSpec(with_grad, fmap(inputs))); } JIT_ASSERT(retval->inputs().size() == inputs.size()); for (size_t i=0; i < retval->inputs().size(); ++i) { diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h index 654c87088e02a8..2c74ae7e0a4c77 100644 --- a/torch/csrc/jit/stack.h +++ b/torch/csrc/jit/stack.h @@ -27,10 +27,10 @@ static inline IValue & peek(Stack & stack, size_t i, size_t N) { } // treat the last N elements of the stack as a list, looking up the // slice starting at index i and having length len -static inline at::ArrayRef peekSlice(Stack & stack, size_t i, size_t len, size_t N) { +static inline at::ArrayRef peekSlice(const Stack & stack, size_t i, size_t len, size_t N) { return at::ArrayRef(stack).slice(stack.size() - N + i, len); } -static inline at::ArrayRef last(Stack & stack, size_t N) { +static inline at::ArrayRef last(const Stack & stack, size_t N) { return peekSlice(stack, 0, N, N); } static inline void drop(Stack & stack, size_t n) { diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index ec889612a10471..ecb8c9b3779816 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -714,7 +714,8 @@ bool isEqual(at::IntList lhs, at::IntList rhs) { return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); } -bool isEqual(const TensorInfo & ti, const autograd::Variable & v) { +bool isEqual(const ArgumentInfo & ti, const autograd::Variable & v) { + REQUIRE(ti.isTensor()); if(!ti.defined()) return ti.defined() == v.defined(); return @@ -728,8 +729,8 @@ bool isEqual(const TensorInfo & ti, const autograd::Variable & v) { // work around the fact that variable_tensor_list doesn't duplicate all // of std::vector's constructors. // most constructors are never used in the implementation, just in our tests. -variable_tensor_list createVarList(std::vector && list) { - return variable_tensor_list(std::move(list)); +Stack createStack(std::vector && list) { + return Stack(std::make_move_iterator(list.begin()), std::make_move_iterator(list.end())); } void argumentSpecTest() { @@ -738,14 +739,14 @@ void argumentSpecTest() { auto & GF = at::CUDA(at::kFloat); auto & GD = at::CUDA(at::kDouble); - auto list = createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()}); + auto list = createStack({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()}); // make sure we have some non-standard strides - list[1].transpose_(0, 1); + list[1].toTensor().transpose_(0, 1); // same list but different backing values - auto list2 = createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()}); - list2[1].transpose_(0, 1); + auto list2 = createStack({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()}); + list2[1].toTensor().transpose_(0, 1); ArgumentSpec a(true, list); @@ -758,7 +759,7 @@ void argumentSpecTest() { REQUIRE(d.hashCode() == a.hashCode()); for(size_t i = 0; i < list.size(); ++i) { - REQUIRE(isEqual(a.tensorInfo(i), list[i])); + REQUIRE(isEqual(a.at(i), list[i].toTensor())); } ArgumentSpec no_grad(/*with_grad=*/false, list); REQUIRE(no_grad != a); @@ -770,7 +771,7 @@ void argumentSpecTest() { spec.insert(std::move(no_grad)); REQUIRE(spec.count(ArgumentSpec(true,list)) == 1); - list2[1].transpose_(0,1); + list2[1].toTensor().transpose_(0,1); ArgumentSpec c(true, list2); // same as list, except for one stride REQUIRE(!(c == a)); REQUIRE(spec.count(c) == 0); @@ -793,7 +794,7 @@ void shapeAnalysisTest() { auto w_hh = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA)); auto g = build_lstm(); - ArgumentSpec spec(false, createVarList({v(input), v(hx), v(cx), v(w_ih), v(w_hh) })); + ArgumentSpec spec(false, createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh) })); PropagateInputShapes(*g, spec); at::Tensor r0, r1; std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh); @@ -818,14 +819,15 @@ void testGraphExecutor() { auto w_ih = t_def(at::randn({4 * hidden_size, input_size}, at::kCUDA)); auto w_hh = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA)); - std::vector inputs = {v(input), v(hx), v(cx), v(w_ih), v(w_hh) }; auto g = build_lstm(); GraphExecutor executor(g); - auto outputs = executor.run(variable_tensor_list(std::move(inputs))); + auto stack = createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh)}); + executor.run(stack); + REQUIRE(stack.size() == 2); at::Tensor r0, r1; std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh); - REQUIRE(almostEqual(Variable(outputs[0]).data(), r0)); - REQUIRE(almostEqual(Variable(outputs[1]).data(), r1)); + REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0)); + REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1)); } void testBlocks(std::ostream & out) { diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index 0b5d41f4de2978..5c998e3fc690bf 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -45,15 +45,13 @@ PreTraceInfo preRecordTrace(Symbol op, void postRecordTrace(const PreTraceInfo& info, at::ArrayRef outputs) { - auto assignOutput = [&info](const Variable & output, Value * value) { + for (size_t i = 0; i < outputs.size(); i++) { + auto & output = outputs[i]; + Value * value = info.n->addOutput(); if (output.defined()) { value->inferTypeFrom(output.data()); setValueTrace(output, value); } - }; - - for (size_t i = 0; i < outputs.size(); i++) { - assignOutput(outputs[i], info.n->addOutput()); } } diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py index c780807791407f..9f726d12e7a724 100644 --- a/torch/nn/modules/rnn.py +++ b/torch/nn/modules/rnn.py @@ -579,7 +579,7 @@ class RNNCell(RNNCellBase): Attributes: weight_ih: the learnable input-hidden weights, of shape - `(input_size x hidden_size)` + `(hidden_size x input_size)` weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)` bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`