diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 9b6b28f2842b88..053a9be5e05487 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -100,6 +100,8 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Unknown reasons, need to debug
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/arg_ops_test.py")
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py")
+  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py")
+  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py")
 
   # Need to go through roi ops to replace max(...) with fmaxf(...)
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/roi_align_rotated_op_test.py")
@@ -107,12 +109,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Our cuda top_k op has some asm code, the hipified version doesn't
   # compile yet, so we don't have top_k operator for now
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
-
-  # These are fixed in rocm 1.8.2, re-enable them once our CI docker images are upgraded
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/recurrent_net_executor_test.py")
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py")
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/conv_test.py")
-  rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/group_conv_test.py")
 fi
 
 # Python tests
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 6f3986b837d308..f6041932518db9 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -25,8 +25,6 @@ namespace onnx {
 
 namespace {
 
-constexpr static int kKnownOpsetVersion = 6;
-
 bool AlmostEqual(double a, double b) {
   constexpr static double kEps = 1e-15;
   return (fabs(a - b) < kEps);
@@ -367,17 +365,19 @@ Caffe2Backend::get_special_operators() const {
 
 Caffe2Ops Caffe2Backend::CreateArgMaxMin(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   if (!attributes.HasAttribute("axis")) {
     auto* attr = attributes.AddRewrittenAttribute("axis");
     attr->set_i(0);
   }
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateCast(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateCast(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 
   auto onnx_dtype =
       onnx_node->attributes.get<int64_t>("to", TensorProto::UNDEFINED);
@@ -443,7 +443,7 @@ Caffe2Ops Caffe2Backend::CreateCast(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateConstant(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   CAFFE_ENFORCE_EQ(onnx_node->node.output_size(), 1);
 
   Caffe2Ops ret;
@@ -486,7 +486,7 @@ Caffe2Ops Caffe2Backend::CreateConstant(
 //  differently.
 Caffe2Ops Caffe2Backend::CreateConvPoolOpBase(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   auto& attributes = onnx_node->attributes;
   if (node.op_type().find("Global") == 0) {
@@ -512,16 +512,18 @@ Caffe2Ops Caffe2Backend::CreateConvPoolOpBase(
     }
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreatePadPool(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   auto& node = onnx_node->node;
   auto& attributes = onnx_node->attributes;
   Caffe2Ops ret;
   // Pad
   bool padding = false;
-  const std::string pad_name = opset_version < 2 ? "paddings" : "pads";
+  const std::string pad_name = ctx.opset_version() < 2 ? "paddings" : "pads";
   const auto pad_input = dummy_->NewDummyName();
   if (attributes.HasAttribute("count_include_pad") &&
       attributes.HasAttribute(pad_name)) {
@@ -561,7 +563,7 @@ Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
     }
   }
   // Pool
-  auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, opset_version);
+  auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, ctx);
   auto* pool_op = c2_ops.ops.Mutable(0);
   if (padding) {
     pool_op->set_input(0, pad_input);
@@ -572,8 +574,10 @@ Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateReshape(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
   auto* op = c2_op.ops.Mutable(0);
   op->add_output(dummy_->NewDummyName());
@@ -583,7 +587,7 @@ Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateReciprocal(
     OnnxNode* onnx_node,
-    int /*opset_version*/) {
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() != 1 || node.output_size() != 1) {
     CAFFE_THROW("Caffe2 Reciprocal should have 1 input and 1 output");
@@ -599,7 +603,9 @@ Caffe2Ops Caffe2Backend::CreateReciprocal(
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateGather(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateGather(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() < 2 || node.output_size() < 1) {
     CAFFE_THROW("Caffe2 Gather should have 2 inputs and 1 output");
@@ -629,7 +635,9 @@ Caffe2Ops Caffe2Backend::CreateGather(OnnxNode* onnx_node, int opset_version) {
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateGemm(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() < 3 || node.output_size() < 1) {
     CAFFE_THROW("Caffe2 Gemm should have 3 inputs and 1 output");
@@ -667,7 +675,22 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
   auto trans_a = onnx_node->attributes.get<int64_t>("transA", 0L);
   auto trans_b = onnx_node->attributes.get<int64_t>("transB", 0L);
   auto broadcast = onnx_node->attributes.get<int64_t>("broadcast", 0L);
-  if ((!trans_a) && trans_b && broadcast) {
+
+  bool use_fc = false;
+  if ((!trans_a) && trans_b) {
+    if (broadcast) {
+      use_fc = true;
+    } else {
+      const auto input_c_vi_iter = ctx.value_infos().find(node.input(2));
+      if (input_c_vi_iter != ctx.value_infos().end() &&
+          input_c_vi_iter->second.type().tensor_type().shape().dim_size() ==
+              1) {
+        use_fc = true;
+      }
+    }
+  }
+
+  if (use_fc) {
     auto* c2_op = ret.ops.Add();
     BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output});
   } else {
@@ -683,7 +706,7 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
     BuildOperator(
         c2_op, "MatMul", {input_a, input_b}, {ab}, {arg_trans_a, arg_trans_b});
     c2_op = ret.ops.Add();
-    if (opset_version >= 7) {
+    if (ctx.opset_version() >= 7) {
       BuildOperator(c2_op, "Add", {ab, input_c}, {output});
     } else {
       caffe2::Argument arg_broadcast;
@@ -696,10 +719,12 @@ Caffe2Ops Caffe2Backend::CreateGemm(OnnxNode* onnx_node, int opset_version) {
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreatePad(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreatePad(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   ::google::protobuf::RepeatedField<::google::protobuf::int64> pads;
-  std::string pad_name = opset_version < 2 ? "paddings" : "pads";
+  std::string pad_name = ctx.opset_version() < 2 ? "paddings" : "pads";
   pads = attributes
              .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>(
                  pad_name);
@@ -734,14 +759,16 @@ Caffe2Ops Caffe2Backend::CreatePad(OnnxNode* onnx_node, int opset_version) {
   attr->add_ints(pads.Get(6));
   attr->add_ints(pads.Get(7));
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
 // TODO: Caffe2 Concat has an extra output. It should be only
 // used when doing training, so we should change Caffe2 to allow
 // 1 output.
-Caffe2Ops Caffe2Backend::CreateConcat(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateConcat(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
   auto* op = c2_op.ops.Mutable(0);
   op->add_output(dummy_->NewDummyName());
@@ -751,7 +778,7 @@ Caffe2Ops Caffe2Backend::CreateConcat(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateLogSoftmax(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() < 1 || node.output_size() < 1) {
     CAFFE_THROW("LogSoftmax should have 1 input and 1 output");
@@ -771,8 +798,10 @@ Caffe2Ops Caffe2Backend::CreateLogSoftmax(
   return ret;
 }
 
-Caffe2Ops Caffe2Backend::CreateSlice(OnnxNode* onnx_node, int opset_version) {
-  auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateSlice(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1);
   auto* op = op_tmp.ops.Mutable(0);
   std::unordered_map<std::string, caffe2::Argument*> args;
@@ -922,40 +951,42 @@ Caffe2Ops Caffe2Backend::CreateSlice(OnnxNode* onnx_node, int opset_version) {
 
 Caffe2Ops Caffe2Backend::CreateBatchNormalization(
     OnnxNode* onnx_node,
-    int opset_version) {
-  if (opset_version < 6) {
+    const ConversionContext& ctx) {
+  if (ctx.opset_version() < 6) {
     auto& attributes = onnx_node->attributes;
     attributes.remove("consumed_inputs");
   }
 
-  if (opset_version >= 7) {
+  if (ctx.opset_version() >= 7) {
     auto& attributes = onnx_node->attributes;
     auto* attr = attributes.AddRewrittenAttribute("is_test");
     attr->set_i(1);
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
 Caffe2Ops Caffe2Backend::CreateSplit(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   if (!attributes.HasAttribute("axis")) {
     auto* attr = attributes.AddRewrittenAttribute("axis");
     attr->set_i(0);
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateMatMul(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateMatMul(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   const auto& node = onnx_node->node;
   if (node.input_size() != 2) {
     CAFFE_THROW("MatMul should have 2 inputs");
   }
 
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
   auto* op = c2_op.ops.Mutable(0);
   auto* broadcast_arg = op->add_arg();
@@ -965,10 +996,12 @@ Caffe2Ops Caffe2Backend::CreateMatMul(OnnxNode* onnx_node, int opset_version) {
   return c2_op;
 }
 
-Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version) {
+Caffe2Ops Caffe2Backend::CreateUpsample(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
   auto& attributes = onnx_node->attributes;
   attributes.remove("mode");
-  if (opset_version >= 7) {
+  if (ctx.opset_version() >= 7) {
     const auto& scales = attributes.get<::google::protobuf::RepeatedField<float>>("scales");
     if (scales.size() != 4) {
       CAFFE_THROW("The scales argument should have size 4");
@@ -976,7 +1009,7 @@ Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version)
       CAFFE_THROW("The first two elements in the scales argument must be 1");
     }
     attributes.remove("scales");
-    auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+    auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
     auto* op = c2_op.ops.Mutable(0);
     auto* c2_height = op->add_arg();
     c2_height->set_name("height_scale");
@@ -986,21 +1019,25 @@ Caffe2Ops Caffe2Backend::CreateUpsample(OnnxNode* onnx_node, int opset_version)
     c2_width->set_f(scales.Get(3));
     return c2_op;
   }
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateDropout(OnnxNode* onnx_node, int opset_version) {
-  if (opset_version >= 7) {
+Caffe2Ops Caffe2Backend::CreateDropout(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  if (ctx.opset_version() >= 7) {
     auto& attributes = onnx_node->attributes;
     auto* attr = attributes.AddRewrittenAttribute("is_test");
     attr->set_i(1);
   }
 
-  return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+  return CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
 }
 
-Caffe2Ops Caffe2Backend::CreateLRN(OnnxNode* onnx_node, int opset_version) {
-  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+Caffe2Ops Caffe2Backend::CreateLRN(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   const auto& attributes = onnx_node->attributes;
   if (!attributes.HasAttribute("alpha")) {
       auto* arg = c2_op.ops.Mutable(0)->add_arg();
@@ -1052,7 +1089,7 @@ Caffe2Backend::AllNamesInGraph(const GraphProto &graph) {
 //  and then fixing things up further.
 Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
     OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx) {
   Caffe2Ops ret;
   auto* c2_op = ret.ops.Add();
 
@@ -1064,12 +1101,12 @@ Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
   const auto onnx_op_type = node.op_type();
   auto broken_version = caffe2::get_default(
       get_broken_operators(), onnx_op_type, std::numeric_limits<int>::max());
-  if (broken_version <= opset_version) {
+  if (broken_version <= ctx.opset_version()) {
     CAFFE_THROW(
         "Don't know how to translate op ",
         onnx_op_type,
         " in ONNX operator set v",
-        opset_version,
+        ctx.opset_version(),
         " (I only support prior to v",
         broken_version);
   }
@@ -1102,14 +1139,14 @@ Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
 
 Caffe2Ops Caffe2Backend::ConvertNode(
     const std::string& node_str,
-    int opset_version) {
+    const ConversionContext& ctx) {
   ::google::protobuf::RepeatedPtrField<NodeProto> nodes;
   auto* n = nodes.Add();
   ParseProtoFromLargeString(node_str, n);
   ModelProto init_model;
   ModelProto pred_model;
   OnnxNode onnx_node = OnnxNode(nodes.Get(0));
-  return OnnxNodeToCaffe2Ops(init_model, pred_model, &onnx_node, opset_version);
+  return OnnxNodeToCaffe2Ops(init_model, pred_model, ctx, &onnx_node);
 }
 
 void Caffe2Backend::CheckOpSchemaArguments(
@@ -1142,14 +1179,14 @@ void Caffe2Backend::CheckOpSchemaArguments(
 Caffe2Ops Caffe2Backend::OnnxNodeToCaffe2Ops(
     const ModelProto& init_model,
     const ModelProto& pred_model,
-    OnnxNode* onnx_node,
-    int opset_version) {
+    const ConversionContext& ctx,
+    OnnxNode* onnx_node) {
   Caffe2Ops res;
   if (get_special_operators().count(onnx_node->node.op_type())) {
     res = (this->*get_special_operators().at(onnx_node->node.op_type()))(
-        onnx_node, opset_version);
+        onnx_node, ctx);
   } else {
-    res = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
+    res = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
   }
 
   for (const auto& result_op: res.ops){
@@ -1198,6 +1235,17 @@ void Caffe2Backend::OnnxToCaffe2(
   name_set.insert(name_set_pred.begin(), name_set_pred.end());
   dummy_->Reset(name_set);
 
+  ValueInfoMap graph_value_infos{};
+  for (const auto& vi : pred_model.graph().input()) {
+    graph_value_infos[vi.name()].CopyFrom(vi);
+  }
+  for (const auto& vi : pred_model.graph().output()) {
+    graph_value_infos[vi.name()].CopyFrom(vi);
+  }
+  for (const auto& vi : pred_model.graph().value_info()) {
+    graph_value_infos[vi.name()].CopyFrom(vi);
+  }
+
   size_t idx_extra = 0;
   auto converter = [&](const ModelProto& model, caffe2::NetDef* net) mutable {
     net->mutable_device_option()->CopyFrom(device_option);
@@ -1230,9 +1278,16 @@ void Caffe2Backend::OnnxToCaffe2(
               " without enough extra preconverted string");
         }
       } else {
+        ValueInfoMap value_infos{};
+        for (const auto& name : node.input()) {
+          auto iter = graph_value_infos.find(name);
+          if (iter != graph_value_infos.end()) {
+            value_infos[name].CopyFrom(iter->second);
+          }
+        }
         auto onnx_node = OnnxNode(node);
         auto c2ops = OnnxNodeToCaffe2Ops(
-            init_model, pred_model, &onnx_node, opset_version);
+            init_model, pred_model, {value_infos, opset_version}, &onnx_node);
         init_net_tmp->mutable_op()->MergeFrom(c2ops.init_ops);
         net->mutable_op()->MergeFrom(c2ops.ops);
         net->mutable_external_input()->MergeFrom(c2ops.interface_blobs);
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
index 437e572b8528b7..681ab5b30d10b0 100644
--- a/caffe2/onnx/backend.h
+++ b/caffe2/onnx/backend.h
@@ -11,6 +11,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
+constexpr int kKnownOpsetVersion = 6;
+
 namespace caffe2 {
 namespace onnx {
 
@@ -19,6 +21,25 @@ using ::ONNX_NAMESPACE::GraphProto;
 using ::ONNX_NAMESPACE::ModelProto;
 using ::ONNX_NAMESPACE::NodeProto;
 using ::ONNX_NAMESPACE::TensorProto;
+using ::ONNX_NAMESPACE::ValueInfoProto;
+
+using ValueInfoMap = std::unordered_map<std::string, ValueInfoProto>;
+
+class ConversionContext {
+ public:
+  ConversionContext(const ValueInfoMap& value_infos, int opset_version)
+      : value_infos_(value_infos), opset_version_(opset_version) {}
+  const ValueInfoMap& value_infos() const {
+    return value_infos_;
+  }
+  int opset_version() const {
+    return opset_version_;
+  }
+
+ private:
+  const ValueInfoMap& value_infos_;
+  const int opset_version_;
+};
 
 // \brief This struct holds the converted ops after the onnx->c2 conversion.
 // Notice that for RNN ops, it may create ops in init_net. Hence we have the
@@ -129,7 +150,9 @@ class Caffe2Backend {
 
   bool SupportOp(const std::string tyep) const;
 
-  Caffe2Ops ConvertNode(const std::string& node_str, int opset_version);
+  Caffe2Ops ConvertNode(
+      const std::string& node_str,
+      const ConversionContext& ctx);
 
   void BuildTensorFillingOp(
       caffe2::OperatorDef* c2_op,
@@ -137,7 +160,8 @@ class Caffe2Backend {
       const std::string& name = "");
 
  private:
-  using SpecialOpConverter = Caffe2Ops (Caffe2Backend::*)(OnnxNode*, int);
+  using SpecialOpConverter =
+      Caffe2Ops (Caffe2Backend::*)(OnnxNode*, const ConversionContext&);
 
   void OnnxToCaffe2(
       caffe2::NetDef* init_net,
@@ -153,51 +177,56 @@ class Caffe2Backend {
   Caffe2Ops OnnxNodeToCaffe2Ops(
       const ModelProto& init_model,
       const ModelProto& pred_model,
-      OnnxNode* onnx_node,
-      int opset_version);
+      const ConversionContext& ctx,
+      OnnxNode* onnx_node);
 
   std::unordered_set<std::string> AllNamesInGraph(const GraphProto& graph);
 
-  Caffe2Ops CommonOnnxNodeToCaffe2Ops(OnnxNode* onnx_node, int opset_version);
-
-  Caffe2Ops CreateArgMaxMin(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CommonOnnxNodeToCaffe2Ops(
+      OnnxNode* onnx_node,
+      const ConversionContext& ctx);
 
-  Caffe2Ops CreateCast(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateArgMaxMin(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateConstant(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateCast(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateConvPoolOpBase(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateConstant(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreatePadPool(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateConvPoolOpBase(
+      OnnxNode* onnx_node,
+      const ConversionContext& ctx);
 
-  Caffe2Ops CreateReshape(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreatePadPool(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateGather(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateReshape(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateGemm(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateGather(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreatePad(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateGemm(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateConcat(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreatePad(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateLogSoftmax(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateConcat(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateSlice(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateLogSoftmax(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateSplit(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateSlice(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateSplit(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateBatchNormalization(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateMatMul(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateBatchNormalization(
+      OnnxNode* onnx_node,
+      const ConversionContext& ctx);
 
-  Caffe2Ops CreateUpsample(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateMatMul(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateDropout(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateUpsample(OnnxNode* onnx_node, const ConversionContext& ctx);
 
-  Caffe2Ops CreateLRN(OnnxNode* onnx_node, int opset_version);
+  Caffe2Ops CreateDropout(OnnxNode* onnx_node, const ConversionContext& ctx);
 
+  Caffe2Ops CreateLRN(OnnxNode* onnx_node, const ConversionContext& ctx);
 
   // LUT related getters
   const std::unordered_map<std::string, std::string>& get_renamed_operators()
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 81bedce653612d..cb9932bc4542a2 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -763,12 +763,13 @@ def ftrl(w, nz, i, g, alpha):
         self.assertReferenceChecks(gc, op, [var, nz, indices, grad, alpha],
                                    ftrl)
 
+    # TODO: (bddppq) test_unique keeps running into segfault on rocm 1.8.2
     @given(input=hu.tensor(max_value=20,
                            max_dim=1,
                            dtype=np.int32,
                            elements=st.integers(min_value=0, max_value=10)),
            with_remapping=st.booleans(),
-           **hu.gcs)
+           **hu.gcs_no_hip)
     def test_unique(self, input, with_remapping, gc, dc):
         op = core.CreateOperator(
             "Unique",
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index e501a7d41d3ecc..f640f6db20eff8 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -252,7 +252,11 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 cpu_do = caffe2_pb2.DeviceOption()
 gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
 hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP)
-device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) + ([hip_do] if workspace.has_hip_support else [])
+# (bddppq) Do not rely on this no_hip option! It's just used to
+# temporarily skip some flaky tests on ROCM before it's getting more mature.
+_device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
+device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else [])
+
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
     [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
@@ -275,6 +279,7 @@ def gradient_checker_device_option():
 
 gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
 gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
+gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip))
 
 
 @contextlib.contextmanager
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 7a7d9440d1aa77..dab79b8b1fb0b4 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -212,34 +212,35 @@ def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version
         super(Caffe2Backend, cls).run_node(node, inputs, device=device,
                                            outputs_info=outputs_info, opset_version=opset_version)
 
+        value_infos = []
         device_option = get_device_option(Device(device))
         ws = Workspace()
         with core.DeviceScope(device_option):  # temporary!
             if isinstance(inputs, dict):
                 for key, value in inputs.items():
                     ws.FeedBlob(key, value)
+                    value_infos.append(onnx.helper.make_tensor_value_info(
+                        name=key,
+                        elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype],
+                        shape=value.shape).SerializeToString())
             else:
                 assert len(node.input) == len(inputs), "{}: expected {} but got {}".format(
                     node.op_type, len(node.input), len(inputs))
                 for key, value in zip(node.input, inputs):
                     ws.FeedBlob(key, value)
+                    value_infos.append(onnx.helper.make_tensor_value_info(
+                        name=key,
+                        elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype],
+                        shape=value.shape).SerializeToString())
 
             ops = []
             cbackend = C.Caffe2Backend(cls._dummy_name)
-            ops_str = cbackend.convert_node(node.SerializeToString(), opset_version)
+            ops_str = cbackend.convert_node(node.SerializeToString(), value_infos, opset_version)
             for s in ops_str[0] + ops_str[1]:
                 op = caffe2_pb2.OperatorDef()
                 op.ParseFromString(s)
                 op.device_option.CopyFrom(device_option)
                 ops.append(op)
-            # For testing
-            if "ONNX_CAFFE2_DEBUG" in os.environ:
-                init_ops, ops2, _ = cls._onnx_node_to_caffe2_op(
-                    None, None, node, opset_version or cls._known_opset_version)
-                ops2 = init_ops + ops2
-                for op in ops2:
-                    op.device_option.CopyFrom(device_option)
-                print("\nC++:\n{}\nPython:\n{}".format(ops, ops2))
             ws.RunOperatorsOnce(ops)
             output_values = [ws.FetchBlob(name) for name in node.output]
             return namedtupledict('Outputs', node.output)(*output_values)
@@ -708,82 +709,34 @@ def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
 
         model = onnx.shape_inference.infer_shapes(model)
 
-        # Check whether we have RNN related ops
-        pred_model = cls.optimize_onnx(model, predict=True)
-        rnn_nodes = []
-        for node in pred_model.graph.node:
-            if node.op_type in {'LSTM', 'GRU', 'RNN'}:
-                rnn_nodes.append(node)
-
-        # Build the C++ backend
-        # TODO: build a predictor that supports GPU
-        #       And for RNN nets, we need to avoid adding init_net
-        use_cpp_backend = device == 'CPU' and not rnn_nodes
-        # use python backend for now
-        use_cpp_backend = False
-        if use_cpp_backend:
-            c2_rnn_ops = []
-            if rnn_nodes:
-                init_model = cls.optimize_onnx(model, init=True)
-                for node in rnn_nodes:
-                    c2ops = cls._onnx_node_to_caffe2_op(
-                        init_model, pred_model, node, opset_version)
-                    init_ops = [x.SerializeToString() for x in c2ops.init_ops]
-                    ops = [x.SerializeToString() for x in c2ops.ops]
-                    external_inputs = c2ops.interface_blobs
-                    c2_rnn_ops.append(C.Caffe2Ops(init_ops, ops, external_inputs))
-                del init_model
-
-            cbackend = C.Caffe2Backend(cls._dummy_name)
-            if raw_values_dict:
-                cls._external_value_resolution_pass(model, raw_values_dict)
-            rep = cbackend.prepare(model.SerializeToString(), device, c2_rnn_ops)
-            # For testing
-            # Dump the net descriptions to file for comparison with the Python ones
-            if "ONNX_CAFFE2_DEBUG" in os.environ:
-                pred_net_str = rep.pred_net()
-                pn = caffe2_pb2.NetDef()
-                pn.ParseFromString(pred_net_str)
-                init_net_str = rep.init_net()
-                inn = caffe2_pb2.NetDef()
-                inn.ParseFromString(init_net_str)
-                with open("cpp.txt", "w") as f:
-                    f.write("pred_net: \n{}".format(pn))
-
-            rep_wrapper = Caffe2CppRep(rep)
-            return rep_wrapper
-        else:
-            ws = Workspace()
-            device_option = get_device_option(Device(device))
+        ws = Workspace()
+        device_option = get_device_option(Device(device))
 
-            init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False)
+        init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False)
 
-            if raw_values_dict:
-                cls._external_value_resolution_pass(model, raw_values_dict)
+        if raw_values_dict:
+            cls._external_value_resolution_pass(model, raw_values_dict)
 
-            # Directly load initializer data into blobs in workspace
-            cls._direct_initialize_parameters(
-                model.graph.initializer,
-                ws,
-                device_option,
-            )
+        # Directly load initializer data into blobs in workspace
+        cls._direct_initialize_parameters(
+            model.graph.initializer,
+            ws,
+            device_option,
+        )
 
-            initialized = {init.name for init in model.graph.initializer}
+        initialized = {init.name for init in model.graph.initializer}
 
-            cls._direct_initialize_inputs(
-                model.graph.input,
-                initialized,
-                ws,
-                device_option,
-            )
+        cls._direct_initialize_inputs(
+            model.graph.input,
+            initialized,
+            ws,
+            device_option,
+        )
 
-            uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized]
+        uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized]
 
-            if "ONNX_CAFFE2_DEBUG" in os.environ:
-                with open("python.txt", "w") as f:
-                    f.write("pred_net: \n{}".format(predict_net))
-            retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
-            return retval
+        retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
+        return retval
 
 
     @classmethod
@@ -791,7 +744,20 @@ def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
     def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version):
         cbackend = C.Caffe2Backend(cls._dummy_name)
         if cbackend.support_onnx_import(node_def.op_type):
-            op_strs = cbackend.convert_node(node_def.SerializeToString(), opset_version)
+
+            # extract value infos from pred model (value infos of
+            # node's inputs that are in init model should be all
+            # available in pred model)
+            value_infos = []
+            for name in node_def.input:
+                if pred_model is not None:
+                    for vi in itertools.chain(pred_model.graph.input,
+                                              pred_model.graph.output,
+                                              pred_model.graph.value_info):
+                        if vi.name == name:
+                            value_infos.append(vi.SerializeToString())
+
+            op_strs = cbackend.convert_node(node_def.SerializeToString(), value_infos, opset_version)
             init_ops = []
             for s in op_strs[0]:
                 op = caffe2_pb2.OperatorDef()
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index 97d824e05897a5..e526d74f73921a 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -1,4 +1,4 @@
-## @package onnx
+# @package onnx
 # Module caffe2.python.onnx.tests.c2_ref_test
 
 from __future__ import absolute_import
@@ -39,14 +39,14 @@ def test_dummy_name(self):
     def test_check_arguments(self):
         b2 = C.Caffe2Backend()
 
-        node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"])
-        b2.convert_node(node_def.SerializeToString(), 6)
+        node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"])
+        b2.convert_node(node_def.SerializeToString())
 
-        bad_node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"], foo = 42, bar = 56)
+        bad_node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"], foo=42, bar=56)
         with self.assertRaisesRegexp(
-            RuntimeError,
-            ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"):
-            b2.convert_node(bad_node_def.SerializeToString(), 6)
+                RuntimeError,
+                ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"):
+            b2.convert_node(bad_node_def.SerializeToString())
 
     def test_relu_graph(self):
         X = np.random.randn(3, 2).astype(np.float32)
@@ -199,6 +199,54 @@ def test_gemm(self):
             output["Y"],
             alpha * np.dot(A, B) + beta * C)
 
+    def test_gemm_conversion(self):
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            alpha=2.,
+            beta=3.,
+            transB=True)
+
+        backend = C.Caffe2Backend()
+
+        # without broadcast and without shape info, gemm will be
+        # converted to matmul + add
+        _, op_strs = backend.convert_node(node_def.SerializeToString())
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
+
+        # with shape info (that indicates C is 1D), gemm will be
+        # converted to FC
+        _, op_strs = backend.convert_node(node_def.SerializeToString(
+        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()])
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
+
+        # or with broadcast, gemm will be converted to fc
+        node_def = make_node(
+            'Gemm',
+            ['A', 'B', 'C'],
+            ["Y"],
+            transB=True,
+            broadcast=1)
+
+        _, op_strs = backend.convert_node(node_def.SerializeToString())
+        op_names = []
+        for s in op_strs:
+            op = caffe2_pb2.OperatorDef()
+            op.ParseFromString(s)
+            op_names.append(op.type)
+        self.assertEqual(op_names, ['FC'])
+
     def test_tensor_filling_ops(self):
         for dtype in [
                 onnx.TensorProto.FLOAT,
@@ -267,7 +315,6 @@ def test_tensor_filling_ops_c_backend(self):
             np.testing.assert_almost_equal(output[0], vals)
             np.testing.assert_almost_equal(ws.FetchBlob(op.output[0]), vals)
 
-
     def test_slice(self):
         X = np.random.randn(1, 2, 3).astype(np.float32)
         starts = np.array([0, 1, 0], dtype=np.int32)
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index cbc83bed116c4a..6854be44164b49 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -28,14 +28,11 @@ class TestSpatialBN(hu.HypothesisTestCase):
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
            inplace=st.sampled_from([True, False]),
-           **hu.gcs)
+           # Currently HIP SpatialBN only supports 2D
+           **hu.gcs_no_hip)
     def test_spatialbn_test_mode_3d(
             self, size, input_channels, batch_size, seed, order, epsilon,
             inplace, gc, dc):
-        # Currently HIP SpatialBN only supports 2D
-        if _run_in_hip(gc, dc):
-            return
-
         op = core.CreateOperator(
             "SpatialBN",
             ["X", "scale", "bias", "mean", "var"],
@@ -77,14 +74,11 @@ def reference_spatialbn_test(X, scale, bias, mean, var):
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
            inplace=st.sampled_from([True, False]),
-           **hu.gcs)
+           # Currently HIP SpatialBN only supports 2D
+           **hu.gcs_no_hip)
     def test_spatialbn_test_mode_1d(
             self, size, input_channels, batch_size, seed, order, epsilon,
             inplace, gc, dc):
-        # Currently HIP SpatialBN only supports 2D
-        if _run_in_hip(gc, dc):
-            return
-
         op = core.CreateOperator(
             "SpatialBN",
             ["X", "scale", "bias", "mean", "var"],
@@ -249,14 +243,11 @@ def test_spatialbn_train_mode_gradient_check(
            seed=st.integers(0, 65535),
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           **hu.gcs)
+           # Currently HIP SpatialBN only supports 2D
+           **hu.gcs_no_hip)
     def test_spatialbn_train_mode_gradient_check_1d(
             self, size, input_channels, batch_size, seed, order, epsilon,
             gc, dc):
-        # Currently HIP SpatialBN only supports 2D
-        if _run_in_hip(gc, dc):
-            return
-
         op = core.CreateOperator(
             "SpatialBN",
             ["X", "scale", "bias", "mean", "var"],
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 70bc635193f19c..04df247d821daf 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -793,13 +793,21 @@ void addObjectMethods(py::module& m) {
           "convert_node",
           [](caffe2::onnx::Caffe2Backend& instance,
              const py::bytes& node_str,
+             const std::vector<py::bytes>& value_infos_bytes,
              int opset_version) -> std::vector<std::vector<py::bytes>> {
             // Note that we return two lists of serialized ops. The first set is
             // init_ops and the second set is ops for pred net. When converting
             // RNN related op, it is possible that we will create ops in the
             // init_net. Hence the return structure here
+            caffe2::onnx::ValueInfoMap value_infos{};
+            for (const auto& vi_bytes : value_infos_bytes) {
+              ::ONNX_NAMESPACE::ValueInfoProto vi{};
+              vi.ParseFromString(vi_bytes);
+              auto name = vi.name();
+              value_infos.emplace(std::move(name), std::move(vi));
+            }
             auto c2ops = instance.ConvertNode(
-                node_str.cast<std::string>(), opset_version);
+                node_str.cast<std::string>(), {value_infos, opset_version});
             std::vector<std::vector<py::bytes>> vals;
             vals.emplace_back();
             auto& init_vals = vals.back();
@@ -816,12 +824,15 @@ void addObjectMethods(py::module& m) {
               normal_vals.emplace_back(py::bytes(out));
             }
             return vals;
-          })
+          },
+          py::arg("node_str"),
+          py::arg("value_infos_bytes") = std::vector<py::bytes>{},
+          py::arg("opset_version") = kKnownOpsetVersion)
       .def(
-        "_build_tensor_filling_op",
-        [](caffe2::onnx::Caffe2Backend& instance,
-           const py::bytes& tensor_proto_str,
-           const std::string& name="") -> py::bytes {
+          "_build_tensor_filling_op",
+          [](caffe2::onnx::Caffe2Backend& instance,
+             const py::bytes& tensor_proto_str,
+             const std::string& name = "") -> py::bytes {
             caffe2::OperatorDef op;
             ::ONNX_NAMESPACE::TensorProto tp;
             ParseProtoFromLargeString(tensor_proto_str, &tp);
@@ -829,7 +840,7 @@ void addObjectMethods(py::module& m) {
             std::string out;
             op.SerializeToString(&out);
             return py::bytes(out);
-        });
+          });
 
   py::class_<Predictor>(m, "Predictor")
       .def(
diff --git a/docs/Makefile b/docs/Makefile
index 2a63943f00f0ab..4a56c12ca22d89 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -23,6 +23,13 @@ docset: html
 	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
 	convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
 
+html-stable:
+	# stable differs from `make html` in two ways:
+	# 1) The stable logo is used instead of the unstable logo
+	# 2) There will not be a link to the stable docs.
+	# See conf.py for more details.
+	RELEASE=1 make html
+
 .PHONY: help Makefile docset
 
 # Catch-all target: route all unknown targets to Sphinx using the new
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b48a5ad27362a9..1eaaa3b9086d96 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-# import os
+import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 import torch
@@ -28,6 +28,8 @@
     warnings.warn('unable to load "torchvision" package')
 import sphinx_rtd_theme
 
+RELEASE = os.environ.get('RELEASE', False)
+
 
 # -- General configuration ------------------------------------------------
 
@@ -54,6 +56,8 @@
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
+if RELEASE:
+    templates_path = ['_templates-stable']
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
@@ -122,6 +126,9 @@
 }
 
 html_logo = '_static/img/pytorch-logo-dark-unstable.png'
+if RELEASE:
+    html_logo = '_static/img/pytorch-logo-dark.svg'
+
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
diff --git a/tools/cwrap/plugins/Broadcast.py b/tools/cwrap/plugins/Broadcast.py
deleted file mode 100644
index 5b0a74167839c4..00000000000000
--- a/tools/cwrap/plugins/Broadcast.py
+++ /dev/null
@@ -1,362 +0,0 @@
-from . import CWrapPlugin
-from string import Template
-
-# Arguments to the Broadcast Plugin:
-# broadcast: args_to_broadcast_against [inplace] [fallback]
-# [args_to_broadcast_against]: either a single argument (e.g. "arg1") or a comma-separated
-#                              list of two arguments (e.g. "tensor1,tensor2") indicating
-#                              arguments to broadcast specified argument (usually "self") against
-# [inplace] will generate code for in-place function, which doesn't allow the in-place
-#           argument to be broadcast
-# [fallback] if tensors aren't broadcastable, preserves "element number" pointwise behavior,
-#            where only number of elements need to match, and tensors are viewed as 1-dimensional.
-# [dims] specify if the tensors shouldn't be broadcast to a specific tensor or tensors, but a combination
-#        of individual dimension sizes of a set of tensors.  For example: addbmm(C,A,B) a.k.a. [C + A @ B]
-#        broadcasts C to the first dimension of A and the second dimension of B.  Each dimension is specified as
-#        [arg].dim[#] and dimensions are comma-separated.  So, to specify that the tensor should be
-#        broadcast to 3-dimensions with sizes:
-#        tensor0->size[0] x tensor1->size[1] x tensor2->size[2]
-#        you would write:
-#        dims:tensor0.dim0,tensor1.dim1,tensor2.dim2
-# [types] if the tensors should be of different types than THTensor, specify as X where
-#         the actual type to use is THXTensor (i.e. Byte for THByteTensor).  If the type
-#         should be THTensor, use 'Real'
-
-# For out of place:
-# Two args: expand the two args together
-# Three args (fused kernels): (e.g. addcmul) expand all three args together
-# Sketch of proof that this is the same:
-# consider addcmul, under expansion we want: a + (b * c) = (a + b * c) [all expanded together]
-# Let e(i, j) be the expansion of i with j, e(i, j, k) be the expansion of i with j,k
-#
-# Then a + (b * c) = e(a, e(b,c) * e(c,b)) + e(e(b,c)   * e(c,b), a)
-#                  = e(a, e(b,c))          + e(e(b,c)   * e(c,b), a)    (only size matters for second param)
-#                  = e(a,b,c)              + e(e(b,c)   * e(c,b), a)    (by associativity of max in expand)
-#                  = e(a,b,c)              + e(b,c,a)   * e(c,b,a)      (see L1)
-# which is a + b * c all expanded together
-#
-# L1: Show e(i * j, a) = e(i,a) * e(j,a) where i,j have same size
-# Consider any index _{ s_0, ..., s_n}
-# e(i * j, a) = (i*j)_{f(s_0), ...,f(s_n)} where f is the expansion of that dimension with a
-#             = i_{f(s_0), ..., f(s_n)} * j_{f(s_0), ..., f(s_n)} by definition of pointwise operator
-#             = e(i,a) * e(j,a)
-
-
-class Broadcast(CWrapPlugin):
-
-    # Save and restore passed in arguments in case later plugins use
-    POST_TEMPLATE = Template(
-        """${arg_op_other} = ${arg_op_other}_save;\n""")
-
-    def getPreArgStringTemplate(self, type=None):
-        if type is None:
-            ret = """THTensor *${arg_op_other}_save = ${arg_op_other};
-                     THTensorPtr ${arg_op_other}_guard(nullptr);\n"""
-        else:
-            cpu_t = "TH" + type + "Tensor"
-            gpu_t = "THCuda" + type + "Tensor"
-            ret = ("#if !IS_CUDA\n" +
-                   cpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" +
-                   cpu_t + "Ptr ${arg_op_other}_guard(nullptr);\n" +
-                   "#else\n" +
-                   gpu_t + " *${arg_op_other}_save = ${arg_op_other};\n" +
-                   "THPPointer<" + gpu_t + "> ${arg_op_other}_guard(nullptr);\n" +
-                   "#endif\n")
-        return Template(ret)
-
-    def getNewForExpand(self, type):
-        if type is None:
-            ret = """THTensor_(new)(LIBRARY_STATE_NOARGS);\n"""
-        else:
-            cpu_t = "TH" + type + "Tensor"
-            gpu_t = "THCuda" + type + "Tensor"
-            ret = ("#if !IS_CUDA\n" +
-                   cpu_t + "_new(LIBRARY_STATE_NOARGS);\n" +
-                   "#else\n" +
-                   gpu_t + "_new(LIBRARY_STATE_NOARGS);\n" +
-                   "#endif\n")
-        return ret
-
-    def getExpandTemplate(self, same_size_check, expand_call, success_code, raise_errors):
-        if not raise_errors:
-            return Template(
-                "bool try_expand = !" + same_size_check + "\n" +
-                "if (try_expand) {\n" +
-                "bool expand_success = false;\n" +
-                "try {\n" +
-                expand_call +
-                "\nexpand_success = true;\n" +
-                "}\n"
-                "catch (std::exception &e) {}\n" +
-                "if(expand_success) {\n" +
-                success_code +
-                "\n}" +
-                "\n}\n")
-        else:
-            return Template(
-                "bool try_expand = !" + same_size_check + "\n" +
-                "if (try_expand) {\n" +
-                expand_call + "\n" +
-                success_code + "\n"
-                "}\n")
-
-    def getOutPlacePreExpand2Template(self, type_op_a, type_op_other, raise_errors):
-        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_other}->size, ${arg_op_other}->nDimension);"""
-        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
-                       "${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" +
-                       """expand_outplace2(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_other}_guard.get(),
-                                           ${arg_op_a}, ${arg_op_other},
-                                           \"${op_a}\", \"${op_other}\", !${raise_errors});""")
-        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();
-                          ${arg_op_other} = ${arg_op_other}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    def getOutPlacePreExpand3Template(self, type_op_a, type_op_other1, type_op_other2, raise_errors):
-        size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                             ${arg_op_other1}->size, ${arg_op_other1}->nDimension) &&
-                        THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_other2}->size, ${arg_op_other2}->nDimension));"""
-        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
-                       "${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" +
-                       "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" +
-                       """expand_outplace3(LIBRARY_STATE ${arg_op_a}_guard.get(),
-                                          ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(),
-                                          ${arg_op_a}, ${arg_op_other1}, ${arg_op_other2},
-                                          \"${op_a}\", \"${op_other1}\", \"${op_other2}\", !${raise_errors});""")
-        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();
-                          ${arg_op_other1} = ${arg_op_other1}_guard.get();
-                          ${arg_op_other2} = ${arg_op_other2}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE = Template(
-        """if(THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim}) <= ${arg_op_dim_value}) {
-             THError("Argument %s requires at least %d dimensions, but only has %d",
-                     "${op_dim}", ${arg_op_dim_value} + 1, THTensor_(nDimension)(LIBRARY_STATE ${arg_op_dim}));
-           }
-           int64_t ${arg_op_a}_dim${idx}_size = THTensor_(size)(LIBRARY_STATE ${arg_op_dim}, ${arg_op_dim_value});\n""")
-
-    OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE = Template(
-        """THLongStoragePtr ${arg_op_a}_storage(THLongStorage_newWithSize1(${arg_op_a}_dim0_size));\n""")
-
-    OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE = Template(
-        """THLongStoragePtr ${arg_op_a}_storage(
-               THLongStorage_newWithSize2(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size));\n""")
-
-    OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE = Template(
-        """THLongStoragePtr ${arg_op_a}_storage(
-               THLongStorage_newWithSize3(${arg_op_a}_dim0_size, ${arg_op_a}_dim1_size, ${arg_op_a}_dim2_size));\n""")
-
-    def getOutPlacePreExpandPostDimTemplate(self, type_op_a, raise_errors):
-        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_a}_storage->data, ${arg_op_a}_storage->size);"""
-        expand_code = ("${arg_op_a}_guard = \n" + self.getNewForExpand(type_op_a) + "\n" +
-                       """expand(LIBRARY_STATE ${arg_op_a}_guard.get(), ${arg_op_a}, ${arg_op_a}_storage);""")
-        success_code = """${arg_op_a} = ${arg_op_a}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    OUT_PLACE_PRE_TEMPLATE = Template(
-        """${code_arg_op_a}${code_arg_op_other1}${code_arg_op_other2}
-           ${expand_code}""")
-
-    def getInPlacePreExpand1Template(self, type_op_other, raise_errors):
-        size_check = """THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                            ${arg_op_other}->size, ${arg_op_other}->nDimension);"""
-        expand_code = ("${arg_op_other}_guard = \n" + self.getNewForExpand(type_op_other) + "\n" +
-                       """expand_inplace1(LIBRARY_STATE ${arg_op_other}_guard.get(), ${arg_op_other}, ${arg_op_a},
-                                         \"${op_other}\", \"${op_a}\", !${raise_errors});""")
-        success_code = """${arg_op_other} = ${arg_op_other}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    def getInPlacePreExpand2Template(self, type_op_other1, type_op_other2, raise_errors):
-        size_check = """(THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                             ${arg_op_other1}->size, ${arg_op_other1}->nDimension) &&
-                         THSize_isSameSizeAs(${arg_op_a}->size, ${arg_op_a}->nDimension,
-                                             ${arg_op_other2}->size, ${arg_op_other2}->nDimension));"""
-        expand_code = ("${arg_op_other1}_guard = \n" + self.getNewForExpand(type_op_other1) + "\n" +
-                       "${arg_op_other2}_guard = \n" + self.getNewForExpand(type_op_other2) + "\n" +
-                       """expand_inplace2(LIBRARY_STATE ${arg_op_other1}_guard.get(), ${arg_op_other2}_guard.get(),
-                                         ${arg_op_other1}, ${arg_op_other2}, ${arg_op_a},
-                                         \"${op_other1}\", \"${op_other2}\", \"${op_a}\", !${raise_errors});""")
-        success_code = """${arg_op_other1} = ${arg_op_other1}_guard.get();
-                          ${arg_op_other2} = ${arg_op_other2}_guard.get();"""
-        return self.getExpandTemplate(size_check, expand_code, success_code, raise_errors)
-
-    IN_PLACE_PRE_TEMPLATE = Template(
-        """${code_arg_op_other1}${code_arg_op_other2}
-           ${expand_code}""")
-
-    def initialize(self, cwrap):
-        self.cwrap = cwrap
-
-    # Arguments:
-    # [0]: name of tensor to broadcast with (possibly two comma separated)
-    # [1] inplace (optional).  In place operations only broadcast on second tensor argument
-    # [2] fallback (optional).  Will fallback to applying to tensor of equal nElem if broadcast fails
-    def process_option_code_template(self, template, option):
-        new_code_pre = []
-        new_code_post = []
-        for _, arg in enumerate(option['arguments']):
-            if 'broadcast' not in arg:
-                continue
-
-            params = arg.get('broadcast').split(" ")
-            op_a = arg.get('assign_name', arg['name'])
-            in_place = "inplace" in params
-            raise_errors = "false" if "fallback" in params else "true"
-
-            param_others = params[0].split(",")
-            if len(param_others) > 2:
-                raise ValueError('Broadcast only supports up to 2 secondary parameters')
-            op_b = param_others[0]
-            op_c = param_others[1] if len(param_others) == 2 else None
-            arg_op_b = "arg_" + op_b
-            arg_op_a = "arg_" + op_a
-            arg_op_c = ("arg_" + op_c) if op_c else None
-
-            dims_kvs = []
-            for p in params:
-                if p.startswith("dims:"):
-                    assert(raise_errors == "true")
-                    if len(dims_kvs) != 0:
-                        raise ValueError("multiple specifications of dims")
-                    dims = p[len("dims:"):].split(",")
-                    for dim in dims:
-                        batchdim = dim.split(".")
-                        assert len(batchdim) == 2
-                        assert batchdim[1].startswith("dim")
-                        dim_val = batchdim[1][len("dim"):]
-                        dims_kvs.append({"op": batchdim[0], "arg_op": "arg_" + batchdim[0], "val": dim_val})
-
-            assert len(dims_kvs) <= 3
-            for p in params[1:]:
-                if p != "inplace" and p != "fallback" and not p.startswith("dims:") and not p.startswith("types:"):
-                    raise ValueError("invalid parameter {}".format(p))
-
-            type_op_b = None
-            type_op_c = None
-            for p in params:
-                if p.startswith("types:"):
-                    if not in_place and len(dims_kvs) > 0:
-                        raise ValueError("type specification not supported yet for out-of-place functions "
-                                         "that specify explicit dimensions")
-                    types = p[len("types:"):].split(",")
-                    assert(len(types) == (2 if op_c else 1))
-                    type_op_b = None if types[0] == "Real" else types[0]
-                    if op_c:
-                        type_op_c = None if types[1] == "Real" else types[1]
-
-            op_b_mapping = {
-                "op_a": op_a,
-                "op_other": op_b,
-                "arg_op_a": arg_op_a,
-                "arg_op_other": arg_op_b,
-                "raise_errors": raise_errors
-            }
-            op_c_mapping = {
-                "op_a": op_a,
-                "op_other": op_c,
-                "arg_op_a": arg_op_a,
-                "arg_op_other": arg_op_c,
-                "raise_errors": raise_errors
-            }
-            raise_errors_s = raise_errors == "true"
-
-            if in_place:
-                code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping)
-                code_arg_op_other2 = (
-                    self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping) if op_c else "")
-
-                if op_c:
-                    expand_code = self.getInPlacePreExpand2Template(type_op_b, type_op_c, raise_errors_s).substitute(
-                        op_b_mapping,
-                        op_other1=op_b,
-                        op_other2=op_c,
-                        arg_op_other1=arg_op_b,
-                        arg_op_other2=arg_op_c)
-                else:
-                    expand_code = self.getInPlacePreExpand1Template(type_op_b, raise_errors_s).substitute(op_b_mapping)
-
-                new_code_pre.append(self.IN_PLACE_PRE_TEMPLATE.substitute(
-                    arg_op_a=arg_op_a,
-                    code_arg_op_other1=code_arg_op_other1,
-                    code_arg_op_other2=code_arg_op_other2,
-                    expand_code=expand_code,
-                    raise_errors=raise_errors))
-                new_code_pre.append("")
-
-                post_code = self.POST_TEMPLATE.substitute(op_b_mapping)
-                if op_c:
-                    post_code += self.POST_TEMPLATE.substitute(op_c_mapping)
-
-                new_code_post.append(post_code)
-                new_code_post.append("")
-            else:
-                if len(dims_kvs) != 0:
-                    code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a)
-                    code_arg_op_other1 = ""
-                    code_arg_op_other2 = ""
-                    expand_code = ""
-                    for idx, kv in enumerate(dims_kvs):
-                        expand_code += self.OUT_PLACE_PRE_EXPAND_PRE_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            op_dim=kv["op"],
-                            arg_op_dim=kv["arg_op"],
-                            arg_op_dim_value=kv["val"],
-                            idx=idx)
-
-                    if len(dims_kvs) == 1:
-                        expand_code += self.OUT_PLACE_PRE_EXPAND1_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            arg_op_dim0=dims_kvs[0]["arg_op"])
-                    elif len(dims_kvs) == 2:
-                        expand_code += self.OUT_PLACE_PRE_EXPAND2_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            arg_op_dim0=dims_kvs[0]["arg_op"],
-                            arg_op_dim1=dims_kvs[1]["arg_op"])
-                    else:
-                        expand_code += self.OUT_PLACE_PRE_EXPAND3_DIM_TEMPLATE.substitute(
-                            arg_op_a=arg_op_a,
-                            arg_op_dim0=dims_kvs[0]["arg_op"],
-                            arg_op_dim1=dims_kvs[1]["arg_op"],
-                            arg_op_dim2=dims_kvs[2]["arg_op"])
-                    expand_code += self.getOutPlacePreExpandPostDimTemplate(None, raise_errors_s).substitute(
-                        arg_op_a=arg_op_a,
-                        raise_errors=raise_errors)
-                    post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a)
-
-                else:
-                    code_arg_op_a = self.getPreArgStringTemplate().substitute(arg_op_other=arg_op_a)
-                    code_arg_op_other1 = self.getPreArgStringTemplate(type=type_op_b).substitute(op_b_mapping)
-                    code_arg_op_other2 = (self.getPreArgStringTemplate(type=type_op_c).substitute(op_c_mapping)
-                                          if op_c else "")
-
-                    if op_c:
-                        expand_template = self.getOutPlacePreExpand3Template(None, type_op_b, type_op_c, raise_errors_s)
-                        expand_code = expand_template.substitute(
-                            op_b_mapping,
-                            op_other1=op_b,
-                            op_other2=op_c,
-                            arg_op_other1=arg_op_b,
-                            arg_op_other2=arg_op_c)
-
-                    else:
-                        expand_code = self.getOutPlacePreExpand2Template(None, type_op_b, raise_errors_s).substitute(
-                            op_b_mapping)
-
-                    post_code = self.POST_TEMPLATE.substitute(arg_op_other=arg_op_a)
-                    post_code += self.POST_TEMPLATE.substitute(op_b_mapping)
-                    post_code += self.POST_TEMPLATE.substitute(op_c_mapping) if op_c else ""
-
-                new_code_pre.append(self.OUT_PLACE_PRE_TEMPLATE.substitute(
-                    code_arg_op_a=code_arg_op_a,
-                    code_arg_op_other1=code_arg_op_other1,
-                    code_arg_op_other2=code_arg_op_other2,
-                    expand_code=expand_code))
-                new_code_pre.append("")
-
-                new_code_post.append(post_code)
-                new_code_post.append("")
-
-        template = new_code_pre + template + new_code_post
-        return template
diff --git a/tools/cwrap/plugins/__init__.py b/tools/cwrap/plugins/__init__.py
index 7efb4a51bf1ce6..53789a0bed989a 100644
--- a/tools/cwrap/plugins/__init__.py
+++ b/tools/cwrap/plugins/__init__.py
@@ -432,4 +432,3 @@ def process_pre_arg_assign(self, template, option):
 from .AutoGPU import AutoGPU
 from .CuDNNPlugin import CuDNNPlugin
 from .WrapDim import WrapDim
-from .Broadcast import Broadcast
diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
index a3e3c0f40f48c0..69b5036766e998 100644
--- a/torch/csrc/jit/argument_spec.h
+++ b/torch/csrc/jit/argument_spec.h
@@ -4,6 +4,7 @@
 #include <vector>
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/utils/hash.h"
+#include "torch/csrc/jit/stack.h"
 #include "torch/csrc/jit/variable_tensor_list.h"
 
 namespace torch { namespace jit {
@@ -16,14 +17,15 @@ namespace torch { namespace jit {
 // since it is used along the hot-path of the JIT to check if the code
 // we have created is valid for the given inputs.
 
-// TensorInfoPOD is only used internally in ArgumentSpec
-// API users should use TensorInfo
-struct TensorInfoPOD {
+// ArgumentInfoPOD is only used internally in ArgumentSpec
+// API users should use ArgumentInfo
+struct ArgumentInfoPOD {
   // total size is 64-bit
-  unsigned type : 8;
+  unsigned is_tensor : 8; // all other fields are invalid if this is false
+  unsigned type : 8; // scalar type
   unsigned defined : 1;
   unsigned requires_grad : 1;
-  signed device : 22;
+  signed device : 14;
   uint32_t total_dims; // all TensorInfoPODs are in ArgumentSpec's tensor_info() array.
                        // total_dims is the total number of dimensions seen so far
                        // in all previous members of tensor_info(), including this tensor
@@ -32,34 +34,38 @@ struct TensorInfoPOD {
                        // for tensor 0, the offset is always 0
 };
 
-static_assert(sizeof(TensorInfoPOD) == sizeof(int64_t),
-  "TensorInfoPOD must be 64-bit struct for ArgumentSpec encoding to work");
+static_assert(sizeof(ArgumentInfoPOD) == sizeof(int64_t),
+  "ArgumentInfoPOD must be 64-bit struct for ArgumentSpec encoding to work");
 
-struct TensorInfo;
+struct ArgumentInfo;
 
 struct ArgumentSpec {
-  // note: tensors must always be variables
-  ArgumentSpec(bool with_grad, const variable_tensor_list & tensors)
-  :  hash_code(0), ntensors(tensors.size()) {
-    int all_dims = 0;
-    for(size_t i = 0; i < ntensors; i++) {
-      all_dims += tensors[i].defined() ? tensors[i].ndimension() : 0;
+  ArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs)
+  :  hash_code(0), ninputs(inputs.size()) {
+    int32_t all_dims = 0;
+    const int32_t num_inputs = inputs.size();
+    for (int32_t i = 0; i < num_inputs; i++) {
+      if (!inputs[i].isTensor()) continue;
+      auto tensor = inputs[i].toTensor();
+      all_dims += tensor.defined() ? tensor.ndimension() : 0;
     }
     // allocate enough room for all TensorPODs and dimensions
-    data.resize(ntensors + all_dims*2);
+    data.resize(ninputs + all_dims*2);
 
     // and reinterpret our data array as these structs
-    TensorInfoPOD * pods = reinterpret_cast<TensorInfoPOD*>(data.data());
+    ArgumentInfoPOD * pods = reinterpret_cast<ArgumentInfoPOD*>(data.data());
     int64_t * next_dim = sizes_strides();
-    int total_dims = 0;
-    for(size_t i = 0; i < ntensors; i++) {
-      const auto & t = tensors[i];
+    int32_t total_dims = 0;
+    for(int32_t i = 0; i < num_inputs; i++) {
       auto & pod = pods[i];
+      pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
+      if (!pod.is_tensor) continue;
+      at::Tensor t = inputs[i].toTensor();
       pod.defined = t.defined();
-      if(t.defined()) {
-        pod.type = static_cast<unsigned int>(t.type().scalarType());
+      if (pod.defined) {
+        pod.type = static_cast<int>(t.type().scalarType());
         pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
-        pod.requires_grad = with_grad && static_cast<const autograd::Variable&>(t).requires_grad();
+        pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
         total_dims += t.ndimension();
         auto sizes = t.sizes();
         std::copy(sizes.begin(),sizes.end(), next_dim);
@@ -73,51 +79,54 @@ struct ArgumentSpec {
     }
     // we precompute the hash_code to minimize the time inside of hash
     // table operations where we may need to hold a compiler cache lock.
-    hash_code = hash_combine(0, ntensors);
+    hash_code = hash_combine(0, ninputs);
     for(auto d : data) {
       hash_code = hash_combine(hash_code, d);
     }
   }
 
-  // equality is fast: check ntensors, and then check the raw array data,
+  // equality is fast: check ninputs, and then check the raw array data,
   // there are no size/stride indirections
   bool operator==(const ArgumentSpec & spec) const {
-    return ntensors == spec.ntensors && data == spec.data;
+    return ninputs == spec.ninputs && data == spec.data;
   }
   bool operator!=(const ArgumentSpec & spec) const {
     return !(*this == spec);
   }
-  friend struct TensorInfo;
-  TensorInfo tensorInfo(size_t i) const;
+  friend struct ArgumentInfo;
+  ArgumentInfo at(size_t i) const;
   size_t size() const {
-    return ntensors;
+    return ninputs;
   }
   size_t hashCode() const {
     return hash_code;
   }
 
 private:
-  ArrayRef<TensorInfoPOD> tensor_info() const {
-    return ArrayRef<TensorInfoPOD>(reinterpret_cast<const TensorInfoPOD*>(data.data()), ntensors);
+  ArrayRef<ArgumentInfoPOD> tensor_info() const {
+    return ArrayRef<ArgumentInfoPOD>(reinterpret_cast<const ArgumentInfoPOD*>(data.data()), ninputs);
   }
-  // the start of the sizes_strides information, which comes after the TensorInfoPOD list.
+  // the start of the sizes_strides information, which comes after the ArgumentInfoPOD list.
   const int64_t* sizes_strides() const {
-    return data.data() + ntensors;
+    return data.data() + ninputs;
   }
   int64_t* sizes_strides() {
-    return data.data() + ntensors;
+    return data.data() + ninputs;
   }
   size_t hash_code; // precomputed on construction
-  uint32_t ntensors;
-  // layout is ntensors of TensorPOD (each 64-bit) followed by their size and stride info
+  int32_t ninputs;
+  // layout is ninputs of TensorPOD (each 64-bit) followed by their size and stride info
   // for 3 tensors: [t0POD][t1POD][t2POD][t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides]
   std::vector<int64_t> data;
 };
 
-// public view of compressed TensorInfo
-struct TensorInfo {
-  TensorInfo(const ArgumentSpec & spec, const int i)
+// public view of compressed ArgumentInfo
+struct ArgumentInfo {
+  ArgumentInfo(const ArgumentSpec & spec, const int i)
   : spec(spec), i(i) {}
+  bool isTensor() const {
+    return pod(i).is_tensor;
+  }
   at::ScalarType type() const {
     return at::ScalarType(pod(i).type);
   }
@@ -148,20 +157,20 @@ struct TensorInfo {
   }
 private:
   // offsetinto sizes_strides() array where the sizes start for tensor j
-  // [valid range] valid range is [0, ntensors]
-  // (i.e. you can ask for the offset at ntensors, which would be the offset of the next tensor if it existed)
+  // [valid range] valid range is [0, ninputs]
+  // (i.e. you can ask for the offset at ninputs, which would be the offset of the next tensor if it existed)
   int sizes_strides_offset(int j) const {
     if(j == 0) return 0;
     return 2*pod(j - 1).total_dims;
   }
-  const TensorInfoPOD & pod(int j) const {
+  const ArgumentInfoPOD & pod(int j) const {
     return spec.tensor_info().at(j);
   }
   const ArgumentSpec & spec;
   const int i;
 };
 
-inline std::ostream & operator<<(std::ostream & out, const TensorInfo & info) {
+inline std::ostream & operator<<(std::ostream & out, const ArgumentInfo & info) {
   if(!info.defined()) {
     return out << "<undefined>";
   }
@@ -178,14 +187,14 @@ inline std::ostream& operator<<(std::ostream & out, const ArgumentSpec & spec) {
   for(size_t i = 0; i < spec.size(); ++i) {
     if (i > 0)
       out << ", ";
-    out << spec.tensorInfo(i);
+    out << spec.at(i);
   }
   out << "}";
   return out;
 }
 
-inline TensorInfo ArgumentSpec::tensorInfo(size_t i) const {
-  return TensorInfo(*this, i);
+inline ArgumentInfo ArgumentSpec::at(size_t i) const {
+  return ArgumentInfo(*this, i);
 }
 
 }}
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 0324d1f3e44b80..2c595ffd679c27 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -34,6 +34,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include <iterator>
 
 namespace torch { namespace jit {
 
@@ -51,38 +52,51 @@ using autograd::variable_list;
 struct ExecutionPlanAutogradFunction : public autograd::Function {
   ExecutionPlanAutogradFunction(GraphExecutor graph, size_t capture_size)
   : graph(std::move(graph)) {
-    captures.reserve(capture_size);
+    is_var_capture.reserve(capture_size);
+    var_captures.reserve(capture_size);
+    ivalue_captures.reserve(capture_size);
   }
+
   virtual variable_list apply(variable_list&& inputs) override {
-    // TODO: expensive copies here to convert to/from tensor_list
-    // TODO: because inputs is passed by const reference there is no
-    // way to release tensors incrementally as this runs
-    variable_tensor_list all_inputs;
-    all_inputs.reserve(captures.size() + inputs.size());
-    all_inputs.insert(all_inputs.end(), inputs.begin(), inputs.end());
-    for(auto & sv : captures) {
-      all_inputs.push_back(sv.unpack(this->shared_from_this()));
+    Stack stack;
+    stack.reserve(is_var_capture.size() + inputs.size());
+    stack.insert(stack.end(), std::make_move_iterator(inputs.begin()),
+                              std::make_move_iterator(inputs.end()));
+    auto var_capture_it = var_captures.begin();
+    auto ivalue_capture_it = ivalue_captures.begin();
+    for (bool is_var : is_var_capture) {
+      if (is_var) {
+        stack.push_back(var_capture_it->unpack(this->shared_from_this()));
+        ++var_capture_it;
+      } else {
+        stack.push_back(*ivalue_capture_it);
+        ++ivalue_capture_it;
+      }
+    }
+    graph.run(stack);
+    return fmap(stack, [](IValue & val) {
+      return autograd::Variable(std::move(val).toTensor());
+    });
+  }
+
+  void capture(const IValue & val) {
+    const bool is_tensor = val.isTensor();
+    is_var_capture.push_back(is_tensor);
+    if (is_tensor) {
+      var_captures.emplace_back(Variable(val.toTensor()), false);
+    } else {
+      ivalue_captures.push_back(val);
     }
-    auto tensors = graph.run(std::move(all_inputs));
-    // TODO: another copy that needs to be removed
-    return autograd::variable_list(tensors.begin(), tensors.end());
   }
 private:
   friend struct ExecutionPlan;
   GraphExecutor graph;
-  std::vector<autograd::SavedVariable> captures;
-};
-
 
-// helper to run interpreter on variables until we switch
-// everything to IValue
-inline variable_tensor_list runOneStage(const Code & code, variable_tensor_list inputs) {
-  std::vector<IValue> stack(inputs.begin(), inputs.end());
-  InterpreterState(code).runOneStage(stack);
-  return variable_tensor_list(fmap(stack, [](IValue& v) {
-    return std::move(v).toTensor();
-  }));
-}
+  // INVARIANT: is_var_capture.size() == var_captures.size() + ivalue_captures.size()
+  std::vector<bool> is_var_capture;
+  std::vector<autograd::SavedVariable> var_captures;
+  std::vector<IValue> ivalue_captures;
+};
 
 // an optimized way of executing the subgraph computed directly on
 // tensors rather than Variables.
@@ -91,19 +105,25 @@ inline variable_tensor_list runOneStage(const Code & code, variable_tensor_list
 // to the output Variables if present.
 struct ExecutionPlan {
   ExecutionPlan(std::shared_ptr<Graph>& graph)
-      : f(graph), graph(graph) {}
+      : f(graph),
+        graph(graph),
+        num_inputs(graph->inputs().size()),
+        num_outputs(graph->outputs().size()) {}
   ExecutionPlan(std::shared_ptr<Graph>& graph, Gradient grad)
       : f(graph),
         graph(graph),
         grad(std::move(grad)),
-        grad_executor(this->grad.df) {}
+        grad_executor(this->grad.df),
+        num_inputs(graph->inputs().size()),
+        num_outputs(graph->outputs().size()) {}
 
-  variable_tensor_list run(variable_tensor_list&& stack) const {
-    if(grad) {
-      return runWithGrad(std::move(stack));
+  void run(Stack & stack) const {
+    if (grad) {
+      return runWithGrad(stack);
     }
-    return runOneStage(f, std::move(stack));
+    InterpreterState(f).runOneStage(stack);
   }
+
   std::shared_ptr<Graph> get_graph() const {
     return graph;
   }
@@ -124,70 +144,73 @@ struct ExecutionPlan {
   }
 
 private:
-  // note: should be inplace to avoid allocations, but we have to switch from
-  // a list of tensor to a list of ivalues
-  std::vector<IValue> unwrapVariables(variable_tensor_list && list) const {
-    return fmap(list, [](const Variable& v) -> IValue {
-      return v.defined() ? autograd::as_variable_ref(v).detach() : at::Tensor();
-    });
-  }
-  // note: should be inplace to avoid allocations, but we have to switch from
-  // a list of tensor to a list of ivalues
-  variable_tensor_list wrapTensors(tensor_list && list) const {
-    for(auto & v : list) {
-      v = autograd::make_variable(v, /*requires_grad=*/false);
+  void detachVariables(Stack & stack) const {
+    // It would be nice to use an ArrayRef here, but unfortunately those can only
+    // return const references, so we need to do a bunch of indexing ourselves.
+    const int64_t stack_size = stack.size();
+    const int64_t stack_offset = stack_size - num_inputs;
+    for (int64_t i = stack_offset; i < stack_size; ++i) {
+      auto & v = stack[i];
+      if (!v.isTensor()) continue;
+      auto t = std::move(v).toTensor();
+      v = IValue{t.defined() ? autograd::as_variable_ref(t).detach() : std::move(t)};
     }
-    return variable_tensor_list(std::move(list));
   }
   // Capture (save) inputs that would be required to subsequently run backwards
-  void captureInputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & inputs) const {
-    for(auto offset : grad.df_input_captured_inputs) {
-      grad_fn.captures.emplace_back(autograd::as_variable_ref(inputs[offset]), false);
+  void captureInputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef<IValue> inputs) const {
+    for (size_t offset : grad.df_input_captured_inputs) {
+      grad_fn.capture(inputs[offset]);
     }
   }
-  void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, variable_tensor_list & outputs) const {
-    for(auto offset : grad.df_input_captured_outputs) {
-      grad_fn.captures.emplace_back(autograd::as_variable_ref(outputs[offset]), true);
+  void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef<IValue> outputs) const {
+    for (size_t offset : grad.df_input_captured_outputs) {
+      grad_fn.capture(outputs[offset]);
     }
   }
 
-  variable_tensor_list runWithGrad(variable_tensor_list&& inputs) const {
+  // XXX: keep in mind that stack can be larger than the inputs we need!
+  void runWithGrad(Stack & stack) const {
     auto grad_fn = std::make_shared<ExecutionPlanAutogradFunction>(grad_executor,
       grad.df_input_captured_inputs.size() + grad.df_input_captured_outputs.size());
-    // hook up the outputs of df to the gradient functions of the inputs that require
-    // gradients
-    for(auto idx : grad.df_output_vjps) {
-      auto & v = autograd::as_variable_ref(inputs[idx]);
-      grad_fn->add_next_edge(v.gradient_edge());
+
+    {
+      auto inputs = last(stack, num_inputs);
+      // hook up the outputs of df to the gradient functions of the inputs that require gradients
+      for(auto idx : grad.df_output_vjps) {
+        auto v = Variable(inputs[idx].toTensor());
+        grad_fn->add_next_edge(v.gradient_edge());
+      }
+      captureInputs(*grad_fn, inputs);
     }
-    captureInputs(*grad_fn, inputs);
 
-    auto stack = unwrapVariables(std::move(inputs));
+    detachVariables(stack);
     InterpreterState(f).runOneStage(stack);
-    variable_tensor_list outputs(
-        fmap(stack, [](IValue& v) { return std::move(v).toTensor(); }));
-
-    // hookup the gradients for the output tensors that require gradients
-    // to the inputs to our gradient function df
-    // TODO - XXX - if any output is the same tensor multiple times, views have to be
-    // setup here. We need to refactor autograd until it is safe for
-    // tensors to be constructed without all the viewing infrastructure.
-    // this is currently intentionally not done here so we can get an idea of our
-    // perf before introducing overhead for correctness
-    for(auto idx : grad.df_input_vjps) {
-      // Note: we have to set this up in place, or we have to throw away and
-      // reallocate variables that were already created in wrapTensors. We
-      // should add an API for this.
-      auto& output = autograd::as_variable_ref(outputs[idx]);
-      autograd::create_gradient_edge(output, grad_fn);
-      output.set_requires_grad(true);
+
+    {
+      auto outputs = last(stack, num_outputs);
+      // hookup the gradients for the output tensors that require gradients
+      // to the inputs to our gradient function df
+      // TODO - XXX - if any output is the same tensor multiple times, views have to be
+      // setup here. We need to refactor autograd until it is safe for
+      // tensors to be constructed without all the viewing infrastructure.
+      // this is currently intentionally not done here so we can get an idea of our
+      // perf before introducing overhead for correctness
+      for(auto idx : grad.df_input_vjps) {
+        // Note: we have to set this up in place, or we have to throw away and
+        // reallocate variables that were already created in wrapTensors. We
+        // should add an API for this.
+        Variable output = outputs[idx].toTensor();
+        autograd::create_gradient_edge(output, grad_fn);
+        output.set_requires_grad(true);
+      }
+      captureOutputs(*grad_fn, outputs);
+      // drop the temporary outputs so that we return the same number of
+      // outputs as if we were not also calculating gradient
+      const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs;
+      stack.erase(stack.end() - num_temporary_outputs, stack.end());
     }
-    captureOutputs(*grad_fn, outputs);
-    // drop the temporary outputs so that we return the same number of
-    // outputs as if we were not also calculating gradient
-    outputs.erase(outputs.begin() + grad.f_real_outputs, outputs.end());
-    return outputs;
   }
+
   Code f;
   // optimized graph for debugging and testing
   std::shared_ptr<Graph> graph;
@@ -195,6 +218,9 @@ struct ExecutionPlan {
   Gradient grad; // if(grad) is false when this is unused
   // executor for df, including code caches
   GraphExecutor grad_executor;
+
+  const size_t num_inputs;
+  const size_t num_outputs;
 };
 
 } // anonymous namespace
@@ -210,6 +236,7 @@ struct GraphExecutorImpl {
   : graph(std::move(graph))
   , optimize(optimize)
   , num_inputs(this->graph->inputs().size())
+  , num_outputs(this->graph->outputs().size())
   , symbolically_differentiable(symbolically_differentiable)
   , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {}
   GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
@@ -223,34 +250,36 @@ struct GraphExecutorImpl {
   }
 
   // entry point where execution begins
-  variable_tensor_list run(variable_tensor_list inputs) {
-    if(inputs.size() != num_inputs) {
+  void run(Stack & stack) {
+    if(stack.size() < num_inputs) {
       std::stringstream ss;
-      ss << "expected " << num_inputs << " inputs but got " << inputs.size() << " inputs";
+      ss << "expected " << num_inputs << " inputs but got " << stack.size() << " inputs";
       throw std::runtime_error(ss.str());
     }
+    auto inputs = last(stack, num_inputs);
 
     // the tracer has called a graph executor
     // there is no need to optimize, but we do need to splice the graph of
     // this excutor into the trace. Otherwise we might unroll control-flow
     // operations.
     if(tracer::isTracing()) {
-      return runTraced(std::move(inputs));
+      return runTraced(stack);
     }
 
     // this is the fallback pathway, when we cannot differentiate
     if(!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
-      return runFallback(std::move(inputs));
+      return runFallback(stack);
     }
 
     // either we can symbolically differentiate, or we do not need a gradient.
     // go down the route where we treat the inputs as tensors
     // and fully optimize
     auto & implementation = getOrCompile(inputs);
-    return implementation.run(std::move(inputs));
+    return implementation.run(stack);
   }
 
-  std::shared_ptr<Graph> graphFor(const variable_tensor_list& inputs) const {
+  std::shared_ptr<Graph> graphFor(const Stack& stack) const {
+    auto inputs = last(stack, num_inputs);
     ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
 
     if (!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
@@ -282,12 +311,15 @@ struct GraphExecutorImpl {
 private:
   friend struct GraphExecutor;
 
-  variable_tensor_list runTraced(variable_tensor_list inputs) {
+  void runTraced(Stack & stack) {
     auto state = tracer::getTracingState();
-    auto input_values = fmap(inputs, tracer::getValueTrace);
+    auto inputs = last(stack, num_inputs);
+    auto input_values = fmap(inputs, [](const IValue & v) {
+      return tracer::getValueTrace(v.toTensor());
+    });
 
     ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
-    auto outputs = runFallback(std::move(inputs));
+    runFallback(stack);
 
     auto all_dynamic = [](const at::ArrayRef<Value*> xs) {
       for(Value* x : xs) {
@@ -308,15 +340,18 @@ struct GraphExecutorImpl {
     }
     auto output_values = script::inlineCallTo(*state->graph, *local_graph, input_values);
 
-    for(size_t i = 0; i < outputs.size(); ++i) {
-      tracer::setValueTrace(outputs[i], output_values[i]);
+    auto outputs = last(stack, num_outputs);
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      // We can't attach tracing states to scalars, so we have to skip them here
+      // TODO: Should we reinterpret them as scalar tensors instead?
+      if (!outputs[i].isTensor()) continue;
+      tracer::setValueTrace(outputs[i].toTensor(), output_values[i]);
     }
-    return outputs;
   }
 
-  variable_tensor_list runFallback(variable_tensor_list inputs) {
+  void runFallback(Stack & stack) {
     auto & fb = getOrCreateAutogradFallback();
-    return runOneStage(fb, std::move(inputs));
+    InterpreterState(fb).runOneStage(stack);
   }
 
   static bool calcMayIntroduceGradient(Block* b) {
@@ -330,14 +365,16 @@ struct GraphExecutorImpl {
     }
     return false;
   }
-  bool needsGradient(const variable_tensor_list & inputs) const {
+  bool needsGradient(at::ArrayRef<IValue> inputs) const {
     if (!autograd::GradMode::is_enabled()) {
       return false;
     }
-    if(may_introduce_gradient)
+    if (may_introduce_gradient)
       return true;
-    for (const auto & tensor : inputs) {
-      if(tensor.defined() && static_cast<const Variable&>(tensor).requires_grad())
+    for (const IValue & value : inputs) {
+      if (!value.isTensor()) continue;
+      auto t = value.toTensor();
+      if (t.defined() && autograd::as_variable_ref(t).requires_grad())
         return true;
     }
     return false;
@@ -359,7 +396,7 @@ struct GraphExecutorImpl {
     autograd_fallback = Code(graph_);
     return autograd_fallback;
   }
-  const ExecutionPlan & getOrCompile(const variable_tensor_list & inputs) {
+  const ExecutionPlan & getOrCompile(at::ArrayRef<IValue> inputs) {
     // outside lock guard, to minimize the time holding the lock on the fast path
     // ArgumentSpec even computes its hashCode here.
     ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
@@ -376,7 +413,7 @@ struct GraphExecutorImpl {
 
   bool argumentSpecRequiresGradient(const ArgumentSpec & spec) {
     for(size_t i = 0; i < spec.size(); ++i) {
-      if(spec.tensorInfo(i).requires_grad())
+      if(spec.at(i).requires_grad())
         return true;
     }
     return false;
@@ -396,7 +433,7 @@ struct GraphExecutorImpl {
     std::vector<bool> requires_grads;
     requires_grads.reserve(spec.size());
     for(size_t i = 0; i < spec.size(); i++)
-      requires_grads.push_back(spec.tensorInfo(i).requires_grad());
+      requires_grads.push_back(spec.at(i).requires_grad());
 
     Gradient gradient = differentiate(graph_, requires_grads);
     graph_ = gradient.f;
@@ -410,8 +447,9 @@ struct GraphExecutorImpl {
   // true - do everything we can to make this graph run fast
   // false - do not modifiy the graph at all and just use the interpreter
   // to run the graph. Useful for debugging correctness issues in the implementation
-  bool optimize;
-  size_t num_inputs;
+  const bool optimize;
+  const size_t num_inputs;
+  const size_t num_outputs;
 
   // GraphExecutor optimizes more aggresively when we _know_ the graph will be
   // symbolically differentiable.
@@ -450,15 +488,15 @@ GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
 GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable)
 : pImpl(new GraphExecutorImpl(std::move(graph), optimize, symbolically_differentiable)) {}
 
-variable_tensor_list GraphExecutor::run(variable_tensor_list && inputs) {
-  return pImpl->run(std::move(inputs));
+void GraphExecutor::run(Stack & inputs) {
+  return pImpl->run(inputs);
 }
 
 std::shared_ptr<Graph> GraphExecutor::graph() const {
   return pImpl->graph;
 }
 
-std::shared_ptr<Graph> GraphExecutor::graphFor(const variable_tensor_list& inputs) const {
+std::shared_ptr<Graph> GraphExecutor::graphFor(const Stack& inputs) const {
   return pImpl->graphFor(inputs);
 }
 
@@ -481,7 +519,7 @@ void specializeToSpec(const std::shared_ptr<Graph>& graph_, const ArgumentSpec&
   // this must be first because later passes do not know what GradOfs are
   std::vector<bool> defined;
   for(size_t i = 0; i < spec.size(); ++i) {
-    defined.push_back(spec.tensorInfo(i).defined());
+    defined.push_back(spec.at(i).defined());
   }
   specializeUndef(*graph_, defined);
 
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index d78076ab6484f5..4e862c9e0a1e44 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -38,12 +38,12 @@ struct TORCH_API GraphExecutor {
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
   // note: if not specified, symbolically_differentiable is computed from the graph.
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable);
-  variable_tensor_list run(variable_tensor_list && inputs);
+  void run(Stack & inputs);
   explicit operator bool() const {
     return pImpl != nullptr;
   }
   std::shared_ptr<Graph> graph() const;
-  std::shared_ptr<Graph> graphFor(const variable_tensor_list& inputs) const;
+  std::shared_ptr<Graph> graphFor(const Stack& inputs) const;
   GraphExecutorState getDebugState();
 private:
   std::shared_ptr<GraphExecutorImpl> pImpl;
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index a4bfdc2a5b8431..908404a43b649e 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -57,18 +57,19 @@ void initJITBindings(PyObject *module) {
    .def("_jit_pass_onnx", ToONNX)
    .def("_jit_pass_onnx_peephole", PeepholeOptimizeONNX)
    .def("_jit_pass_fuse", FuseGraph)
-   .def("_jit_pass_dce", [](std::shared_ptr<Graph>& g){
+   .def("_jit_pass_dce", [](std::shared_ptr<Graph>& g) {
      return EliminateDeadCode(g); // overload resolution
    })
-   .def("_jit_pass_cse", EliminateCommonSubexpression)
+   .def("_jit_pass_cse", [](std::shared_ptr<Graph>& g) {
+     return EliminateCommonSubexpression(g); // overload resolution
+   })
    .def("_jit_pass_peephole", PeepholeOptimize)
    .def("_jit_pass_canonicalize", [](const std::shared_ptr<Graph>& g) {
      return Canonicalize(g);
    })
    .def("_jit_pass_lint", LintGraph)
    .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) {
-     auto tensor_inputs = createVariableTensorList(inputs);
-     PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs));
+     PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs)));
    })
    .def("_jit_pass_remove_expands", RemoveExpands)
    .def("_jit_pass_erase_number_types", EraseNumberTypes)
@@ -180,28 +181,15 @@ void initJITBindings(PyObject *module) {
         return ge.graph();
       })
       .def("graph_for", [](GraphExecutor& ge, py::args args) {
-        return ge.graphFor(createVariableTensorList(args));
+        return ge.graphFor(createStack(args));
       })
       .def("get_debug_state", [](GraphExecutor& ge) {
         return ge.getDebugState();
       })
       .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object {
-        auto inputs = createVariableTensorList(args);
-        auto outputs = ge.run(std::move(inputs));
-        // if we don't tell pybind these are variables it chokes on the
-        // conversion.
-        // TODO: fix conversions to be sane and make sure this works.
-        if (outputs.size() == 0) {
-          return py::none();
-        } else if (outputs.size() == 1) {
-          return py::cast(autograd::as_variable_ref(outputs[0]));
-        } else {
-          py::tuple tuple(outputs.size());
-          for(size_t i = 0; i < outputs.size(); i++) {
-            tuple[i] = py::cast(autograd::as_variable_ref(outputs[i]));
-          }
-          return tuple;
-        }
+        auto stack = createStack(args);
+        ge.run(stack);
+        return wrapStack(std::move(stack));
       });
 
   initPythonIRBindings(module);
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 04664f62885e83..cf7dda32413c23 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -617,16 +617,9 @@ struct CodeImpl {
 
     auto executor = std::make_shared<GraphExecutor>(node->g(attr::Subgraph));
     graph_executors.emplace_back(executor.get());
-    auto num_inputs = node->inputs().size();
     return [=](Stack& stack) mutable {
       autograd::profiler::RecordFunction record("GraphExecutor");
-      auto inputs = last(stack, num_inputs);
-      variable_tensor_list tinputs(
-          fmap(inputs, [](const IValue& v) { return v.toTensor(); }));
-      drop(stack, num_inputs);
-      //TODO: has graph executor work from a stack as well
-      variable_tensor_list toutputs = executor->run(variable_tensor_list(std::move(tinputs)));
-      stack.insert(stack.end(), toutputs.begin(), toutputs.end());
+      executor->run(stack);
       return 0;
     };
   }
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 81863baac9ce36..42a5be89e55e4b 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -4,6 +4,8 @@
 
 #include <ATen/ATen.h>
 
+#include <type_traits>
+
 namespace torch { namespace jit {
 
 // smart pointer to hold onto at::Retainable objects in a generic way
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index adcc5664179308..3b18699f94ffcd 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -455,7 +455,9 @@ void PropagateShapeOnBlock(Block * block, bool insert_expands) {
 void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec) {
   JIT_ASSERT(graph.inputs().size() == spec.size());
   for(size_t i = 0; i < spec.size(); ++i) {
-    graph.inputs()[i]->setType(spec.tensorInfo(i));
+    auto argspec = spec.at(i);
+    if (!argspec.isTensor()) continue;
+    graph.inputs()[i]->setType(argspec);
   }
   PropagateShapeOnBlock(graph.block());
 }
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 8b7e78a4d54384..415fc311086ac9 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -2,17 +2,10 @@
 
 #include "torch/csrc/utils/pybind.h"
 
-#include "torch/csrc/jit/variable_tensor_list.h"
-
 namespace torch { namespace jit {
 
-namespace {
-
-// we cannot use the default py:cast<autograd::Variable> because it currently
-// unwraps the data tensor in the conversion process
-// TODO: replace with bs type
-variable_tensor_list createVariableTensorList(py::tuple tuple, size_t reserve_extra_space = 0) {
-  variable_tensor_list result;
+inline Stack createStack(const py::tuple& tuple, size_t reserve_extra_space = 0) {
+  Stack result;
   result.reserve(tuple.size() + reserve_extra_space);
   for(auto e : tuple) {
     result.push_back(py::cast<autograd::Variable>(e));
@@ -20,6 +13,20 @@ variable_tensor_list createVariableTensorList(py::tuple tuple, size_t reserve_ex
   return result;
 }
 
-}  // namespace
+inline py::object wrapStack(Stack&& outputs) {
+  if (outputs.size() == 0) {
+    return py::none();
+  } else if (outputs.size() == 1) {
+    JIT_ASSERT(outputs[0].isTensor());
+    return py::cast(autograd::as_variable_ref(std::move(outputs[0]).toTensor()));
+  } else {
+    py::tuple tuple(outputs.size());
+    for(size_t i = 0; i < outputs.size(); i++) {
+      JIT_ASSERT(outputs[i].isTensor());
+      tuple[i] = py::cast(autograd::as_variable_ref(std::move(outputs[i]).toTensor()));
+    }
+    return tuple;
+  }
+}
 
 } }  // namespace torch::jit
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index d4fdb529782a69..c9e41e8a7eee26 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -145,7 +145,7 @@ void initPythonIRBindings(PyObject * module_) {
       return ss.str();
     })
     .def("propagate_shapes", [](Graph& g, std::vector<at::Tensor> inputs, bool with_grad) {
-      PropagateInputShapes(g, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs))));
+      PropagateInputShapes(g, ArgumentSpec(with_grad, fmap<IValue>(inputs)));
     })
     .def("export", [](const std::shared_ptr<Graph> g, const std::vector<at::Tensor>& initializers,
                       int64_t onnx_opset_version, bool defer_weight_export,
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 61100060f7f65f..576344427c0461 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -371,9 +371,9 @@ static void gatherParametersAndBuffers(std::vector<at::Tensor*> & values, const
 }
 
 py::object runMethodFromPython(Method& m, py::args args) {
-  auto inputs = createVariableTensorList(args);
-  auto outputs = m.run(std::move(inputs));
-  return unpackVariableTensorList(std::move(outputs));
+  auto stack = createStack(args);
+  m.run(stack);
+  return wrapStack(std::move(stack));
 }
 
 void initJitScriptBindings(PyObject* module) {
@@ -502,7 +502,7 @@ void initJitScriptBindings(PyObject* module) {
       })
       .def("graph_for", [](Module& self, py::args args) {
         if (self.find_method("forward")) {
-          return self.get_method("forward").graph_for(createVariableTensorList(args));
+          return self.get_method("forward").graph_for(createStack(args));
         }
         throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()");
       })
@@ -530,7 +530,7 @@ void initJitScriptBindings(PyObject* module) {
     .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes)
     .def("params", &Method::params)
     .def("graph_for", [](Method& self, py::args args) {
-      return self.graph_for(createVariableTensorList(args));
+      return self.graph_for(createStack(args));
     })
     .def("set_arg_and_return_types", [](Method &self, TypedDef &typed_def, bool method) {
       std::vector<Argument> arg_type_args, return_type_args;
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 90ad6f75d1b38c..76518aaf1d26fa 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -54,13 +54,13 @@ struct Method {
     }
   }
 
-  variable_tensor_list run(variable_tensor_list && inputs) {
-    for(auto tp : member_inputs) {
-      inputs.push_back(*tp);
+  void run(Stack & stack) {
+    for(at::Tensor* tp : member_inputs) {
+      stack.push_back(*tp);
     }
-    return get_executor().run(std::move(inputs));
+    get_executor().run(stack);
   }
-  std::shared_ptr<Graph> graph_for(const variable_tensor_list& inputs) {
+  std::shared_ptr<Graph> graph_for(const Stack& inputs) {
     return get_executor().graphFor(inputs);
   }
   std::shared_ptr<Graph> graph() const {
@@ -95,12 +95,15 @@ struct Method {
 
   std::shared_ptr<Graph> propagate_shapes(std::vector<at::Tensor> inputs, bool with_grad=false) {
     auto retval = graph_->copy();
-    for (auto inp : member_inputs) {
-      inputs.push_back(*inp);
+    Stack stack;
+    stack.reserve(inputs.size() + member_inputs.size());
+    for (at::Tensor & i : inputs) {
+      stack.emplace_back(std::move(i));
+    }
+    for (at::Tensor* inp : member_inputs) {
+      stack.push_back(*inp);
     }
-    PropagateInputShapes(
-      *retval,
-      ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs))));
+    PropagateInputShapes(*retval, ArgumentSpec(with_grad, std::move(stack)));
     return retval;
   }
 
@@ -110,8 +113,7 @@ struct Method {
       inputs.push_back(*inp);
     }
     if (propagate) {
-      auto inputs_copy = inputs;
-      PropagateInputShapes(*retval, ArgumentSpec(with_grad, variable_tensor_list(std::move(inputs_copy))));
+      PropagateInputShapes(*retval, ArgumentSpec(with_grad, fmap<IValue>(inputs)));
     }
     JIT_ASSERT(retval->inputs().size() == inputs.size());
     for (size_t i=0; i < retval->inputs().size(); ++i) {
diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h
index 654c87088e02a8..2c74ae7e0a4c77 100644
--- a/torch/csrc/jit/stack.h
+++ b/torch/csrc/jit/stack.h
@@ -27,10 +27,10 @@ static inline IValue & peek(Stack & stack, size_t i, size_t N) {
 }
 // treat the last N elements of the stack as a list, looking up the
 // slice starting at index i and having length len
-static inline at::ArrayRef<IValue> peekSlice(Stack & stack, size_t i, size_t len, size_t N) {
+static inline at::ArrayRef<IValue> peekSlice(const Stack & stack, size_t i, size_t len, size_t N) {
   return at::ArrayRef<IValue>(stack).slice(stack.size() - N + i, len);
 }
-static inline at::ArrayRef<IValue> last(Stack & stack, size_t N) {
+static inline at::ArrayRef<IValue> last(const Stack & stack, size_t N) {
   return peekSlice(stack, 0, N, N);
 }
 static inline void drop(Stack & stack, size_t n) {
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index ec889612a10471..ecb8c9b3779816 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -714,7 +714,8 @@ bool isEqual(at::IntList lhs, at::IntList rhs) {
   return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
 }
 
-bool isEqual(const TensorInfo & ti, const autograd::Variable & v) {
+bool isEqual(const ArgumentInfo & ti, const autograd::Variable & v) {
+  REQUIRE(ti.isTensor());
   if(!ti.defined())
     return ti.defined() == v.defined();
   return
@@ -728,8 +729,8 @@ bool isEqual(const TensorInfo & ti, const autograd::Variable & v) {
 // work around the fact that variable_tensor_list doesn't duplicate all
 // of std::vector's constructors.
 // most constructors are never used in the implementation, just in our tests.
-variable_tensor_list createVarList(std::vector<at::Tensor> && list) {
-  return variable_tensor_list(std::move(list));
+Stack createStack(std::vector<at::Tensor> && list) {
+  return Stack(std::make_move_iterator(list.begin()), std::make_move_iterator(list.end()));
 }
 
 void argumentSpecTest() {
@@ -738,14 +739,14 @@ void argumentSpecTest() {
   auto & GF = at::CUDA(at::kFloat);
   auto & GD = at::CUDA(at::kDouble);
 
-  auto list =  createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
+  auto list = createStack({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
 
   // make sure we have some non-standard strides
-  list[1].transpose_(0, 1);
+  list[1].toTensor().transpose_(0, 1);
 
   // same list but different backing values
-  auto list2 = createVarList({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
-  list2[1].transpose_(0, 1);
+  auto list2 = createStack({ var(CF, {1}, true), var(CD, {1, 2}, false) , var(GF, {}, true), var(GD, {4,5,6}, false), undef()});
+  list2[1].toTensor().transpose_(0, 1);
 
 
   ArgumentSpec a(true, list);
@@ -758,7 +759,7 @@ void argumentSpecTest() {
   REQUIRE(d.hashCode() == a.hashCode());
 
   for(size_t i = 0; i < list.size(); ++i) {
-    REQUIRE(isEqual(a.tensorInfo(i), list[i]));
+    REQUIRE(isEqual(a.at(i), list[i].toTensor()));
   }
   ArgumentSpec no_grad(/*with_grad=*/false, list);
   REQUIRE(no_grad != a);
@@ -770,7 +771,7 @@ void argumentSpecTest() {
   spec.insert(std::move(no_grad));
   REQUIRE(spec.count(ArgumentSpec(true,list)) == 1);
 
-  list2[1].transpose_(0,1);
+  list2[1].toTensor().transpose_(0,1);
   ArgumentSpec c(true, list2); // same as list, except for one stride
   REQUIRE(!(c == a));
   REQUIRE(spec.count(c) == 0);
@@ -793,7 +794,7 @@ void shapeAnalysisTest() {
   auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
 
   auto g = build_lstm();
-  ArgumentSpec spec(false, createVarList({v(input), v(hx), v(cx), v(w_ih), v(w_hh) }));
+  ArgumentSpec spec(false, createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh) }));
   PropagateInputShapes(*g, spec);
   at::Tensor r0, r1;
   std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
@@ -818,14 +819,15 @@ void testGraphExecutor() {
   auto w_ih  = t_def(at::randn({4 * hidden_size, input_size}, at::kCUDA));
   auto w_hh  = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCUDA));
 
-  std::vector<at::Tensor> inputs = {v(input), v(hx), v(cx), v(w_ih), v(w_hh) };
   auto g = build_lstm();
   GraphExecutor executor(g);
-  auto outputs = executor.run(variable_tensor_list(std::move(inputs)));
+  auto stack = createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh)});
+  executor.run(stack);
+  REQUIRE(stack.size() == 2);
   at::Tensor r0, r1;
   std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
-  REQUIRE(almostEqual(Variable(outputs[0]).data(), r0));
-  REQUIRE(almostEqual(Variable(outputs[1]).data(), r1));
+  REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0));
+  REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1));
 }
 
 void testBlocks(std::ostream & out) {
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 0b5d41f4de2978..5c998e3fc690bf 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -45,15 +45,13 @@ PreTraceInfo preRecordTrace(Symbol op,
 
 void postRecordTrace(const PreTraceInfo& info,
                      at::ArrayRef<Variable> outputs) {
-  auto assignOutput = [&info](const Variable & output, Value * value) {
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto & output = outputs[i];
+    Value * value = info.n->addOutput();
     if (output.defined()) {
       value->inferTypeFrom(output.data());
       setValueTrace(output, value);
     }
-  };
-
-  for (size_t i = 0; i < outputs.size(); i++) {
-    assignOutput(outputs[i], info.n->addOutput());
   }
 }
 
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index c780807791407f..9f726d12e7a724 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -579,7 +579,7 @@ class RNNCell(RNNCellBase):
 
     Attributes:
         weight_ih: the learnable input-hidden weights, of shape
-            `(input_size x hidden_size)`
+            `(hidden_size x input_size)`
         weight_hh: the learnable hidden-hidden weights, of shape
             `(hidden_size x hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`