Relax verify of VariableFlags (#4191)

zdevito · soumith · commit 84b7daadb22f · 2017-12-15T12:57:31.000-05:00
* Fix another leak in pybind11 code. This time caused by an upstream pybind11 bug: pybind/pybind11#1216 This changes causes the code to go down a non-buggy pathway. * Relax verify of VariableFlags If we trace with a defined tensor, but see a run with a undefined tensors we now allow that run to happen, replacing the tensor with zeros. This also fixes a bug where stage 0 tensors were not checked against their verify flags. This change does _not_ handle all bad situations that can happen. For instance if the first thing traced has a undefined tensor but a later tensor is defined, then it will fail because the graph itself does not contain the trace for the derivative of the tensor. However it is possible to work around this later case by dry-running the function: z = Variable(...,requires_grad=True) x,y = f(z) (x.sum() + y.sum()).backward()
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
@@ -478,6 +478,13 @@ struct InterpreterStateImpl {
       outputs.clear();
       loadTensorsFromRegisters(stage.outputs, outputs);
   }
+  const TensorType & tensorTypeForInput(size_t i) const {
+    size_t graph_i = i;
+    for(size_t s = 0; s < current_stage; s++)
+      graph_i += function->stages[s].inputs.size;
+    JIT_ASSERTM(graph_i < function->graph->inputs().size(), "Input out of range");
+    return *function->graph->inputs().at(graph_i)->type()->expect<TensorType>();
+  }
   int get(const ListHandle<int> & list, int i) {
     return int_data[list.start + i];
   };
@@ -532,6 +539,9 @@ void InterpreterState::runOneStage(
   std::vector<at::Tensor> & outputs) {
     return pImpl->runOneStage(inputs, outputs);
 }
+const TensorType & InterpreterState::tensorTypeForInput(size_t i) const {
+  return pImpl->tensorTypeForInput(i);
+}
 InterpreterState InterpreterState::clone() const {
   return InterpreterState(new InterpreterStateImpl(*pImpl));
 }
diff --git a/torch/csrc/jit/interpreter.h b/torch/csrc/jit/interpreter.h
@@ -6,14 +6,15 @@ namespace at {
   struct Tensor;
 }
 namespace torch { namespace jit {
-  
+
 // The interpreter run Graphs with Tensor inputs and Tensor outputs
 // a separate component in the autograd handles unwrapping and wrapping
 // variable objects for use in the interpreter.
 
 struct CodeImpl;
 struct InterpreterStateImpl;
 struct Graph;
+struct TensorType;
 
 struct Code {
   Code()
@@ -36,6 +37,7 @@ struct InterpreterState {
   void runOneStage(
     const std::vector<at::Tensor> & inputs,
     std::vector<at::Tensor> & outputs);
+  const TensorType & tensorTypeForInput(size_t i) const;
   ~InterpreterState();
   // create a copy of InterpreterState with its current state
   // used when retain_graph=True so that stages can be re-run
diff --git a/torch/csrc/jit/interpreter_autograd_function.cpp b/torch/csrc/jit/interpreter_autograd_function.cpp
@@ -5,6 +5,14 @@ namespace torch { namespace jit {
 
 using namespace torch::jit::tracer;
 
+static at::Tensor zeroTensorWithType(const TensorType & type) {
+  auto device = (type.device() < 0)? at::kCPU : at::kCUDA;
+  auto & at_type = at::getType(device, type.scalarType());
+  // note: this has to be a contiguous tensor of zeros, because the fusion engine
+  // specialized to what is normally here which might be fully dense
+  return at_type.zeros(type.sizes());
+}
+
 autograd::variable_list InterpreterAutogradFunction::apply(
     const autograd::variable_list& inputs) {
   // Initial correctness checks.
@@ -19,15 +27,31 @@ autograd::variable_list InterpreterAutogradFunction::apply(
   const auto & details = stage_details_[stage_];
 
   // Validate inputs
-  for (std::size_t i = 0; i < (std::size_t)num_inputs; ++i) {
-    if (!details.input_flags[i].verify(inputs[i])) {
-      throw std::runtime_error("JIT interpreter received inputs with different "
-          "flags than it was compiled for.");
+  std::vector<at::Tensor> tinputs;
+  tinputs.reserve(inputs.size());
+  TORCH_ASSERT(inputs.size() == num_inputs);
+  TORCH_ASSERT(inputs.size() == details.input_flags.size());
+  for (std::size_t i = 0; i < (std::size_t)inputs.size(); ++i) {
+    if(stage_ > 0 && !inputs[i].defined() && !details.input_flags[i].was_null) {
+      // [Temporary workaround for variants] until tracer produces all variants:
+      // if you have a function x, y = fn(z) and only use x then gradient for y
+      // will be undefined. If you reuse the same trace with and _sometimes_ use y
+      // then in the cases where you don't use it, the grad_y input in stage 1
+      // will be undefined. To ensure we can continue, we create a 0 gradient,
+      // using trace information to figure out what shape it should be
+      tinputs.push_back(zeroTensorWithType(interp_.tensorTypeForInput(i)));
+    } else if(!details.input_flags[i].verify(inputs[i])) {
+      std::stringstream ss;
+      ss << "JIT interpreter received inputs with different "
+        << "flags than it was compiled for. Compiled with " << details.input_flags[i]
+        << " but found " << VariableFlags::of(inputs[i]) << "\n";
+      throw std::runtime_error(ss.str());
+    } else {
+      tinputs.push_back(inputs[i].data());
     }
   }
 
   // Run the interpreter
-  auto tinputs = fmap(inputs, [](const autograd::Variable& i) { return i.data(); });
   std::vector<at::Tensor> toutputs;
   InterpreterState interp = (keep_graph_) ? interp_.clone() : interp_;
   interp.runOneStage(tinputs, toutputs);
@@ -57,7 +81,13 @@ autograd::variable_list InterpreterAutogradFunction::apply(
     }
     // Add grad_fns corresponding to inputs
     for (auto & input : inputs) {
-      if (!input.requires_grad()) continue; // See Note [Null-edge pruning]
+      if (!input.requires_grad()) {
+        continue; // See Note [Null-edge pruning]
+      } else if (!input.defined()) {
+        // See Note [Temporary workaround for variants]
+        grad_fn->next_functions.emplace_back();
+        continue;
+      }
       grad_fn->next_functions.emplace_back(
         input.grad_fn() ? input.grad_fn() : input.grad_accumulator(),
         input.output_nr());
diff --git a/torch/csrc/jit/interpreter_autograd_function.h b/torch/csrc/jit/interpreter_autograd_function.h
@@ -20,7 +20,11 @@ struct InterpreterAutogradFunction : public autograd::Function {
                               const std::vector<StageDetails>& stage_details)
     : interp_(code)
     , stage_details_(stage_details)
-    , stage_(0) {}
+    , stage_(0) {
+      // stage 0 isn't run through the autograd, so we set this
+      // here just in case it is used
+      num_inputs = stage_details.at(0).input_flags.size();
+    }
 
   InterpreterAutogradFunction(InterpreterState interp,
                               const std::vector<StageDetails>& stage_details,
diff --git a/torch/csrc/jit/python_compiled_function.cpp b/torch/csrc/jit/python_compiled_function.cpp
@@ -200,18 +200,32 @@ CompiledFunction::TraceForKey* getTraceFor(CompiledFunction& fn,
 
 } // anonymous namespace
 
+static py::tuple tuple_tail(const py::tuple & tup) {
+  py::tuple r(tup.size() - 1);
+  for(int i = 1; i < tup.size(); i++) {
+    r[i-1] = tup[i];
+  }
+  return r;
+}
+
 void initCompilerMixin(PyObject *module) {
   auto m = py::handle(module).cast<py::module>();
   py::class_<CompiledFunction>(m, "CompiledFunction", py::dynamic_attr())
     .def(py::init<int, bool, bool, py::object, std::string>())
-    .def("__call__", [](CompiledFunction& fn, py::args args) -> py::object {
-      return fn.call(args);
+    .def("__call__", [](py::args args_) -> py::object {
+      auto fn = py::cast<CompiledFunction*>(args_[0]);
+      auto args = tuple_tail(args_);
+      return fn->call(args);
     })
-    .def("has_trace_for", [](CompiledFunction& fn, py::args args) -> bool {
-      return getTraceFor(fn, args) != nullptr;
+    .def("has_trace_for", [](py::args args_) -> bool {
+      auto fn = py::cast<CompiledFunction*>(args_[0]);
+      auto args = tuple_tail(args_);
+      return getTraceFor(*fn, args) != nullptr;
     })
-    .def("graph_for", [](CompiledFunction& fn, py::args args) -> py::object {
-      auto trace = getTraceFor(fn, args);
+    .def("graph_for", [](py::args args_) -> py::object {
+      auto fn = py::cast<CompiledFunction*>(args_[0]);
+      auto args = tuple_tail(args_);
+      auto trace = getTraceFor(*fn, args);
       return trace ? py::cast(trace->graph_) : py::none();
     })
     .def("clear_cache", [](CompiledFunction& fn) {
diff --git a/torch/csrc/jit/variable_flags.h b/torch/csrc/jit/variable_flags.h
@@ -1,5 +1,5 @@
 #pragma once
-
+#include <iostream>
 namespace torch { namespace autograd {
 struct Variable;
 }}
@@ -15,4 +15,11 @@ struct VariableFlags {
   bool was_null;
 };
 
+static inline std::ostream & operator<<(std::ostream & out, const VariableFlags& v) {
+  return out
+    << "(requires_grad=" << v.requires_grad
+    << ", is_volatile=" << v.is_volatile
+    << ", was_null=" << v.was_null << ")";
+}
+
 }}