From ae1c13e2f36110af8304562d1c69000dcf7ff7cf Mon Sep 17 00:00:00 2001
From: Roman Levenstein <rlevenstein@fb.com>
Date: Mon, 18 Dec 2017 21:14:49 -0800
Subject: [PATCH 1/2] Fix debug printing

---
 src/glow/Backends/Interpreter/InterpreterNodes.cpp | 4 ++--
 src/glow/Optimizer/IROptimizer.cpp                 | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/glow/Backends/Interpreter/InterpreterNodes.cpp b/src/glow/Backends/Interpreter/InterpreterNodes.cpp
index 22c8757c6e..6050e21f8f 100644
--- a/src/glow/Backends/Interpreter/InterpreterNodes.cpp
+++ b/src/glow/Backends/Interpreter/InterpreterNodes.cpp
@@ -1197,10 +1197,10 @@ void Interpreter::fwdDeallocActivationInst(bool isTrain,
 /// tensor.
 void Interpreter::fwdDebugPrintInst(bool isTrain, const DebugPrintInst *I) {
   auto *V = I->getSrc();
+  llvm::outs() << I->getName() << ": ";
   // Dump the content of a value.
   V->dump();
   llvm::outs() << "\n";
-  auto WH = getWeightHandle(V);
-  WH.dump();
+  dumpImpl(getTensor(V));
   llvm::outs() << "\n";
 }
diff --git a/src/glow/Optimizer/IROptimizer.cpp b/src/glow/Optimizer/IROptimizer.cpp
index da7f960609..ab91c84f36 100644
--- a/src/glow/Optimizer/IROptimizer.cpp
+++ b/src/glow/Optimizer/IROptimizer.cpp
@@ -884,19 +884,24 @@ static void performDebugInstrumentation(Module &M) {
       it = next;
       continue;
     }
+    auto instrName = (*it)->getName();
     for (auto const &Op : (*it)->getOperands()) {
       // Dump inputs of the current instruction before the instruction.
       if (Op.second != OperandKind::Out) {
-        std::string name = "print_input_";
+        std::string name = "debug_print.before.";
         name += Op.first->getName();
+        name += ".";
+        name += instrName;
         auto *dumpInstr = new DebugPrintInst(&M, name, Op.first);
         M.insertInstruction(it, dumpInstr);
       }
 
       // Dump outputs of the current instruction after the instruction.
       if (Op.second != OperandKind::In) {
-        std::string name = "print_output_";
+        std::string name = "debug_print.after.";
         name += Op.first->getName();
+        name += ".";
+        name += instrName;
         auto *dumpInstr = new DebugPrintInst(&M, name, Op.first);
         M.insertInstruction(next, dumpInstr);
       }

From 34dd9614edfe74658ed4369723d180f81bbc1e2e Mon Sep 17 00:00:00 2001
From: Roman Levenstein <rlevenstein@fb.com>
Date: Mon, 18 Dec 2017 21:18:46 -0800
Subject: [PATCH 2/2] Implement a new tensorview instruction

This instruction is used to create an unowned tensor with a required dimensionality from an existing tensor. The result is just a view of an existing tensor and does not allocate any new memory. Tensorview is essentially a typecast applied to a tensor.
---
 include/glow/IR/IRBuilder.h                   |  4 ++
 src/glow/Backends/Interpreter/Interpreter.cpp | 19 ++++++++
 src/glow/Backends/Interpreter/Interpreter.h   |  5 +++
 .../Backends/Interpreter/InterpreterNodes.cpp | 45 ++++++++++++-------
 src/glow/Backends/OpenCL/OpenCL.cpp           | 13 ++++++
 src/glow/IR/IR.cpp                            |  3 +-
 src/glow/IR/IRBuilder.cpp                     | 23 +++++++++-
 src/glow/IR/IRGen.cpp                         | 35 ++++++++++++---
 src/glow/IR/Instrs.cpp                        | 12 +++++
 src/glow/Optimizer/IROptimizer.cpp            | 10 ++++-
 tests/unittests/basicIRTest.cpp               |  4 +-
 tools/ClassGen/InstrGen.cpp                   |  5 +++
 12 files changed, 152 insertions(+), 26 deletions(-)

diff --git a/include/glow/IR/IRBuilder.h b/include/glow/IR/IRBuilder.h
index b56ce6e183..cffb69f406 100644
--- a/include/glow/IR/IRBuilder.h
+++ b/include/glow/IR/IRBuilder.h
@@ -59,6 +59,10 @@ class IRBuilder {
 
   ReshapeInst *createReshapeOp(Value *input, llvm::ArrayRef<size_t> shape);
 
+  TensorViewInst *createTensorView(ElemKind elemKind,
+                                   llvm::ArrayRef<size_t> dims, Value *src,
+                                   llvm::StringRef name);
+
   TransposeInst *createTransposeOp(Value *input,
                                    llvm::ArrayRef<unsigned> shuffle);
 
diff --git a/src/glow/Backends/Interpreter/Interpreter.cpp b/src/glow/Backends/Interpreter/Interpreter.cpp
index 882ccb79de..ca395f0faa 100644
--- a/src/glow/Backends/Interpreter/Interpreter.cpp
+++ b/src/glow/Backends/Interpreter/Interpreter.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Support/Casting.h"
 
 using namespace glow;
+using llvm::isa;
 
 Interpreter::~Interpreter() { clear(); }
 
@@ -70,6 +71,24 @@ Tensor *Interpreter::getOrCreateTensor(const Value *v) {
   return it->second;
 }
 
+Tensor *Interpreter::getOrCreateUnownedTensor(const Value *v,
+                                              const Value *src) {
+  assert(isa<TensorViewInst>(v) && "Expected a tensor view");
+
+  // Pick the tensor.
+  auto it = tensors_.find(v);
+
+  // Release unowned tensors before re-creating them.
+  if (it != tensors_.end()) {
+    deleteTensor(v);
+  }
+
+  auto *T = new Tensor();
+  *T = getTensor(src)->getUnowned(v->dims());
+  tensors_[v] = T;
+  return T;
+}
+
 void Interpreter::deleteTensor(const Value *v) {
   auto it = tensors_.find(v);
   if (it == tensors_.end()) {
diff --git a/src/glow/Backends/Interpreter/Interpreter.h b/src/glow/Backends/Interpreter/Interpreter.h
index 24df2dcfd8..55c04c1587 100644
--- a/src/glow/Backends/Interpreter/Interpreter.h
+++ b/src/glow/Backends/Interpreter/Interpreter.h
@@ -61,6 +61,11 @@ class Interpreter final : public Backend {
   /// \returns a tensor for \p v.
   Tensor *getOrCreateTensor(const Value *v);
 
+  /// Allocate an unowned tensor to back the value \p v. The source tensor of
+  /// the unowned tensor is provided by \p src.
+  /// \returns a tensor for \p v.
+  Tensor *getOrCreateUnownedTensor(const Value *v, const Value *src);
+
   /// If a tensor is allocated for \p v then delete it.
   void deleteTensor(const Value *v);
 
diff --git a/src/glow/Backends/Interpreter/InterpreterNodes.cpp b/src/glow/Backends/Interpreter/InterpreterNodes.cpp
index 6050e21f8f..fe65f7b5d2 100644
--- a/src/glow/Backends/Interpreter/InterpreterNodes.cpp
+++ b/src/glow/Backends/Interpreter/InterpreterNodes.cpp
@@ -384,23 +384,27 @@ void Interpreter::fwdFullyConnectedInst(bool isTrain,
   auto inW = getWeightHandle(I->getSrc());
   auto outW = getWeightHandle(I->getDest());
 
-  auto odim = flattenCdr(outW.dims());
-  auto idim = flattenCdr(inW.dims());
-  assert(odim.first == idim.first && "Mismatch batch size");
+  // outW and inW are 2-dimensional.
+  // Dimensions are depth and width.
+  auto OutWidth = outW.dims()[0];
+  auto OutDepth = outW.dims()[1];
+  auto InDepth = inW.dims()[0];
+  auto InWidth = inW.dims()[1];
+
+  assert(OutWidth == InDepth && "Mismatch batch size");
 
   auto filterW = getWeightHandle(I->getFilter());
   auto biasW = getWeightHandle(I->getBias());
 
-  size_t inputSize = idim.second;
+  size_t inputSize = InWidth;
 
-  for (size_t n = 0; n < odim.first; n++) {
-    size_t base = inW.getElementPtr({n});
+  for (size_t n = 0; n < OutWidth; n++) {
 
-    for (size_t i = 0; i < odim.second; i++) {
+    for (size_t i = 0; i < OutDepth; i++) {
 
       float sum = 0;
       for (size_t j = 0; j < inputSize; j++) {
-        sum += inW.raw(base + j) * filterW.at({i, j});
+        sum += inW.at({n, j}) * filterW.at({i, j});
       }
 
       sum += biasW.at({i});
@@ -415,8 +419,14 @@ void Interpreter::fwdFullyConnectedGradInst(bool isTrain,
   auto inG = getWeightHandle(I->getSrcGrad());
   auto outG = getWeightHandle(I->getDestGrad());
 
-  auto odim = flattenCdr(outG.dims());
-  auto idim = flattenCdr(inW.dims());
+  assert(inW.dims().size() == 2);
+  assert(inG.dims().size() == 2);
+
+  // outG and inW are 2-dimensional.
+  // Dimensions are depth and width.
+  auto OutWidth = outG.dims()[0];
+  auto OutDepth = outG.dims()[1];
+  auto InWidth = inW.dims()[1];
 
   auto filterW = getWeightHandle(I->getFilter());
   auto filterG = getWeightHandle(I->getFilterGrad());
@@ -426,20 +436,19 @@ void Interpreter::fwdFullyConnectedGradInst(bool isTrain,
   filterG.clear();
   inG.clear();
 
-  size_t inSize = idim.second;
+  size_t inSize = InWidth;
 
-  for (size_t n = 0; n < odim.first; n++) {
-    size_t base = inW.getElementPtr({n});
+  for (size_t n = 0; n < OutWidth; n++) {
 
     // Compute the gradient:
-    for (size_t i = 0; i < odim.second; i++) {
+    for (size_t i = 0; i < OutDepth; i++) {
       float chainGrad = outG.at({n, i});
 
       for (size_t j = 0, e = inSize; j < e; j++) {
         // Input gradient:
-        inG.raw(base + j) += filterW.at({i, j}) * chainGrad;
+        inG.at({n, j}) += filterW.at({i, j}) * chainGrad;
         // Param gradient:
-        filterG.at({i, j}) += inW.raw(base + j) * chainGrad;
+        filterG.at({i, j}) += inW.at({n, j}) * chainGrad;
       }
 
       biasG.at({i}) += chainGrad;
@@ -586,6 +595,10 @@ void Interpreter::fwdReshapeInst(bool isTrain, const ReshapeInst *I) {
   outT->copyRawFrom(inT);
 }
 
+void Interpreter::fwdTensorViewInst(bool isTrain, const TensorViewInst *I) {
+  getOrCreateUnownedTensor(I, I->getSrc());
+}
+
 void Interpreter::fwdZeroInst(bool isTrain, const glow::ZeroInst *I) {
   auto *T = getTensor(I->getDest());
   T->zero();
diff --git a/src/glow/Backends/OpenCL/OpenCL.cpp b/src/glow/Backends/OpenCL/OpenCL.cpp
index f859ae2792..500ac9cb39 100644
--- a/src/glow/Backends/OpenCL/OpenCL.cpp
+++ b/src/glow/Backends/OpenCL/OpenCL.cpp
@@ -365,6 +365,13 @@ void OCLBackend::doForwardPass(bool isTrain) {
       continue;
     }
 
+    if (auto *TV = dyn_cast<TensorViewInst>(I)) {
+      assert(tensors_[TV] == tensors_[TV->getSrc()] &&
+             "Memory address for a tensor_view should be the same as the "
+             "address of its origin");
+      continue;
+    }
+
     if (isa<CopyInst>(I) || isa<ReshapeInst>(I)) {
       auto *dest = I->getOperand(0).first;
       auto *src = I->getOperand(1).first;
@@ -457,6 +464,12 @@ void OCLBackend::init() {
       continue;
     }
 
+    if (auto *TV = llvm::dyn_cast<TensorViewInst>(I)) {
+      assert(!tensors_.count(TV) && "Allocation already made!");
+      tensors_[TV] = tensors_[TV->getSrc()];
+      continue;
+    }
+
     if (auto *D = llvm::dyn_cast<DeallocActivationInst>(I)) {
       auto *A = D->getAlloc();
       assert(tensors_.count(A) && "Invalid deallocation!");
diff --git a/src/glow/IR/IR.cpp b/src/glow/IR/IR.cpp
index a8a8295536..cd5173ea3e 100644
--- a/src/glow/IR/IR.cpp
+++ b/src/glow/IR/IR.cpp
@@ -394,7 +394,8 @@ static void nameInstr(std::unordered_set<std::string> &usedNames, Named *named,
 Module::Module(Graph *G) : G_(G), name_(G->getName()) {}
 
 static bool hasResultValue(Instruction *I) {
-  return I->getKind() == Instruction::Kind::AllocActivationInstKind;
+  return I->getKind() == Instruction::Kind::AllocActivationInstKind ||
+         I->getKind() == Instruction::Kind::TensorViewInstKind;
 }
 
 void Module::nameInstructions() {
diff --git a/src/glow/IR/IRBuilder.cpp b/src/glow/IR/IRBuilder.cpp
index 6318957217..70432348f9 100644
--- a/src/glow/IR/IRBuilder.cpp
+++ b/src/glow/IR/IRBuilder.cpp
@@ -101,12 +101,20 @@ FullyConnectedInst *IRBuilder::createFullyConnectedOp(Value *input,
                                                       Value *bias,
                                                       size_t outDepth) {
   TypeRef T = input->getType();
+
   auto idim = flattenCdr(input->dims());
 
+  Value *inputview = input;
+  // Create a tensor view only if the dimensionality needs to be changed.
+  if (input->dims().size() != 2)
+    inputview =
+        createTensorView(input->getElementType(), {idim.first, idim.second},
+                         input, "fctensorview");
   auto *dest = createAllocActivationInst("fcres", T->getElementType(),
                                          {idim.first, outDepth});
 
-  return createFullyConnectedInst("fc", dest, input, filter, bias, outDepth);
+  return createFullyConnectedInst("fc", dest, inputview, filter, bias,
+                                  outDepth);
 }
 
 ReluInst *IRBuilder::createRELUOp(Value *input) {
@@ -137,6 +145,19 @@ ReshapeInst *IRBuilder::createReshapeOp(Value *input,
   return createReshapeInst("reshape", res, input, shape);
 }
 
+/// Creates a tensorview instruction with the following parameters:
+/// \param elemKind the type of elements in a tensor
+/// \param dims dimensions of the view, such that the number of elements
+/// in the view is the same as the number of elements in the source tensor
+/// \p src
+/// \param src the source tensor used to create the unowned tensor.
+TensorViewInst *IRBuilder::createTensorView(ElemKind elemKind,
+                                            llvm::ArrayRef<size_t> dims,
+                                            Value *src, llvm::StringRef name) {
+  auto ty = getModule().getGraph()->uniqueType(Type(elemKind, dims));
+  return createTensorViewInst(name, src, ty);
+}
+
 TransposeInst *IRBuilder::createTransposeOp(Value *input,
                                             llvm::ArrayRef<unsigned> shuffle) {
   llvm::SmallVector<size_t, 6> shape;
diff --git a/src/glow/IR/IRGen.cpp b/src/glow/IR/IRGen.cpp
index f9a4b46910..dca901a314 100644
--- a/src/glow/IR/IRGen.cpp
+++ b/src/glow/IR/IRGen.cpp
@@ -70,7 +70,10 @@ struct IRGenVisitor : NodeWalker {
     }
     assert(!generatedNodeDest_.count(N) &&
            "Already generated code for this node");
-    assert(isa<AllocActivationInst>(v) && "The value must be an activation");
+    auto *dest = v;
+    if (auto *zn = dyn_cast<ZeroInst>(v))
+      dest = zn->getDest();
+    assert(isa<AllocActivationInst>(dest) && "The value must be an activation");
     generatedNodeDest_[N] = v;
   }
 
@@ -180,19 +183,41 @@ struct IRGenVisitor : NodeWalker {
 
     case glow::Kinded::Kind::FullyConnectedGradNodeKind: {
       auto *FCG = cast<FullyConnectedGradNode>(N);
-      auto *inW = valueForNode(FCG->getInput());
+      auto *InW = valueForNode(FCG->getInput());
+      // FullyConnected works with tensor views, so create them.
+      auto *InWview = InW;
+      // Create a tensor view only if the dimensionality needs to be changed.
+      if (InWview->dims().size() != 2) {
+        auto idim = flattenCdr(InW->dims());
+        InWview = builder_.createTensorView(InWview->getElementType(),
+                                            {idim.first, idim.second}, InW,
+                                            "inWtensorview");
+      }
       auto *filterW = valueForNode(FCG->getFilter());
       auto *outW = valueForNode(FCG->getGradOfOriginalOutputNamedOutput());
       auto biasX = FCG->getBias();
 
-      auto *InG = builder_.createAllocActivationInst("inG", inW->getType());
+      Value *InG = builder_.createAllocActivationInst("inG", InW->getType());
+      // Create a tensor view for the @out parameter G.
+      Value *InGview = InG;
+      if (InGview->dims().size() != 2) {
+        auto idim = flattenCdr(InG->dims());
+        // tensorview is the first use of InG and takes inG an @in parameter.
+        // But InG is not initialized yet to be used as an @in parameter and
+        // the IR verifier would complain about it. Therefore, we initialize
+        // InG as zero to make the verifier happy.
+        builder_.createZeroInst("zero", InG);
+        InGview = builder_.createTensorView(InGview->getElementType(),
+                                            {idim.first, idim.second}, InG,
+                                            "inGtensorview");
+      }
       auto *FilterG =
           builder_.createAllocActivationInst("filterG", filterW->getType());
       auto *BiasG =
           builder_.createAllocActivationInst("biasG", biasX.getType());
 
-      builder_.createFullyConnectedGradInst(N->getName(), inW, filterW, outW,
-                                            InG, FilterG, BiasG,
+      builder_.createFullyConnectedGradInst(N->getName(), InWview, filterW,
+                                            outW, InGview, FilterG, BiasG,
                                             FCG->getDepth());
 
       registerIR(FCG->getGradOfInputNamedInput(), InG);
diff --git a/src/glow/IR/Instrs.cpp b/src/glow/IR/Instrs.cpp
index 3cc608f507..08f20ebabe 100644
--- a/src/glow/IR/Instrs.cpp
+++ b/src/glow/IR/Instrs.cpp
@@ -134,6 +134,11 @@ void FullyConnectedInst::verify() const {
   llvm::ArrayRef<size_t> expB = {Depth_};
   assert(B->dims() == expB && "Invalid output shape");
   (void)expB;
+
+  assert(src->dims().size() == 2 &&
+         "Src of a FullyConnectedInst should be 2-dimensional");
+  assert(dest->dims().size() == 2 &&
+         "Dest of a FullyConnectedInst should be 2-dimensional");
 }
 
 void BatchedMatMulInst::verify() const {
@@ -178,6 +183,13 @@ void ReshapeInst::verify() const {
          "Reshape into a different size");
 }
 
+void TensorViewInst::verify() const {
+  assert(getOperand(0).first->getType()->size() == getType()->size() &&
+         "TensorView view size should be the same as Src size");
+  assert(getOperand(0).first->getElementType() == getType()->getElementType() &&
+         "TensorView view element type should be the same as Src size");
+}
+
 void TransposeInst::verify() const {
   auto *dest = getOperand(0).first;
   auto *src = getOperand(1).first;
diff --git a/src/glow/Optimizer/IROptimizer.cpp b/src/glow/Optimizer/IROptimizer.cpp
index ab91c84f36..d20f3f39eb 100644
--- a/src/glow/Optimizer/IROptimizer.cpp
+++ b/src/glow/Optimizer/IROptimizer.cpp
@@ -69,7 +69,7 @@ static void calculateLiveness(Module &M, LivenessMap &liveness) {
 /// Hoists Dealloc instructions right after their last use.
 static void hoistDealloc(Module &M) {
   // Maps activation instructions to their last non-dealloc user.
-  std::unordered_map<AllocActivationInst *, InstrIterator> lastUser;
+  std::unordered_map<Value *, InstrIterator> lastUser;
   auto &instrs = M.getInstrs();
 
   // Record the last use of each dealloc.
@@ -82,6 +82,14 @@ static void hoistDealloc(Module &M) {
       if (auto alloc = dyn_cast<AllocActivationInst>(op)) {
         lastUser[alloc] = it;
       }
+
+      if (auto tensorView = dyn_cast<TensorViewInst>(op)) {
+        // Consider any use of a tensor_view to be also a use
+        // of its source tensor. This is required to make
+        // sure that a lifetime of a tensor_view is always
+        // enclosed inside the lifetime of its source tensor.
+        lastUser[tensorView->getSrc()] = it;
+      }
     }
   }
 
diff --git a/tests/unittests/basicIRTest.cpp b/tests/unittests/basicIRTest.cpp
index 34ebdc6315..dd298f7375 100644
--- a/tests/unittests/basicIRTest.cpp
+++ b/tests/unittests/basicIRTest.cpp
@@ -85,7 +85,6 @@ TEST(IR, allInstrs) {
 
     auto *I3 = builder.createWeightVar(ElemKind::FloatTy, {1, 12, 12, 64});
     auto *I4 = builder.createWeightVar(ElemKind::FloatTy, {1, 12, 12, 3});
-    auto *I5 = builder.createWeightVar(ElemKind::FloatTy, {1, 32});
     auto *I6 = builder.createWeightVar(ElemKind::FloatTy, {2, 12, 12, 64});
     auto *I7 = builder.createWeightVar(T1, "I7");
     auto *I8 = builder.createWeightVar(ElemKind::FloatTy, {1, 24, 3, 24}, "I8");
@@ -109,12 +108,13 @@ TEST(IR, allInstrs) {
     builder.createCopyInst("", I1, I0);
     builder.createConvolutionInst("", I3, I1, F0, B0, 7, 2, 3, 64);
     builder.createPoolMaxInst("", I4, I0, XY, 7, 2, 3);
-    builder.createFullyConnectedInst("", I5, I0, F1, B1, 32);
+    builder.createFullyConnectedOp(I0, F1, B1, 32);
     builder.createReluInst("", I1, I0);
     builder.createSigmoidInst("", I1, I0);
     builder.createTanhInst("", I1, I0);
     builder.createSoftMaxInst("", I1, I0, I7, E0);
     builder.createTransposeInst("", I8, I2, {0, 3, 1, 2});
+    builder.createTensorView(ElemKind::FloatTy, {1, 24, 3, 24}, I2, "I2_view");
     builder.createInsertTensorInst("", I6, I3, {0, 0, 0, 0});
     builder.createBatchNormalizationInst("", I1, I0, S0, S0, S0, S0, 3, 0.01,
                                          0.9);
diff --git a/tools/ClassGen/InstrGen.cpp b/tools/ClassGen/InstrGen.cpp
index 344a467437..75a3e67823 100644
--- a/tools/ClassGen/InstrGen.cpp
+++ b/tools/ClassGen/InstrGen.cpp
@@ -31,6 +31,11 @@ int main(int argc, char **argv) {
       .addMember(MemberType::TypeRef, "Ty")
       .setType("Ty");
 
+  BB.newInstr("TensorView")
+      .addOperand("Src", OperandKind::In)
+      .addMember(MemberType::TypeRef, "Ty")
+      .setType("Ty");
+
   BB.newInstr("DeallocActivation")
       .addOperand("Src", OperandKind::Out)
       .addExtraMethod("AllocActivationInst *getAlloc() const { return "