From d13dab1d30a66c954f6a8c1095d49af014659b3f Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Wed, 10 Jul 2024 21:36:04 +0200
Subject: [PATCH 01/17] dpu: llvm: DPUTargetLowering::LowerOperation: cleaner
 default case

---
 llvm/lib/Target/DPU/DPUTargetLowering.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index 95ed30c7086ec..701d050338ba3 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -382,23 +382,22 @@ SDValue DPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerVAARG(Op, DAG);
 
   default: {
-    const char *NodeName = getTargetNodeName(Op.getOpcode());
     LLVM_DEBUG({
       dbgs() << "FAIL: ";
       Op.dump(&DAG);
-    });
-    if (NodeName != nullptr) {
-      LLVM_DEBUG(dbgs() << "\tnode name = " << NodeName << "\n");
-    }
-    for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) {
-      LLVM_DEBUG({
+      dbgs() << "\n";
+      const char *NodeName = getTargetNodeName(Op.getOpcode());
+      if (NodeName != nullptr) {
+        dbgs() << "\tnode name = " << NodeName << "\n";
+      }
+      for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) {
         dbgs() << "\toperand #" << std::to_string(eachOp) << " = ";
         Op.getOperand(eachOp).dump(&DAG);
+      }
       });
-    }
-  }
     report_fatal_error("NOT implemented: lowering of such a type of SDValue");
   }
+  }
 }
 
 const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

From 7a5ebcb064f87cdd0edd7e5a54c2c66d5498d225 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Wed, 10 Jul 2024 21:45:52 +0200
Subject: [PATCH 02/17] dpu: llvm:
 DPUMergeComboInstrPass::runOnMachineFunction: improve tracking of change made

---
 llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 998d4f0d4bcc5..135fd3b7c5c40 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -822,8 +822,17 @@ bool DPUMergeComboInstrPass::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *MBB = &MFI;
 
     LLVM_DEBUG(MBB->dump());
-    changeMade |= mergeComboInstructionsInMBB(MBB, InstrInfo);
+    bool local_change = mergeComboInstructionsInMBB(MBB, InstrInfo);
+    if (local_change) {
+      LLVM_DEBUG({
+        dbgs() << "\nchanged to:\n";
+        MBB->dump();
+      });
+      changeMade = true;
+    }
   }
 
+  LLVM_DEBUG(dbgs() << "********** DPU/MergeComboInstrPass: " << MF.getName()
+                    << " done: changeMade = " << changeMade << " **********\n\n");
   return changeMade;
 }

From 210bb44ae5c79d6091194775a466a366016fc7bd Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Thu, 11 Jul 2024 21:59:41 +0200
Subject: [PATCH 03/17] dpu: llvm: add constant register for lowering

---
 llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp | 53 ++++++++++++++++++++++++-
 llvm/lib/Target/DPU/DPURegisterInfo.cpp | 16 ++++++++
 llvm/lib/Target/DPU/DPURegisterInfo.h   |  2 +
 llvm/lib/Target/DPU/DPURegisterInfo.td  |  4 +-
 4 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
index c501d43ed7a89..00adb6c2b9f6e 100644
--- a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
@@ -387,10 +387,61 @@ void DPUDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
+  EVT VT = Node->getValueType(0);
+
   switch (Opcode) {
+  case ISD::Constant: {
+    LLVM_DEBUG({dbgs() << "a constant: "; Node->dump();});
+    if (VT == MVT::i32) {
+      // Materialize some constants as copies from constant register.
+      // This allows the coalescer to propagate these into other instructions.
+      ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+      if (ConstNode->isNullValue()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+					     DPU::ZERO, MVT::i32);
+	ReplaceNode(Node, New.getNode());
+	return;
+      } else if (ConstNode->isOne()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+					     DPU::ONE, MVT::i32);
+	ReplaceNode(Node, New.getNode());
+	return;
+      } else if (ConstNode->isAllOnesValue()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+					     DPU::LNEG, MVT::i32);
+	ReplaceNode(Node, New.getNode());
+	return;
+      } else {
+	const ConstantInt *Cst = ConstNode->getConstantIntValue();
+	if (Cst->isMinValue(/* signed = */ true)) {
+	  SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+					       DPU::MNEG, MVT::i32);
+	  ReplaceNode(Node, New.getNode());
+	  return;
+	}
+      }
+    } else if (VT == MVT::i64) {
+      ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+      if (ConstNode->isNullValue()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+					     DPU::ZERO, MVT::i32);
+	auto *NewMove = CurDAG->getMachineNode(DPU::MOVE_Srr, SDLoc(Node), VT,
+					       New);
+	ReplaceNode(Node, NewMove);
+	return;
+      } else if (ConstNode->isOne()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+					     DPU::ONE, MVT::i32);
+	auto *NewMove = CurDAG->getMachineNode(DPU::MOVE_Srr, SDLoc(Node), VT,
+					       New);
+	ReplaceNode(Node, NewMove);
+	return;
+      }
+    }
+    break;
+  }
   case ISD::FrameIndex: {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    EVT VT = Node->getValueType(0);
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
     unsigned Opc = DPU::ADDrri;
     if (Node->hasOneUse()) {
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.cpp b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
index 778ac2343a5c4..e66b657ea91f3 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.cpp
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
@@ -167,3 +167,19 @@ DPURegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
                                       CallingConv::ID /*CC*/) const {
   return CSR_RegMask;
 }
+
+bool DPURegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+  switch(PhysReg) {
+  default:
+    return false;
+  case DPU::ZERO:
+  case DPU::ONE:
+  case DPU::LNEG:
+  case DPU::MNEG:
+  // case DPU::ID:
+  // case DPU::ID2:
+  // case DPU::ID4:
+  // case DPU::ID8:
+    return true;
+  }
+}
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.h b/llvm/lib/Target/DPU/DPURegisterInfo.h
index 5d769d6a0d9d7..25d9c575a3967 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.h
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.h
@@ -37,6 +37,8 @@ struct DPURegisterInfo : public DPUGenRegisterInfo {
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
 
+  bool isConstantPhysReg(MCRegister PhysReg) const override;
+
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.td b/llvm/lib/Target/DPU/DPURegisterInfo.td
index caa0d84670555..5bf3c3776500a 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.td
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.td
@@ -139,7 +139,9 @@ def MAJ_ID8:   DPUReg<31, "ID8", [], [ID8]>;
 // that can be used as an instruction operand.
 // Hide the reserved registers, so that we are very sure that the compiler will
 // not do anything with them.
-def GP_REG    : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23), (sequence "MAJ_R%u", 0, 23))>;
+def GP_REG    : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23), (sequence "MAJ_R%u", 0, 23), ZERO, ONE,
+// LNEG,    <-- there is an issue with this one:        lsr_add r2, lneg, r2, 3  seems to be understood as sats r2, r2 ... encoding problem???
+ MNEG)>;
 def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG, MAJ_ZERO, MAJ_ONE, MAJ_LNEG, MAJ_MNEG)>;
 def ID_REG    : RegisterClass<"DPU", [i32], 32, (add ID, ID2, ID4, ID8, MAJ_ID, MAJ_ID2, MAJ_ID4, MAJ_ID8)>;
 def ZERO_REG  : RegisterClass<"DPU", [i32], 32, (add ZERO, MAJ_ZERO)>;

From ad6f57a35603745de7e000fc64829c8c43b16c09 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Fri, 12 Jul 2024 20:13:53 +0200
Subject: [PATCH 04/17] dpu: llvm: correcting IR generation

---
 llvm/lib/Target/DPU/DPUFrameLowering.cpp | 3 ++-
 llvm/lib/Target/DPU/DPUInstrInfo.cpp     | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUFrameLowering.cpp b/llvm/lib/Target/DPU/DPUFrameLowering.cpp
index 8bf3c6c06650b..026354d10e304 100644
--- a/llvm/lib/Target/DPU/DPUFrameLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUFrameLowering.cpp
@@ -85,7 +85,8 @@ void DPUFrameLowering::emitPrologue(MachineFunction &MF,
         .addCFIIndex(CFIIndex)
         .setMIFlag(MachineInstr::FrameSetup);
 
-    BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir), DPU::R22)
+    BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir))
+        .addReg(DPU::R22)
         .addImm(StackSize - STACK_SIZE_FOR_D22)
         .addReg(DPU::D22);
     BuildMI(MBB, MBBI, DL, DPUII.get(DPU::ADDrri), DPU::R22)
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index db957f97bcaa9..810ab6e792af2 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -106,9 +106,9 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::JUMPr)).addReg(DPU::R23);
     break;
   case DPU::CALLi:
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri))
-        .addReg(DPU::R23)
-        .add(MI.getOperand(0));
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri), DPU::R23)
+        .add(MI.getOperand(0))
+        .copyImplicitOps(MI);
     break;
   case DPU::CALLr:
     BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr))

From cd1553f87f02f1b2f523d3068b243526a78a50fa Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Tue, 16 Jul 2024 10:21:25 +0200
Subject: [PATCH 05/17] dpu: llvm: register simplify and remove lneg,mneg

---
 llvm/lib/Target/DPU/DPURegisterInfo.cpp |  22 ++---
 llvm/lib/Target/DPU/DPURegisterInfo.td  | 120 ++++++++++++++----------
 2 files changed, 79 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.cpp b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
index e66b657ea91f3..ad7db1f538ada 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.cpp
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
@@ -61,17 +61,17 @@ BitVector DPURegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserved.set(DPU::ID2);
   reserved.set(DPU::ID4);
   reserved.set(DPU::ID8);
-  reserved.set(DPU::MAJ_D22);
-  reserved.set(DPU::MAJ_R22);
-  reserved.set(DPU::MAJ_R23);
-  reserved.set(DPU::MAJ_ZERO);
-  reserved.set(DPU::MAJ_ONE);
-  reserved.set(DPU::MAJ_LNEG);
-  reserved.set(DPU::MAJ_MNEG);
-  reserved.set(DPU::MAJ_ID);
-  reserved.set(DPU::MAJ_ID2);
-  reserved.set(DPU::MAJ_ID4);
-  reserved.set(DPU::MAJ_ID8);
+  // reserved.set(DPU::MAJ_D22);
+  // reserved.set(DPU::MAJ_R22);
+  // reserved.set(DPU::MAJ_R23);
+  // reserved.set(DPU::MAJ_ZERO);
+  // reserved.set(DPU::MAJ_ONE);
+  // reserved.set(DPU::MAJ_LNEG);
+  // reserved.set(DPU::MAJ_MNEG);
+  // reserved.set(DPU::MAJ_ID);
+  // reserved.set(DPU::MAJ_ID2);
+  // reserved.set(DPU::MAJ_ID4);
+  // reserved.set(DPU::MAJ_ID8);
   return reserved;
 }
 
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.td b/llvm/lib/Target/DPU/DPURegisterInfo.td
index 5bf3c3776500a..cbd6215ca8d88 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.td
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.td
@@ -56,31 +56,31 @@ def R22 : DPUReg<22, "r22">, DwarfRegNum<[22]>;
 // R23: reserved as the return address for functions
 def R23 : DPUReg<23, "r23">, DwarfRegNum<[23]>;
 
-// Thread data registers
-def MAJ_R0 : DPUReg<0, "R0", [], [R0]>, DwarfRegNum<[0]>;
-def MAJ_R1 : DPUReg<1, "R1", [], [R1]>, DwarfRegNum<[1]>;
-def MAJ_R2 : DPUReg<2, "R2", [], [R2]>, DwarfRegNum<[2]>;
-def MAJ_R3 : DPUReg<3, "R3", [], [R3]>, DwarfRegNum<[3]>;
-def MAJ_R4 : DPUReg<4, "R4", [], [R4]>, DwarfRegNum<[4]>;
-def MAJ_R5 : DPUReg<5, "R5", [], [R5]>, DwarfRegNum<[5]>;
-def MAJ_R6 : DPUReg<6, "R6", [], [R6]>, DwarfRegNum<[6]>;
-def MAJ_R7 : DPUReg<7, "R7", [], [R7]>, DwarfRegNum<[7]>;
-def MAJ_R8 : DPUReg<8, "R8", [], [R8]>, DwarfRegNum<[8]>;
-def MAJ_R9 : DPUReg<9, "R9", [], [R9]>, DwarfRegNum<[9]>;
-def MAJ_R10 : DPUReg<10, "R10", [], [R10]>, DwarfRegNum<[10]>;
-def MAJ_R11 : DPUReg<11, "R11", [], [R11]>, DwarfRegNum<[11]>;
-def MAJ_R12 : DPUReg<12, "R12", [], [R12]>, DwarfRegNum<[12]>;
-def MAJ_R13 : DPUReg<13, "R13", [], [R13]>, DwarfRegNum<[13]>;
-def MAJ_R14 : DPUReg<14, "R14", [], [R14]>, DwarfRegNum<[14]>;
-def MAJ_R15 : DPUReg<15, "R15", [], [R15]>, DwarfRegNum<[15]>;
-def MAJ_R16 : DPUReg<16, "R16", [], [R16]>, DwarfRegNum<[16]>;
-def MAJ_R17 : DPUReg<17, "R17", [], [R17]>, DwarfRegNum<[17]>;
-def MAJ_R18 : DPUReg<18, "R18", [], [R18]>, DwarfRegNum<[18]>;
-def MAJ_R19 : DPUReg<19, "R19", [], [R19]>, DwarfRegNum<[19]>;
-def MAJ_R20 : DPUReg<20, "R20", [], [R20]>, DwarfRegNum<[20]>;
-def MAJ_R21 : DPUReg<21, "R21", [], [R21]>, DwarfRegNum<[21]>;
-def MAJ_R22 : DPUReg<22, "R22", [], [R22]>, DwarfRegNum<[22]>;
-def MAJ_R23 : DPUReg<23, "R23", [], [R23]>, DwarfRegNum<[23]>;
+// // Thread data registers
+// def MAJ_R0 : DPUReg<0, "R0", [], [R0]>, DwarfRegNum<[0]>;
+// def MAJ_R1 : DPUReg<1, "R1", [], [R1]>, DwarfRegNum<[1]>;
+// def MAJ_R2 : DPUReg<2, "R2", [], [R2]>, DwarfRegNum<[2]>;
+// def MAJ_R3 : DPUReg<3, "R3", [], [R3]>, DwarfRegNum<[3]>;
+// def MAJ_R4 : DPUReg<4, "R4", [], [R4]>, DwarfRegNum<[4]>;
+// def MAJ_R5 : DPUReg<5, "R5", [], [R5]>, DwarfRegNum<[5]>;
+// def MAJ_R6 : DPUReg<6, "R6", [], [R6]>, DwarfRegNum<[6]>;
+// def MAJ_R7 : DPUReg<7, "R7", [], [R7]>, DwarfRegNum<[7]>;
+// def MAJ_R8 : DPUReg<8, "R8", [], [R8]>, DwarfRegNum<[8]>;
+// def MAJ_R9 : DPUReg<9, "R9", [], [R9]>, DwarfRegNum<[9]>;
+// def MAJ_R10 : DPUReg<10, "R10", [], [R10]>, DwarfRegNum<[10]>;
+// def MAJ_R11 : DPUReg<11, "R11", [], [R11]>, DwarfRegNum<[11]>;
+// def MAJ_R12 : DPUReg<12, "R12", [], [R12]>, DwarfRegNum<[12]>;
+// def MAJ_R13 : DPUReg<13, "R13", [], [R13]>, DwarfRegNum<[13]>;
+// def MAJ_R14 : DPUReg<14, "R14", [], [R14]>, DwarfRegNum<[14]>;
+// def MAJ_R15 : DPUReg<15, "R15", [], [R15]>, DwarfRegNum<[15]>;
+// def MAJ_R16 : DPUReg<16, "R16", [], [R16]>, DwarfRegNum<[16]>;
+// def MAJ_R17 : DPUReg<17, "R17", [], [R17]>, DwarfRegNum<[17]>;
+// def MAJ_R18 : DPUReg<18, "R18", [], [R18]>, DwarfRegNum<[18]>;
+// def MAJ_R19 : DPUReg<19, "R19", [], [R19]>, DwarfRegNum<[19]>;
+// def MAJ_R20 : DPUReg<20, "R20", [], [R20]>, DwarfRegNum<[20]>;
+// def MAJ_R21 : DPUReg<21, "R21", [], [R21]>, DwarfRegNum<[21]>;
+// def MAJ_R22 : DPUReg<22, "R22", [], [R22]>, DwarfRegNum<[22]>;
+// def MAJ_R23 : DPUReg<23, "R23", [], [R23]>, DwarfRegNum<[23]>;
 
 // Thread data registers, extended to 64 bits.
 let SubRegIndices = [sub_32bit, sub_32bit_hi], CoveredBySubRegs = 1 in {
@@ -97,39 +97,39 @@ let SubRegIndices = [sub_32bit, sub_32bit_hi], CoveredBySubRegs = 1 in {
   def D20 : DPUReg<20, "d20", [R21, R20]>;
   def D22 : DPUReg<22, "d22", [R23, R22]>;
 
-  def MAJ_D0 : DPUReg<0, "D0", [MAJ_R1, MAJ_R0], [D0]>;
-  def MAJ_D2 : DPUReg<2, "D2", [MAJ_R3, MAJ_R2], [D2]>;
-  def MAJ_D4 : DPUReg<4, "D4", [MAJ_R5, MAJ_R4], [D4]>;
-  def MAJ_D6 : DPUReg<6, "D6", [MAJ_R7, MAJ_R6], [D6]>;
-  def MAJ_D8 : DPUReg<8, "D8", [MAJ_R9, MAJ_R8], [D8]>;
-  def MAJ_D10 : DPUReg<10, "D10", [MAJ_R11, MAJ_R10], [D10]>;
-  def MAJ_D12 : DPUReg<12, "D12", [MAJ_R13, MAJ_R12], [D12]>;
-  def MAJ_D14 : DPUReg<14, "D14", [MAJ_R15, MAJ_R14], [D14]>;
-  def MAJ_D16 : DPUReg<16, "D16", [MAJ_R17, MAJ_R16], [D16]>;
-  def MAJ_D18 : DPUReg<18, "D18", [MAJ_R19, MAJ_R18], [D18]>;
-  def MAJ_D20 : DPUReg<20, "D20", [MAJ_R21, MAJ_R20], [D20]>;
-  def MAJ_D22 : DPUReg<22, "D22", [MAJ_R23, MAJ_R22], [D22]>;
+  // def MAJ_D0 : DPUReg<0, "D0", [MAJ_R1, MAJ_R0], [D0]>;
+  // def MAJ_D2 : DPUReg<2, "D2", [MAJ_R3, MAJ_R2], [D2]>;
+  // def MAJ_D4 : DPUReg<4, "D4", [MAJ_R5, MAJ_R4], [D4]>;
+  // def MAJ_D6 : DPUReg<6, "D6", [MAJ_R7, MAJ_R6], [D6]>;
+  // def MAJ_D8 : DPUReg<8, "D8", [MAJ_R9, MAJ_R8], [D8]>;
+  // def MAJ_D10 : DPUReg<10, "D10", [MAJ_R11, MAJ_R10], [D10]>;
+  // def MAJ_D12 : DPUReg<12, "D12", [MAJ_R13, MAJ_R12], [D12]>;
+  // def MAJ_D14 : DPUReg<14, "D14", [MAJ_R15, MAJ_R14], [D14]>;
+  // def MAJ_D16 : DPUReg<16, "D16", [MAJ_R17, MAJ_R16], [D16]>;
+  // def MAJ_D18 : DPUReg<18, "D18", [MAJ_R19, MAJ_R18], [D18]>;
+  // def MAJ_D20 : DPUReg<20, "D20", [MAJ_R21, MAJ_R20], [D20]>;
+  // def MAJ_D22 : DPUReg<22, "D22", [MAJ_R23, MAJ_R22], [D22]>;
 }
 
 // Constant registers.
 def ZERO: DPUReg<24, "zero">;
-def MAJ_ZERO: DPUReg<24, "ZERO", [], [ZERO]>;
+// def MAJ_ZERO: DPUReg<24, "ZERO", [], [ZERO]>;
 def ONE: DPUReg<25, "one">;
-def MAJ_ONE: DPUReg<25, "ONE", [], [ONE]>;
+// def MAJ_ONE: DPUReg<25, "ONE", [], [ONE]>;
 def LNEG: DPUReg<26, "lneg">;
-def MAJ_LNEG: DPUReg<26, "LNEG", [], [LNEG]>;
+// def MAJ_LNEG: DPUReg<26, "LNEG", [], [LNEG]>;
 def MNEG:  DPUReg<27, "mneg">;
-def MAJ_MNEG: DPUReg<27, "MNEG", [], [MNEG]>;
+// def MAJ_MNEG: DPUReg<27, "MNEG", [], [MNEG]>;
 // Thread id registers. Return the thread identification for the
 // current thread, times 1, 2, 4, 8.
 def ID:    DPUReg<28, "id">;
 def ID2:   DPUReg<29, "id2">;
 def ID4:   DPUReg<30, "id4">;
 def ID8:   DPUReg<31, "id8">;
-def MAJ_ID:    DPUReg<28, "ID", [], [ID]>;
-def MAJ_ID2:   DPUReg<29, "ID2", [], [ID2]>;
-def MAJ_ID4:   DPUReg<30, "ID4", [], [ID4]>;
-def MAJ_ID8:   DPUReg<31, "ID8", [], [ID8]>;
+// def MAJ_ID:    DPUReg<28, "ID", [], [ID]>;
+// def MAJ_ID2:   DPUReg<29, "ID2", [], [ID2]>;
+// def MAJ_ID4:   DPUReg<30, "ID4", [], [ID4]>;
+// def MAJ_ID8:   DPUReg<31, "ID8", [], [ID8]>;
 
 // Define the register class representing this bank of general
 // purpose registers used by ONE thread.
@@ -139,18 +139,34 @@ def MAJ_ID8:   DPUReg<31, "ID8", [], [ID8]>;
 // that can be used as an instruction operand.
 // Hide the reserved registers, so that we are very sure that the compiler will
 // not do anything with them.
-def GP_REG    : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23), (sequence "MAJ_R%u", 0, 23), ZERO, ONE,
-// LNEG,    <-- there is an issue with this one:        lsr_add r2, lneg, r2, 3  seems to be understood as sats r2, r2 ... encoding problem???
- MNEG)>;
-def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG, MAJ_ZERO, MAJ_ONE, MAJ_LNEG, MAJ_MNEG)>;
-def ID_REG    : RegisterClass<"DPU", [i32], 32, (add ID, ID2, ID4, ID8, MAJ_ID, MAJ_ID2, MAJ_ID4, MAJ_ID8)>;
-def ZERO_REG  : RegisterClass<"DPU", [i32], 32, (add ZERO, MAJ_ZERO)>;
+def GP_REG    : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23)
+// , (sequence "MAJ_R%u", 0, 23)
+, ZERO
+, ONE
+// ,LNEG <-- there is an issue with this one:        lsr_add r2, lneg, r2, 3  seems to be understood as sats r2, r2 ... encoding problem???
+// ,MNEG <-- this one as well
+)>;
+
+def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG
+// , MAJ_ZERO, MAJ_ONE, MAJ_LNEG, MAJ_MNEG
+)>;
+
+def ID_REG    : RegisterClass<"DPU", [i32], 32, (add ID, ID2, ID4, ID8
+//, MAJ_ID, MAJ_ID2, MAJ_ID4, MAJ_ID8
+)>;
+
+def ZERO_REG  : RegisterClass<"DPU", [i32], 32, (add ZERO
+// , MAJ_ZERO
+)>;
+
 def OP_REG    : RegisterClass<"DPU", [i32], 32, (add GP_REG, CONST_REG, ID_REG)>;
 def GPZ_REG   : RegisterClass<"DPU", [i32], 32, (add GP_REG, ZERO_REG)>;
 
 // 64 bits registers are the combinations of 2 consecutive registers.
 def GP64_REG  : RegisterClass<"DPU", [i64], 64,
-                          (add D0, D2, D4, D6, D8, D10, D12, D14, D16, D18, D20, D22, MAJ_D0, MAJ_D2, MAJ_D4, MAJ_D6, MAJ_D8, MAJ_D10, MAJ_D12, MAJ_D14, MAJ_D16, MAJ_D18, MAJ_D20, MAJ_D22)>;
+                          (add D0, D2, D4, D6, D8, D10, D12, D14, D16, D18, D20, D22
+			  // , MAJ_D0, MAJ_D2, MAJ_D4, MAJ_D6, MAJ_D8, MAJ_D10, MAJ_D12, MAJ_D14, MAJ_D16, MAJ_D18, MAJ_D20, MAJ_D22
+			  )>;
 
 def S0:   DPUReg<0, "s0">;
 def S1:   DPUReg<1, "s1">;

From 45e4cc850dbc821b4e60a36f8b3bd6df5dffaf9a Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Tue, 16 Jul 2024 10:25:37 +0200
Subject: [PATCH 06/17] dpu: llvm: correct some def

---
 llvm/lib/Target/DPU/DPUInstrInfo.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index 810ab6e792af2..482f1eb0653c3 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -111,9 +111,9 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         .copyImplicitOps(MI);
     break;
   case DPU::CALLr:
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr))
-        .addReg(DPU::R23)
-        .add(MI.getOperand(0));
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr), DPU::R23)
+        .add(MI.getOperand(0))
+        .copyImplicitOps(MI);
     break;
   case DPU::ADD_VAStart: { // Get the first index in stack where the first
                            // vaargs is stored
@@ -122,8 +122,7 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       StackSize = MF->getFrameInfo().getStackSize();
     }
     unsigned int ResultReg = MI.getOperand(0).getReg();
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif))
-        .addReg(ResultReg)
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif), ResultReg)
         .addReg(DPU::R22)
         .addImm(StackSize + STACK_SIZE_FOR_D22)
         .addImm(DPUAsmCondition::Condition::False);

From 81385dccc36b1807e186cd4ce14db22e62103d6d Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Tue, 16 Jul 2024 10:51:21 +0200
Subject: [PATCH 07/17] dpu: llvm: DPUResolveMacroInstrPass: correct some

---
 .../Target/DPU/DPUResolveMacroInstrPass.cpp   | 48 +++++++++++++++----
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
index bbfb4fec0d67e..4168c5af3b937 100644
--- a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
@@ -181,18 +181,31 @@ static void resolveJeq64(MachineBasicBlock *MBB,
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
   MachineFunction *F = MBB->getParent();
+
+  bool need_splice = std::next(MBBIter) != MBB->end();
+
+  MachineBasicBlock *FTMBB = MBB->getFallThrough();
+
   MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *endMBB;
+  MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB();
+
   F->insert(I, trueMBB);
-  F->insert(I, endMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  if (need_splice) {
+    endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(I, endMBB);
+    // Update machine-CFG edges by transferring all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(MBB);
+    MBB->addSuccessor(endMBB);
+  } else {
+    endMBB = FTMBB;
+    MBB->removeSuccessor(JumpMBB);
+  }
+
   // Next, add the true and fallthrough blocks as its successors.
-  auto JumpMBB = MBBIter->getOperand(3).getMBB();
   MBB->addSuccessor(trueMBB);
-  MBB->addSuccessor(endMBB);
   trueMBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(endMBB);
 
@@ -215,6 +228,9 @@ static void resolveJeq64(MachineBasicBlock *MBB,
       .addReg(MsbOp1Reg)
       .addReg(MsbOp2Reg)
       .addMBB(JumpMBB);
+
+  trueMBB->addLiveIn(MsbOp1Reg);
+  trueMBB->addLiveIn(MsbOp2Reg);
 }
 
 static void resolveJneq64(MachineBasicBlock *MBB,
@@ -227,6 +243,7 @@ static void resolveJneq64(MachineBasicBlock *MBB,
   MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(I, trueMBB);
   F->insert(I, endMBB);
+
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
   endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
@@ -257,6 +274,9 @@ static void resolveJneq64(MachineBasicBlock *MBB,
       .addReg(MsbOp1Reg)
       .addReg(MsbOp2Reg)
       .addMBB(JumpMBB);
+  trueMBB->addLiveIn(MsbOp1Reg);
+  trueMBB->addLiveIn(MsbOp2Reg);
+  endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
 }
 
 static void resolveJcc64AsSub64(MachineBasicBlock *MBB,
@@ -496,8 +516,18 @@ bool DPUResolveMacroInstrPass::runOnMachineFunction(MachineFunction &MF) {
 
   for (auto &MFI : MF) {
     MachineBasicBlock *MBB = &MFI;
-    changeMade |= resolveMacroInstructionsInMBB(MBB, InstrInfo);
+    LLVM_DEBUG({MBB->dump();});
+    bool local_change = resolveMacroInstructionsInMBB(MBB, InstrInfo);
+    if (local_change) {
+      LLVM_DEBUG({
+	  dbgs() << "change to:\n";
+	  MBB->dump();
+	});
+      changeMade = true;
+    }
   }
 
+  LLVM_DEBUG(dbgs() << "********** DPU/ResolveMacroInstrPass: " << MF.getName()
+	     << " done: changeMade = " << changeMade << " **********\n\n");
   return changeMade;
 }

From 0cdb611e58f6c176275ae7c6e933aca83f3a9bf5 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Sat, 20 Jul 2024 12:14:02 +0200
Subject: [PATCH 08/17] wip: stay in correct SSA form

The idea is to generate correct SSA form in EmitInstrWithCustomInserter
Currently, we introduce arith+comp+branch, which seems to be wrong for SSA ala LLVM
Loads of passes during reg_alloc/phi_elim morph def to use in those instruction
it also put some stuff after the terminator for stack frame management
and potentially put COPY before the def

So this is a wip to actually keep correct SSA form at this stage.
Todo so, we do the actual arith+cmp+branch with simple instruction,
and add enough information to keep together/adjacent those instructions
as we know they will be merge back together at later stage.
Try to use metadata and implement the tweaker for LLVM internal passes.
---
 llvm/lib/Target/DPU/DPUInstrInfo.cpp          |  93 ++++-
 llvm/lib/Target/DPU/DPUInstrInfo.h            |   2 +
 llvm/lib/Target/DPU/DPUMCInstLower.cpp        |   1 +
 llvm/lib/Target/DPU/DPUMacroFusion.cpp        |  49 ++-
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp | 105 ++++-
 llvm/lib/Target/DPU/DPURegisterInfo.td        |  15 +-
 .../Target/DPU/DPUResolveMacroInstrPass.cpp   | 367 +++++++++++-----
 llvm/lib/Target/DPU/DPUTargetLowering.cpp     | 395 ++++++++++++------
 8 files changed, 785 insertions(+), 242 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index 482f1eb0653c3..cab73d689b44b 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -53,7 +53,9 @@ void DPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   DebugLoc DL = (I != MBB.end()) ? I->getDebugLoc() : DebugLoc();
-  unsigned Opcode = (RC == &DPU::GP_REGRegClass) ? DPU::SWrir : DPU::SDrir;
+  unsigned Opcode = (RC == &DPU::GP_REGRegClass
+		     // || RC == &DPU::GPZ_REGRegClass
+		     ) ? DPU::SWrir : DPU::SDrir;
 
   LLVM_DEBUG({
     dbgs() << "DPU/Instr - storeRegToStackSlot DestReg="
@@ -82,7 +84,9 @@ void DPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
-  unsigned Opcode = (RC == &DPU::GP_REGRegClass) ? DPU::LWrri : DPU::LDrri;
+  unsigned Opcode = (RC == &DPU::GP_REGRegClass
+		     // || RC == &DPU::GPZ_REGRegClass
+		     ) ? DPU::LWrri : DPU::LDrri;
   LLVM_DEBUG({
     dbgs() << "DPU/Instr - loadRegFromStackSlot DestReg="
            << std::to_string(DestReg) << " Opcode= " << std::to_string(Opcode)
@@ -99,8 +103,18 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
 
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to expand: "; MI.dump();
+      dbgs() << "** MBB: "; MBB.dump();
+      dbgs() << "****** \n";
+    });
   switch (MI.getDesc().getOpcode()) {
   default:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "Don't know how to expand.\n";
+      });
     return false;
   case DPU::RETi:
     BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::JUMPr)).addReg(DPU::R23);
@@ -128,9 +142,22 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         .addImm(DPUAsmCondition::Condition::False);
     break;
   }
+
+  // case DPU::Jcci:
+  // case DPU::TmpJcci:
+  // case DPU::Jcc: {
+  //   // don't expand yet as they are used for late optimization
+  //   // these late optimization should be reworked and placed earlier in the pipeline
+  //   // so we could treat more cases of optim
+  //   break;
+  // }
   }
 
   MBB.erase(MI);
+
+  LLVM_DEBUG({
+      dbgs() << "** MBB: "; MBB.dump();
+    });
   return true;
 }
 
@@ -443,6 +470,22 @@ unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
 void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
                                           MachineBasicBlock *TBB, DebugLoc DL,
                                           ArrayRef<MachineOperand> Cond) const {
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "DPU::sub_32bit " << DPU::sub_32bit << "\n";
+  //     dbgs() << "DPU::sub_32bit_hi " << DPU::sub_32bit_hi << "\n";
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	dbgs() << "Cond[" << i << "] = "; Cond[i].dump();
+  // 	if (Cond[i].isReg()) {
+  // 	  dbgs() << "is Reg\n";
+  // 	  dbgs() << Cond[i].getReg() << "\n";
+  // 	  dbgs() << Cond[i].getSubReg() << "\n";
+
+  // 	  dbgs() << "contains " << DPU::GP64_REGRegClass.contains(Cond[i].getReg()) << "\n";
+  // 	}
+  //     }
+  //   });
+
   MachineInstrBuilder MIB;
 
   unsigned Opc = Cond[0].getImm();
@@ -450,12 +493,19 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
   MIB = BuildMI(&MBB, DL, get(Opc));
 
   for (unsigned i = 1; i < Cond.size(); ++i) {
-    if (Cond[i].isReg())
-      MIB.addReg(Cond[i].getReg());
-    else if (Cond[i].isImm())
+    if (Cond[i].isReg()) {
+      // The register in question could potentially be a
+      // subreg hi/lo of a 64-bit vreg
+      if (unsigned SubReg = Cond[i].getSubReg()) {
+	MIB.addReg(Cond[i].getReg(), 0, SubReg);
+      } else {
+	MIB.addReg(Cond[i].getReg());
+      }
+    } else if (Cond[i].isImm()) {
       MIB.addImm(Cond[i].getImm());
-    else
+    } else {
       assert(false && "Cannot copy operand");
+    }
   }
 
   MIB.addMBB(TBB);
@@ -493,3 +543,34 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
     *BytesAdded = nrOfInsertedMachineInstr;
   return nrOfInsertedMachineInstr;
 }
+
+bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const {
+  switch (MI.getDesc().getOpcode()) {
+  default:
+    break;
+  case DPU::CLZ_Urr:
+  case DPU::LSLXrrr:
+  case DPU::LSRXrrr:
+  case DPU::ANDrri:
+  case DPU::JEQrii:
+  case DPU::JNEQrii:
+    {
+      //   return false;
+      for (const MachineOperand &Op : MI.operands()) {
+	if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI.getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+	  LLVM_DEBUG({
+	      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Don't sink because I have MySpecialMetadata.\n";
+	    });
+	  return false; // Do not sink this instruction
+	}
+      }
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " I'm potentially something used in arith+cond+jump from EmitInstrWithCustomInserter but I allow sink because I don't have MySpecialMetadata.\n";
+	});
+      break;
+    }
+  }
+
+  // return true;
+  return TargetInstrInfo::shouldSink(MI);
+}
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.h b/llvm/lib/Target/DPU/DPUInstrInfo.h
index e9c2a3b920a05..14c199c9160e8 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.h
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.h
@@ -65,6 +65,8 @@ class DPUInstrInfo : public DPUGenInstrInfo {
 
   void buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                               DebugLoc DL, ArrayRef<MachineOperand> Cond) const;
+
+  bool shouldSink(const MachineInstr &MI) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/DPU/DPUMCInstLower.cpp b/llvm/lib/Target/DPU/DPUMCInstLower.cpp
index 311c64f86b142..954f3834cc138 100644
--- a/llvm/lib/Target/DPU/DPUMCInstLower.cpp
+++ b/llvm/lib/Target/DPU/DPUMCInstLower.cpp
@@ -102,6 +102,7 @@ void DPUMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
 
     case MachineOperand::MO_RegisterMask:
+    case MachineOperand::MO_Metadata:
       continue;
 
     case MachineOperand::MO_GlobalAddress:
diff --git a/llvm/lib/Target/DPU/DPUMacroFusion.cpp b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
index a606c017d7cfb..43655fc012e50 100644
--- a/llvm/lib/Target/DPU/DPUMacroFusion.cpp
+++ b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
@@ -28,14 +28,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   // We are mainly interested in merging a simple operation with a simple
   // conditional/unconditional branch
   LLVM_DEBUG({
-    dbgs() << "DPU/Merge: checking macro fusion:\n\t";
-    if (!FirstMI)
-      dbgs() << "<NONE>";
-    else
-      FirstMI->dump();
-    dbgs() << "\n\t";
-    SecondMI.dump();
-    dbgs() << "\n";
+    dbgs() << "DPU/Merge: checking macro fusion:\n";
+    if (!FirstMI) {
+      dbgs() << "\t<NONE>\n";
+    } else {
+      dbgs() << "\t"; FirstMI->dump();
+    }
+    dbgs() << "\t"; SecondMI.dump();
   });
 
   if (!FirstMI) {
@@ -51,14 +50,38 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   switch (secondOpc) {
   default:
     // todo probably more opportunities (Conditional branches...)
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
     return false;
   case DPU::JUMPi:
   case DPU::TmpJcci:
     break;
+  case DPU::JNEQrii:
+  case DPU::JEQrii:
+    if (!(FirstMI->getOperand(0).isReg() && SecondMI.getOperand(0).isReg() &&
+	  (FirstMI->getOperand(0).getReg() ==
+	   SecondMI.getOperand(0).getReg()))) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
+    LLVM_DEBUG({
+	dbgs() << "first reg " << FirstMI->getOperand(0).getReg() << "\n";
+	dbgs() << "second reg " << SecondMI.getOperand(0).getReg() << "\n";
+      });
+    return false;
+    }
+    break;
   case DPU::Jcci:
     if (!(FirstMI->getOperand(0).isReg() && SecondMI.getOperand(1).isReg() &&
-          (FirstMI->getOperand(0).getReg() ==
-           SecondMI.getOperand(1).getReg()))) {
+	  (FirstMI->getOperand(0).getReg() ==
+	   SecondMI.getOperand(1).getReg()))) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
       return false;
     }
     break;
@@ -68,7 +91,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   default:
     // todo probably more opportunities (Operations with specific immediate
     // operands, call...)
-    LLVM_DEBUG(dbgs() << "DPU/Merge: the two instructions cannot be fused\n");
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
     return false;
   case DPU::ADDrri:
   case DPU::ADDrrr:
@@ -92,6 +118,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   case DPU::RORrrr:
   case DPU::RORrri:
   case DPU::CLZrr:
+  case DPU::CLZ_Urr:
   case DPU::CAOrr:
   case DPU::MUL_UL_ULrrr:
   case DPU::MUL_SL_ULrrr:
diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 135fd3b7c5c40..4274304345aa0 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -6,6 +6,9 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
+// TODO: expand to more situation of arith+comp+branch
+
 #include "DPUTargetMachine.h"
 #include <llvm/CodeGen/MachineInstrBuilder.h>
 #include <set>
@@ -202,6 +205,91 @@ getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
   return &*I;
 }
 
+
+static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
+				       const DPUInstrInfo &InstrInfo) {
+  MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend();
+  MachineInstr *LastInst, *SecondLastInst;
+  unsigned int LastOpc, SecondLastOpc;
+
+  LastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (LastInst == NULL) {
+    LLVM_DEBUG(dbgs() << "KO: I == REnd\n");
+    return false;
+  }
+  I++;
+  SecondLastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (SecondLastInst == NULL) {
+    LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
+    return false;
+  }
+
+  LastOpc = LastInst->getOpcode();
+  SecondLastOpc = SecondLastInst->getOpcode();
+
+  switch (SecondLastOpc) {
+  default:
+    LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
+    return false;
+  case DPU::CLZ_Urr: {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "study CLZ_Urr to CLZ_Urrci\n";
+	SecondLastInst->dump();
+	LastInst->dump();
+      });
+    
+    bool do_def_reg_alias = false;
+    const TargetRegisterInfo *TRI = MBB->getParent()->getSubtarget().getRegisterInfo();
+    for (MCRegAliasIterator Alias(SecondLastInst->getOperand(0).getReg(), TRI, true); Alias.isValid(); ++Alias) {
+      Register AliasReg = *Alias;
+      if (LastInst->getOperand(0).getReg() == AliasReg) {
+	// dbgs() << "yep it's alias\n";
+	do_def_reg_alias = true;
+      }
+    }
+    if (LastInst->getOpcode() == DPU::JNEQrii
+	&& LastInst->getOperand(1).getImm() == 32
+	&& do_def_reg_alias
+	) {
+      // dbgs() << "yep we may optimize to \n";
+      // SecondLastInst->getOperand(0).dump();
+      // dbgs() << " = CLZ_Urrci\n";
+      // SecondLastInst->getOperand(1).dump();
+      // dbgs() << DPUAsmCondition::Condition::NotMaximum << "\n";
+      // dbgs() << LastInst->getOperand(2).getMBB()->getFullName() << "\n";
+      // LastInst->getOperand(2).getMBB()->dump();
+
+      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+      
+      MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(DPU::CLZ_Urrci), SecondLastInst->getOperand(0).getReg())
+	.add(SecondLastInst->getOperand(1))
+	.addImm(DPUAsmCondition::Condition::NotMaximum)
+	.addMBB(LastInst->getOperand(2).getMBB());
+
+      LLVM_DEBUG({
+	  dbgs() << "OK\n";
+	  dbgs() << "del "; SecondLastInst->dump();
+	  dbgs() << "del "; LastInst->dump();
+	  dbgs() << "fused to ";
+	  dbgs() << "add "; ComboInst->dump();
+	});
+      LastInst->eraseFromParent();
+      SecondLastInst->eraseFromParent();
+      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+      return true;
+    } else {
+      LLVM_DEBUG({dbgs() << "can't optimize\n";});
+      return false;
+    }
+
+    return false;
+  }
+  }
+
+  return false;
+}
+
 static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
                                         const DPUInstrInfo &InstrInfo) {
   MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend();
@@ -653,6 +741,13 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     LLVM_DEBUG(dbgs() << "KO: Unknown LastOpc\n");
     return false;
   case DPU::JUMPi: {
+    // this is currently wrong
+    // we morph the branch from unconditional to conditional
+    // by this, we modify the CFG by creating artificially a fall through which is not declared
+    // so, it's bugged
+    // return false;
+    // 
+    
     if (!ImmCanBeEncodedOn8Bits) {
       LLVM_DEBUG(
           dbgs() << "KO: LastOpc == DPU::JUMPi && !ImmCanBeEncodedOn8Bits\n");
@@ -822,12 +917,14 @@ bool DPUMergeComboInstrPass::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *MBB = &MFI;
 
     LLVM_DEBUG(MBB->dump());
-    bool local_change = mergeComboInstructionsInMBB(MBB, InstrInfo);
+
+    bool local_change = mergeBranchArithmeticInMBB(MBB, InstrInfo);
+    local_change |= mergeComboInstructionsInMBB(MBB, InstrInfo);
     if (local_change) {
       LLVM_DEBUG({
-        dbgs() << "\nchanged to:\n";
-        MBB->dump();
-      });
+	  dbgs() << "\nchanged to:\n";
+	  MBB->dump();
+	});
       changeMade = true;
     }
   }
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.td b/llvm/lib/Target/DPU/DPURegisterInfo.td
index cbd6215ca8d88..06c44a9aaeac2 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.td
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.td
@@ -141,10 +141,17 @@ def ID8:   DPUReg<31, "id8">;
 // not do anything with them.
 def GP_REG    : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23)
 // , (sequence "MAJ_R%u", 0, 23)
-, ZERO
-, ONE
-// ,LNEG <-- there is an issue with this one:        lsr_add r2, lneg, r2, 3  seems to be understood as sats r2, r2 ... encoding problem???
-// ,MNEG <-- this one as well
+// , ZERO
+// , ONE
+// ,LNEG //<-- there is an issue with this one:        lsr_add r2, lneg, r2, 3  seems to be understood as sats r2, r2 ... encoding problem???
+// ,MNEG //<-- this one as well
+// in fact they cause more trouble now.
+// probably they are not well specified elsewhere
+// or encoding/decoding are not well tested properly with register constraints ...
+//  need to check that
+//   because register coalescing could be really interesting ...
+//    move $d/r 0/1/-1 could be potentially removed
+//   will check that later, first: correctness
 )>;
 
 def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG
diff --git a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
index 4168c5af3b937..4e5313f12050c 100644
--- a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
@@ -7,6 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+// possibly move that earlier in the pipeline
+//   all simple arithmetic could be moved to in EmitInstrWithCustomInserter pre regalloc and other optim
+
+// TODO: expand test cases for splicing
+//       need_splice = 0/1  x  canFallThrough = 0/1
+
 #include "DPU.h"
 #include "DPUInstrInfo.h"
 #include "DPUSubtarget.h"
@@ -119,6 +125,13 @@ static void resolve64BitImmediateAluInstruction(
     MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter,
     const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode,
     unsigned int MsbOpcode) {
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+
   MachineFunction *MF = MBB->getParent();
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
 
@@ -143,12 +156,23 @@ static void resolve64BitImmediateAluInstruction(
           MSBDestReg)
       .addReg(MSBDOp1Reg)
       .addImm(MSBOp2Imm);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
 }
 
 static void resolve64BitRegisterAluInstruction(
     MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter,
     const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode,
     unsigned int MsbOpcode) {
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+    });
   MachineFunction *MF = MBB->getParent();
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
 
@@ -173,6 +197,11 @@ static void resolve64BitRegisterAluInstruction(
           MSBDestReg)
       .addReg(MSBDOp1Reg)
       .addReg(MSBOp2Reg);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
 }
 
 static void resolveJeq64(MachineBasicBlock *MBB,
@@ -185,10 +214,23 @@ static void resolveJeq64(MachineBasicBlock *MBB,
   bool need_splice = std::next(MBBIter) != MBB->end();
 
   MachineBasicBlock *FTMBB = MBB->getFallThrough();
+  MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** canFallThrough: " << MBB->canFallThrough() << "\n";
+      if (MBB->canFallThrough()) {
+	dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
 
   MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *endMBB;
-  MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB();
 
   F->insert(I, trueMBB);
   if (need_splice) {
@@ -199,16 +241,17 @@ static void resolveJeq64(MachineBasicBlock *MBB,
     endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
     endMBB->transferSuccessorsAndUpdatePHIs(MBB);
     MBB->addSuccessor(endMBB);
+    endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
   } else {
     endMBB = FTMBB;
-    MBB->removeSuccessor(JumpMBB);
+    MBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
   }
 
   // Next, add the true and fallthrough blocks as its successors.
   MBB->addSuccessor(trueMBB);
   trueMBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(endMBB);
-
+  
   unsigned int Op1Reg = MBBIter->getOperand(1).getReg();
   unsigned int Op2Reg = MBBIter->getOperand(2).getReg();
 
@@ -231,6 +274,17 @@ static void resolveJeq64(MachineBasicBlock *MBB,
 
   trueMBB->addLiveIn(MsbOp1Reg);
   trueMBB->addLiveIn(MsbOp2Reg);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** trueMBB: "; trueMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "** FTMBB: "; FTMBB->dump();
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
 }
 
 static void resolveJneq64(MachineBasicBlock *MBB,
@@ -239,19 +293,44 @@ static void resolveJneq64(MachineBasicBlock *MBB,
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
   MachineFunction *F = MBB->getParent();
+
+  bool need_splice = std::next(MBBIter) != MBB->end();
+  bool canFallThrough = MBB->canFallThrough();
+  MachineBasicBlock *FTMBB = MBB->getFallThrough();
+  MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** canFallThrough: " << canFallThrough << "\n";
+      if (canFallThrough) {
+	dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
+
   MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *endMBB;
   F->insert(I, trueMBB);
-  F->insert(I, endMBB);
 
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
-  // Next, add the true and fallthrough blocks as its successors.
-  auto JumpMBB = MBBIter->getOperand(3).getMBB();
+  if (need_splice) {
+    endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(I, endMBB);
+    // Update machine-CFG edges by transferring all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(MBB);
+    MBB->addSuccessor(JumpMBB);
+    endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
+  } else {
+    endMBB = FTMBB;
+    MBB->removeSuccessor(endMBB, /* NormalizeSuccProbs = */ true);
+  }
+
   MBB->addSuccessor(trueMBB);
-  MBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(endMBB);
 
@@ -274,15 +353,35 @@ static void resolveJneq64(MachineBasicBlock *MBB,
       .addReg(MsbOp1Reg)
       .addReg(MsbOp2Reg)
       .addMBB(JumpMBB);
+  
   trueMBB->addLiveIn(MsbOp1Reg);
   trueMBB->addLiveIn(MsbOp2Reg);
-  endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** trueMBB: "; trueMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      if (canFallThrough) {
+	dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
 }
 
 static void resolveJcc64AsSub64(MachineBasicBlock *MBB,
                                 MachineBasicBlock::iterator MBBIter,
                                 const DPUInstrInfo &InstrInfo,
                                 DPUAsmCondition::Condition Cond) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+  
   unsigned int Op1Reg = MBBIter->getOperand(1).getReg();
   unsigned int Op2Reg = MBBIter->getOperand(2).getReg();
   auto JumpMBB = MBBIter->getOperand(3).getMBB();
@@ -304,11 +403,20 @@ static void resolveJcc64AsSub64(MachineBasicBlock *MBB,
       .addReg(MsbOp2Reg)
       .addImm(Cond)
       .addMBB(JumpMBB);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
 }
 
 static void resolveJcc64(MachineBasicBlock *MBB,
                          MachineBasicBlock::iterator MBBIter,
                          const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
   switch (MBBIter->getOperand(0).getImm()) {
   default:
     llvm_unreachable("invalid condition");
@@ -363,6 +471,138 @@ static void resolveJcc64(MachineBasicBlock *MBB,
   }
 }
 
+static void resolveMOVE64rr(MachineBasicBlock *MBB,
+			    MachineBasicBlock::iterator MBBIter,
+			    const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+ 
+  MachineFunction *MF = MBB->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MBBIter->getOperand(0).getReg();
+  int64_t Op1Imm = MBBIter->getOperand(1).getImm();
+
+  int64_t LSBOp1Imm = Op1Imm & 0xFFFFFFFFl;
+  int64_t MSBOp1Imm = (Op1Imm >> 32) & 0xFFFFFFFFl;
+  unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
+  unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
+	  LSBDestReg)
+    .addImm(LSBOp1Imm);
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
+	  MSBDestReg)
+    .addImm(MSBOp1Imm);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveSET64cc(MachineBasicBlock *MBB,
+			   MachineBasicBlock::iterator MBBIter,
+			   const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  MachineFunction *MF = MBB->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MBBIter->getOperand(0).getReg();
+  auto ImmCond = static_cast<DPUAsmCondition::Condition>(
+							 MBBIter->getOperand(1).getImm());
+  unsigned int Op1Reg = MBBIter->getOperand(2).getReg();
+  unsigned int Op2Reg = MBBIter->getOperand(3).getReg();
+
+  DPUAsmCondition::Condition SetCondition =
+    findSelect64SetConditionFor(ImmCond);
+
+  unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
+  unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
+
+  unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
+  unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::SUBzrr))
+    .addReg(DPU::ZERO)
+    .addReg(LSBDOp1Reg)
+    .addReg(LSBOp2Reg);
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(),
+	  InstrInfo.get(DPU::SUBCrrrc), DestReg)
+    .addReg(MSBDOp1Reg)
+    .addReg(MSBOp2Reg)
+    .addImm(SetCondition);
+  
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveJcc(MachineBasicBlock *MBB,
+		       MachineBasicBlock::iterator MBBIter,
+		       const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  unsigned int OpCode =
+    findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), false);
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode))
+    .add(MBBIter->getOperand(1))
+    .add(MBBIter->getOperand(2))
+    .add(MBBIter->getOperand(3));
+  
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveJcci(MachineBasicBlock *MBB,
+		       MachineBasicBlock::iterator MBBIter,
+		       const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  unsigned int OpCode =
+    findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true);
+  const MachineInstrBuilder &MIB =
+    BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode));
+  MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2));
+
+  for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) {
+    MachineOperand &Operand = MBBIter->getOperand(i);
+
+    if (Operand.isMBB()) {
+      MIB.add(Operand);
+      break;
+    }
+  }
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
 static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB,
                                           const DPUInstrInfo &InstrInfo) {
   bool Modified = false;
@@ -375,88 +615,28 @@ static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB,
     default:
       InstrModified = false;
       break;
-    case DPU::Jcc: {
-      unsigned int OpCode =
-          findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), false);
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode))
-          .add(MBBIter->getOperand(1))
-          .add(MBBIter->getOperand(2))
-          .add(MBBIter->getOperand(3));
+
+    case DPU::Jcc:
+      resolveJcc(MBB, MBBIter, InstrInfo);
       break;
-    }
-    case DPU::TmpJcci:
-    case DPU::Jcci: {
-      unsigned int OpCode =
-          findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true);
-      const MachineInstrBuilder &MIB =
-          BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode));
-      MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2));
-
-      for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) {
-        MachineOperand &Operand = MBBIter->getOperand(i);
-
-        if (Operand.isMBB()) {
-          MIB.add(Operand);
-          break;
-        }
-      }
 
+    case DPU::TmpJcci:
+    case DPU::Jcci:
+      resolveJcci(MBB, MBBIter, InstrInfo);
       break;
-    }
+
     case DPU::Jcc64:
       resolveJcc64(MBB, MBBIter, InstrInfo);
       break;
-    case DPU::SET64cc: {
-      MachineFunction *MF = MBB->getParent();
-      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
-      unsigned int DestReg = MBBIter->getOperand(0).getReg();
-      auto ImmCond = static_cast<DPUAsmCondition::Condition>(
-          MBBIter->getOperand(1).getImm());
-      unsigned int Op1Reg = MBBIter->getOperand(2).getReg();
-      unsigned int Op2Reg = MBBIter->getOperand(3).getReg();
-
-      DPUAsmCondition::Condition SetCondition =
-          findSelect64SetConditionFor(ImmCond);
-
-      unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
-      unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
-
-      unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
-      unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
-
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::SUBzrr))
-          .addReg(DPU::ZERO)
-          .addReg(LSBDOp1Reg)
-          .addReg(LSBOp2Reg);
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(),
-              InstrInfo.get(DPU::SUBCrrrc), DestReg)
-          .addReg(MSBDOp1Reg)
-          .addReg(MSBOp2Reg)
-          .addImm(SetCondition);
 
+    case DPU::SET64cc:
+      resolveSET64cc(MBB, MBBIter, InstrInfo);
       break;
-    }
-    case DPU::MOVE64ri: {
-      MachineFunction *MF = MBB->getParent();
-      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
-      unsigned int DestReg = MBBIter->getOperand(0).getReg();
-      int64_t Op1Imm = MBBIter->getOperand(1).getImm();
-
-      int64_t LSBOp1Imm = Op1Imm & 0xFFFFFFFFl;
-      int64_t MSBOp1Imm = (Op1Imm >> 32) & 0xFFFFFFFFl;
-      unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
-      unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
-
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
-              LSBDestReg)
-          .addImm(LSBOp1Imm);
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
-              MSBDestReg)
-          .addImm(MSBOp1Imm);
+
+    case DPU::MOVE64ri:
+      resolveMOVE64rr(MBB, MBBIter, InstrInfo);
       break;
-    }
+
     case DPU::ADD64rr:
       resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrrr,
                                          DPU::ADDCrrr);
@@ -497,8 +677,9 @@ static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB,
 
     if (InstrModified) {
       MBB->erase(MBBIter++);
-      Modified = true;
-    } else {
+      Modified |= true;
+    }
+    else {
       ++MBBIter;
     }
   }
@@ -516,15 +697,7 @@ bool DPUResolveMacroInstrPass::runOnMachineFunction(MachineFunction &MF) {
 
   for (auto &MFI : MF) {
     MachineBasicBlock *MBB = &MFI;
-    LLVM_DEBUG({MBB->dump();});
-    bool local_change = resolveMacroInstructionsInMBB(MBB, InstrInfo);
-    if (local_change) {
-      LLVM_DEBUG({
-	  dbgs() << "change to:\n";
-	  MBB->dump();
-	});
-      changeMade = true;
-    }
+    changeMade |= resolveMacroInstructionsInMBB(MBB, InstrInfo);
   }
 
   LLVM_DEBUG(dbgs() << "********** DPU/ResolveMacroInstrPass: " << MF.getName()
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index 701d050338ba3..32e68ef5cd488 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -96,6 +96,7 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI)
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &DPU::GP_REGRegClass);
+  // addRegisterClass(MVT::i32, &DPU::CONST_REGRegClass);
   addRegisterClass(MVT::i64, &DPU::GP64_REGRegClass);
 
   // Compute derived properties from the register classes
@@ -2060,6 +2061,7 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
+  // should be checked
   BuildMI(BB, dl, TII.get(MulLL), LLDest)
       .addReg(Op1)
       .addReg(Op2)
@@ -2372,6 +2374,13 @@ EmitMramLoadDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
 
 static MachineBasicBlock *
 EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
+
   /*
       What we want to generate (with dc.h != rb in that example):
       lslx       __R0, da.l, rb, ?sh32 @+4
@@ -2405,9 +2414,10 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   MachineRegisterInfo &RI = F->getRegInfo();
   unsigned LsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned MsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-
+  // unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  // unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned ShiftReg_check = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
   unsigned BigShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned BigShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
@@ -2425,20 +2435,49 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit);
-
-  BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
-      .addReg(LsbOp1Reg)
-      .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB);
-
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
+  // BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
+  //     .addReg(Op1Reg, 0, DPU::sub_32bit);
+
+  // unsigned DummyReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  /// faulty
+  // BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
+  //     .addReg(LsbOp1Reg)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB);
+
+  /// good, but
+  // could increase quite a bit the code size
+  //   because MachineSinking will sink the lslxrrr to other places
+  //   and we will not be able to merge those three
+  //   though, with shouldSink false for this
+  //   on a few example, I can keep them adjacent
+  //  but I may kill other optimization stuff in other code
+  //   that use it genuinelly
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  BuildMI(BB, dl, TII.get(DPU::LSLXrrr), LsbToMsbPartReg)
+    // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
+    .addReg(ShiftReg)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
+    .addReg(ShiftReg_check)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(N);
+  
+  // BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg)
+      // .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), MsbToMsbPartReg)
-      .addReg(MsbOp1Reg)
+      // .addReg(MsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
       .addReg(ShiftReg);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftMsbReg)
@@ -2446,7 +2485,8 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
       .addReg(LsbToMsbPartReg);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), SmallShiftLsbReg)
-      .addReg(LsbOp1Reg)
+      // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
       .addReg(ShiftReg);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), Undef2Reg);
@@ -2465,7 +2505,8 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB);
 
   BuildMI(bigShiftMBB, dl, TII.get(DPU::LSLrrr), BigShiftMsbReg)
-      .addReg(LsbOp1Reg)
+      // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
       .addReg(ShiftReg);
 
   BuildMI(bigShiftMBB, dl, TII.get(DPU::MOVEri), BigShiftLsbReg).addImm(0);
@@ -2494,6 +2535,16 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
       .addMBB(smallShiftMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** smallShiftMBB: "; smallShiftMBB->dump();
+      dbgs() << "** bigShiftMBB: "; bigShiftMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
   return endMBB;
 }
 
@@ -2617,6 +2668,13 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
 static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB, unsigned int shiftRight,
     unsigned int shiftRightExtended) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
+
   /*
       What we want to generate (with dc.l != rb in that example):
       lsrx    __R0, da.h, rb, ?sh32 @+4
@@ -2651,6 +2709,7 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
   unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned MsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned LsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned ShiftReg_check = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned SmallShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned SmallShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
@@ -2663,11 +2722,28 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
   BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
-  BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
-      .addReg(MsbOp1Reg)
-      .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB);
+  // BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
+  //     .addReg(MsbOp1Reg)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB);
+
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  BuildMI(BB, dl, TII.get(DPU::LSRXrrr), MsbToLsbPartReg)
+    .addReg(MsbOp1Reg)
+    .addReg(ShiftReg)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
+    .addReg(ShiftReg_check)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(N);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), LsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -2715,6 +2791,17 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
       .addMBB(smallShiftMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** smallShiftMBB: "; smallShiftMBB->dump();
+      dbgs() << "** bigShiftMBB: "; bigShiftMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
+ 
   return endMBB;
 }
 
@@ -2876,6 +2963,7 @@ EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   BuildMI(*BB, MI, dl, TII.get(lsN), Op1MsbShift)
       .addReg(Op1Msb)
       .addReg(ShiftReg);
+  // should be checked
   BuildMI(*BB, MI, dl, TII.get(lsNJump), Op1LsbShift)
       .addReg(Op1Lsb)
       .addReg(ShiftReg)
@@ -3062,6 +3150,12 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
 
 static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
                                                       MachineBasicBlock *BB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
   /*
       What we want to generate (with dc != da in that example):
       clz.u dc, da.h ?nmax @+3
@@ -3093,132 +3187,193 @@ static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
   MachineRegisterInfo &RI = F->getRegInfo();
   unsigned FastResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned SlowResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SlowResultPart1Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  // unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  // unsigned SlowResultPart1Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  // unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
-      .addImm(DPUAsmCondition::Condition::NotMaximum)
-      .addMBB(endMBB);
+  unsigned SlowResultReg_step = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+
+  unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LsbAddReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  // BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
+  //     .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+  //     .addImm(DPUAsmCondition::Condition::NotMaximum)
+  //     .addMBB(endMBB);
+
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JNEQrii))
+    .addReg(FastResultReg, 0, DPU::sub_32bit)
+    .addImm(32)
+    .addMBB(endMBB)
+    .addMetadata(N);
 
   BuildMI(msbAreZerosMBB, dl, TII.get(DPU::CLZrr), LsbClzReg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg)
-      .addReg(LsbClzReg)
-      .addImm(32);
+  // This
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg)
+  //     .addReg(LsbClzReg)
+  //     .addImm(32);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg)
-      .addReg(UndefReg)
-      .addReg(SlowResultPartReg)
-      .addImm(DPU::sub_32bit);
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg)
+  //     .addReg(UndefReg)
+  //     .addReg(SlowResultPartReg)
+  //     .addImm(DPU::sub_32bit);
+
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg)
+  //     .addReg(SlowResultPart1Reg)
+  //     .addReg(FastResultReg, 0, DPU::sub_32bit_hi)
+  //     .addImm(DPU::sub_32bit_hi);
 
+  // or
+  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), LsbAddReg)
+      .addReg(LsbClzReg)
+      .addImm(32);
+
+  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg_step)
+    .addReg(SlowResultReg_step, RegState::Undef)
+    .addReg(LsbAddReg)
+    .addImm(DPU::sub_32bit);
+  
   BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg)
-      .addReg(SlowResultPart1Reg)
+      .addReg(SlowResultReg_step)
       .addReg(FastResultReg, 0, DPU::sub_32bit_hi)
       .addImm(DPU::sub_32bit_hi);
-
+  
   BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
       .addReg(FastResultReg)
       .addMBB(BB)
       .addReg(SlowResultReg)
       .addMBB(msbAreZerosMBB);
 
+  
   MI.eraseFromParent(); // The pseudo instruction is gone now.
-  return endMBB;
-}
 
-static MachineBasicBlock *EmitSeqreadGet(MachineInstr &MI,
-                                         MachineBasicBlock *BB, bool IsIncCst) {
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *slowMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *fastMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, slowMBB);
-  F->insert(I, fastMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  fastMBB->splice(fastMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  fastMBB->transferSuccessorsAndUpdatePHIs(BB);
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(slowMBB);
-  BB->addSuccessor(fastMBB);
-  slowMBB->addSuccessor(fastMBB);
-
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int PtrInit = MI.getOperand(1).getReg();
-  unsigned int Reader = MI.getOperand(3).getReg();
-  unsigned int Cond = MI.getOperand(4).getImm();
-  unsigned int PageSize = MI.getOperand(5).getImm();
-
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned int PtrIncremented = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-
-  if (IsIncCst) {
-    BuildMI(BB, dl, TII.get(DPU::ADDrrici), PtrIncremented)
-        .addReg(PtrInit)
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(Cond)
-        .addMBB(fastMBB);
-  } else {
-    BuildMI(BB, dl, TII.get(DPU::ADDrrrci), PtrIncremented)
-        .addReg(PtrInit)
-        .addReg(MI.getOperand(2).getReg())
-        .addImm(Cond)
-        .addMBB(fastMBB);
-  }
-
-  unsigned int WramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int MramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int MramCacheUpdated =
-      RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int PtrUpdated = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  BuildMI(slowMBB, dl, TII.get(DPU::LWrri), MramCache).addReg(Reader).addImm(4);
-  BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), MramCacheUpdated)
-      .addReg(MramCache)
-      .addImm(PageSize);
-  BuildMI(slowMBB, dl, TII.get(DPU::SWrir))
-      .addReg(Reader)
-      .addImm(4)
-      .addReg(MramCacheUpdated);
-  BuildMI(slowMBB, dl, TII.get(DPU::LWrri), WramCache).addReg(Reader).addImm(0);
-  BuildMI(slowMBB, dl, TII.get(DPU::LDMArri))
-      .addReg(WramCache)
-      .addReg(MramCacheUpdated)
-      .addImm(FormatDMASize(PageSize * 2));
-  BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), PtrUpdated)
-      .addReg(PtrIncremented)
-      .addImm(-PageSize);
-
-  BuildMI(*fastMBB, fastMBB->begin(), dl, TII.get(TargetOpcode::PHI), Dest)
-      .addReg(PtrIncremented)
-      .addMBB(BB)
-      .addReg(PtrUpdated)
-      .addMBB(slowMBB);
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** msbAreZerosMBB: "; msbAreZerosMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
 
-  MI.eraseFromParent(); // The pseudo instruction is gone now.
-  return fastMBB;
+  return endMBB;
 }
 
+// static MachineBasicBlock *EmitSeqreadGet(MachineInstr &MI,
+//                                          MachineBasicBlock *BB, bool IsIncCst) {
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction to replace: "; MI.dump();
+//       dbgs() << "IsIncCst: " << IsIncCst << "\n";
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "****** \n";
+//     });
+    
+//   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+//   DebugLoc dl = MI.getDebugLoc();
+//   const BasicBlock *LLVM_BB = BB->getBasicBlock();
+//   MachineFunction::iterator I = ++BB->getIterator();
+//   MachineFunction *F = BB->getParent();
+//   MachineBasicBlock *slowMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   MachineBasicBlock *fastMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   F->insert(I, slowMBB);
+//   F->insert(I, fastMBB);
+//   // Update machine-CFG edges by transferring all successors of the current
+//   // block to the new block which will contain the Phi node for the select.
+//   fastMBB->splice(fastMBB->begin(), BB,
+//                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+//   fastMBB->transferSuccessorsAndUpdatePHIs(BB);
+//   // Next, add the true and fallthrough blocks as its successors.
+//   BB->addSuccessor(slowMBB);
+//   BB->addSuccessor(fastMBB);
+//   slowMBB->addSuccessor(fastMBB);
+
+//   unsigned int Dest = MI.getOperand(0).getReg();
+//   unsigned int PtrInit = MI.getOperand(1).getReg();
+//   unsigned int Reader = MI.getOperand(3).getReg();
+//   unsigned int Cond = MI.getOperand(4).getImm();
+//   unsigned int PageSize = MI.getOperand(5).getImm();
+
+//   MachineRegisterInfo &RI = F->getRegInfo();
+//   unsigned int PtrIncremented = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+//   if (IsIncCst) {
+//     BuildMI(BB, dl, TII.get(DPU::ADDrrici), PtrIncremented)
+//         .addReg(PtrInit)
+//         .addImm(MI.getOperand(2).getImm())
+//         .addImm(Cond)
+//         .addMBB(fastMBB);
+//   } else {
+//     BuildMI(BB, dl, TII.get(DPU::ADDrrrci), PtrIncremented)
+//         .addReg(PtrInit)
+//         .addReg(MI.getOperand(2).getReg())
+//         .addImm(Cond)
+//         .addMBB(fastMBB);
+//   }
+
+//   unsigned int WramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   unsigned int MramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   unsigned int MramCacheUpdated =
+//       RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   unsigned int PtrUpdated = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   BuildMI(slowMBB, dl, TII.get(DPU::LWrri), MramCache).addReg(Reader).addImm(4);
+//   BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), MramCacheUpdated)
+//       .addReg(MramCache)
+//       .addImm(PageSize);
+//   BuildMI(slowMBB, dl, TII.get(DPU::SWrir))
+//       .addReg(Reader)
+//       .addImm(4)
+//       .addReg(MramCacheUpdated);
+//   BuildMI(slowMBB, dl, TII.get(DPU::LWrri), WramCache).addReg(Reader).addImm(0);
+//   BuildMI(slowMBB, dl, TII.get(DPU::LDMArri))
+//       .addReg(WramCache)
+//       .addReg(MramCacheUpdated)
+//       .addImm(FormatDMASize(PageSize * 2));
+//   BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), PtrUpdated)
+//       .addReg(PtrIncremented)
+//       .addImm(-PageSize);
+
+//   BuildMI(*fastMBB, fastMBB->begin(), dl, TII.get(TargetOpcode::PHI), Dest)
+//       .addReg(PtrIncremented)
+//       .addMBB(BB)
+//       .addReg(PtrUpdated)
+//       .addMBB(slowMBB);
+
+//   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction replaced\n";
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "** slowMBB: "; slowMBB->dump();
+//       dbgs() << "** fastMBB: "; fastMBB->dump();
+//       dbgs() << "****** \n";
+//     });
+  
+//   return fastMBB;
+// }
+
 MachineBasicBlock *
 DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   switch (MI.getOpcode()) {
   default:
+    MI.print(errs());
     llvm_unreachable("Unexpected instr type to insert");
-  case DPU::SEQREAD_GET:
-    return EmitSeqreadGet(MI, BB, false);
-  case DPU::SEQREAD_GET_CST:
-    return EmitSeqreadGet(MI, BB, true);
+  // case DPU::SEQREAD_GET:
+  //   return EmitSeqreadGet(MI, BB, false);
+  // case DPU::SEQREAD_GET_CST:
+  //   return EmitSeqreadGet(MI, BB, true);
   case DPU::Mul16UUrr:
     return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci,
                                        DPU::MUL_UH_ULrrr, DPU::MUL_UH_ULrrr,

From 310f79012b28b47eb20964d39ebeea3dc7f91e08 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Sat, 20 Jul 2024 14:03:06 +0200
Subject: [PATCH 09/17] wip: optimize ls{l,r}x + and + jeq

---
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp | 56 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 4274304345aa0..3372eb7f70b80 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -205,6 +205,15 @@ getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
   return &*I;
 }
 
+static bool do_have_special_metadata(MachineInstr *MI) {
+  for (const MachineOperand &Op : MI->operands()) {
+    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+      return true;
+    }
+  }
+
+  return false;
+}
 
 static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
 				       const DPUInstrInfo &InstrInfo) {
@@ -227,6 +236,49 @@ static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
   LastOpc = LastInst->getOpcode();
   SecondLastOpc = SecondLastInst->getOpcode();
 
+  // attempt to merge lslx/lsrx and XX 32 jeq XX 32 instructions
+  // TODO: check if it's shift32 as well?
+  //       or maybe use other metadata?
+  //         but this is to be extra careful, or the next player in the game ... :)
+  if (LastOpc == DPU::JEQrii && do_have_special_metadata(LastInst)
+      && SecondLastOpc == DPU::ANDrri && do_have_special_metadata(SecondLastInst)) {
+    I++;
+    MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd);
+    if (ThirdLastInst == NULL) {
+      LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
+      return false;
+    }
+    unsigned int ThirdLastOpc = ThirdLastInst->getOpcode();
+    if ((ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr)
+	&& do_have_special_metadata(ThirdLastInst)) {
+      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+      unsigned int new_opcode = (ThirdLastOpc == DPU::LSLXrrr ?
+				 DPU::LSLXrrrci : DPU::LSRXrrrci);
+      MachineInstrBuilder ComboInst = BuildMI(MBB, ThirdLastInst->getDebugLoc(),
+					      InstrInfo.get(new_opcode),
+					      ThirdLastInst->getOperand(0).getReg());
+      ComboInst.add(ThirdLastInst->getOperand(1));
+      ComboInst.add(ThirdLastInst->getOperand(2));
+      ComboInst.addImm(DPUAsmCondition::Condition::Shift32);
+      ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+
+      LLVM_DEBUG({
+	  dbgs() << "OK\n";
+	  dbgs() << "del "; ThirdLastInst->dump();
+	  dbgs() << "del "; SecondLastInst->dump();
+	  dbgs() << "del "; LastInst->dump();
+	  dbgs() << "fused to\n";
+	  dbgs() << "add "; ComboInst->dump();
+	});
+
+      LastInst->eraseFromParent();
+      SecondLastInst->eraseFromParent();
+      ThirdLastInst->eraseFromParent();
+      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+      return true;
+    }
+  }
+
   switch (SecondLastOpc) {
   default:
     LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
@@ -271,7 +323,7 @@ static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
 	  dbgs() << "OK\n";
 	  dbgs() << "del "; SecondLastInst->dump();
 	  dbgs() << "del "; LastInst->dump();
-	  dbgs() << "fused to ";
+	  dbgs() << "fused to\n";
 	  dbgs() << "add "; ComboInst->dump();
 	});
       LastInst->eraseFromParent();
@@ -745,7 +797,7 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     // we morph the branch from unconditional to conditional
     // by this, we modify the CFG by creating artificially a fall through which is not declared
     // so, it's bugged
-    // return false;
+    return false;
     // 
     
     if (!ImmCanBeEncodedOn8Bits) {

From df78969b78f54583b4d47a4b2e9140f4dc88cd51 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Sat, 20 Jul 2024 16:24:59 +0200
Subject: [PATCH 10/17] wip: don't lose metadata during analyzeBranch

---
 llvm/lib/Target/DPU/DPUInstrInfo.cpp          | 83 ++++++++++++++++++-
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp | 20 ++---
 2 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index cab73d689b44b..4a39b2551d181 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -301,6 +301,11 @@ bool DPUInstrInfo::reverseBranchCondition(
 static void
 fetchUnconditionalBranchInfo(MachineInstr *Inst,
                              unsigned &targetBasicBlockOperandIndex) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+    });
+
   switch (Inst->getOpcode()) {
   case DPU::JUMPi:
     targetBasicBlockOperandIndex = 0;
@@ -313,6 +318,14 @@ fetchUnconditionalBranchInfo(MachineInstr *Inst,
 static void fetchConditionalBranchInfo(MachineInstr *Inst,
                                        unsigned &targetBasicBlockOperandIndex,
                                        SmallVectorImpl<MachineOperand> &Cond) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
+  
   unsigned Opc = Inst->getOpcode();
   Cond.push_back(MachineOperand::CreateImm(Opc));
 
@@ -327,6 +340,20 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
       Cond.push_back(operand);
     }
   }
+
+  for (const MachineOperand &Op : Inst->operands()) {
+    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(Inst->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+      Cond.push_back(Op);
+    }
+  }
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
 }
 
 static inline bool isAnalyzableBranch(MachineInstr *Inst) {
@@ -338,6 +365,15 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
+  
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 
   // Skip all the debug instructions.
@@ -393,6 +429,13 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     if (LastInst->isConditionalBranch()) {
       unsigned int TBBOpIdx;
       fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond);
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	  dbgs() << "MBB "; MBB.dump();
+	  for (unsigned i = 0; i < Cond.size(); ++i) {
+	    dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+	  }
+	});
       TBB = LastInst->getOperand(TBBOpIdx).getMBB();
       return false;
     }
@@ -435,7 +478,13 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond);
     TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB();
     FBB = LastInst->getOperand(FTBBOpIdx).getMBB();
-
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "MBB "; MBB.dump();
+	for (unsigned i = 0; i < Cond.size(); ++i) {
+	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+	}
+      });
     return false;
   }
 
@@ -445,6 +494,10 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
 unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                     int *BytesRemoved) const {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+    });
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
 
@@ -470,6 +523,14 @@ unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
 void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
                                           MachineBasicBlock *TBB, DebugLoc DL,
                                           ArrayRef<MachineOperand> Cond) const {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
+
   // LLVM_DEBUG({
   //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
   //     dbgs() << "DPU::sub_32bit " << DPU::sub_32bit << "\n";
@@ -503,12 +564,25 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
       }
     } else if (Cond[i].isImm()) {
       MIB.addImm(Cond[i].getImm());
+    } else if (Cond[i].isMetadata()) {
+      // MIB.addMetadata(Cond[i].getMetadata());
     } else {
       assert(false && "Cannot copy operand");
     }
   }
 
   MIB.addMBB(TBB);
+
+  // add back remaining metadata
+  for (unsigned i = 0; i < Cond.size(); ++i) {
+     if (Cond[i].isMetadata()) {
+      MIB.addMetadata(Cond[i].getMetadata());
+     }
+  }
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MIB "; MIB->dump();
+    });
 }
 
 unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
@@ -516,6 +590,13 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
                                     const DebugLoc &DL, int *BytesAdded) const {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
   unsigned nrOfInsertedMachineInstr = 0;
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 3372eb7f70b80..fecb6aa79e018 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -239,7 +239,12 @@ static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
   // attempt to merge lslx/lsrx and XX 32 jeq XX 32 instructions
   // TODO: check if it's shift32 as well?
   //       or maybe use other metadata?
-  //         but this is to be extra careful, or the next player in the game ... :)
+  //         but this is to be extra careful, or for the next player in the game ... :)
+  // though, here I apply only when with my metadata
+  //   but if I actually not test my metadata, maybe
+  //     and add JNEQrii, I could pop both
+  //     and why not tackle other possible optim that may have introduce this code
+  //        event from user maybe
   if (LastOpc == DPU::JEQrii && do_have_special_metadata(LastInst)
       && SecondLastOpc == DPU::ANDrri && do_have_special_metadata(SecondLastInst)) {
     I++;
@@ -304,14 +309,6 @@ static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
 	&& LastInst->getOperand(1).getImm() == 32
 	&& do_def_reg_alias
 	) {
-      // dbgs() << "yep we may optimize to \n";
-      // SecondLastInst->getOperand(0).dump();
-      // dbgs() << " = CLZ_Urrci\n";
-      // SecondLastInst->getOperand(1).dump();
-      // dbgs() << DPUAsmCondition::Condition::NotMaximum << "\n";
-      // dbgs() << LastInst->getOperand(2).getMBB()->getFullName() << "\n";
-      // LastInst->getOperand(2).getMBB()->dump();
-
       LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
       
       MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(DPU::CLZ_Urrci), SecondLastInst->getOperand(0).getReg())
@@ -330,12 +327,7 @@ static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
       SecondLastInst->eraseFromParent();
       LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
       return true;
-    } else {
-      LLVM_DEBUG({dbgs() << "can't optimize\n";});
-      return false;
     }
-
-    return false;
   }
   }
 

From 0af56bb1d21477eb9bbd37f8c12a2466376cf92a Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Sat, 20 Jul 2024 19:43:52 +0200
Subject: [PATCH 11/17] wip: do mul16 and tailored logic to metadata, will
 rework later to be generic and handle naturally all cases

---
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp | 137 ++++++++++++------
 llvm/lib/Target/DPU/DPUTargetLowering.cpp     |  44 +++++-
 2 files changed, 135 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index fecb6aa79e018..007bb5ea4094f 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -245,7 +245,8 @@ static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
   //     and add JNEQrii, I could pop both
   //     and why not tackle other possible optim that may have introduce this code
   //        event from user maybe
-  if (LastOpc == DPU::JEQrii && do_have_special_metadata(LastInst)
+  // original code is JEQrii, but JNEQrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JEQrii || LastOpc == DPU::JNEQrii) && do_have_special_metadata(LastInst)
       && SecondLastOpc == DPU::ANDrri && do_have_special_metadata(SecondLastInst)) {
     I++;
     MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd);
@@ -284,53 +285,105 @@ static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
     }
   }
 
-  switch (SecondLastOpc) {
-  default:
-    LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
-    return false;
-  case DPU::CLZ_Urr: {
+  // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch
+  // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii) && do_have_special_metadata(LastInst)
+      && SecondLastOpc == DPU::MUL_UL_ULrrr && do_have_special_metadata(SecondLastInst)) {
+    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+      
+    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
+					    InstrInfo.get(DPU::MUL_UL_ULrrrci),
+					    SecondLastInst->getOperand(0).getReg())
+      .add(SecondLastInst->getOperand(1))
+      .add(SecondLastInst->getOperand(1))
+      .addImm(DPUAsmCondition::Small)
+      .addMBB(LastInst->getOperand(2).getMBB());
+
     LLVM_DEBUG({
-	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-	dbgs() << "study CLZ_Urr to CLZ_Urrci\n";
-	SecondLastInst->dump();
-	LastInst->dump();
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
       });
-    
-    bool do_def_reg_alias = false;
-    const TargetRegisterInfo *TRI = MBB->getParent()->getSubtarget().getRegisterInfo();
-    for (MCRegAliasIterator Alias(SecondLastInst->getOperand(0).getReg(), TRI, true); Alias.isValid(); ++Alias) {
-      Register AliasReg = *Alias;
-      if (LastInst->getOperand(0).getReg() == AliasReg) {
-	// dbgs() << "yep it's alias\n";
-	do_def_reg_alias = true;
-      }
-    }
-    if (LastInst->getOpcode() == DPU::JNEQrii
-	&& LastInst->getOperand(1).getImm() == 32
-	&& do_def_reg_alias
-	) {
-      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+    LastInst->eraseFromParent();
+    SecondLastInst->eraseFromParent();
+    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+    return true;
+  }
+
+  // original code is JNEQrii, but JEQrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii) && do_have_special_metadata(LastInst)
+      && SecondLastOpc == DPU::CLZ_Urr && do_have_special_metadata(SecondLastInst)) {
+    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
       
-      MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(DPU::CLZ_Urrci), SecondLastInst->getOperand(0).getReg())
-	.add(SecondLastInst->getOperand(1))
-	.addImm(DPUAsmCondition::Condition::NotMaximum)
-	.addMBB(LastInst->getOperand(2).getMBB());
+    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
+					    InstrInfo.get(DPU::CLZ_Urrci),
+					    SecondLastInst->getOperand(0).getReg())
+      .add(SecondLastInst->getOperand(1))
+      .addImm(DPUAsmCondition::Condition::NotMaximum)
+      .addMBB(LastInst->getOperand(2).getMBB());
 
-      LLVM_DEBUG({
-	  dbgs() << "OK\n";
-	  dbgs() << "del "; SecondLastInst->dump();
-	  dbgs() << "del "; LastInst->dump();
-	  dbgs() << "fused to\n";
-	  dbgs() << "add "; ComboInst->dump();
-	});
-      LastInst->eraseFromParent();
-      SecondLastInst->eraseFromParent();
-      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-      return true;
-    }
-  }
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    LastInst->eraseFromParent();
+    SecondLastInst->eraseFromParent();
+    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+    return true;
   }
 
+  // switch (SecondLastOpc) {
+  // default:
+  //   LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
+  //   return false;
+  // case DPU::CLZ_Urr: {
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "study CLZ_Urr to CLZ_Urrci\n";
+  // 	SecondLastInst->dump();
+  // 	LastInst->dump();
+  //     });
+    
+  //   bool do_def_reg_alias = false;
+  //   const TargetRegisterInfo *TRI = MBB->getParent()->getSubtarget().getRegisterInfo();
+  //   for (MCRegAliasIterator Alias(SecondLastInst->getOperand(0).getReg(), TRI, true); Alias.isValid(); ++Alias) {
+  //     Register AliasReg = *Alias;
+  //     if (LastInst->getOperand(0).getReg() == AliasReg) {
+  // 	// dbgs() << "yep it's alias\n";
+  // 	do_def_reg_alias = true;
+  //     }
+  //   }
+  //   if (LastInst->getOpcode() == DPU::JNEQrii
+  // 	&& LastInst->getOperand(1).getImm() == 32
+  // 	&& do_def_reg_alias
+  // 	) {
+  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+      
+  //     MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(DPU::CLZ_Urrci), SecondLastInst->getOperand(0).getReg())
+  // 	.add(SecondLastInst->getOperand(1))
+  // 	.addImm(DPUAsmCondition::Condition::NotMaximum)
+  // 	.addMBB(LastInst->getOperand(2).getMBB());
+
+  //     LLVM_DEBUG({
+  // 	  dbgs() << "OK\n";
+  // 	  dbgs() << "del "; SecondLastInst->dump();
+  // 	  dbgs() << "del "; LastInst->dump();
+  // 	  dbgs() << "fused to\n";
+  // 	  dbgs() << "add "; ComboInst->dump();
+  // 	});
+  //     LastInst->eraseFromParent();
+  //     SecondLastInst->eraseFromParent();
+  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+  //     return true;
+  //   }
+  // }
+  // }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index 32e68ef5cd488..cef0c02bc7623 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -2029,6 +2029,12 @@ static MachineBasicBlock *
 EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
                             unsigned MulLL, unsigned MulHL, unsigned MulHL2,
                             unsigned MulHH) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -2062,12 +2068,23 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
   // should be checked
-  BuildMI(BB, dl, TII.get(MulLL), LLDest)
+  // BuildMI(BB, dl, TII.get(MulLL), LLDest)
+  //     .addReg(Op1)
+  //     .addReg(Op2)
+  //     .addImm(DPUAsmCondition::Small)
+  //     .addMBB(fastMBB);
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest)
       .addReg(Op1)
       .addReg(Op2)
-      .addImm(DPUAsmCondition::Small)
-      .addMBB(fastMBB);
-
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JLTUrii))
+    .addReg(LLDest)
+    .addImm(0x100)
+    .addMBB(fastMBB)
+    .addMetadata(N);
+  
   BuildMI(slowMBB, dl, TII.get(MulHL), HLDest).addReg(Op1).addReg(Op2);
   BuildMI(slowMBB, dl, TII.get(DPU::LSL_ADDrrri), LSL1Dest)
       .addReg(LLDest)
@@ -2093,6 +2110,16 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
       .addMBB(slowMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** slowMBB: "; slowMBB->dump();
+      dbgs() << "** fastMBB: "; fastMBB->dump();
+      dbgs() << "****** \n";
+    });
+ 
   return fastMBB;
 }
 
@@ -3375,14 +3402,23 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   // case DPU::SEQREAD_GET_CST:
   //   return EmitSeqreadGet(MI, BB, true);
   case DPU::Mul16UUrr:
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16UUrr\n";
+      });
     return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci,
                                        DPU::MUL_UH_ULrrr, DPU::MUL_UH_ULrrr,
                                        DPU::MUL_UH_UHrrr);
   case DPU::Mul16SUrr:
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16SUrr\n";
+      });
     return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci,
                                        DPU::MUL_SH_ULrrr, DPU::MUL_UH_ULrrr,
                                        DPU::MUL_SH_UHrrr);
   case DPU::Mul16SSrr:
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16SSrr\n";
+      });
     return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci,
                                        DPU::MUL_SH_ULrrr, DPU::MUL_SH_ULrrr,
                                        DPU::MUL_SH_SHrrr);

From 06aa1f68fbbef38ccb7c0844d79e8fc580adf199 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Sun, 21 Jul 2024 11:47:24 +0200
Subject: [PATCH 12/17] wip: got the issue ... Always a padawan before evolving
 to Sith Lord

I first moved the trick stuff from DPUMergeComboInstrPass to a new DPUPostRAFusion
because I wanted to have possible optim between RA and PreEmit::MergeComboInstr be effective.

Along the way, I simply discover that we lose consistency of our def/use
during analyzeBranch process for those arith+comp+jump:
insertBranch and buildConditionalBranch was changing the instruction in a bad way.

So, in fact we may don't even need to tweak shouldSink and don't need DPUPostRAFusion.
And now I believe we don't kill really the SSA form, we just wasn't doing the right way.

It was probably just bad branch reconstruction ... We always learn :)
---
 llvm/lib/Target/DPU/CMakeLists.txt            |   2 +-
 llvm/lib/Target/DPU/DPU.h                     |   1 +
 llvm/lib/Target/DPU/DPUInstrInfo.cpp          |  94 ++++--
 llvm/lib/Target/DPU/DPUInstrInfo.h            |   2 +-
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp | 185 +----------
 llvm/lib/Target/DPU/DPUPostRAFusion.cpp       | 296 ++++++++++++++++++
 llvm/lib/Target/DPU/DPUTargetLowering.cpp     | 142 +++++----
 llvm/lib/Target/DPU/DPUTargetMachine.cpp      |   6 +
 8 files changed, 444 insertions(+), 284 deletions(-)
 create mode 100644 llvm/lib/Target/DPU/DPUPostRAFusion.cpp

diff --git a/llvm/lib/Target/DPU/CMakeLists.txt b/llvm/lib/Target/DPU/CMakeLists.txt
index 7a887b71ee3aa..9e216ef08cb39 100644
--- a/llvm/lib/Target/DPU/CMakeLists.txt
+++ b/llvm/lib/Target/DPU/CMakeLists.txt
@@ -28,7 +28,7 @@ add_llvm_target(DPUCodeGen
         DPUResolveMacroInstrPass.cpp
         DPUMacroFusion.cpp
         DPUSelectionDAGInfo.cpp
-
+	DPUPostRAFusion.cpp
         DEPENDS
         intrinsics_gen
 
diff --git a/llvm/lib/Target/DPU/DPU.h b/llvm/lib/Target/DPU/DPU.h
index 2ef567d9bc868..7f84823cb9ae0 100644
--- a/llvm/lib/Target/DPU/DPU.h
+++ b/llvm/lib/Target/DPU/DPU.h
@@ -19,6 +19,7 @@ namespace llvm {
 class FunctionPass;
 class DPUTargetMachine;
 
+FunctionPass *createDPUPostRAFusionPass(DPUTargetMachine &tm);
 FunctionPass *createDPUMergeComboInstrPass(DPUTargetMachine &tm);
 FunctionPass *createDPUResolveMacroInstrPass(DPUTargetMachine &tm);
 
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index 4a39b2551d181..98f08fd8cab4f 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -551,9 +551,38 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
 
   unsigned Opc = Cond[0].getImm();
 
-  MIB = BuildMI(&MBB, DL, get(Opc));
+  // treat special cases
+  // those where not well handled with LLVM SSA stuff
+  bool have_metadata = false;
+  // TODO: find a better way to discover if it's an arithmetic+comp+jump
+  //       or simply rely solely on metadata?
+  switch (Opc) {
+  default:
+    break;
+  case DPU::CLZ_Urrci:
+  case DPU::MUL_UL_ULrrrci:
+  case DPU::LSLXrrrci:
+  case DPU::LSRXrrrci:
+    {
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	if (Cond[i].isMetadata()
+	    && Cond[i].getMetadata()->getOperand(0).get() == MDString::get(MBB.getParent()->getFunction().getContext(), "MySpecialMetadata")) {
+	  have_metadata = true;
+	}
+      }
+      break;
+    }
+  }
 
-  for (unsigned i = 1; i < Cond.size(); ++i) {
+  unsigned start = 1;
+  if (have_metadata) {
+    MIB = BuildMI(&MBB, DL, get(Opc), Cond[start].getReg());
+    start++;
+  } else {
+    MIB = BuildMI(&MBB, DL, get(Opc));
+  }
+
+  for (unsigned i = start; i < Cond.size(); ++i) {
     if (Cond[i].isReg()) {
       // The register in question could potentially be a
       // subreg hi/lo of a 64-bit vreg
@@ -579,6 +608,7 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
       MIB.addMetadata(Cond[i].getMetadata());
      }
   }
+
   LLVM_DEBUG({
       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
       dbgs() << "MIB "; MIB->dump();
@@ -625,33 +655,33 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
   return nrOfInsertedMachineInstr;
 }
 
-bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const {
-  switch (MI.getDesc().getOpcode()) {
-  default:
-    break;
-  case DPU::CLZ_Urr:
-  case DPU::LSLXrrr:
-  case DPU::LSRXrrr:
-  case DPU::ANDrri:
-  case DPU::JEQrii:
-  case DPU::JNEQrii:
-    {
-      //   return false;
-      for (const MachineOperand &Op : MI.operands()) {
-	if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI.getMF()->getFunction().getContext(), "MySpecialMetadata")) {
-	  LLVM_DEBUG({
-	      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Don't sink because I have MySpecialMetadata.\n";
-	    });
-	  return false; // Do not sink this instruction
-	}
-      }
-      LLVM_DEBUG({
-	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " I'm potentially something used in arith+cond+jump from EmitInstrWithCustomInserter but I allow sink because I don't have MySpecialMetadata.\n";
-	});
-      break;
-    }
-  }
-
-  // return true;
-  return TargetInstrInfo::shouldSink(MI);
-}
+// bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const {
+//   switch (MI.getDesc().getOpcode()) {
+//   default:
+//     break;
+//   case DPU::CLZ_Urr:
+//   case DPU::LSLXrrr:
+//   case DPU::LSRXrrr:
+//   case DPU::ANDrri:
+//   case DPU::JEQrii:
+//   case DPU::JNEQrii:
+//     {
+//       //   return false;
+//       for (const MachineOperand &Op : MI.operands()) {
+// 	if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI.getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+// 	  LLVM_DEBUG({
+// 	      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Don't sink because I have MySpecialMetadata.\n";
+// 	    });
+// 	  return false; // Do not sink this instruction
+// 	}
+//       }
+//       LLVM_DEBUG({
+// 	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " I'm potentially something used in arith+cond+jump from EmitInstrWithCustomInserter but I allow sink because I don't have MySpecialMetadata.\n";
+// 	});
+//       break;
+//     }
+//   }
+
+//   // return true;
+//   return TargetInstrInfo::shouldSink(MI);
+// }
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.h b/llvm/lib/Target/DPU/DPUInstrInfo.h
index 14c199c9160e8..2d08d67f4f721 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.h
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.h
@@ -66,7 +66,7 @@ class DPUInstrInfo : public DPUGenInstrInfo {
   void buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                               DebugLoc DL, ArrayRef<MachineOperand> Cond) const;
 
-  bool shouldSink(const MachineInstr &MI) const override;
+  // bool shouldSink(const MachineInstr &MI) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 007bb5ea4094f..b126c7d1e52e8 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -205,188 +205,6 @@ getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
   return &*I;
 }
 
-static bool do_have_special_metadata(MachineInstr *MI) {
-  for (const MachineOperand &Op : MI->operands()) {
-    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-static bool mergeBranchArithmeticInMBB(MachineBasicBlock *MBB,
-				       const DPUInstrInfo &InstrInfo) {
-  MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend();
-  MachineInstr *LastInst, *SecondLastInst;
-  unsigned int LastOpc, SecondLastOpc;
-
-  LastInst = getLastNonDebugInstrFrom(I, REnd);
-  if (LastInst == NULL) {
-    LLVM_DEBUG(dbgs() << "KO: I == REnd\n");
-    return false;
-  }
-  I++;
-  SecondLastInst = getLastNonDebugInstrFrom(I, REnd);
-  if (SecondLastInst == NULL) {
-    LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
-    return false;
-  }
-
-  LastOpc = LastInst->getOpcode();
-  SecondLastOpc = SecondLastInst->getOpcode();
-
-  // attempt to merge lslx/lsrx and XX 32 jeq XX 32 instructions
-  // TODO: check if it's shift32 as well?
-  //       or maybe use other metadata?
-  //         but this is to be extra careful, or for the next player in the game ... :)
-  // though, here I apply only when with my metadata
-  //   but if I actually not test my metadata, maybe
-  //     and add JNEQrii, I could pop both
-  //     and why not tackle other possible optim that may have introduce this code
-  //        event from user maybe
-  // original code is JEQrii, but JNEQrii could be introduce by analyzeBranch
-  if ((LastOpc == DPU::JEQrii || LastOpc == DPU::JNEQrii) && do_have_special_metadata(LastInst)
-      && SecondLastOpc == DPU::ANDrri && do_have_special_metadata(SecondLastInst)) {
-    I++;
-    MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd);
-    if (ThirdLastInst == NULL) {
-      LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
-      return false;
-    }
-    unsigned int ThirdLastOpc = ThirdLastInst->getOpcode();
-    if ((ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr)
-	&& do_have_special_metadata(ThirdLastInst)) {
-      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-      unsigned int new_opcode = (ThirdLastOpc == DPU::LSLXrrr ?
-				 DPU::LSLXrrrci : DPU::LSRXrrrci);
-      MachineInstrBuilder ComboInst = BuildMI(MBB, ThirdLastInst->getDebugLoc(),
-					      InstrInfo.get(new_opcode),
-					      ThirdLastInst->getOperand(0).getReg());
-      ComboInst.add(ThirdLastInst->getOperand(1));
-      ComboInst.add(ThirdLastInst->getOperand(2));
-      ComboInst.addImm(DPUAsmCondition::Condition::Shift32);
-      ComboInst.addMBB(LastInst->getOperand(2).getMBB());
-
-      LLVM_DEBUG({
-	  dbgs() << "OK\n";
-	  dbgs() << "del "; ThirdLastInst->dump();
-	  dbgs() << "del "; SecondLastInst->dump();
-	  dbgs() << "del "; LastInst->dump();
-	  dbgs() << "fused to\n";
-	  dbgs() << "add "; ComboInst->dump();
-	});
-
-      LastInst->eraseFromParent();
-      SecondLastInst->eraseFromParent();
-      ThirdLastInst->eraseFromParent();
-      LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-      return true;
-    }
-  }
-
-  // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch
-  // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch
-  if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii) && do_have_special_metadata(LastInst)
-      && SecondLastOpc == DPU::MUL_UL_ULrrr && do_have_special_metadata(SecondLastInst)) {
-    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-      
-    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
-					    InstrInfo.get(DPU::MUL_UL_ULrrrci),
-					    SecondLastInst->getOperand(0).getReg())
-      .add(SecondLastInst->getOperand(1))
-      .add(SecondLastInst->getOperand(1))
-      .addImm(DPUAsmCondition::Small)
-      .addMBB(LastInst->getOperand(2).getMBB());
-
-    LLVM_DEBUG({
-	dbgs() << "OK\n";
-	dbgs() << "del "; SecondLastInst->dump();
-	dbgs() << "del "; LastInst->dump();
-	dbgs() << "fused to\n";
-	dbgs() << "add "; ComboInst->dump();
-      });
-    LastInst->eraseFromParent();
-    SecondLastInst->eraseFromParent();
-    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-    return true;
-  }
-
-  // original code is JNEQrii, but JEQrii could be introduce by analyzeBranch
-  if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii) && do_have_special_metadata(LastInst)
-      && SecondLastOpc == DPU::CLZ_Urr && do_have_special_metadata(SecondLastInst)) {
-    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-      
-    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
-					    InstrInfo.get(DPU::CLZ_Urrci),
-					    SecondLastInst->getOperand(0).getReg())
-      .add(SecondLastInst->getOperand(1))
-      .addImm(DPUAsmCondition::Condition::NotMaximum)
-      .addMBB(LastInst->getOperand(2).getMBB());
-
-    LLVM_DEBUG({
-	dbgs() << "OK\n";
-	dbgs() << "del "; SecondLastInst->dump();
-	dbgs() << "del "; LastInst->dump();
-	dbgs() << "fused to\n";
-	dbgs() << "add "; ComboInst->dump();
-      });
-    LastInst->eraseFromParent();
-    SecondLastInst->eraseFromParent();
-    LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-    return true;
-  }
-
-  // switch (SecondLastOpc) {
-  // default:
-  //   LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
-  //   return false;
-  // case DPU::CLZ_Urr: {
-  //   LLVM_DEBUG({
-  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  // 	dbgs() << "study CLZ_Urr to CLZ_Urrci\n";
-  // 	SecondLastInst->dump();
-  // 	LastInst->dump();
-  //     });
-    
-  //   bool do_def_reg_alias = false;
-  //   const TargetRegisterInfo *TRI = MBB->getParent()->getSubtarget().getRegisterInfo();
-  //   for (MCRegAliasIterator Alias(SecondLastInst->getOperand(0).getReg(), TRI, true); Alias.isValid(); ++Alias) {
-  //     Register AliasReg = *Alias;
-  //     if (LastInst->getOperand(0).getReg() == AliasReg) {
-  // 	// dbgs() << "yep it's alias\n";
-  // 	do_def_reg_alias = true;
-  //     }
-  //   }
-  //   if (LastInst->getOpcode() == DPU::JNEQrii
-  // 	&& LastInst->getOperand(1).getImm() == 32
-  // 	&& do_def_reg_alias
-  // 	) {
-  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-      
-  //     MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(DPU::CLZ_Urrci), SecondLastInst->getOperand(0).getReg())
-  // 	.add(SecondLastInst->getOperand(1))
-  // 	.addImm(DPUAsmCondition::Condition::NotMaximum)
-  // 	.addMBB(LastInst->getOperand(2).getMBB());
-
-  //     LLVM_DEBUG({
-  // 	  dbgs() << "OK\n";
-  // 	  dbgs() << "del "; SecondLastInst->dump();
-  // 	  dbgs() << "del "; LastInst->dump();
-  // 	  dbgs() << "fused to\n";
-  // 	  dbgs() << "add "; ComboInst->dump();
-  // 	});
-  //     LastInst->eraseFromParent();
-  //     SecondLastInst->eraseFromParent();
-  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-  //     return true;
-  //   }
-  // }
-  // }
-
-  return false;
-}
-
 static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
                                         const DPUInstrInfo &InstrInfo) {
   MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend();
@@ -1015,8 +833,7 @@ bool DPUMergeComboInstrPass::runOnMachineFunction(MachineFunction &MF) {
 
     LLVM_DEBUG(MBB->dump());
 
-    bool local_change = mergeBranchArithmeticInMBB(MBB, InstrInfo);
-    local_change |= mergeComboInstructionsInMBB(MBB, InstrInfo);
+    bool local_change = mergeComboInstructionsInMBB(MBB, InstrInfo);
     if (local_change) {
       LLVM_DEBUG({
 	  dbgs() << "\nchanged to:\n";
diff --git a/llvm/lib/Target/DPU/DPUPostRAFusion.cpp b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
new file mode 100644
index 0000000000000..cae1aedaf03ef
--- /dev/null
+++ b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
@@ -0,0 +1,296 @@
+#include "DPUTargetMachine.h"
+#include "DPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include <llvm/CodeGen/MachineInstrBuilder.h>
+
+#define GET_INSTRINFO_ENUM
+
+#include "DPUCondCodes.h"
+#include "DPUGenInstrInfo.inc"
+#include "DPUISelLowering.h"
+#include "MCTargetDesc/DPUAsmCondition.h"
+
+#define GET_REGINFO_ENUM
+#include "DPUGenRegisterInfo.inc"
+
+#define DEBUG_TYPE "dpu-postra-fusion"
+
+using namespace llvm;
+
+namespace {
+class DPUPostRAFusionPass : public MachineFunctionPass {
+public:
+  static char ID;
+
+  explicit DPUPostRAFusionPass(DPUTargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  llvm::StringRef getPassName() const override {
+    return "DPU PostRA Fussion";
+  }
+
+private:
+  const DPUTargetMachine &TM;
+};
+
+char DPUPostRAFusionPass::ID = 0;
+} // namespace
+
+FunctionPass *llvm::createDPUPostRAFusionPass(DPUTargetMachine &tm) {
+  return new DPUPostRAFusionPass(tm);
+}
+
+static MachineInstr *
+getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
+                         MachineBasicBlock::reverse_iterator REnd) {
+  // Skip all the debug instructions.
+  while (I != REnd &&
+         (I->isDebugValue() || I->getOpcode() == TargetOpcode::DBG_VALUE)) {
+    ++I;
+  }
+  if (I == REnd) {
+    return NULL;
+  }
+  return &*I;
+}
+
+static bool do_have_special_metadata(MachineInstr *MI) {
+  for (const MachineOperand &Op : MI->operands()) {
+    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool runOnMachineBB(MachineBasicBlock *MBB,
+			   const DPUInstrInfo &InstrInfo) {
+  MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend();
+  MachineInstr *LastInst, *SecondLastInst;
+  unsigned int LastOpc, SecondLastOpc;
+
+  LLVMContext &Context = MBB->getParent()->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  LastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (LastInst == NULL) {
+    // LLVM_DEBUG(dbgs() << "KO: I == REnd\n");
+    return false;
+  }
+  I++;
+  SecondLastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (SecondLastInst == NULL) {
+    // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
+    return false;
+  }
+
+  LastOpc = LastInst->getOpcode();
+  SecondLastOpc = SecondLastInst->getOpcode();
+
+  // attempt to merge lslx/lsrx and XX 32 jeq XX 32 instructions
+  // TODO: check if it's shift32 as well?
+  //       or maybe use other metadata?
+  //         but this is to be extra careful, or for the next player in the game ... :)
+  // though, here I apply only when with my metadata
+  //   but if I actually not test my metadata, maybe
+  //     and add JNEQrii, I could pop both
+  //     and why not tackle other possible optim that may have introduce this code
+  //        event from user maybe
+  // original code is JEQrii, but JNEQrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JEQrii || LastOpc == DPU::JNEQrii) && do_have_special_metadata(LastInst)
+      && SecondLastOpc == DPU::ANDrri && do_have_special_metadata(SecondLastInst)) {
+    I++;
+    MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd);
+    if (ThirdLastInst == NULL) {
+      // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
+      return false;
+    }
+    unsigned int ThirdLastOpc = ThirdLastInst->getOpcode();
+    if ((ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr)
+	&& do_have_special_metadata(ThirdLastInst)) {
+
+      LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+      unsigned int new_opcode = (ThirdLastOpc == DPU::LSLXrrr ?
+				 DPU::LSLXrrrci : DPU::LSRXrrrci);
+      MachineInstrBuilder ComboInst = BuildMI(MBB, ThirdLastInst->getDebugLoc(),
+					      InstrInfo.get(new_opcode),
+					      ThirdLastInst->getOperand(0).getReg());
+      ComboInst.add(ThirdLastInst->getOperand(1));
+      ComboInst.add(ThirdLastInst->getOperand(2));
+      ComboInst.addImm(DPUAsmCondition::Condition::Shift32);
+      ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+      ComboInst.addMetadata(N);
+      
+      LLVM_DEBUG({
+	  dbgs() << "OK\n";
+	  dbgs() << "del "; ThirdLastInst->dump();
+	  dbgs() << "del "; SecondLastInst->dump();
+	  dbgs() << "del "; LastInst->dump();
+	  dbgs() << "fused to\n";
+	  dbgs() << "add "; ComboInst->dump();
+	});
+
+      LastInst->eraseFromParent();
+      SecondLastInst->eraseFromParent();
+      ThirdLastInst->eraseFromParent();
+      LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+      return true;
+    }
+  }
+
+  // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch
+  // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii) && do_have_special_metadata(LastInst)
+      && SecondLastOpc == DPU::MUL_UL_ULrrr && do_have_special_metadata(SecondLastInst)) {
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+      
+    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
+					    InstrInfo.get(DPU::MUL_UL_ULrrrci),
+					    SecondLastInst->getOperand(0).getReg());
+    ComboInst.add(SecondLastInst->getOperand(1));
+    ComboInst.add(SecondLastInst->getOperand(1));
+    ComboInst.addImm(DPUAsmCondition::Small);
+    ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+    ComboInst.addMetadata(N);
+    
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    LastInst->eraseFromParent();
+    SecondLastInst->eraseFromParent();
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+    return true;
+  }
+
+  // original code is JNEQrii, but JEQrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii) && do_have_special_metadata(LastInst)
+      && SecondLastOpc == DPU::CLZ_Urr && do_have_special_metadata(SecondLastInst)) {
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
+					    InstrInfo.get(DPU::CLZ_Urrci),
+					    SecondLastInst->getOperand(0).getReg());
+    ComboInst.add(SecondLastInst->getOperand(1));
+    ComboInst.addImm(DPUAsmCondition::Condition::NotMaximum);
+    ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+    ComboInst.addMetadata(N);
+
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    
+    LastInst->eraseFromParent();
+    SecondLastInst->eraseFromParent();
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+    return true;
+  }
+
+  // switch (SecondLastOpc) {
+  // default:
+  //   LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
+  //   return false;
+  // case DPU::CLZ_Urr: {
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "study CLZ_Urr to CLZ_Urrci\n";
+  // 	SecondLastInst->dump();
+  // 	LastInst->dump();
+  //     });
+    
+  //   bool do_def_reg_alias = false;
+  //   const TargetRegisterInfo *TRI = MBB->getParent()->getSubtarget().getRegisterInfo();
+  //   for (MCRegAliasIterator Alias(SecondLastInst->getOperand(0).getReg(), TRI, true); Alias.isValid(); ++Alias) {
+  //     Register AliasReg = *Alias;
+  //     if (LastInst->getOperand(0).getReg() == AliasReg) {
+  // 	// dbgs() << "yep it's alias\n";
+  // 	do_def_reg_alias = true;
+  //     }
+  //   }
+  //   if (LastInst->getOpcode() == DPU::JNEQrii
+  // 	&& LastInst->getOperand(1).getImm() == 32
+  // 	&& do_def_reg_alias
+  // 	) {
+  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+      
+  //     MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(DPU::CLZ_Urrci), SecondLastInst->getOperand(0).getReg())
+  // 	.add(SecondLastInst->getOperand(1))
+  // 	.addImm(DPUAsmCondition::Condition::NotMaximum)
+  // 	.addMBB(LastInst->getOperand(2).getMBB());
+
+  //     LLVM_DEBUG({
+  // 	  dbgs() << "OK\n";
+  // 	  dbgs() << "del "; SecondLastInst->dump();
+  // 	  dbgs() << "del "; LastInst->dump();
+  // 	  dbgs() << "fused to\n";
+  // 	  dbgs() << "add "; ComboInst->dump();
+  // 	});
+  //     LastInst->eraseFromParent();
+  //     SecondLastInst->eraseFromParent();
+  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
+  //     return true;
+  //   }
+  // }
+  // }
+  
+  return false;
+}
+
+bool DPUPostRAFusionPass::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName()
+                    << " **********\n\n");
+
+  auto &SubTarget = static_cast<const DPUSubtarget &>(MF.getSubtarget());
+  auto &InstrInfo = *SubTarget.getInstrInfo();
+  bool changeMade = false;
+
+  for (auto &MFI : MF) {
+    MachineBasicBlock *MBB = &MFI;
+    changeMade |= runOnMachineBB(MBB, InstrInfo);
+  }
+
+  LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName()
+                    << " done: changeMade = " << changeMade << " **********\n\n");
+  return changeMade;
+}
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index cef0c02bc7623..8d55bca676900 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -2067,23 +2067,24 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  // should be checked
-  // BuildMI(BB, dl, TII.get(MulLL), LLDest)
-  //     .addReg(Op1)
-  //     .addReg(Op2)
-  //     .addImm(DPUAsmCondition::Small)
-  //     .addMBB(fastMBB);
   LLVMContext &Context = F->getFunction().getContext();
   MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
-  BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest)
+  BuildMI(BB, dl, TII.get(MulLL), LLDest)
       .addReg(Op1)
       .addReg(Op2)
+      .addImm(DPUAsmCondition::Small)
+      .addMBB(fastMBB)
     .addMetadata(N);
-  BuildMI(BB, dl, TII.get(DPU::JLTUrii))
-    .addReg(LLDest)
-    .addImm(0x100)
-    .addMBB(fastMBB)
-    .addMetadata(N);
+
+  // BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest)
+  //     .addReg(Op1)
+  //     .addReg(Op2)
+  //   .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::JLTUrii))
+  //   .addReg(LLDest)
+  //   .addImm(0x100)
+  //   .addMBB(fastMBB)
+  //   .addMetadata(N);
   
   BuildMI(slowMBB, dl, TII.get(MulHL), HLDest).addReg(Op1).addReg(Op2);
   BuildMI(slowMBB, dl, TII.get(DPU::LSL_ADDrrri), LSL1Dest)
@@ -2462,17 +2463,22 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
   // BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
   //     .addReg(Op1Reg, 0, DPU::sub_32bit);
 
   // unsigned DummyReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   
   /// faulty
-  // BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
-  //     .addReg(LsbOp1Reg)
-  //     .addReg(ShiftReg)
-  //     .addImm(DPUAsmCondition::Condition::Shift32)
-  //     .addMBB(bigShiftMBB);
+  BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
+      // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
+      .addReg(ShiftReg)
+      .addImm(DPUAsmCondition::Condition::Shift32)
+      .addMBB(bigShiftMBB)
+    .addMetadata(N);
 
   /// good, but
   // could increase quite a bit the code size
@@ -2482,22 +2488,20 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   //   on a few example, I can keep them adjacent
   //  but I may kill other optimization stuff in other code
   //   that use it genuinelly
-  LLVMContext &Context = F->getFunction().getContext();
-  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
-  BuildMI(BB, dl, TII.get(DPU::LSLXrrr), LsbToMsbPartReg)
-    // .addReg(LsbOp1Reg)
-    .addReg(Op1Reg, 0, DPU::sub_32bit)
-    .addReg(ShiftReg)
-    .addMetadata(N);
-  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
-    .addReg(ShiftReg)
-    .addImm(0x20)
-    .addMetadata(N);
-  BuildMI(BB, dl, TII.get(DPU::JEQrii))
-    .addReg(ShiftReg_check)
-    .addImm(0x20)
-    .addMBB(bigShiftMBB)
-    .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::LSLXrrr), LsbToMsbPartReg)
+  //   // .addReg(LsbOp1Reg)
+  //   .addReg(Op1Reg, 0, DPU::sub_32bit)
+  //   .addReg(ShiftReg)
+  //   .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+  //   .addReg(ShiftReg)
+  //   .addImm(0x20)
+  //   .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::JEQrii))
+  //   .addReg(ShiftReg_check)
+  //   .addImm(0x20)
+  //   .addMBB(bigShiftMBB)
+  //   .addMetadata(N);
   
   // BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg)
       // .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
@@ -2746,32 +2750,36 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
       RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
-
-  // BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
-  //     .addReg(MsbOp1Reg)
-  //     .addReg(ShiftReg)
-  //     .addImm(DPUAsmCondition::Condition::Shift32)
-  //     .addMBB(bigShiftMBB);
-
   LLVMContext &Context = F->getFunction().getContext();
   MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
-  BuildMI(BB, dl, TII.get(DPU::LSRXrrr), MsbToLsbPartReg)
-    .addReg(MsbOp1Reg)
-    .addReg(ShiftReg)
-    .addMetadata(N);
-  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
-    .addReg(ShiftReg)
-    .addImm(0x20)
-    .addMetadata(N);
-  BuildMI(BB, dl, TII.get(DPU::JEQrii))
-    .addReg(ShiftReg_check)
-    .addImm(0x20)
-    .addMBB(bigShiftMBB)
+  BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg)
+      .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
+
+  BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
+      .addReg(MsbOp1Reg)
+      .addReg(ShiftReg)
+      .addImm(DPUAsmCondition::Condition::Shift32)
+      .addMBB(bigShiftMBB)
     .addMetadata(N);
 
+  // LLVMContext &Context = F->getFunction().getContext();
+  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // BuildMI(BB, dl, TII.get(DPU::LSRXrrr), MsbToLsbPartReg)
+  //   .addReg(MsbOp1Reg)
+  //   .addReg(ShiftReg)
+  //   .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+  //   .addReg(ShiftReg)
+  //   .addImm(0x20)
+  //   .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::JEQrii))
+  //   .addReg(ShiftReg_check)
+  //   .addImm(0x20)
+  //   .addMBB(bigShiftMBB)
+  //   .addMetadata(N);
+
   BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), LsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
 
@@ -3223,23 +3231,25 @@ static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
 
   unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned LsbAddReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  
-  // BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
-  //     .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
-  //     .addImm(DPUAsmCondition::Condition::NotMaximum)
-  //     .addMBB(endMBB);
 
   LLVMContext &Context = F->getFunction().getContext();
   MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
-  BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg)
-    .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
-    .addMetadata(N);
-  BuildMI(BB, dl, TII.get(DPU::JNEQrii))
-    .addReg(FastResultReg, 0, DPU::sub_32bit)
-    .addImm(32)
-    .addMBB(endMBB)
+
+  BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
+      .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+      .addImm(DPUAsmCondition::Condition::NotMaximum)
+      .addMBB(endMBB)
     .addMetadata(N);
 
+  // BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg)
+  //   .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+  //   .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::JNEQrii))
+  //   .addReg(FastResultReg, 0, DPU::sub_32bit)
+  //   .addImm(32)
+  //   .addMBB(endMBB)
+  //   .addMetadata(N);
+
   BuildMI(msbAreZerosMBB, dl, TII.get(DPU::CLZrr), LsbClzReg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
 
diff --git a/llvm/lib/Target/DPU/DPUTargetMachine.cpp b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
index 5815b161c6ce9..a292eb41821d0 100644
--- a/llvm/lib/Target/DPU/DPUTargetMachine.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
@@ -84,6 +84,7 @@ class DPUPassConfig : public TargetPassConfig {
 
   bool addInstSelector() override;
 
+  // void addPostRegAlloc() override;
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
 };
@@ -103,6 +104,11 @@ bool DPUPassConfig::addInstSelector() {
   return false;
 }
 
+// void DPUPassConfig::addPostRegAlloc() {
+//   DPUTargetMachine &TM = getDPUTargetMachine();
+//   addPass(createDPUPostRAFusionPass(TM));
+// }
+
 void DPUPassConfig::addPreEmitPass() {
   DPUTargetMachine &TM = getDPUTargetMachine();
   addPass(createDPUMergeComboInstrPass(TM));

From f4b565dc26be12c2becbc9cfbac36c3abd559db4 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 22 Jul 2024 07:47:56 +0200
Subject: [PATCH 13/17] wip: have a go without the metadata now, and keep
 simple in buildConditionalBranch

---
 llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp       |   4 +
 llvm/lib/Target/DPU/DPUInstrInfo.cpp          | 149 +++++++++++++-----
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp |  83 +++++++---
 llvm/lib/Target/DPU/DPUTargetLowering.cpp     |  28 ++--
 4 files changed, 189 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
index 00adb6c2b9f6e..d2365966a09a0 100644
--- a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
@@ -149,6 +149,10 @@ bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
                                                  const DPUInstrInfo *DII,
                                                  const TargetRegisterInfo *TRI,
                                                  const MachineInstr &MI) {
+  // This function seems to do manual coalescing
+  //    probably we should use the proper one that probably knows better
+  //    maybe prob with MI operand constraint ... ?
+  //    probably better to educate the coalescer, or better define register class
   unsigned DstReg = 0, CstReg = 0;
 
   if (MI.getOpcode() == DPU::COPY) {
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index 98f08fd8cab4f..5f8591a042380 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -321,6 +321,7 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
   LLVM_DEBUG({
       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
       dbgs() << "Inst "; Inst->dump();
+      dbgs() << "Cond.size() " << Cond.size() << "\n";
       for (unsigned i = 0; i < Cond.size(); ++i) {
 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
       }
@@ -329,11 +330,36 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
   unsigned Opc = Inst->getOpcode();
   Cond.push_back(MachineOperand::CreateImm(Opc));
 
+  // for (unsigned int eachOperandIndex = 0; eachOperandIndex < Inst->getNumOperands();
+  //      eachOperandIndex++) {
+  //   MachineOperand &operand = Inst->getOperand(eachOperandIndex);
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "operand " << eachOperandIndex << ": "; operand.dump();
+  //     });
+  //   if (operand.isMBB()) {
+  //     targetBasicBlockOperandIndex = eachOperandIndex;
+  //   } else {
+  //     Cond.push_back(operand);
+  //   }
+  // }
   unsigned int NumOp = Inst->getNumExplicitOperands();
+  // unsigned int NumOp = Inst->getNumOperands();
 
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "NumOp " << NumOp << "\n";
+    });
   for (unsigned int eachOperandIndex = 0; eachOperandIndex < NumOp;
        eachOperandIndex++) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "operand " << eachOperandIndex << ": ";
+      });
     MachineOperand &operand = Inst->getOperand(eachOperandIndex);
+    LLVM_DEBUG({
+	operand.dump();
+      });
     if (operand.isMBB()) {
       targetBasicBlockOperandIndex = eachOperandIndex;
     } else {
@@ -341,15 +367,16 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
     }
   }
 
-  for (const MachineOperand &Op : Inst->operands()) {
-    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(Inst->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
-      Cond.push_back(Op);
-    }
-  }
+  // for (const MachineOperand &Op : Inst->operands()) {
+  //   if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(Inst->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+  //     Cond.push_back(Op);
+  //   }
+  // }
   
   LLVM_DEBUG({
       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
       dbgs() << "Inst "; Inst->dump();
+      dbgs() << "Cond.size() " << Cond.size() << "\n";
       for (unsigned i = 0; i < Cond.size(); ++i) {
 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
       }
@@ -432,6 +459,8 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       LLVM_DEBUG({
 	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
 	  dbgs() << "MBB "; MBB.dump();
+	  dbgs() << "LastInst "; LastInst->dump();
+	  dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
 	  for (unsigned i = 0; i < Cond.size(); ++i) {
 	    dbgs() << "Cond[" << i << "] "; Cond[i].dump();
 	  }
@@ -481,6 +510,10 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     LLVM_DEBUG({
 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
 	dbgs() << "MBB "; MBB.dump();
+	dbgs() << "LastInst "; LastInst->dump();
+	dbgs() << "SecondLastInst "; SecondLastInst->dump();
+	dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
+
 	for (unsigned i = 0; i < Cond.size(); ++i) {
 	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
 	}
@@ -528,6 +561,10 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
       dbgs() << "MBB "; MBB.dump();
       for (unsigned i = 0; i < Cond.size(); ++i) {
 	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+	  if (Cond[i].isReg()) {
+	    dbgs() << "Cond[" << i << "] isUse " << Cond[i].isUse() << "\n";
+	    dbgs() << "Cond[" << i << "] isDef " << Cond[i].isDef() << "\n";
+	  }
       }
     });
 
@@ -553,44 +590,72 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
 
   // treat special cases
   // those where not well handled with LLVM SSA stuff
-  bool have_metadata = false;
+  // bool have_metadata = false;
   // TODO: find a better way to discover if it's an arithmetic+comp+jump
   //       or simply rely solely on metadata?
-  switch (Opc) {
-  default:
-    break;
-  case DPU::CLZ_Urrci:
-  case DPU::MUL_UL_ULrrrci:
-  case DPU::LSLXrrrci:
-  case DPU::LSRXrrrci:
-    {
-      for (unsigned i = 0; i < Cond.size(); ++i) {
-	if (Cond[i].isMetadata()
-	    && Cond[i].getMetadata()->getOperand(0).get() == MDString::get(MBB.getParent()->getFunction().getContext(), "MySpecialMetadata")) {
-	  have_metadata = true;
-	}
-      }
-      break;
-    }
-  }
+  // switch (Opc) {
+  // default:
+  //   break;
+  // case DPU::CLZ_Urrci:
+  // case DPU::MUL_UL_ULrrrci:
+  // case DPU::LSLXrrrci:
+  // case DPU::LSRXrrrci:
+  //   {
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	if (Cond[i].isMetadata()
+  // 	    && Cond[i].getMetadata()->getOperand(0).get() == MDString::get(MBB.getParent()->getFunction().getContext(), "MySpecialMetadata")) {
+  // 	  have_metadata = true;
+  // 	}
+  //     }
+  //     break;
+  //   }
+  // }
 
+  MIB = BuildMI(&MBB, DL, get(Opc));
+  // for (unsigned i = 1; i < Cond.size(); ++i) {
+  //   MIB->addOperand(Cond[i]);
+  // }
+  
+  
   unsigned start = 1;
-  if (have_metadata) {
-    MIB = BuildMI(&MBB, DL, get(Opc), Cond[start].getReg());
-    start++;
-  } else {
-    MIB = BuildMI(&MBB, DL, get(Opc));
-  }
+  // if (have_metadata) {
+  //   MIB = BuildMI(&MBB, DL, get(Opc), Cond[start].getReg());
+  //   start++;
+  // } else {
+  //   MIB = BuildMI(&MBB, DL, get(Opc));
+  // }
 
   for (unsigned i = start; i < Cond.size(); ++i) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << " working on " << i << "\n";
+      });
     if (Cond[i].isReg()) {
-      // The register in question could potentially be a
-      // subreg hi/lo of a 64-bit vreg
-      if (unsigned SubReg = Cond[i].getSubReg()) {
-	MIB.addReg(Cond[i].getReg(), 0, SubReg);
-      } else {
-	MIB.addReg(Cond[i].getReg());
-      }
+      // LLVM_DEBUG({
+      // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      // });
+      // MIB.addReg(Cond[i].getReg());
+      // LLVM_DEBUG({
+      // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      // });
+      MIB->addOperand(Cond[i]);
+      // if (Cond[i].isDef()) {
+      // 	// The register in question could potentially be a
+      // 	// subreg hi/lo of a 64-bit vreg
+      // 	if (unsigned SubReg = Cond[i].getSubReg()) {
+      // 	  MIB.addDef(Cond[i].getReg(), 0, SubReg);
+      // 	} else {
+      // 	  MIB.addDef(Cond[i].getReg());
+      // 	}
+      // } else {
+      // 	// The register in question could potentially be a
+      // 	// subreg hi/lo of a 64-bit vreg
+      // 	if (unsigned SubReg = Cond[i].getSubReg()) {
+      // 	  MIB.addReg(Cond[i].getReg(), 0, SubReg);
+      // 	} else {
+      // 	  MIB.addReg(Cond[i].getReg());
+      // 	}
+      // }
     } else if (Cond[i].isImm()) {
       MIB.addImm(Cond[i].getImm());
     } else if (Cond[i].isMetadata()) {
@@ -602,12 +667,12 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
 
   MIB.addMBB(TBB);
 
-  // add back remaining metadata
-  for (unsigned i = 0; i < Cond.size(); ++i) {
-     if (Cond[i].isMetadata()) {
-      MIB.addMetadata(Cond[i].getMetadata());
-     }
-  }
+  // // add back remaining metadata
+  // for (unsigned i = 0; i < Cond.size(); ++i) {
+  //    if (Cond[i].isMetadata()) {
+  //     MIB.addMetadata(Cond[i].getMetadata());
+  //    }
+  // }
 
   LLVM_DEBUG({
       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index b126c7d1e52e8..90c98a527082d 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -247,17 +247,17 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
   default:
     LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
     return false;
-  case DPU::MOVEri:
-    OpPrototype = OpriLimited;
-    OpJumpOpc = DPU::MOVErici;
-    OpNullJumpOpc = DPU::MOVErici; // should not be used
-    usableConditions = normalConditionsSet;
-    break;
-  case DPU::MOVErr:
-    OpPrototype = Oprr;
-    OpJumpOpc = DPU::MOVErrci;
-    OpNullJumpOpc = DPU::MOVErrci; // should not be used
-    usableConditions = normalConditionsSet;
+  // case DPU::MOVEri:
+  //   OpPrototype = OpriLimited;
+  //   OpJumpOpc = DPU::MOVErici;
+  //   OpNullJumpOpc = DPU::MOVErici; // should not be used
+  //   usableConditions = normalConditionsSet;
+  //   break;
+  // case DPU::MOVErr:
+  //   OpPrototype = Oprr;
+  //   OpJumpOpc = DPU::MOVErrci;
+  //   OpNullJumpOpc = DPU::MOVErrci; // should not be used
+  //   usableConditions = normalConditionsSet;
     break;
   case DPU::SUBrrr:
     OpPrototype = Oprrr;
@@ -669,6 +669,12 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
       return false;
     }
 
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
     int64_t actualCondition = ISD::SETTRUE2;
     MachineInstrBuilder ComboInst =
         BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(OpJumpOpc))
@@ -697,14 +703,33 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     auto actualConditionOperand = MachineOperand::CreateImm(actualCondition);
     ComboInst.add(actualConditionOperand).add(LastInst->getOperand(0));
 
-    LLVM_DEBUG(dbgs() << "OK\n"; LastInst->dump(); SecondLastInst->dump(););
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    
     LastInst->eraseFromParent();
     SecondLastInst->eraseFromParent();
 
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+    
     return true;
   }
   case DPU::TmpJcci:
   case DPU::Jcci: {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: (if any)\n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
     bool isSourceCondition = false;
 
     if (SecondLastInst->getOperand(0).getReg() !=
@@ -771,7 +796,7 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
       // now. This can become an issue (unnecessary spilling)
       ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
                           InstrInfo.get(OpNullJumpOpc))
-                      .addReg(DPU::ZERO);
+	.addReg(DPU::ZERO);
     } else {
       if (!ImmCanBeEncodedOn8Bits) {
         LLVM_DEBUG(
@@ -804,14 +829,30 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
       break;
     }
 
-    LastInst->getOperand(0).setImm(actualCondition);
-    ComboInst.add(LastInst->getOperand(0))
+    // why modify the original instruction ???
+    // LastInst->getOperand(0).setImm(actualCondition);
+    // ComboInst.add(LastInst->getOperand(0))
+    //     .add(LastInst->getOperand(LastInst->getNumOperands() - 1));
+    ComboInst.addImm(actualCondition)
         .add(LastInst->getOperand(LastInst->getNumOperands() - 1));
 
-    LLVM_DEBUG(dbgs() << "OK\n"; LastInst->dump(); SecondLastInst->dump(););
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    
     LastInst->eraseFromParent();
     SecondLastInst->eraseFromParent();
 
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+    
     return true;
   }
   case DPU::Jcc:
@@ -831,14 +872,14 @@ bool DPUMergeComboInstrPass::runOnMachineFunction(MachineFunction &MF) {
   for (auto &MFI : MF) {
     MachineBasicBlock *MBB = &MFI;
 
-    LLVM_DEBUG(MBB->dump());
+    // LLVM_DEBUG(MBB->dump());
 
     bool local_change = mergeComboInstructionsInMBB(MBB, InstrInfo);
     if (local_change) {
-      LLVM_DEBUG({
-	  dbgs() << "\nchanged to:\n";
-	  MBB->dump();
-	});
+      // LLVM_DEBUG({
+      // 	  dbgs() << "\nchanged to:\n";
+      // 	  MBB->dump();
+      // 	});
       changeMade = true;
     }
   }
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index 8d55bca676900..ffdd77b035c2b 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -2067,14 +2067,15 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  LLVMContext &Context = F->getFunction().getContext();
-  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  // LLVMContext &Context = F->getFunction().getContext();
+  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
   BuildMI(BB, dl, TII.get(MulLL), LLDest)
       .addReg(Op1)
       .addReg(Op2)
       .addImm(DPUAsmCondition::Small)
       .addMBB(fastMBB)
-    .addMetadata(N);
+    // .addMetadata(N)
+    ;
 
   // BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest)
   //     .addReg(Op1)
@@ -2463,8 +2464,8 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  LLVMContext &Context = F->getFunction().getContext();
-  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  // LLVMContext &Context = F->getFunction().getContext();
+  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
   // BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
   //     .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -2478,7 +2479,8 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
       .addReg(ShiftReg)
       .addImm(DPUAsmCondition::Condition::Shift32)
       .addMBB(bigShiftMBB)
-    .addMetadata(N);
+    // .addMetadata(N)
+    ;
 
   /// good, but
   // could increase quite a bit the code size
@@ -2750,8 +2752,8 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
       RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  LLVMContext &Context = F->getFunction().getContext();
-  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  // LLVMContext &Context = F->getFunction().getContext();
+  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
   BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
@@ -2761,7 +2763,8 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
       .addReg(ShiftReg)
       .addImm(DPUAsmCondition::Condition::Shift32)
       .addMBB(bigShiftMBB)
-    .addMetadata(N);
+    // .addMetadata(N)
+    ;
 
   // LLVMContext &Context = F->getFunction().getContext();
   // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
@@ -3232,14 +3235,15 @@ static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
   unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned LsbAddReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  LLVMContext &Context = F->getFunction().getContext();
-  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  // LLVMContext &Context = F->getFunction().getContext();
+  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
   BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
       .addImm(DPUAsmCondition::Condition::NotMaximum)
       .addMBB(endMBB)
-    .addMetadata(N);
+    // .addMetadata(N)
+    ;
 
   // BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg)
   //   .addReg(Op1Reg, 0, DPU::sub_32bit_hi)

From 2d24393a5dc8e3bb86ace870c778db8abf72d534 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 22 Jul 2024 07:56:33 +0200
Subject: [PATCH 14/17] wip: allow this buggy one because CI/CD test it ...

---
 llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 90c98a527082d..d5575207c6234 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -660,7 +660,7 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     // we morph the branch from unconditional to conditional
     // by this, we modify the CFG by creating artificially a fall through which is not declared
     // so, it's bugged
-    return false;
+    // return false;
     // 
     
     if (!ImmCanBeEncodedOn8Bits) {

From bdbdeecca6a3cc953960b7b067710f37ad2934c9 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 22 Jul 2024 14:30:26 +0200
Subject: [PATCH 15/17] wip: have a go with renamable phy register and normal
 coalescer/copy_propagation and other optimizer

---
 llvm/lib/Target/DPU/DPU.td              |   1 +
 llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp | 268 ++++++++++++------------
 llvm/lib/Target/DPU/DPUInstrInfo.cpp    | 176 ++++++++--------
 llvm/lib/Target/DPU/DPURegisterInfo.cpp |  43 ++--
 4 files changed, 251 insertions(+), 237 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPU.td b/llvm/lib/Target/DPU/DPU.td
index 65f22ee7312f9..e262860b24780 100644
--- a/llvm/lib/Target/DPU/DPU.td
+++ b/llvm/lib/Target/DPU/DPU.td
@@ -71,4 +71,5 @@ def DPU : Target {
   let AssemblyParsers = [DPUAsmParser];
   let AssemblyParserVariants = [DPUAsmParserVariant];
   let AssemblyWriters = [DPUInstPrinter];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
index d2365966a09a0..b08a71adb52a4 100644
--- a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
@@ -86,12 +86,12 @@ class DPUDAGToDAGISel : public SelectionDAGISel {
 
   bool IsGlobalAddrInImmediateSection(SDNode *Node) const;
 
-  void processFunctionAfterISel(MachineFunction &MF);
+  // void processFunctionAfterISel(MachineFunction &MF);
 
-  bool replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
-                                  const DPUInstrInfo *DII,
-                                  const TargetRegisterInfo *TRI,
-                                  const MachineInstr &MI);
+  // bool replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
+  //                                 const DPUInstrInfo *DII,
+  //                                 const TargetRegisterInfo *TRI,
+  //                                 const MachineInstr &MI);
 
   bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out);
 };
@@ -102,139 +102,139 @@ StringRef DPUDAGToDAGISel::getPassName() const { return "DPUDAGToDAGISel"; }
 bool DPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
-  processFunctionAfterISel(MF);
+  // processFunctionAfterISel(MF);
 
   return Ret;
 }
 
-void DPUDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
-
-  auto &SubTarget = static_cast<const DPUSubtarget &>(MF.getSubtarget());
-  auto InstrInfo = SubTarget.getInstrInfo();
-  auto RegInfo = SubTarget.getRegisterInfo();
-
-  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
-       ++MFI)
-    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
-      replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I);
-    }
-}
-
-static inline bool canCommuteOperation(MachineInstr *MI, unsigned opNo,
-                                       unsigned &newOpNo) {
-  switch (MI->getOpcode()) {
-  case DPU::ADDrrr:
-  case DPU::ANDrrr:
-  case DPU::ORrrr:
-  case DPU::XORrrr:
-    switch (opNo) {
-    case 1:
-      newOpNo = 2;
-      break;
-    case 2:
-      newOpNo = 1;
-      break;
-    default:
-      return false;
-    }
-
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
-                                                 const DPUInstrInfo *DII,
-                                                 const TargetRegisterInfo *TRI,
-                                                 const MachineInstr &MI) {
-  // This function seems to do manual coalescing
-  //    probably we should use the proper one that probably knows better
-  //    maybe prob with MI operand constraint ... ?
-  //    probably better to educate the coalescer, or better define register class
-  unsigned DstReg = 0, CstReg = 0;
-
-  if (MI.getOpcode() == DPU::COPY) {
-    unsigned reg = MI.getOperand(1).getReg();
-
-    DstReg = MI.getOperand(0).getReg();
-    switch (reg) {
-    case DPU::ID:
-    case DPU::ID2:
-    case DPU::ID4:
-    case DPU::ID8:
-      CstReg = reg;
-      break;
-    default:
-      break;
-    }
-  } else if (((MI.getOpcode() == DPU::MOVErr) &&
-              (MI.getOperand(1).getReg() == DPU::ZERO)) ||
-             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-              (MI.getOperand(1).getImm() == 0))) {
-    DstReg = MI.getOperand(0).getReg();
-    CstReg = DPU::ZERO;
-  } else if (((MI.getOpcode() == DPU::MOVErr) &&
-              (MI.getOperand(1).getReg() == DPU::ONE)) ||
-             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-              (MI.getOperand(1).getImm() == 1))) {
-    DstReg = MI.getOperand(0).getReg();
-    CstReg = DPU::ONE;
-  } else if (((MI.getOpcode() == DPU::MOVErr) &&
-              (MI.getOperand(1).getReg() == DPU::LNEG)) ||
-             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-              (MI.getOperand(1).getImm() == -1))) {
-    DstReg = MI.getOperand(0).getReg();
-    CstReg = DPU::LNEG;
-  } else if (((MI.getOpcode() == DPU::MOVErr) &&
-              (MI.getOperand(1).getReg() == DPU::MNEG)) ||
-             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-              (MI.getOperand(1).getImm() == 0x8000000))) {
-    DstReg = MI.getOperand(0).getReg();
-    CstReg = DPU::MNEG;
-  }
-
-  if (!CstReg)
-    return false;
-
-  // Replace uses with CstReg.
-  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
-                                         E = MRI->use_end();
-       U != E;) {
-    MachineOperand &MO = *U;
-    unsigned OpNo = U.getOperandNo();
-    MachineInstr *UMI = MO.getParent();
-    ++U;
-
-    // Do not replace if it is a phi's operand or is tied to def operand.
-    if (UMI->isPHI() || UMI->isRegTiedToDefOperand(OpNo) || UMI->isPseudo())
-      continue;
-
-    // Also, we have to check that the register class of the operand
-    // contains the constant register.
-    if (!UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(CstReg)) {
-      unsigned newOpNo;
-
-      if (canCommuteOperation(UMI, OpNo, newOpNo)) {
-        auto OtherReg = UMI->getOperand(newOpNo).getReg();
-
-        if (UMI->getRegClassConstraint(newOpNo, DII, TRI)->contains(CstReg) &&
-            (!Register::isPhysicalRegister(OtherReg) ||
-             UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(OtherReg))) {
-          UMI->getOperand(newOpNo).setReg(CstReg);
-          UMI->getOperand(OpNo).setReg(OtherReg);
-        }
-      }
-
-      continue;
-    }
-
-    MO.setReg(CstReg);
-  }
-
-  return true;
-}
+// void DPUDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+//   MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+//   auto &SubTarget = static_cast<const DPUSubtarget &>(MF.getSubtarget());
+//   auto InstrInfo = SubTarget.getInstrInfo();
+//   auto RegInfo = SubTarget.getRegisterInfo();
+
+//   for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
+//        ++MFI)
+//     for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+//       replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I);
+//     }
+// }
+
+// static inline bool canCommuteOperation(MachineInstr *MI, unsigned opNo,
+//                                        unsigned &newOpNo) {
+//   switch (MI->getOpcode()) {
+//   case DPU::ADDrrr:
+//   case DPU::ANDrrr:
+//   case DPU::ORrrr:
+//   case DPU::XORrrr:
+//     switch (opNo) {
+//     case 1:
+//       newOpNo = 2;
+//       break;
+//     case 2:
+//       newOpNo = 1;
+//       break;
+//     default:
+//       return false;
+//     }
+
+//     return true;
+//   default:
+//     return false;
+//   }
+// }
+
+// bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
+//                                                  const DPUInstrInfo *DII,
+//                                                  const TargetRegisterInfo *TRI,
+//                                                  const MachineInstr &MI) {
+//   // This function seems to do manual coalescing
+//   //    probably we should use the proper one that probably knows better
+//   //    maybe prob with MI operand constraint ... ?
+//   //    probably better to educate the coalescer, or better define register class
+//   unsigned DstReg = 0, CstReg = 0;
+
+//   if (MI.getOpcode() == DPU::COPY) {
+//     unsigned reg = MI.getOperand(1).getReg();
+
+//     DstReg = MI.getOperand(0).getReg();
+//     switch (reg) {
+//     case DPU::ID:
+//     case DPU::ID2:
+//     case DPU::ID4:
+//     case DPU::ID8:
+//       CstReg = reg;
+//       break;
+//     default:
+//       break;
+//     }
+//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
+//               (MI.getOperand(1).getReg() == DPU::ZERO)) ||
+//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+//               (MI.getOperand(1).getImm() == 0))) {
+//     DstReg = MI.getOperand(0).getReg();
+//     CstReg = DPU::ZERO;
+//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
+//               (MI.getOperand(1).getReg() == DPU::ONE)) ||
+//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+//               (MI.getOperand(1).getImm() == 1))) {
+//     DstReg = MI.getOperand(0).getReg();
+//     CstReg = DPU::ONE;
+//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
+//               (MI.getOperand(1).getReg() == DPU::LNEG)) ||
+//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+//               (MI.getOperand(1).getImm() == -1))) {
+//     DstReg = MI.getOperand(0).getReg();
+//     CstReg = DPU::LNEG;
+//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
+//               (MI.getOperand(1).getReg() == DPU::MNEG)) ||
+//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+//               (MI.getOperand(1).getImm() == 0x8000000))) {
+//     DstReg = MI.getOperand(0).getReg();
+//     CstReg = DPU::MNEG;
+//   }
+
+//   if (!CstReg)
+//     return false;
+
+//   // Replace uses with CstReg.
+//   for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+//                                          E = MRI->use_end();
+//        U != E;) {
+//     MachineOperand &MO = *U;
+//     unsigned OpNo = U.getOperandNo();
+//     MachineInstr *UMI = MO.getParent();
+//     ++U;
+
+//     // Do not replace if it is a phi's operand or is tied to def operand.
+//     if (UMI->isPHI() || UMI->isRegTiedToDefOperand(OpNo) || UMI->isPseudo())
+//       continue;
+
+//     // Also, we have to check that the register class of the operand
+//     // contains the constant register.
+//     if (!UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(CstReg)) {
+//       unsigned newOpNo;
+
+//       if (canCommuteOperation(UMI, OpNo, newOpNo)) {
+//         auto OtherReg = UMI->getOperand(newOpNo).getReg();
+
+//         if (UMI->getRegClassConstraint(newOpNo, DII, TRI)->contains(CstReg) &&
+//             (!Register::isPhysicalRegister(OtherReg) ||
+//              UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(OtherReg))) {
+//           UMI->getOperand(newOpNo).setReg(CstReg);
+//           UMI->getOperand(OpNo).setReg(OtherReg);
+//         }
+//       }
+
+//       continue;
+//     }
+
+//     MO.setReg(CstReg);
+//   }
+
+//   return true;
+// }
 
 bool DPUDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
   if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index 5f8591a042380..d3e071f03ff7e 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -301,10 +301,10 @@ bool DPUInstrInfo::reverseBranchCondition(
 static void
 fetchUnconditionalBranchInfo(MachineInstr *Inst,
                              unsigned &targetBasicBlockOperandIndex) {
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "Inst "; Inst->dump();
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "Inst "; Inst->dump();
+  //   });
 
   switch (Inst->getOpcode()) {
   case DPU::JUMPi:
@@ -318,14 +318,14 @@ fetchUnconditionalBranchInfo(MachineInstr *Inst,
 static void fetchConditionalBranchInfo(MachineInstr *Inst,
                                        unsigned &targetBasicBlockOperandIndex,
                                        SmallVectorImpl<MachineOperand> &Cond) {
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "Inst "; Inst->dump();
-      dbgs() << "Cond.size() " << Cond.size() << "\n";
-      for (unsigned i = 0; i < Cond.size(); ++i) {
-	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-      }
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "Inst "; Inst->dump();
+  //     dbgs() << "Cond.size() " << Cond.size() << "\n";
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+  //     }
+  //   });
   
   unsigned Opc = Inst->getOpcode();
   Cond.push_back(MachineOperand::CreateImm(Opc));
@@ -346,20 +346,20 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
   unsigned int NumOp = Inst->getNumExplicitOperands();
   // unsigned int NumOp = Inst->getNumOperands();
 
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "NumOp " << NumOp << "\n";
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "NumOp " << NumOp << "\n";
+  //   });
   for (unsigned int eachOperandIndex = 0; eachOperandIndex < NumOp;
        eachOperandIndex++) {
-    LLVM_DEBUG({
-	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-	dbgs() << "operand " << eachOperandIndex << ": ";
-      });
+    // LLVM_DEBUG({
+    // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    // 	dbgs() << "operand " << eachOperandIndex << ": ";
+    //   });
     MachineOperand &operand = Inst->getOperand(eachOperandIndex);
-    LLVM_DEBUG({
-	operand.dump();
-      });
+    // LLVM_DEBUG({
+    // 	operand.dump();
+    //   });
     if (operand.isMBB()) {
       targetBasicBlockOperandIndex = eachOperandIndex;
     } else {
@@ -373,14 +373,14 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
   //   }
   // }
   
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "Inst "; Inst->dump();
-      dbgs() << "Cond.size() " << Cond.size() << "\n";
-      for (unsigned i = 0; i < Cond.size(); ++i) {
-	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-      }
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "Inst "; Inst->dump();
+  //     dbgs() << "Cond.size() " << Cond.size() << "\n";
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+  //     }
+  //   });
 }
 
 static inline bool isAnalyzableBranch(MachineInstr *Inst) {
@@ -393,13 +393,13 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
 
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "MBB "; MBB.dump();
-      for (unsigned i = 0; i < Cond.size(); ++i) {
-	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-      }
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "MBB "; MBB.dump();
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+  //     }
+  //   });
   
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 
@@ -456,15 +456,15 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     if (LastInst->isConditionalBranch()) {
       unsigned int TBBOpIdx;
       fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond);
-      LLVM_DEBUG({
-	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-	  dbgs() << "MBB "; MBB.dump();
-	  dbgs() << "LastInst "; LastInst->dump();
-	  dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
-	  for (unsigned i = 0; i < Cond.size(); ++i) {
-	    dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-	  }
-	});
+      // LLVM_DEBUG({
+      // 	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      // 	  dbgs() << "MBB "; MBB.dump();
+      // 	  dbgs() << "LastInst "; LastInst->dump();
+      // 	  dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
+      // 	  for (unsigned i = 0; i < Cond.size(); ++i) {
+      // 	    dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      // 	  }
+      // 	});
       TBB = LastInst->getOperand(TBBOpIdx).getMBB();
       return false;
     }
@@ -507,17 +507,17 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond);
     TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB();
     FBB = LastInst->getOperand(FTBBOpIdx).getMBB();
-    LLVM_DEBUG({
-	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-	dbgs() << "MBB "; MBB.dump();
-	dbgs() << "LastInst "; LastInst->dump();
-	dbgs() << "SecondLastInst "; SecondLastInst->dump();
-	dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
-
-	for (unsigned i = 0; i < Cond.size(); ++i) {
-	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-	}
-      });
+    // LLVM_DEBUG({
+    // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    // 	dbgs() << "MBB "; MBB.dump();
+    // 	dbgs() << "LastInst "; LastInst->dump();
+    // 	dbgs() << "SecondLastInst "; SecondLastInst->dump();
+    // 	dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
+
+    // 	for (unsigned i = 0; i < Cond.size(); ++i) {
+    // 	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+    // 	}
+    //   });
     return false;
   }
 
@@ -527,10 +527,10 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
 unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                     int *BytesRemoved) const {
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "MBB "; MBB.dump();
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "MBB "; MBB.dump();
+  //   });
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
 
@@ -556,17 +556,17 @@ unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
 void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
                                           MachineBasicBlock *TBB, DebugLoc DL,
                                           ArrayRef<MachineOperand> Cond) const {
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "MBB "; MBB.dump();
-      for (unsigned i = 0; i < Cond.size(); ++i) {
-	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-	  if (Cond[i].isReg()) {
-	    dbgs() << "Cond[" << i << "] isUse " << Cond[i].isUse() << "\n";
-	    dbgs() << "Cond[" << i << "] isDef " << Cond[i].isDef() << "\n";
-	  }
-      }
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "MBB "; MBB.dump();
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+  // 	  if (Cond[i].isReg()) {
+  // 	    dbgs() << "Cond[" << i << "] isUse " << Cond[i].isUse() << "\n";
+  // 	    dbgs() << "Cond[" << i << "] isDef " << Cond[i].isDef() << "\n";
+  // 	  }
+  //     }
+  //   });
 
   // LLVM_DEBUG({
   //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
@@ -626,10 +626,10 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
   // }
 
   for (unsigned i = start; i < Cond.size(); ++i) {
-    LLVM_DEBUG({
-	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-	dbgs() << " working on " << i << "\n";
-      });
+    // LLVM_DEBUG({
+    // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    // 	dbgs() << " working on " << i << "\n";
+    //   });
     if (Cond[i].isReg()) {
       // LLVM_DEBUG({
       // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
@@ -674,10 +674,10 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
   //    }
   // }
 
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "MIB "; MIB->dump();
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "MIB "; MIB->dump();
+  //   });
 }
 
 unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
@@ -685,13 +685,13 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
                                     const DebugLoc &DL, int *BytesAdded) const {
-  LLVM_DEBUG({
-      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-      dbgs() << "MBB "; MBB.dump();
-      for (unsigned i = 0; i < Cond.size(); ++i) {
-	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-      }
-    });
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "MBB "; MBB.dump();
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+  //     }
+  //   });
   unsigned nrOfInsertedMachineInstr = 0;
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.cpp b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
index ad7db1f538ada..705b05ca0e746 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.cpp
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
@@ -50,17 +50,30 @@ DPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector DPURegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector reserved = BitVector(getNumRegs());
-  reserved.set(DPU::D22);
-  reserved.set(DPU::R22);
-  reserved.set(DPU::R23);
-  reserved.set(DPU::ZERO);
-  reserved.set(DPU::ONE);
-  reserved.set(DPU::LNEG);
-  reserved.set(DPU::MNEG);
-  reserved.set(DPU::ID);
-  reserved.set(DPU::ID2);
-  reserved.set(DPU::ID4);
-  reserved.set(DPU::ID8);
+  
+  markSuperRegs(reserved, DPU::D22);
+  markSuperRegs(reserved, DPU::R22);
+  markSuperRegs(reserved, DPU::R23);
+  markSuperRegs(reserved, DPU::ZERO);
+  markSuperRegs(reserved, DPU::ONE);
+  markSuperRegs(reserved, DPU::LNEG);
+  markSuperRegs(reserved, DPU::MNEG);
+  markSuperRegs(reserved, DPU::ID);
+  markSuperRegs(reserved, DPU::ID2);
+  markSuperRegs(reserved, DPU::ID4);
+  markSuperRegs(reserved, DPU::ID8);
+  assert(checkAllSuperRegsMarked(reserved));
+  // reserved.set(DPU::D22);
+  // reserved.set(DPU::R22);
+  // reserved.set(DPU::R23);
+  // reserved.set(DPU::ZERO);
+  // reserved.set(DPU::ONE);
+  // reserved.set(DPU::LNEG);
+  // reserved.set(DPU::MNEG);
+  // reserved.set(DPU::ID);
+  // reserved.set(DPU::ID2);
+  // reserved.set(DPU::ID4);
+  // reserved.set(DPU::ID8);
   // reserved.set(DPU::MAJ_D22);
   // reserved.set(DPU::MAJ_R22);
   // reserved.set(DPU::MAJ_R23);
@@ -176,10 +189,10 @@ bool DPURegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
   case DPU::ONE:
   case DPU::LNEG:
   case DPU::MNEG:
-  // case DPU::ID:
-  // case DPU::ID2:
-  // case DPU::ID4:
-  // case DPU::ID8:
+  case DPU::ID:
+  case DPU::ID2:
+  case DPU::ID4:
+  case DPU::ID8:
     return true;
   }
 }

From bbb5895151ea891aefabff940a2baefdf3cc7cfd Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 29 Jul 2024 08:22:05 +0200
Subject: [PATCH 16/17] dpu: compiler-rt: add support

---
 compiler-rt/dpu/CMakeLists.txt                | 306 ++++++++++++++++++
 compiler-rt/dpu/Toolchain.cmake               |  12 +
 compiler-rt/dpu/compiler_rt_tests.sh          | 265 +++++++++++++++
 compiler-rt/dpu/lldb_python.py                |  42 +++
 compiler-rt/lib/builtins/dpu/div32.c          |  97 ++++++
 compiler-rt/lib/builtins/dpu/divdi3.c         |  31 ++
 compiler-rt/lib/builtins/dpu/divsi3.c         |  23 ++
 compiler-rt/lib/builtins/dpu/moddi3.c         |  31 ++
 compiler-rt/lib/builtins/dpu/modsi3.c         |  34 ++
 compiler-rt/lib/builtins/dpu/mul32.S          |  48 +++
 compiler-rt/lib/builtins/dpu/mul32.c          |  59 ++++
 compiler-rt/lib/builtins/dpu/muldi3.c         | 171 ++++++++++
 compiler-rt/lib/builtins/dpu/mulsi3.c         |   8 +
 compiler-rt/lib/builtins/dpu/udiv32.S         |  49 +++
 compiler-rt/lib/builtins/dpu/udiv32.c         |  63 ++++
 compiler-rt/lib/builtins/dpu/udiv64.c         |  59 ++++
 compiler-rt/lib/builtins/dpu/udivdi3.c        |  19 ++
 compiler-rt/lib/builtins/dpu/udivmodsi4.c     |  29 ++
 compiler-rt/lib/builtins/dpu/udivsi3.c        |  15 +
 compiler-rt/lib/builtins/dpu/umoddi3.c        |  19 ++
 compiler-rt/lib/builtins/dpu/umodsi3.c        |  27 ++
 .../test/builtins/Unit/comparedf2_test.c      |   2 +-
 .../test/builtins/Unit/comparesf2_test.c      |   2 +-
 compiler-rt/test/builtins/Unit/test.c         |  13 +
 24 files changed, 1422 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/dpu/CMakeLists.txt
 create mode 100644 compiler-rt/dpu/Toolchain.cmake
 create mode 100644 compiler-rt/dpu/compiler_rt_tests.sh
 create mode 100644 compiler-rt/dpu/lldb_python.py
 create mode 100644 compiler-rt/lib/builtins/dpu/div32.c
 create mode 100644 compiler-rt/lib/builtins/dpu/divdi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/divsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/moddi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/modsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/mul32.S
 create mode 100644 compiler-rt/lib/builtins/dpu/mul32.c
 create mode 100644 compiler-rt/lib/builtins/dpu/muldi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/mulsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udiv32.S
 create mode 100644 compiler-rt/lib/builtins/dpu/udiv32.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udiv64.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udivdi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udivmodsi4.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udivsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/umoddi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/umodsi3.c
 create mode 100644 compiler-rt/test/builtins/Unit/test.c

diff --git a/compiler-rt/dpu/CMakeLists.txt b/compiler-rt/dpu/CMakeLists.txt
new file mode 100644
index 0000000000000..19e3c2790baf1
--- /dev/null
+++ b/compiler-rt/dpu/CMakeLists.txt
@@ -0,0 +1,306 @@
+cmake_minimum_required(VERSION 3.13)
+
+project(librt C ASM)
+
+set(CMAKE_AR llvm-ar)
+set(CMAKE_LINKER llvm-ld)
+set(CMAKE_NM llvm-nm)
+set(CMAKE_OBJDUMP llvm-objdump)
+set(CMAKE_RANLIB llvm-ranlib)
+set(OBJCOPY llvm-objcopy)
+set(CLANGFORMAT clang-format)
+
+set(COMPILER_RT_BUILTINS_DIR ../lib/builtins)
+
+set(GENERIC_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/mul32.S
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/mulsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/muldi3.c
+
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.S
+  # ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.c optimized above
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/div32.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/divsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/modsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/umodsi3.c
+  
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv64.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/divdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/moddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/umoddi3.c
+
+  ${COMPILER_RT_BUILTINS_DIR}/absvdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/absvsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/adddf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashldi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashrdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/bswapdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/bswapsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/cmpdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparedf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparesf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/divdf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmoddi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendsfdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendhfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffsdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffssi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatdidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatdisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatundidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatundisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/fp_mode.c
+  ${COMPILER_RT_BUILTINS_DIR}/int_util.c
+  ${COMPILER_RT_BUILTINS_DIR}/lshrdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/moddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/modsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulodi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulosi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/negdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/paritydi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/paritysi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powidf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powisf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/subdf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncdfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncdfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncsfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ucmpdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmoddi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umoddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umodsi3.c
+  )
+
+set(GENERIC_TF_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/addtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/absvti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashrti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparetf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/cmpti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/divtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmodti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/extenddftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendhftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendsftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffsti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatditf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunditf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/lshrti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/modti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muloti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/multf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/multi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/negti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powitf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/subtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ucmpti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmodti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umodti3.c
+  )
+
+set(GENERIC_COMPLEX_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/divdc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c
+  )
+
+set(GENERIC_COMPLEX_TF_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/divdc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divtc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/multc3.c
+  )
+
+set(SOURCES ${GENERIC_SOURCES}
+  # ${GENERIC_TF_SOURCES}
+  # ${GENERIC_COMPLEX}
+  # ${GENERIC_COMPLEX_TF_SOURCES}
+  )
+
+function(add_dpu_library)
+  set(options PROFILING)
+  set(oneValueArgs TARGET OPT_LEVEL LTO)
+  set(multiValueArgs SOURCES)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message("ARGN: ${ARGN}")
+
+  message(${options})
+  message(${oneValueArgs})
+  message(${multiValueArgs})
+  
+  message("TARGET: ${arg_TARGET}")
+  message("OPT_LEVEL: ${arg_OPT_LEVEL}")
+  message("PROFILING: ${arg_PROFILING}")
+  message("LTO: ${arg_LTO}")
+  message("LTO_TYPE: ${arg_LTO_TYPE}")
+  
+  set(LOCAL_TARGET ${arg_TARGET})
+
+  set(OTHER_FLAGS)
+  list(APPEND OTHER_FLAGS -Wall)
+  list(APPEND OTHER_FLAGS -Wextra)
+
+  if (arg_OPT_LEVEL)
+    list(APPEND OTHER_FLAGS ${arg_OPT_LEVEL})
+    string(REPLACE "-" "" arg_OPT_LEVEL ${arg_OPT_LEVEL})
+    string(APPEND LOCAL_TARGET "_${arg_OPT_LEVEL}")
+  endif()
+  if (arg_LTO)
+    list(APPEND OTHER_FLAGS ${arg_LTO})
+    string(REPLACE "-f" "" arg_LTO ${arg_LTO})
+    string(REPLACE "=" "" arg_LTO ${arg_LTO})
+    string(APPEND LOCAL_TARGET "_${arg_LTO}")
+  else()
+    string(APPEND LOCAL_TARGET "_")
+  endif()
+  if (arg_PROFILING)
+    list(APPEND OTHER_FLAGS -pg)
+    string(APPEND LOCAL_TARGET "_pg")
+  endif()
+
+  list(APPEND OTHER_FLAGS -g0)
+  list(APPEND OTHER_FLAGS -mllvm -verify-machineinstrs)
+  # list(APPEND OTHER_FLAGS -mllvm -debug) --> deduped
+
+  message("LOCAL_TARGET: ${LOCAL_TARGET}")
+  message("OTHER_FLAGS: ${OTHER_FLAGS}")
+
+  add_library(${LOCAL_TARGET} STATIC "${arg_SOURCES}")
+
+  target_include_directories(${LOCAL_TARGET} PRIVATE
+    ${COMPILER_RT_BUILTINS_DIR}
+    ${COMPILER_RT_BUILTINS_DIR}/dpu)
+  
+  target_compile_options(${LOCAL_TARGET}
+    PRIVATE ${NOSTDLIB_FLAGS} ${STRICT_FLAGS} ${COMPILER_TIMESTAMP_DEF} ${OTHER_FLAGS})
+
+  # set_target_properties(${LOCAL_TARGET} PROPERTIES OUTPUT_NAME "rt")
+
+  if (arg_LTO)
+    install(
+      TARGETS ${LOCAL_TARGET}
+      ARCHIVE
+      DESTINATION ${arg_OPT_LEVEL}/${arg_LTO}
+      )
+  else()
+    install(
+      TARGETS ${LOCAL_TARGET}
+      ARCHIVE
+      DESTINATION ${arg_OPT_LEVEL}/no_lto
+      )
+  endif()
+endfunction()
+
+# add_dpu_library(
+#     TARGET rt
+#     OPT_LEVEL -O3
+#     # LTO -flto
+#     # PROFILING
+#     SOURCES ${SOURCES}
+#     )
+  
+foreach(OPT_LEVEL -O0;-O1;-O2;-O3;-Os)
+  add_dpu_library(
+    TARGET rt
+    OPT_LEVEL ${OPT_LEVEL}
+    SOURCES ${SOURCES}
+    )
+  # add_dpu_library(
+  #   TARGET rt
+  #   OPT_LEVEL ${OPT_LEVEL}
+  #   PROFILING
+  #   SOURCES ${SOURCES}
+  #   )
+  foreach(LTO -flto;-flto=thin)
+    add_dpu_library(
+      TARGET rt
+      OPT_LEVEL ${OPT_LEVEL}
+      LTO ${LTO}
+      SOURCES ${SOURCES}
+      )
+    # add_dpu_library(
+    #   TARGET rt
+    #   OPT_LEVEL ${OPT_LEVEL}
+    #   LTO ${LTO}
+    #   PROFILING
+    #   SOURCES ${SOURCES}
+    #   )
+  endforeach()
+endforeach()
diff --git a/compiler-rt/dpu/Toolchain.cmake b/compiler-rt/dpu/Toolchain.cmake
new file mode 100644
index 0000000000000..ae09a95e9b705
--- /dev/null
+++ b/compiler-rt/dpu/Toolchain.cmake
@@ -0,0 +1,12 @@
+include(CMakeForceCompiler)
+
+# set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS s;S;asm)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_CROSSCOMPILING 1)
+set(CMAKE_ASM_COMPILER dpu-clang)
+set(CMAKE_C_COMPILER dpu-clang)
+set(CMAKE_CXX_COMPILER dpu-clang)
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_C_COMPILER_WORKS 1)
+set(CMAKE_CXX_COMPILER_WORKS 1)
diff --git a/compiler-rt/dpu/compiler_rt_tests.sh b/compiler-rt/dpu/compiler_rt_tests.sh
new file mode 100644
index 0000000000000..350c9ce6fb7e1
--- /dev/null
+++ b/compiler-rt/dpu/compiler_rt_tests.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+
+set -eux
+
+COMPILER_RT=~/work/dpu_tools_fix_64bit_reg/llvm-project/compiler-rt/lib/builtins
+COMPILER_RT_TESTS=~/work/dpu_tools_fix_64bit_reg/llvm-project/compiler-rt/test/builtins/Unit
+
+# not supported
+# declare -a TESTS_=(
+    # absvti2_test.c
+    # adddf3vfp_test.c
+    # addsf3vfp_test.c
+    # addtf3_test.c
+    # addvti3_test.c
+    # ashlti3_test.c
+    # ashrti3_test.c
+    # clzti2_test.c
+    # cmpti2_test.c
+    # compiler_rt_logb_test.c
+    # compiler_rt_logbf_test.c
+    # compiler_rt_logbl_test.c
+    # ctzti2_test.c
+    # divdc3_test.c
+    # divdf3vfp_test.c
+    # divmodti4_test.c
+    # divsf3vfp_test.c
+    # divsc3_test.c
+    # divtc3_test.c
+    # divtf3_test.c
+    # divti3_test.c
+    # divxc3_test.c
+    # eqdf2vfp_test.c
+    # eqsf2vfp_test.c
+    # eqtf2_test.c
+    # extenddftf2_test.c
+    # extendhftf2_test.c
+    # extendsfdf2vfp_test.c
+    # extendsftf2_test.c
+    # ffsti2_test.c
+    # fixdfsivfp_test.c
+    # fixdfti_test.c
+    # fixsfsivfp_test.c
+    # fixsfti_test.c
+    # fixtfti_test.c
+    # fixunsdfsivfp_test.c
+    # fixunsdfti_test.c
+    # fixunssfsivfp_test.c
+    # fixunssfti_test.c
+    # floatditf_test.c
+    # floatsidfvfp_test.c
+    # floatsisfvfp_test.c
+    # floatunditf_test.c
+    # floatunssidfvfp_test.c
+    # floatunssisfvfp_test.c
+    # muldc3_test.c
+    # ltdf2vfp_test.c
+    # ltsf2vfp_test.c
+    # gedf2vfp_test.c
+    # gesf2vfp_test.c
+    # gtdf2vfp_test.c
+    # gtsf2vfp_test.c
+    # ledf2vfp_test.c
+    # lesf2vfp_test.c
+    # muldf3vfp_test.c
+    # mulsf3vfp_test.c
+    # nedf2vfp_test.c
+    # negdf2vfp_test.c
+    # negsf2vfp_test.c
+    # nesf2vfp_test.c
+    # subdf3vfp_test.c
+    # subsf3vfp_test.c
+    # truncdfsf2vfp_test.c
+    # unorddf2vfp_test.c
+    # unordsf2vfp_test.c
+    # mulsc3_test.c
+    # mulxc3_test.c
+    # powixf2_test.c
+    # subvti3_test.c
+    # ucmpti2_test.c
+    # udivmodti4_test.c
+    # udivti3_test.c
+    # umodti3_test.c
+    # subtf3_test.c
+    # powitf2_test.c
+    # negvti2_test.c
+    # modti3_test.c
+    # muloti4_test.c
+    # multc3_test.c
+    # multi3_test.c
+    # mulvti3_test.c
+    # negti2_test.c
+    # netf2_test.c
+    # parityti2_test.c
+    # popcountti2_test.c
+    # fixtfdi_test.c
+    # fixtfsi_test.c
+    # fixunstfdi_test.c
+    # fixunstfsi_test.c
+    # fixunstfti_test.c
+    # fixunsxfdi_test.c
+    # fixunsxfsi_test.c
+    # fixunsxfti_test.c
+    # fixxfti_test.c
+    # floatdixf_test.c
+    # floatsitf_test.c
+    # floattidf_test.c
+    # floattisf_test.c
+    # floattitf_test.c
+    # floattixf_test.c
+    # floatundixf_test.c
+    # floatunsitf_test.c
+    # floatuntidf_test.c
+    # floatuntisf_test.c
+    # floatuntitf_test.c
+    # floatuntixf_test.c
+    # getf2_test.c
+    # gttf2_test.c
+    # letf2_test.c
+    # lshrti3_test.c
+    # lttf2_test.c
+    # multf3_test.c
+    # unordtf2_test.c
+    # trunctfdf2_test.c
+    # trunctfhf2_test.c
+    # trunctfsf2_test.c
+    # fixxfdi_test.c
+    # udivmoddi4_test.c # too big :)
+# )
+
+declare -a TESTS=(
+    test.c
+    absvdi2_test.c
+    absvsi2_test.c
+    addvdi3_test.c
+    addvsi3_test.c
+    ashldi3_test.c
+    ashrdi3_test.c
+    bswapdi2_test.c
+    bswapsi2_test.c
+    clzdi2_test.c
+    clzsi2_test.c
+    cmpdi2_test.c
+    comparedf2_test.c
+    comparesf2_test.c
+    ctzdi2_test.c
+    ctzsi2_test.c
+    divdf3_test.c
+    divdi3_test.c
+    divmodsi4_test.c
+    divsf3_test.c
+    divsi3_test.c
+    extendhfsf2_test.c
+    ffsdi2_test.c
+    ffssi2_test.c
+    fixdfdi_test.c
+    fixsfdi_test.c
+    fixunsdfdi_test.c
+    fixunsdfsi_test.c
+    fixunssfdi_test.c
+    fixunssfsi_test.c
+    floatdidf_test.c
+    floatdisf_test.c
+    floatundidf_test.c
+    floatundisf_test.c
+    lshrdi3_test.c
+    moddi3_test.c
+    modsi3_test.c
+    muldi3_test.c
+    mulodi4_test.c
+    mulosi4_test.c
+    mulsi3_test.c
+    mulvdi3_test.c
+    mulvsi3_test.c
+    negdi2_test.c
+    negvdi2_test.c
+    negvsi2_test.c
+    paritydi2_test.c
+    paritysi2_test.c
+    popcountdi2_test.c
+    popcountsi2_test.c
+    powidf2_test.c
+    powisf2_test.c
+    subvdi3_test.c
+    subvsi3_test.c
+    truncdfhf2_test.c
+    truncdfsf2_test.c
+    truncsfhf2_test.c
+    ucmpdi2_test.c
+    udivdi3_test.c
+    udivmodsi4_test.c
+    udivsi3_test.c
+    umoddi3_test.c
+    umodsi3_test.c
+)
+
+declare -a OPT_LEVELS=(
+    O0
+    # O1
+    # O2
+    # O3
+    # Os
+)
+
+declare -a COMPILER_OPTIONS=(
+    no_lto
+    # lto
+    # ltothin
+)
+
+MYPWD=`pwd`
+
+mkdir -p test
+cd test
+
+for COMPILER_OPTION in "${COMPILER_OPTIONS[@]}"
+do
+    mkdir -p ${COMPILER_OPTION}
+    cd ${COMPILER_OPTION}
+
+    case "$COMPILER_OPTION" in
+	"no_lto") COMPILER_OPTION_="";;
+	"lto") COMPILER_OPTION_="-flto";;
+	"ltothin") COMPILER_OPTION_="-flto=thin";;
+    esac
+
+    case "$COMPILER_OPTION" in
+	"no_lto") COMPILER_OPTION_LIB="";;
+	"lto") COMPILER_OPTION_LIB="lto";;
+	"ltothin") COMPILER_OPTION_LIB="ltothin";;
+    esac
+    
+    for OPT_LEVEL in "${OPT_LEVELS[@]}"
+    do
+	mkdir -p ${OPT_LEVEL}
+	cd ${OPT_LEVEL}
+	
+	for TEST in "${TESTS[@]}"
+	do
+	    clang --target=dpu-upmem-dpurte -mcpu=v1A \
+		  -I${COMPILER_RT} \
+		  -g0 \
+		  -v \
+		  -save-temps \
+		  -I ${MYPWD} \
+		  ${COMPILER_OPTION_} \
+		  -${OPT_LEVEL} \
+		  ${COMPILER_RT_TESTS}/${TEST} \
+		  -o $(basename "${TEST}" .c) \
+		  -L ${MYPWD}/install/${OPT_LEVEL}/${COMPILER_OPTION}/ -lrt_${OPT_LEVEL}_${COMPILER_OPTION_LIB} \
+		&> `basename ${TEST}`_compiler_log.txt
+
+	    # dpu-lldb --batch --one-line run -- $(basename "${TEST}" .c)
+	    python3 ~/work/simple_examples/lldb_python.py $(basename "${TEST}" .c)
+	done
+	cd ..
+    done
+
+    cd ..
+done
+cd ..
+
+		  # -L  ~/scratch/dpu_tools/share/upmem/include/built-in/v1A -lrt_v1A \
+		  # -save-temps \
+		  # -mllvm -debug -mllvm -print-after-all -mllvm -verify-machineinstrs \
+		  # --thinlto-jobs=1
diff --git a/compiler-rt/dpu/lldb_python.py b/compiler-rt/dpu/lldb_python.py
new file mode 100644
index 0000000000000..e333723af601e
--- /dev/null
+++ b/compiler-rt/dpu/lldb_python.py
@@ -0,0 +1,42 @@
+import sys
+import os
+import subprocess
+import dpu
+import lldb
+import tempfile
+
+binary = sys.argv[1]
+
+debugger = lldb.SBDebugger().Create()
+debugger.SetAsync(False)
+
+target = debugger.CreateTarget(binary)
+assert target.IsValid()
+
+launch_info = lldb.SBLaunchInfo(None)
+launch_info.SetWorkingDirectory(os.getcwd())
+
+with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+    stdout_path = tmp_file.name
+
+launch_info.AddOpenFileAction(1, stdout_path, False, True)
+
+# process = target.Launch(debugger.GetListener(), None, None, ".",
+#                         "stdout.txt", "stderr.txt", None, 0, False, error)
+process = target.Launch(launch_info, lldb.SBError())
+# process = target.LaunchSimple(None, None, ".")
+
+# print(process)
+
+assert process.IsValid()
+
+with open(stdout_path, 'r') as file:
+    stdout_data = file.read()
+
+os.remove(stdout_path)
+
+print(stdout_data)
+
+# Cleanup LLDB
+# lldb.SBDebugger.Terminate()
+sys.exit(process.exit_state)
diff --git a/compiler-rt/lib/builtins/dpu/div32.c b/compiler-rt/lib/builtins/dpu/div32.c
new file mode 100644
index 0000000000000..df25bbbdaf9d4
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/div32.c
@@ -0,0 +1,97 @@
+/* Copyright 2024 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+/* int64_t */
+void
+__div32(int32_t dividend, int32_t divider
+	, int32_t *p_q, int32_t *p_rem
+	)
+{
+    uint64_t res;
+    uint32_t q;
+    uint32_t rem;
+
+    __asm__ goto("clo zero, %[dividend], z, %l[__div32_pos_dividend]\n\t"
+                 "clo zero, %[divider], z, %l[__div32_neg_dividend_pos_divider]\n\t"
+                 :
+                 : [dividend] "r"(dividend), [divider] "r"(divider)
+                 :
+                 : __div32_pos_dividend, __div32_neg_dividend_pos_divider);
+
+    /* The quotient's sign depends on the sign of the dividend and divider... After few tries it sounds */
+    /* like the quickest way to select the operators is to branch according to the cases. */
+
+    /* __div32_neg_dividend_neg_divider: */
+    /* As a result, the quotient is positive and the remainder negative */
+    dividend = 0 - dividend;
+    divider = 0 - divider;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    rem = (uint32_t)res;
+    rem = 0 - rem;
+    goto recombine;
+
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+    
+__div32_neg_dividend_pos_divider:
+    /* As a result, the quotient is negative and the remainder negative */
+    dividend = 0 - dividend;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    q = 0 - q;
+    rem = (uint32_t)res;
+    rem = 0 - rem;
+    goto recombine;
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+__div32_pos_dividend:
+    __asm__ goto("clo zero, %[divider], z, %l[__div32_pos_dividend_pos_divider]"
+                 :
+                 : [divider] "r"(divider)
+                 :
+                 : __div32_pos_dividend_pos_divider);
+    /* As a result, the quotient is negative and the remainder positive */
+    divider = 0 - divider;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    q = 0 - q;
+    rem = (uint32_t)res;
+    goto recombine;
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+__div32_pos_dividend_pos_divider:
+    /* The dividend and divider are both positive */
+    res = __udiv32(dividend, divider);
+    /* goto last_exit; */
+    q = (uint32_t) (res >> 32);
+    rem = (uint32_t) res;
+    /* goto recombine; */
+
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    
+/* recombine: */
+/*     res = q; */
+/*     res <<= 32; */
+/*     res |= rem; */
+/* last_exit: */
+/*     return res; */
+
+ recombine:
+    *p_q = q;
+    *p_rem = rem;
+    return;
+}
diff --git a/compiler-rt/lib/builtins/dpu/divdi3.c b/compiler-rt/lib/builtins/dpu/divdi3.c
new file mode 100644
index 0000000000000..178cbf35fd2ee
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/divdi3.c
@@ -0,0 +1,31 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 signed division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+
+extern uint64_t __udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+int64_t
+__divdi3(int64_t dividend, int64_t divider)
+{
+    if (dividend >= 0) {
+        if (divider >= 0) {
+            return __udiv64(dividend, divider, 0);
+        } else {
+            return -__udiv64(dividend, -divider, 0);
+        }
+    } else if (divider >= 0) {
+        // Negative dividend, positive divider
+        return -__udiv64(-dividend, divider, 0);
+    } else {
+        // Negative dividend, negative divider
+        return __udiv64(-dividend, -divider, 0);
+    }
+}
diff --git a/compiler-rt/lib/builtins/dpu/divsi3.c b/compiler-rt/lib/builtins/dpu/divsi3.c
new file mode 100644
index 0000000000000..8ec97468aaf83
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/divsi3.c
@@ -0,0 +1,23 @@
+/* Copyright 2024 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+
+/* extern int64_t __div32(int32_t dividend, int32_t divider); */
+extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem);
+
+#include "int_lib.h"
+
+COMPILER_RT_ABI si_int
+__divsi3(si_int a, si_int b)
+{
+  /* int64_t res = __div32(a, b); */
+  /* return (si_int) (res >> 32); */
+  
+  int32_t q;
+  int32_t rem;
+  __div32(a, b, &q, &rem);
+  return q;
+}
diff --git a/compiler-rt/lib/builtins/dpu/moddi3.c b/compiler-rt/lib/builtins/dpu/moddi3.c
new file mode 100644
index 0000000000000..dad11e699f87c
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/moddi3.c
@@ -0,0 +1,31 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 signed division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+int64_t
+__moddi3(int64_t dividend, int64_t divider)
+{
+    if (dividend >= 0) {
+        if (divider >= 0) {
+            return __udiv64(dividend, divider, 1);
+        } else {
+            return __udiv64(dividend, -divider, 1);
+        }
+    } else if (divider >= 0) {
+        // Negative dividend, positive divider
+        return -__udiv64(-dividend, divider, 1);
+    } else {
+        // Negative dividend, negative divider
+        return -__udiv64(-dividend, -divider, 1);
+    }
+}
diff --git a/compiler-rt/lib/builtins/dpu/modsi3.c b/compiler-rt/lib/builtins/dpu/modsi3.c
new file mode 100644
index 0000000000000..c0cc59e8c92f9
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/modsi3.c
@@ -0,0 +1,34 @@
+/* ===-- modsi3.c - Implement __modsi3 -------------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __modsi3 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include <stdint.h>
+
+/* extern int64_t __div32(int32_t dividend, int32_t divider); */
+extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem);
+
+#include "int_lib.h"
+
+/* Returns: a % b */
+
+COMPILER_RT_ABI si_int
+__modsi3(si_int a, si_int b)
+{
+    /* int64_t res = __div32(a, b); */
+    /* return (si_int) res; */
+  
+  int32_t q;
+  int32_t rem;
+  __div32(a, b, &q, &rem);
+  return rem;
+}
diff --git a/compiler-rt/lib/builtins/dpu/mul32.S b/compiler-rt/lib/builtins/dpu/mul32.S
new file mode 100644
index 0000000000000..fe735ab5b328f
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mul32.S
@@ -0,0 +1,48 @@
+        .text
+        .globl  __mul32
+        .type   __mul32,@function
+__mul32:
+        jgtu r1, r0, .Ltmp0
+        move r2, r0
+        move r0, r1, true, .Ltmp1
+.Ltmp0:
+        move r2, r1
+        // move r0, r0
+.Ltmp1:
+        move r1, zero
+        mul_step d0, r2, d0, 0, z, .Ltmp2
+        mul_step d0, r2, d0, 1, z, .Ltmp2
+        mul_step d0, r2, d0, 2, z, .Ltmp2
+        mul_step d0, r2, d0, 3, z, .Ltmp2
+        mul_step d0, r2, d0, 4, z, .Ltmp2
+        mul_step d0, r2, d0, 5, z, .Ltmp2
+        mul_step d0, r2, d0, 6, z, .Ltmp2
+        mul_step d0, r2, d0, 7, z, .Ltmp2
+        mul_step d0, r2, d0, 8, z, .Ltmp2
+        mul_step d0, r2, d0, 9, z, .Ltmp2
+        mul_step d0, r2, d0, 10, z, .Ltmp2
+        mul_step d0, r2, d0, 11, z, .Ltmp2
+        mul_step d0, r2, d0, 12, z, .Ltmp2
+        mul_step d0, r2, d0, 13, z, .Ltmp2
+        mul_step d0, r2, d0, 14, z, .Ltmp2
+        mul_step d0, r2, d0, 15, z, .Ltmp2
+        mul_step d0, r2, d0, 16, z, .Ltmp2
+        mul_step d0, r2, d0, 17, z, .Ltmp2
+        mul_step d0, r2, d0, 18, z, .Ltmp2
+        mul_step d0, r2, d0, 19, z, .Ltmp2
+        mul_step d0, r2, d0, 20, z, .Ltmp2
+        mul_step d0, r2, d0, 21, z, .Ltmp2
+        mul_step d0, r2, d0, 22, z, .Ltmp2
+        mul_step d0, r2, d0, 23, z, .Ltmp2
+        mul_step d0, r2, d0, 24, z, .Ltmp2
+        mul_step d0, r2, d0, 25, z, .Ltmp2
+        mul_step d0, r2, d0, 26, z, .Ltmp2
+        mul_step d0, r2, d0, 27, z, .Ltmp2
+        mul_step d0, r2, d0, 28, z, .Ltmp2
+        mul_step d0, r2, d0, 29, z, .Ltmp2
+        mul_step d0, r2, d0, 30, z, .Ltmp2
+        mul_step d0, r2, d0, 31, z, .Ltmp2
+.Ltmp2:
+        move r0, r1
+
+        jump r23
diff --git a/compiler-rt/lib/builtins/dpu/mul32.c b/compiler-rt/lib/builtins/dpu/mul32.c
new file mode 100644
index 0000000000000..cc6be09b64847
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mul32.c
@@ -0,0 +1,59 @@
+#include <stdint.h>
+
+int32_t __mulsi3(int32_t a, int32_t b)
+{
+    int32_t dest;
+
+    int32_t temp0;
+    uint64_t temp1;
+
+    this is not working yet ...
+      temp1.hi/temp1.lo is not yet supported
+      
+    __asm__ volatile("  jgtu %[b], %[a], 1f\n"
+                     "  move %[temp0], %[a]\n"
+                     "  move %[temp1.hi], %[b], true, 2f\n"
+                     "1:\n"
+                     "  move %[temp0], %[b]\n"
+                     "  move %[temp1.hi], %[a]\n"
+                     "2:\n"
+                     "  move r1, zero\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 0 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 1 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 2 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 3 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 4 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 5 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 6 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 7 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 8 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 9 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 10, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 11, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 12, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 13, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 14, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 15, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 16, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 17, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 18, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 19, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 20, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 21, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 22, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 23, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 24, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 25, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 26, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 27, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 28, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 29, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 30, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 31, z, 3f\n"
+                     "3:\n"
+                     "  move %[dest], %[temp1.lo]\n"
+                     : [dest] "=&r"(dest), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1)
+                     : [a]"r"(a), [b]"r"(b)
+                     : );
+    return dest;
+}
diff --git a/compiler-rt/lib/builtins/dpu/muldi3.c b/compiler-rt/lib/builtins/dpu/muldi3.c
new file mode 100644
index 0000000000000..2d5a28b1dc260
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/muldi3.c
@@ -0,0 +1,171 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 multiplication emulation.
+ *
+ * A relatively fast emulation of 64x64 multiplication using byte multipliers.
+ * Basically, the two operands X and Y are seen as byte polynomials:
+ *  - X = X0.2^0 + X1.2^8 + X2.2^16 + X3.2^24 + X4.2^32 + X5.2^40 + X6.2^48 + X7.2^56
+ *  - Y = Y0.2^0 + Y1.2^8 + Y2.2^16 + Y3.2^24 + Y4.2^32 + Y5.2^40 + Y6.2^48 + Y7.2^56
+ *
+ * The product Z is expressed as a similar polynomial. Since the result is 64 bits,
+ * the function drops any coefficient for a power greater than 56, hence the following
+ * formula:
+ *  Z = (X0.Y0).2^0
+ *      + (X0.Y1 + X1.Y0).2^8
+ *      + (X0.Y2 + X2.Y0 + X1.Y1).2^16
+ *      + (X0.Y3 + X1.Y2 + X2.Y1 + X3.Y0).2^24
+ *      + (X0.Y4 + X1.Y3 + X2.Y2 + X3.Y1 + X4.Y0).2^32
+ *      etc.
+ *
+ * Each individual produce is computed with the native built-in 8x8 instructions.
+ * Resulting processing time is in the magnitude of 150 instructions.
+ *
+ * The two operands are found in __D0 and the first kernel nano-stack entry.
+ * The result goes into __R0 (lsbits) and __R1 (msbits).
+ * Also, __R2 contains the return address register, instead of __RET__.
+ */
+#include <stdint.h>
+
+static inline __attribute__((always_inline)) uint16_t
+_mul00(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return (a & 0xff) * (b & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_ul_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+static inline __attribute__((always_inline)) uint16_t
+_mul01(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return (a & 0xff) * ((b >> 8) & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_ul_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+#define _mul02(a, b) _mul00(a, (b >> 16))
+#define _mul03(a, b) _mul01(a, (b >> 16))
+
+static inline __attribute__((always_inline)) uint16_t
+_mul11(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return ((a >> 8) & 0xff) * ((b >> 8) & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_uh_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+static inline __attribute__((always_inline)) uint16_t
+_mul12(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return ((a >> 8) & 0xff) * ((b >> 16) & 0xff);
+#else
+    uint32_t r = (b >> 16);
+    __asm__ volatile("mul_uh_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(r) :);
+    return r;
+#endif
+}
+
+#define _mul13(a, b) _mul11(a, (b >> 16))
+#define _mul22(a, b) _mul00((a >> 16), (b >> 16))
+#define _mul23(a, b) _mul01((a >> 16), (b >> 16))
+#define _mul33(a, b) _mul11((a >> 16), (b >> 16))
+
+#define mulx0y0(xl, yl) _mul00(xl, yl)
+#define mulx0y1(xl, yl) _mul01(xl, yl)
+#define mulx0y2(xl, yl) _mul02(xl, yl)
+#define mulx0y3(xl, yl) _mul03(xl, yl)
+#define mulx0y4(xl, yh) _mul00(xl, yh)
+#define mulx0y5(xl, yh) _mul01(xl, yh)
+#define mulx0y6(xl, yh) _mul02(xl, yh)
+#define mulx0y7(xl, yh) _mul03(xl, yh)
+
+#define mulx1y1(xl, yl) _mul11(xl, yl)
+#define mulx1y2(xl, yl) _mul12(xl, yl)
+#define mulx1y3(xl, yl) _mul13(xl, yl)
+#define mulx1y4(xl, yh) _mul01(yh, xl)
+#define mulx1y5(xl, yh) _mul11(xl, yh)
+#define mulx1y6(xl, yh) _mul12(xl, yh)
+
+#define mulx2y2(xl, yl) _mul22(xl, yl)
+#define mulx2y3(xl, yl) _mul23(xl, yl)
+#define mulx2y4(xl, yh) _mul02(yh, xl)
+#define mulx2y5(xl, yh) _mul12(yh, xl)
+
+#define mulx3y3(xl, yl) _mul33(xl, yl)
+#define mulx3y4(xl, yh) _mul03(yh, xl)
+
+// Symmetry...
+#define mulx1y0(xl, yl) mulx0y1(yl, xl)
+#define mulx2y0(xl, yl) mulx0y2(yl, xl)
+#define mulx2y1(xl, yl) mulx1y2(yl, xl)
+#define mulx3y0(xl, yl) mulx0y3(yl, xl)
+#define mulx3y1(xl, yl) mulx1y3(yl, xl)
+#define mulx3y2(xl, yl) mulx2y3(yl, xl)
+#define mulx4y0(xh, yl) mulx0y4(yl, xh)
+#define mulx4y1(xh, yl) mulx1y4(yl, xh)
+#define mulx4y2(xh, yl) mulx2y4(yl, xh)
+#define mulx4y3(xh, yl) mulx3y4(yl, xh)
+#define mulx5y0(xh, yl) mulx0y5(yl, xh)
+#define mulx5y1(xh, yl) mulx1y5(yl, xh)
+#define mulx5y2(xh, yl) mulx2y5(yl, xh)
+#define mulx6y0(xh, yl) mulx0y6(yl, xh)
+#define mulx6y1(xh, yl) mulx1y6(yl, xh)
+#define mulx7y0(xh, yl) mulx0y7(yl, xh)
+
+uint64_t
+__muldi3(uint64_t x, uint64_t y)
+{
+    uint32_t xl = x;
+    uint32_t xh = ((uint64_t)x >> 32);
+    uint32_t yl = y;
+    uint32_t yh = ((uint64_t)y >> 32);
+
+    // Each fragment of the product.
+    uint32_t p0, p1, p2, p3, p4, p5, p6, p7, rh;
+    uint64_t rl;
+
+    p0 = mulx0y0(xl, yl);
+    rl = (uint64_t)p0;
+
+    p1 = mulx0y1(xl, yl) + mulx1y0(xl, yl);
+    rl += ((uint64_t)p1 << 8);
+
+    p2 = mulx0y2(xl, yl) + mulx2y0(xl, yl) + mulx1y1(xl, yl);
+    rl += ((uint64_t)p2 << 16);
+
+    p3 = mulx0y3(xl, yl) + mulx3y0(xl, yl) + mulx1y2(xl, yl) + mulx2y1(xl, yl);
+    rl += ((uint64_t)p3 << 24);
+
+    p4 = mulx0y4(xl, yh) + mulx4y0(xh, yl) + mulx1y3(xl, yl) + mulx3y1(xl, yl) + mulx2y2(xl, yl);
+    rh = p4;
+    
+    p5 = (mulx0y5(xl, yh) + mulx5y0(xh, yl) + mulx1y4(xl, yh) + mulx4y1(xh, yl)
+	  + mulx2y3(xl, yl) + mulx3y2(xl, yl));
+    rh += p5 << 8;
+
+    p6 = (mulx0y6(xl, yh) + mulx6y0(xh, yl) + mulx1y5(xl, yh) + mulx5y1(xh, yl)
+	  + mulx2y4(xl, yh) + mulx4y2(xh, yl) + mulx3y3(xl, yl));
+    rh += p6 << 16;
+    
+    p7 = (mulx0y7(xl, yh) + mulx7y0(xh, yl) + mulx1y6(xl, yh) + mulx6y1(xh, yl)
+	  + mulx2y5(xl, yh) + mulx5y2(xh, yl) + mulx3y4(xl, yh) + mulx4y3(xh, yl));
+    rh += p7 << 24;
+
+    return rl + (((uint64_t)rh) << 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/mulsi3.c b/compiler-rt/lib/builtins/dpu/mulsi3.c
new file mode 100644
index 0000000000000..f41210acd79cd
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mulsi3.c
@@ -0,0 +1,8 @@
+#include <stdint.h>
+
+extern int32_t __mul32(int32_t a, int32_t b);
+
+int32_t __mulsi3(int32_t a, int32_t b)
+{
+  return __mul32(a, b);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udiv32.S b/compiler-rt/lib/builtins/dpu/udiv32.S
new file mode 100644
index 0000000000000..8298d37dd8a0e
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv32.S
@@ -0,0 +1,49 @@
+        .text
+        .globl  __udiv32
+        .type   __udiv32,@function
+__udiv32:
+	clz r2, r1, max, 1f // r2 = by how many the divider can be shifted on 32-bit
+	clz r3, r0         // r3 = number of useless bits of the dividend
+	sub r2, r3, r2, gtu, 2f// r2 = the maximal shift to be done
+	move r3, r1
+	move.u d0, r0
+	jump r2, 3f                 // As we will jump backward relatively to label 3 forward
+	div_step d0, r3, d0, 31
+	div_step d0, r3, d0, 30
+	div_step d0, r3, d0, 29
+	div_step d0, r3, d0, 28
+	div_step d0, r3, d0, 27
+	div_step d0, r3, d0, 26
+	div_step d0, r3, d0, 25
+	div_step d0, r3, d0, 24
+	div_step d0, r3, d0, 23
+	div_step d0, r3, d0, 22
+	div_step d0, r3, d0, 21
+	div_step d0, r3, d0, 20
+	div_step d0, r3, d0, 19
+	div_step d0, r3, d0, 18
+	div_step d0, r3, d0, 17
+	div_step d0, r3, d0, 16
+	div_step d0, r3, d0, 15
+	div_step d0, r3, d0, 14
+	div_step d0, r3, d0, 13
+	div_step d0, r3, d0, 12
+	div_step d0, r3, d0, 11
+	div_step d0, r3, d0, 10
+	div_step d0, r3, d0, 9
+	div_step d0, r3, d0, 8
+	div_step d0, r3, d0, 7
+	div_step d0, r3, d0, 6
+	div_step d0, r3, d0, 5
+	div_step d0, r3, d0, 4
+	div_step d0, r3, d0, 3
+	div_step d0, r3, d0, 2
+	div_step d0, r3, d0, 1
+3:
+	div_step d0, r3, d0, 0
+4:	
+	jump r23
+2:
+	move.u d0, r0, true, 4b
+1:
+	fault 2
diff --git a/compiler-rt/lib/builtins/dpu/udiv32.c b/compiler-rt/lib/builtins/dpu/udiv32.c
new file mode 100644
index 0000000000000..22f617e14fd71
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv32.c
@@ -0,0 +1,63 @@
+#include <stdint.h>
+
+uint64_t
+__udiv32(uint32_t dividend, uint32_t divider)
+{
+    uint64_t dest;
+
+    uint32_t temp0;
+    uint32_t temp1;
+
+    /* clang-format off */
+    __asm__ volatile("  clz %[temp0], %[divider], max, 1f\n" // %[temp0] = by how many the divider can be shifted on 32-bit
+                     "  clz %[temp1], %[dividend]\n" // %[temp1] = number of useless bits of the dividend
+                     "  sub %[temp0], %[temp1], %[temp0], gtu, 2f\n" // %[temp0] = the maximal shift to be done
+                     "  move %[temp1], %[divider]\n"
+                     "  move.u %[dest], %[dividend]\n"
+                     "  jump %[temp0], 3f\n" // As we will jump backward relatively to label 3 forward
+                     "  div_step %[dest], %[temp1], %[dest], 31\n"
+                     "  div_step %[dest], %[temp1], %[dest], 30\n"
+                     "  div_step %[dest], %[temp1], %[dest], 29\n"
+                     "  div_step %[dest], %[temp1], %[dest], 28\n"
+                     "  div_step %[dest], %[temp1], %[dest], 27\n"
+                     "  div_step %[dest], %[temp1], %[dest], 26\n"
+                     "  div_step %[dest], %[temp1], %[dest], 25\n"
+                     "  div_step %[dest], %[temp1], %[dest], 24\n"
+                     "  div_step %[dest], %[temp1], %[dest], 23\n"
+                     "  div_step %[dest], %[temp1], %[dest], 22\n"
+                     "  div_step %[dest], %[temp1], %[dest], 21\n"
+                     "  div_step %[dest], %[temp1], %[dest], 20\n"
+                     "  div_step %[dest], %[temp1], %[dest], 19\n"
+                     "  div_step %[dest], %[temp1], %[dest], 18\n"
+                     "  div_step %[dest], %[temp1], %[dest], 17\n"
+                     "  div_step %[dest], %[temp1], %[dest], 16\n"
+                     "  div_step %[dest], %[temp1], %[dest], 15\n"
+                     "  div_step %[dest], %[temp1], %[dest], 14\n"
+                     "  div_step %[dest], %[temp1], %[dest], 13\n"
+                     "  div_step %[dest], %[temp1], %[dest], 12\n"
+                     "  div_step %[dest], %[temp1], %[dest], 11\n"
+                     "  div_step %[dest], %[temp1], %[dest], 10\n"
+                     "  div_step %[dest], %[temp1], %[dest], 9\n"
+                     "  div_step %[dest], %[temp1], %[dest], 8\n"
+                     "  div_step %[dest], %[temp1], %[dest], 7\n"
+                     "  div_step %[dest], %[temp1], %[dest], 6\n"
+                     "  div_step %[dest], %[temp1], %[dest], 5\n"
+                     "  div_step %[dest], %[temp1], %[dest], 4\n"
+                     "  div_step %[dest], %[temp1], %[dest], 3\n"
+                     "  div_step %[dest], %[temp1], %[dest], 2\n"
+                     "  div_step %[dest], %[temp1], %[dest], 1\n"
+                     "3:\n"
+                     "  div_step %[dest], %[temp1], %[dest], 0\n"
+                     "4:\n"
+                     "  jump 5f\n"
+                     "2:\n"
+                     "  move.u %[dest], %[dividend], true, 4b\n"
+                     "1:\n"
+                     "  fault 2\n"
+                     "5:\n"
+                     : [dest] "=r"(dest), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1)
+                     : [dividend] "r"(dividend), [divider] "r"(divider));
+    /* clang-format on */
+
+    return dest;
+}
diff --git a/compiler-rt/lib/builtins/dpu/udiv64.c b/compiler-rt/lib/builtins/dpu/udiv64.c
new file mode 100644
index 0000000000000..e55b3ffe9904c
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv64.c
@@ -0,0 +1,59 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 multiplication unsigned division.
+ */
+#include <stdint.h>
+
+static unsigned int
+__clz__(uint64_t x)
+{
+    return __builtin_clzl(x);
+}
+
+uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder)
+{
+    uint64_t dxo = dividend, dxe = 0;
+
+    if (divider == 0) {
+      __asm__ volatile("fault 2");
+      /* unreachable(); */
+      __builtin_unreachable();
+    }
+    if (divider > dividend) {
+        if (ask_remainder == 0)
+            return 0;
+        else
+            return dividend;
+    }
+
+    // Mimic the div_step.
+    /// div_step functionality:
+    //   if (Dxo >= (Ra<< #u5)) {
+    //     Dxo = Dxo - (Ra<< #u5);
+    //     Dxe = (Dxe << 1) | 1;
+    //   } else {
+    //     Dxe =  Dxe << 1;
+    //   }
+    int dividerl0 = __clz__(divider), dividendl0 = __clz__(dividend);
+
+    int i = dividerl0 - dividendl0;
+
+    for (; i >= 0; i--) {
+        uint64_t pivot = ((uint64_t)divider << i);
+        if (dxo >= pivot) {
+            dxo = dxo - pivot;
+            dxe = ((uint64_t)dxe << 1) | 1L;
+        } else {
+            dxe = (uint64_t)dxe << 1;
+        }
+    }
+    if (ask_remainder == 1)
+        return dxo;
+    else
+        return dxe;
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivdi3.c b/compiler-rt/lib/builtins/dpu/udivdi3.c
new file mode 100644
index 0000000000000..1b60b934b85f4
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivdi3.c
@@ -0,0 +1,19 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 unsigned division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+uint64_t
+__udivdi3(uint64_t dividend, uint64_t divider)
+{
+    return __udiv64(dividend, divider, 0);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivmodsi4.c b/compiler-rt/lib/builtins/dpu/udivmodsi4.c
new file mode 100644
index 0000000000000..3a3f3902b6f61
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivmodsi4.c
@@ -0,0 +1,29 @@
+/*===-- udivmodsi4.c - Implement __udivmodsi4 ------------------------------===
+ *
+ *                    The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __udivmodsi4 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+#include "int_lib.h"
+
+/* Returns: a / b, *rem = a % b  */
+
+COMPILER_RT_ABI su_int
+__udivmodsi4(su_int a, su_int b, su_int *rem)
+{
+    uint64_t res = __udiv32(a, b);
+    *rem = (su_int)res;
+    return (su_int) (res >> 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivsi3.c b/compiler-rt/lib/builtins/dpu/udivsi3.c
new file mode 100644
index 0000000000000..dcc1d9fcf672f
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivsi3.c
@@ -0,0 +1,15 @@
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+#include "../int_lib.h"
+
+typedef su_int fixuint_t;
+typedef si_int fixint_t;
+
+// Returns: a / b
+
+COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) {
+  uint64_t res = __udiv32(a, b);
+  return (su_int) (res >> 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/umoddi3.c b/compiler-rt/lib/builtins/dpu/umoddi3.c
new file mode 100644
index 0000000000000..4b3a82b01eb98
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/umoddi3.c
@@ -0,0 +1,19 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 unsigned remainder.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divider)
+{
+    return __udiv64(dividend, divider, 1);
+}
diff --git a/compiler-rt/lib/builtins/dpu/umodsi3.c b/compiler-rt/lib/builtins/dpu/umodsi3.c
new file mode 100644
index 0000000000000..c85cd8a4d9aed
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/umodsi3.c
@@ -0,0 +1,27 @@
+/* ===-- umodsi3.c - Implement __umodsi3 -----------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __umodsi3 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include "int_lib.h"
+
+/* Returns: a % b */
+
+extern unsigned long
+__udiv32(unsigned int, unsigned int);
+
+COMPILER_RT_ABI su_int
+__umodsi3(su_int a, su_int b)
+{
+    unsigned long res = __udiv32(a, b);
+    return (unsigned int)res;
+}
diff --git a/compiler-rt/test/builtins/Unit/comparedf2_test.c b/compiler-rt/test/builtins/Unit/comparedf2_test.c
index 27666e2ad689b..d606ae7eff6ca 100644
--- a/compiler-rt/test/builtins/Unit/comparedf2_test.c
+++ b/compiler-rt/test/builtins/Unit/comparedf2_test.c
@@ -458,7 +458,7 @@ static const struct TestVector vectors[] = {
     {__builtin_inf(),__builtin_inf(),0,0,0,0,0,0,0},
 };    
 
-int main(int argc, char *argv[]) {
+int main() {
     const int numVectors = sizeof vectors / sizeof vectors[0];
     int i;
     for (i = 0; i<numVectors; ++i) {
diff --git a/compiler-rt/test/builtins/Unit/comparesf2_test.c b/compiler-rt/test/builtins/Unit/comparesf2_test.c
index b6a52b74633aa..f129bece62364 100644
--- a/compiler-rt/test/builtins/Unit/comparesf2_test.c
+++ b/compiler-rt/test/builtins/Unit/comparesf2_test.c
@@ -458,7 +458,7 @@ static const struct TestVector vectors[] = {
     {__builtin_inff(),__builtin_inff(),0,0,0,0,0,0,0},
 };    
 
-int main(int argc, char *argv[]) {
+int main() {
     const int numVectors = sizeof vectors / sizeof vectors[0];
     int i;
     for (i = 0; i<numVectors; ++i) {
diff --git a/compiler-rt/test/builtins/Unit/test.c b/compiler-rt/test/builtins/Unit/test.c
new file mode 100644
index 0000000000000..bad88690c884f
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/test.c
@@ -0,0 +1,13 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+  fprintf(stderr, "hello err\n");
+  fprintf(stdout, "hello out\n");
+  srand(42);
+  for (int i = 0; i < 10; i++) {
+    printf("%d %d\n", i, rand());
+  }
+  return 0;
+}

From 45abde92efafe2b9189f8274fc2c28897c467c74 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 29 Jul 2024 08:34:40 +0200
Subject: [PATCH 17/17] wip: I fixed all issues reported with
 -verify-machineinstrs, need LARGE cleanup now ...

While at it, I implemented some quick optimization.

- Implemented `(1 << n) - 1` == `lslx lneg n`
https://github.com/upmem/llvm-project/issues/9

- Moved some 64-bit operation earlier in the pipeline,
from ResolveMacroInstrPass to expandPostRA.
I tried even earlier, but SUBC is not well defined and get moved around during MergeSink
for critical edge split. The computation is then wrong because require to be packed for the
particular use I identified.
I leave my experiment here. I will check to fully define it and move it preRA, or will leave it
like that again for a while, fulfilling my other duty.

- Fixed a few easy Def/Use when BuildMI

- Fixed lose of MachineInstr correctness
Our arithmetic+comp+branch was destroyed during analyzeBranch/removeBranch/insertBranch

- temporarily removed fusion of any instruction + JUMPi in MergeComboInstr
The problem is that at this stage (PreEmit):
- machine CFG is done.
- JUMPi is unconditional jump
- arithmetic + cond + branch; with cond as True/False is conditional
-- even if we know that cond with true/false is unconditional, the instruction have the
-- property of being conditional by its definition.
---- To fix that, I will create other PseudoInstruction to have them set correctly.
---- Also, if those arith+cond+branch do have pattern, maybe they could be selected far earlier
---- and the machine CFG would be correctly formed at the first place probably.

- issue with ThinLTO fixed
-- some code construction ended up in SELECTrr, which is not common for us
---- this is lowered to TmpJcc
---- and TmpJcc is kind of wrong
------ and finally, MergeComboInstr was combining even more wrong.
--> I removed TmpJcc, and use simply our well defined JEQrii

- issue whith ThinLTO fixed
-- another was present but undetected on Release build
-- we use multiple address spacees (IRAM, WRAM, MRAM)
-- there was an assertion with ThinLTO when populating GV out of multiple modules
---- it's fixed in llvm13, but we are on llvm12
------ I reproduced the patch (not cherry-picked) https://github.com/llvm/llvm-project/commit/60c60dd1387742730b5cc756f8d92bac2e23c2b0
------ just for now. will do that correctly when cleaning up
------- so when I will upgrade our LLVM, it will be mergeable easily
---
 llvm/lib/CodeGen/MachineSink.cpp              |   57 +-
 llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp       |  342 ++---
 llvm/lib/Target/DPU/DPUISelLowering.h         |  166 +--
 llvm/lib/Target/DPU/DPUInstrFormats.td        |    1 +
 llvm/lib/Target/DPU/DPUInstrInfo.cpp          |  406 ++++--
 llvm/lib/Target/DPU/DPUInstrInfo.h            |   14 +-
 llvm/lib/Target/DPU/DPUInstrInfo.td           |   90 +-
 llvm/lib/Target/DPU/DPUMacroFusion.cpp        |    2 +-
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp |   32 +-
 llvm/lib/Target/DPU/DPUPostRAFusion.cpp       |  100 +-
 .../Target/DPU/DPUResolveMacroInstrPass.cpp   |  173 ++-
 llvm/lib/Target/DPU/DPUTargetLowering.cpp     | 1155 ++++++++++++-----
 llvm/lib/Target/DPU/DPUTargetMachine.cpp      |   11 +-
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp    |    8 +-
 llvm/lib/Transforms/Utils/ModuleUtils.cpp     |   22 +-
 15 files changed, 1716 insertions(+), 863 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 378df1b75e25d..15ed3e94bff5b 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1103,23 +1103,51 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
 bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
                                      AllSuccsCache &AllSuccessors) {
   // Don't sink instructions that the target prefers not to sink.
-  if (!TII->shouldSink(MI))
+  if (!TII->shouldSink(MI)) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "shouldSink false "; MI.dump();
+    //   });
     return false;
-
+  }
+  
   // Check if it's safe to move the instruction.
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(AA, SawStore)) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "not safe "; MI.dump();
+    // 	dbgs() << "mayStore(): " << MI.mayStore() << "\n";
+    // 	dbgs() << "mayLoad(): " << MI.mayLoad() << "\n";
+    // 	dbgs() << "isCall(): " << MI.isCall() << "\n";
+    // 	dbgs() << "isPHI(): " << MI.isPHI() << "\n";
+    // 	dbgs() << "hasOrderedMemoryRef(): " << MI.hasOrderedMemoryRef() << "\n";
+    // 	dbgs() << "isPosition(): " << MI.isPosition() << "\n";
+    // 	dbgs() << "isDebugInstr(): " << MI.isDebugInstr() << "\n";
+    // 	dbgs() << "isTerminator(): " << MI.isTerminator() << "\n";
+    // 	dbgs() << "mayRaiseFPException(): " << MI.mayRaiseFPException() << "\n";
+    // 	dbgs() << "hasUnmodeledSideEffects(): " << MI.hasUnmodeledSideEffects() << "\n";
+    // 	dbgs() << "isDereferenceableInvariantLoad(AA): " << MI.isDereferenceableInvariantLoad(AA) << "\n";
+    // 	dbgs() << "SawStore: " << SawStore << "\n";
+    //   });
     return false;
-
+  }
+  
   // Convergent operations may not be made control-dependent on additional
   // values.
-  if (MI.isConvergent())
+  if (MI.isConvergent()) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "isconvergent "; MI.dump();
+    //   });
     return false;
-
+  }
+  
   // Don't break implicit null checks.  This is a performance heuristic, and not
   // required for correctness.
-  if (SinkingPreventsImplicitNullCheck(MI, TII, TRI))
+  if (SinkingPreventsImplicitNullCheck(MI, TII, TRI)) {
+    LLVM_DEBUG({
+	dbgs() << "nullcheck "; MI.dump();
+      });
     return false;
-
+  }
+  
   // FIXME: This should include support for sinking instructions within the
   // block they are currently in to shorten the live ranges.  We often get
   // instructions sunk into the top of a large block, but it would be better to
@@ -1134,9 +1162,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
       FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
 
   // If there are no outputs, it must have side-effects.
-  if (!SuccToSinkTo)
+  if (!SuccToSinkTo) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "no succ "; MI.dump();
+    //   });
     return false;
-
+  }
   // If the instruction to move defines a dead physical register which is live
   // when leaving the basic block, don't move it because it could turn into a
   // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
@@ -1146,8 +1177,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     Register Reg = MO.getReg();
     if (Reg == 0 || !Register::isPhysicalRegister(Reg))
       continue;
-    if (SuccToSinkTo->isLiveIn(Reg))
+    if (SuccToSinkTo->isLiveIn(Reg)) {
+      // LLVM_DEBUG({
+      // 	  dbgs() << "zombie "; MI.dump();
+      // 	});
       return false;
+    }
   }
 
   LLVM_DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo);
diff --git a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
index b08a71adb52a4..539056aeb055b 100644
--- a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
@@ -86,12 +86,12 @@ class DPUDAGToDAGISel : public SelectionDAGISel {
 
   bool IsGlobalAddrInImmediateSection(SDNode *Node) const;
 
-  // void processFunctionAfterISel(MachineFunction &MF);
+  void processFunctionAfterISel(MachineFunction &MF);
 
-  // bool replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
-  //                                 const DPUInstrInfo *DII,
-  //                                 const TargetRegisterInfo *TRI,
-  //                                 const MachineInstr &MI);
+  bool replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
+                                  const DPUInstrInfo *DII,
+                                  const TargetRegisterInfo *TRI,
+                                  const MachineInstr &MI);
 
   bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out);
 };
@@ -100,141 +100,170 @@ class DPUDAGToDAGISel : public SelectionDAGISel {
 StringRef DPUDAGToDAGISel::getPassName() const { return "DPUDAGToDAGISel"; }
 
 bool DPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+  
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
-  // processFunctionAfterISel(MF);
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
+  processFunctionAfterISel(MF);
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
 
   return Ret;
 }
 
-// void DPUDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
-//   MachineRegisterInfo *MRI = &MF.getRegInfo();
-
-//   auto &SubTarget = static_cast<const DPUSubtarget &>(MF.getSubtarget());
-//   auto InstrInfo = SubTarget.getInstrInfo();
-//   auto RegInfo = SubTarget.getRegisterInfo();
-
-//   for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
-//        ++MFI)
-//     for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
-//       replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I);
-//     }
-// }
-
-// static inline bool canCommuteOperation(MachineInstr *MI, unsigned opNo,
-//                                        unsigned &newOpNo) {
-//   switch (MI->getOpcode()) {
-//   case DPU::ADDrrr:
-//   case DPU::ANDrrr:
-//   case DPU::ORrrr:
-//   case DPU::XORrrr:
-//     switch (opNo) {
-//     case 1:
-//       newOpNo = 2;
-//       break;
-//     case 2:
-//       newOpNo = 1;
-//       break;
-//     default:
-//       return false;
-//     }
-
-//     return true;
-//   default:
-//     return false;
-//   }
-// }
-
-// bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
-//                                                  const DPUInstrInfo *DII,
-//                                                  const TargetRegisterInfo *TRI,
-//                                                  const MachineInstr &MI) {
-//   // This function seems to do manual coalescing
-//   //    probably we should use the proper one that probably knows better
-//   //    maybe prob with MI operand constraint ... ?
-//   //    probably better to educate the coalescer, or better define register class
-//   unsigned DstReg = 0, CstReg = 0;
-
-//   if (MI.getOpcode() == DPU::COPY) {
-//     unsigned reg = MI.getOperand(1).getReg();
-
-//     DstReg = MI.getOperand(0).getReg();
-//     switch (reg) {
-//     case DPU::ID:
-//     case DPU::ID2:
-//     case DPU::ID4:
-//     case DPU::ID8:
-//       CstReg = reg;
-//       break;
-//     default:
-//       break;
-//     }
-//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
-//               (MI.getOperand(1).getReg() == DPU::ZERO)) ||
-//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-//               (MI.getOperand(1).getImm() == 0))) {
-//     DstReg = MI.getOperand(0).getReg();
-//     CstReg = DPU::ZERO;
-//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
-//               (MI.getOperand(1).getReg() == DPU::ONE)) ||
-//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-//               (MI.getOperand(1).getImm() == 1))) {
-//     DstReg = MI.getOperand(0).getReg();
-//     CstReg = DPU::ONE;
-//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
-//               (MI.getOperand(1).getReg() == DPU::LNEG)) ||
-//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-//               (MI.getOperand(1).getImm() == -1))) {
-//     DstReg = MI.getOperand(0).getReg();
-//     CstReg = DPU::LNEG;
-//   } else if (((MI.getOpcode() == DPU::MOVErr) &&
-//               (MI.getOperand(1).getReg() == DPU::MNEG)) ||
-//              ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
-//               (MI.getOperand(1).getImm() == 0x8000000))) {
-//     DstReg = MI.getOperand(0).getReg();
-//     CstReg = DPU::MNEG;
-//   }
-
-//   if (!CstReg)
-//     return false;
-
-//   // Replace uses with CstReg.
-//   for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
-//                                          E = MRI->use_end();
-//        U != E;) {
-//     MachineOperand &MO = *U;
-//     unsigned OpNo = U.getOperandNo();
-//     MachineInstr *UMI = MO.getParent();
-//     ++U;
-
-//     // Do not replace if it is a phi's operand or is tied to def operand.
-//     if (UMI->isPHI() || UMI->isRegTiedToDefOperand(OpNo) || UMI->isPseudo())
-//       continue;
-
-//     // Also, we have to check that the register class of the operand
-//     // contains the constant register.
-//     if (!UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(CstReg)) {
-//       unsigned newOpNo;
-
-//       if (canCommuteOperation(UMI, OpNo, newOpNo)) {
-//         auto OtherReg = UMI->getOperand(newOpNo).getReg();
-
-//         if (UMI->getRegClassConstraint(newOpNo, DII, TRI)->contains(CstReg) &&
-//             (!Register::isPhysicalRegister(OtherReg) ||
-//              UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(OtherReg))) {
-//           UMI->getOperand(newOpNo).setReg(CstReg);
-//           UMI->getOperand(OpNo).setReg(OtherReg);
-//         }
-//       }
-
-//       continue;
-//     }
-
-//     MO.setReg(CstReg);
-//   }
-
-//   return true;
-// }
+void DPUDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  auto &SubTarget = static_cast<const DPUSubtarget &>(MF.getSubtarget());
+  auto InstrInfo = SubTarget.getInstrInfo();
+  auto RegInfo = SubTarget.getRegisterInfo();
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE; ++MFI) {
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
+    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	});
+
+      bool res = replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I);
+      if (res) {
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " YES did something.\n";
+      }
+    }
+  }
+}
+
+static inline bool canCommuteOperation(MachineInstr *MI, unsigned opNo,
+                                       unsigned &newOpNo) {
+  switch (MI->getOpcode()) {
+  case DPU::ADDrrr:
+  case DPU::ANDrrr:
+  case DPU::ORrrr:
+  case DPU::XORrrr:
+    switch (opNo) {
+    case 1:
+      newOpNo = 2;
+      break;
+    case 2:
+      newOpNo = 1;
+      break;
+    default:
+      return false;
+    }
+
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
+                                                 const DPUInstrInfo *DII,
+                                                 const TargetRegisterInfo *TRI,
+                                                 const MachineInstr &MI) {
+  // This function seems to do manual coalescing
+  //    probably we should use the proper one that probably knows better
+  //    maybe prob with MI operand constraint ... ?
+  //    probably better to educate the coalescer, or better define register class
+  unsigned DstReg = 0, CstReg = 0;
+
+  if (MI.getOpcode() == DPU::COPY) {
+    unsigned reg = MI.getOperand(1).getReg();
+
+    DstReg = MI.getOperand(0).getReg();
+    switch (reg) {
+    case DPU::ID:
+    case DPU::ID2:
+    case DPU::ID4:
+    case DPU::ID8:
+      CstReg = reg;
+      break;
+    default:
+      break;
+    }
+  } else if (((MI.getOpcode() == DPU::MOVErr) &&
+              (MI.getOperand(1).getReg() == DPU::ZERO)) ||
+             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+              (MI.getOperand(1).getImm() == 0))) {
+    DstReg = MI.getOperand(0).getReg();
+    CstReg = DPU::ZERO;
+  } else if (((MI.getOpcode() == DPU::MOVErr) &&
+              (MI.getOperand(1).getReg() == DPU::ONE)) ||
+             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+              (MI.getOperand(1).getImm() == 1))) {
+    DstReg = MI.getOperand(0).getReg();
+    CstReg = DPU::ONE;
+  } else if (((MI.getOpcode() == DPU::MOVErr) &&
+              (MI.getOperand(1).getReg() == DPU::LNEG)) ||
+             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+              (MI.getOperand(1).getImm() == -1))) {
+    DstReg = MI.getOperand(0).getReg();
+    CstReg = DPU::LNEG;
+  } else if (((MI.getOpcode() == DPU::MOVErr) &&
+              (MI.getOperand(1).getReg() == DPU::MNEG)) ||
+             ((MI.getOpcode() == DPU::MOVEri) && (MI.getOperand(1).isImm()) &&
+              (MI.getOperand(1).getImm() == 0x8000000))) {
+    DstReg = MI.getOperand(0).getReg();
+    CstReg = DPU::MNEG;
+  }
+
+  if (!CstReg)
+    return false;
+
+  // Replace uses with CstReg.
+  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+                                         E = MRI->use_end();
+       U != E;) {
+    MachineOperand &MO = *U;
+    unsigned OpNo = U.getOperandNo();
+    MachineInstr *UMI = MO.getParent();
+    ++U;
+
+    // Do not replace if it is a phi's operand or is tied to def operand.
+    if (UMI->isPHI() || UMI->isRegTiedToDefOperand(OpNo) || UMI->isPseudo())
+      continue;
+
+    // Also, we have to check that the register class of the operand
+    // contains the constant register.
+    if (!UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(CstReg)) {
+      unsigned newOpNo;
+
+      if (canCommuteOperation(UMI, OpNo, newOpNo)) {
+        auto OtherReg = UMI->getOperand(newOpNo).getReg();
+
+        if (UMI->getRegClassConstraint(newOpNo, DII, TRI)->contains(CstReg) &&
+            (!Register::isPhysicalRegister(OtherReg) ||
+             UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(OtherReg))) {
+          UMI->getOperand(newOpNo).setReg(CstReg);
+          UMI->getOperand(OpNo).setReg(OtherReg);
+
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " YES did something.\n";
+        }
+      }
+
+      continue;
+    }
+
+    MO.setReg(CstReg);
+  }
+
+  return true;
+}
 
 bool DPUDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
   if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
@@ -392,7 +421,11 @@ void DPUDAGToDAGISel::Select(SDNode *Node) {
   }
 
   EVT VT = Node->getValueType(0);
+  SDLoc DL(Node);
 
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  
   switch (Opcode) {
   case ISD::Constant: {
     LLVM_DEBUG({dbgs() << "a constant: "; Node->dump();});
@@ -401,25 +434,21 @@ void DPUDAGToDAGISel::Select(SDNode *Node) {
       // This allows the coalescer to propagate these into other instructions.
       ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
       if (ConstNode->isNullValue()) {
-	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-					     DPU::ZERO, MVT::i32);
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::ZERO, MVT::i32);
 	ReplaceNode(Node, New.getNode());
 	return;
       } else if (ConstNode->isOne()) {
-	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-					     DPU::ONE, MVT::i32);
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::ONE, MVT::i32);
 	ReplaceNode(Node, New.getNode());
 	return;
       } else if (ConstNode->isAllOnesValue()) {
-	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-					     DPU::LNEG, MVT::i32);
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::LNEG, MVT::i32);
 	ReplaceNode(Node, New.getNode());
 	return;
       } else {
 	const ConstantInt *Cst = ConstNode->getConstantIntValue();
 	if (Cst->isMinValue(/* signed = */ true)) {
-	  SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-					       DPU::MNEG, MVT::i32);
+	  SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::MNEG, MVT::i32);
 	  ReplaceNode(Node, New.getNode());
 	  return;
 	}
@@ -427,18 +456,27 @@ void DPUDAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::i64) {
       ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
       if (ConstNode->isNullValue()) {
-	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-					     DPU::ZERO, MVT::i32);
-	auto *NewMove = CurDAG->getMachineNode(DPU::MOVE_Srr, SDLoc(Node), VT,
-					       New);
-	ReplaceNode(Node, NewMove);
+	// // Create a new virtual register of type i64
+	// SDValue ImpDef = SDValue(CurDAG->getMachineNode(DPU::IMPLICIT_DEF, DL, MVT::i64), 0);
+	// // Insert the low part into the virtual register
+	// SDValue InsertLo = CurDAG->getTargetInsertSubreg(DPU::sub_32bit, DL, MVT::i64, 
+	// 						 ImpDef,
+	// 						 CurDAG->getRegister(DPU::ZERO, MVT::i32));
+	// // Insert the high part into the virtual register
+	// SDValue InsertHi = CurDAG->getTargetInsertSubreg(DPU::sub_32bit_hi, DL, MVT::i64, 
+	// 						 InsertLo,
+	// 						 CurDAG->getRegister(DPU::ZERO, MVT::i32));
+	// // Replace the old node with the new virtual register value
+	// ReplaceNode(Node, InsertHi.getNode());
+
+	SDValue truc = SDValue(CurDAG->getMachineNode(DPU::MOVE_Srr, DL, MVT::i64,
+						      CurDAG->getRegister(DPU::ZERO, MVT::i32)), 0);
+	ReplaceNode(Node, truc.getNode());
 	return;
       } else if (ConstNode->isOne()) {
-	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-					     DPU::ONE, MVT::i32);
-	auto *NewMove = CurDAG->getMachineNode(DPU::MOVE_Srr, SDLoc(Node), VT,
-					       New);
-	ReplaceNode(Node, NewMove);
+	SDValue truc = SDValue(CurDAG->getMachineNode(DPU::MOVE_Srr, DL, MVT::i64,
+						      CurDAG->getRegister(DPU::ONE, MVT::i32)), 0);
+	ReplaceNode(Node, truc.getNode());
 	return;
       }
     }
diff --git a/llvm/lib/Target/DPU/DPUISelLowering.h b/llvm/lib/Target/DPU/DPUISelLowering.h
index 87d963121a70b..91eadd89e9489 100644
--- a/llvm/lib/Target/DPU/DPUISelLowering.h
+++ b/llvm/lib/Target/DPU/DPUISelLowering.h
@@ -32,12 +32,12 @@ enum {
   SetCC,    // SET to a condition
   BrCC,     // Jump and branch with condition
   BrCCi,    // Jump and branch with condition
-  BrCCZero, // Jump and branch with condition and one operand equal to zero
-  OrJCCZero,
-  AndJCCZero,
-  XorJCCZero,
-  AddJCCZero,
-  SubJCCZero,
+  // BrCCZero, // Jump and branch with condition and one operand equal to zero
+  // OrJCCZero,
+  // AndJCCZero,
+  // XorJCCZero,
+  // AddJCCZero,
+  // SubJCCZero,
   Wrapper,    // Global addresses, externals...
   TRUNC64,    // Keep the LSBits register,
   LSL64_32,   // Shift 32 positions to the left
@@ -62,9 +62,9 @@ enum {
   MUL16_SU,
   MUL16_SS,
 
-  Addc,
-  Subc,
-  Rsubc,
+  // Addc,
+  // Subc,
+  // Rsubc,
 
   Clo,
   Cls,
@@ -77,80 +77,80 @@ enum {
 
   LslAdd,
 
-  AddJcc,
-  AddNullJcc,
-  AddcJcc,
-  AddcNullJcc,
-  AndJcc,
-  AndNullJcc,
-  OrJcc,
-  OrNullJcc,
-  XorJcc,
-  XorNullJcc,
-  NandJcc,
-  NandNullJcc,
-  NorJcc,
-  NorNullJcc,
-  NxorJcc,
-  NxorNullJcc,
-  AndnJcc,
-  AndnNullJcc,
-  OrnJcc,
-  OrnNullJcc,
-  LslJcc,
-  LslNullJcc,
-  LslxJcc,
-  LslxNullJcc,
-  Lsl1Jcc,
-  Lsl1NullJcc,
-  Lsl1xJcc,
-  Lsl1xNullJcc,
-  LsrJcc,
-  LsrNullJcc,
-  LsrxJcc,
-  LsrxNullJcc,
-  Lsr1Jcc,
-  Lsr1NullJcc,
-  Lsr1xJcc,
-  Lsr1xNullJcc,
-  AsrJcc,
-  AsrNullJcc,
-  RolJcc,
-  RolNullJcc,
-  RorJcc,
-  RorNullJcc,
-  MUL8_UUJcc,
-  MUL8_UUNullJcc,
-  MUL8_SUJcc,
-  MUL8_SUNullJcc,
-  MUL8_SSJcc,
-  MUL8_SSNullJcc,
-  SubJcc,
-  SubNullJcc,
-  RsubJcc,
-  RsubNullJcc,
-  SubcJcc,
-  SubcNullJcc,
-  RsubcJcc,
-  RsubcNullJcc,
-  CaoJcc,
-  CaoNullJcc,
-  ClzJcc,
-  ClzNullJcc,
-  CloJcc,
-  CloNullJcc,
-  ClsJcc,
-  ClsNullJcc,
-  MoveJcc,
-  MoveNullJcc,
-  RolAddJcc,
-  RolAddNullJcc,
-  LsrAddJcc,
-  LsrAddNullJcc,
-  LslAddJcc,
-  LslAddNullJcc,
-  LslSubJcc,
-  LslSubNullJcc,
+  // AddJcc,
+  // AddNullJcc,
+  // AddcJcc,
+  // AddcNullJcc,
+  // AndJcc,
+  // AndNullJcc,
+  // OrJcc,
+  // OrNullJcc,
+  // XorJcc,
+  // XorNullJcc,
+  // NandJcc,
+  // NandNullJcc,
+  // NorJcc,
+  // NorNullJcc,
+  // NxorJcc,
+  // NxorNullJcc,
+  // AndnJcc,
+  // AndnNullJcc,
+  // OrnJcc,
+  // OrnNullJcc,
+  // LslJcc,
+  // LslNullJcc,
+  // LslxJcc,
+  // LslxNullJcc,
+  // Lsl1Jcc,
+  // Lsl1NullJcc,
+  // Lsl1xJcc,
+  // Lsl1xNullJcc,
+  // LsrJcc,
+  // LsrNullJcc,
+  // LsrxJcc,
+  // LsrxNullJcc,
+  // Lsr1Jcc,
+  // Lsr1NullJcc,
+  // Lsr1xJcc,
+  // Lsr1xNullJcc,
+  // AsrJcc,
+  // AsrNullJcc,
+  // RolJcc,
+  // RolNullJcc,
+  // RorJcc,
+  // RorNullJcc,
+  // MUL8_UUJcc,
+  // MUL8_UUNullJcc,
+  // MUL8_SUJcc,
+  // MUL8_SUNullJcc,
+  // MUL8_SSJcc,
+  // MUL8_SSNullJcc,
+  // SubJcc,
+  // SubNullJcc,
+  // RsubJcc,
+  // RsubNullJcc,
+  // SubcJcc,
+  // SubcNullJcc,
+  // RsubcJcc,
+  // RsubcNullJcc,
+  // CaoJcc,
+  // CaoNullJcc,
+  // ClzJcc,
+  // ClzNullJcc,
+  // CloJcc,
+  // CloNullJcc,
+  // ClsJcc,
+  // ClsNullJcc,
+  // MoveJcc,
+  // MoveNullJcc,
+  // RolAddJcc,
+  // RolAddNullJcc,
+  // LsrAddJcc,
+  // LsrAddNullJcc,
+  // LslAddJcc,
+  // LslAddNullJcc,
+  // LslSubJcc,
+  // LslSubNullJcc,
 
   ADD_VASTART,
 
diff --git a/llvm/lib/Target/DPU/DPUInstrFormats.td b/llvm/lib/Target/DPU/DPUInstrFormats.td
index 66116ab29b153..a4e80392af3b6 100644
--- a/llvm/lib/Target/DPU/DPUInstrFormats.td
+++ b/llvm/lib/Target/DPU/DPUInstrFormats.td
@@ -97,6 +97,7 @@ def u5_imm  : UImmOperand< 5, i32>;
 def u8_imm  : UImmOperand< 8, i32>;
 
 def s8_i64_imm : SImmOperand<8, i64>;
+def s11_i64_imm : SImmOperand<11, i64>;
 def s16_i64_imm : SImmOperand<16, i64>;
 def s32_i64_imm : SImmOperand<32, i64>;
 def u32_i64_imm : UImmOperand<32, i64>;
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index d3e071f03ff7e..eb10d5bdbcf0e 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -54,7 +54,7 @@ void DPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterInfo *TRI) const {
   DebugLoc DL = (I != MBB.end()) ? I->getDebugLoc() : DebugLoc();
   unsigned Opcode = (RC == &DPU::GP_REGRegClass
-		     // || RC == &DPU::GPZ_REGRegClass
+		     || RC == &DPU::GPZ_REGRegClass
 		     ) ? DPU::SWrir : DPU::SDrir;
 
   LLVM_DEBUG({
@@ -85,7 +85,7 @@ void DPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (I != MBB.end())
     DL = I->getDebugLoc();
   unsigned Opcode = (RC == &DPU::GP_REGRegClass
-		     // || RC == &DPU::GPZ_REGRegClass
+		     || RC == &DPU::GPZ_REGRegClass
 		     ) ? DPU::LWrri : DPU::LDrri;
   LLVM_DEBUG({
     dbgs() << "DPU/Instr - loadRegFromStackSlot DestReg="
@@ -98,6 +98,118 @@ void DPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(Opcode), DestReg).addFrameIndex(FI).addImm(0);
 }
 
+void DPUInstrInfo::expand64BitRegisterAluInstruction(MachineInstr &MI,
+						     MachineBasicBlock &MBB,
+						     unsigned int LsbOpcode,
+						     unsigned int MsbOpcode) const {
+  MachineFunction *MF = MBB.getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MI.getOperand(0).getReg();
+  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  unsigned int Op2Reg = MI.getOperand(2).getReg();
+
+  unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
+  unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
+
+  unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
+  unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
+
+  unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
+  unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
+
+  MachineInstrBuilder MIBDestLsb;
+  MIBDestLsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(LsbOpcode),
+		       LSBDestReg)
+    .addReg(LSBDOp1Reg)
+    .addReg(LSBOp2Reg);
+
+  MachineInstrBuilder MIBDestMsb;
+  MIBDestMsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(MsbOpcode),
+		       MSBDestReg)
+    .addReg(MSBDOp1Reg)
+    .addReg(MSBOp2Reg);
+
+  for (unsigned i = 0; i < 3; i++) {
+    if (MI.getOperand(i).isRenamable()) {
+      MIBDestLsb->getOperand(i).setIsRenamable();
+      MIBDestMsb->getOperand(i).setIsRenamable();
+    }
+    if (MI.getOperand(i).isKill()) {
+      MIBDestLsb->getOperand(i).setIsKill();
+      MIBDestMsb->getOperand(i).setIsKill();
+    }
+  }
+}
+
+void DPUInstrInfo::expand64BitImmediateAluInstruction(MachineInstr &MI,
+						      MachineBasicBlock &MBB,
+						      unsigned int LsbOpcode,
+						      unsigned int MsbOpcode) const {
+  MachineFunction *MF = MBB.getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MI.getOperand(0).getReg();
+  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  int64_t Op2Imm = MI.getOperand(2).getImm();
+
+  unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
+  unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
+
+  unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
+  unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
+
+  int64_t LSBOp2Imm = Op2Imm & 0xFFFFFFFFl;
+  int64_t MSBOp2Imm = (Op2Imm >> 32) & 0xFFFFFFFFl;
+
+  // // what if value is zero???
+  // // probably optimizable :)
+  switch (LSBOp2Imm) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "LSBOp2Imm = " << LSBOp2Imm << " could be optimized\n";
+      });
+  }
+
+  switch (MSBOp2Imm) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "MSBOp2Imm = " << MSBOp2Imm << " could be optimized\n";
+      });
+  }
+  
+  MachineInstrBuilder MIBDestLsb;
+  MIBDestLsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(LsbOpcode),
+		       LSBDestReg)
+    .addReg(LSBDOp1Reg)
+    .addImm(LSBOp2Imm);
+
+  MachineInstrBuilder MIBDestMsb;
+  MIBDestMsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(MsbOpcode),
+		       MSBDestReg)
+    .addReg(MSBDOp1Reg)
+    .addImm(MSBOp2Imm);
+
+  for (unsigned i = 0; i < 2; i++) {
+    if (MI.getOperand(i).isRenamable()) {
+      MIBDestLsb->getOperand(i).setIsRenamable();
+      MIBDestMsb->getOperand(i).setIsRenamable();
+    }
+    if (MI.getOperand(i).isKill()) {
+      MIBDestLsb->getOperand(i).setIsKill();
+      MIBDestMsb->getOperand(i).setIsKill();
+    }
+  }
+}
+
 bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction *MF = MBB.getParent();
@@ -143,6 +255,35 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     break;
   }
 
+  case DPU::ADD64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::ADDrrr, DPU::ADDCrrr);
+    break;
+  case DPU::AND64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::ANDrrr, DPU::ANDrrr);
+    break;
+  case DPU::OR64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::ORrrr, DPU::ORrrr);
+    break;
+  case DPU::SUB64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::SUBrrr, DPU::SUBCrrr);
+    break;
+  case DPU::XOR64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::XORrrr, DPU::XORrrr);
+    break;
+    
+  case DPU::ADD64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::ADDrri, DPU::ADDCrri);
+    break;
+  case DPU::AND64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::ANDrri, DPU::ANDrri);
+    break;
+  case DPU::OR64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::ORrri, DPU::ORrri);
+    break;
+  case DPU::XOR64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::XORrri, DPU::XORrri);
+    break;
+    
   // case DPU::Jcci:
   // case DPU::TmpJcci:
   // case DPU::Jcc: {
@@ -165,18 +306,24 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL, MCRegister DestReg,
                                MCRegister SrcReg, bool KillSrc) const {
+  LLVM_DEBUG({ dbgs() << "DPU/Instr - copyPhysReg "; I->dump(); });
+
+  bool is_dest_renamable = I->getOperand(0).isRenamable();
+  bool is_src_renamable = I->getOperand(1).isRenamable();
+  MachineInstrBuilder MIB;
+
   if (DPU::GP_REGRegClass.contains(DestReg) &&
       DPU::OP_REGRegClass.contains(SrcReg)) {
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::MOVErr), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::MOVErr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
   } else if (DPU::GP64_REGRegClass.contains(DestReg, SrcReg)) {
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::MOVDrr), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::MOVDrr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
   } else if (DPU::GP64_REGRegClass.contains(SrcReg) &&
              DPU::GP_REGRegClass.contains(DestReg)) {
@@ -184,7 +331,7 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::EXTRACT_SUBREG), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::EXTRACT_SUBREG), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
         .addImm(DPU::sub_32bit);
   } else if (DPU::GP_REGRegClass.contains(SrcReg) &&
@@ -193,11 +340,16 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::MOVE_Srr), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::MOVE_Srr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
   } else {
     llvm_unreachable("Impossible reg-to-reg copy");
   }
+
+  if (is_dest_renamable)
+    MIB->getOperand(0).setIsRenamable();
+  if (is_src_renamable)
+    MIB->getOperand(1).setIsRenamable();
 }
 
 static bool reverseBranchOpc(unsigned Opc, unsigned &ReversedOpc) {
@@ -283,6 +435,7 @@ bool DPUInstrInfo::reverseBranchCondition(
   case DPU::Jcc:
   case DPU::Jcci:
   case DPU::Jcc64:
+  // case DPU::Jcci64:
     Cond[1].setImm(ISD::getSetCCInverse(ISD::CondCode(Cond[1].getImm()), MVT::i32));
     break;
   default: {
@@ -301,10 +454,10 @@ bool DPUInstrInfo::reverseBranchCondition(
 static void
 fetchUnconditionalBranchInfo(MachineInstr *Inst,
                              unsigned &targetBasicBlockOperandIndex) {
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "Inst "; Inst->dump();
-  //   });
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+    });
 
   switch (Inst->getOpcode()) {
   case DPU::JUMPi:
@@ -317,15 +470,16 @@ fetchUnconditionalBranchInfo(MachineInstr *Inst,
 
 static void fetchConditionalBranchInfo(MachineInstr *Inst,
                                        unsigned &targetBasicBlockOperandIndex,
-                                       SmallVectorImpl<MachineOperand> &Cond) {
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "Inst "; Inst->dump();
-  //     dbgs() << "Cond.size() " << Cond.size() << "\n";
-  //     for (unsigned i = 0; i < Cond.size(); ++i) {
-  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-  //     }
-  //   });
+                                       SmallVectorImpl<MachineOperand> &Cond,
+				       bool &do_have_metadata) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+      dbgs() << "Cond.size() " << Cond.size() << "\n";
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
   
   unsigned Opc = Inst->getOpcode();
   Cond.push_back(MachineOperand::CreateImm(Opc));
@@ -367,20 +521,22 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
     }
   }
 
-  // for (const MachineOperand &Op : Inst->operands()) {
-  //   if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(Inst->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
-  //     Cond.push_back(Op);
-  //   }
-  // }
+  do_have_metadata = false;
+  for (const MachineOperand &Op : Inst->operands()) {
+    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(Inst->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+      Cond.push_back(Op);
+      do_have_metadata = true;
+    }
+  }
   
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "Inst "; Inst->dump();
-  //     dbgs() << "Cond.size() " << Cond.size() << "\n";
-  //     for (unsigned i = 0; i < Cond.size(); ++i) {
-  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-  //     }
-  //   });
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+      dbgs() << "Cond.size() " << Cond.size() << "\n";
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
 }
 
 static inline bool isAnalyzableBranch(MachineInstr *Inst) {
@@ -393,13 +549,13 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
 
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "MBB "; MBB.dump();
-  //     for (unsigned i = 0; i < Cond.size(); ++i) {
-  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-  //     }
-  //   });
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
   
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 
@@ -420,6 +576,9 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
   // If not an analyzable branch (e.g., indirect jump), just leave.
   if (!isAnalyzableBranch(LastInst)) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+      });
     return true;
   }
 
@@ -455,7 +614,14 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     // Conditional branch
     if (LastInst->isConditionalBranch()) {
       unsigned int TBBOpIdx;
-      fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond);
+      bool do_have_metadata = false;
+      fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond, do_have_metadata);
+      if (do_have_metadata) {
+	LLVM_DEBUG({
+	    dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable metadata\n";
+	  });
+	return true;
+      }
       // LLVM_DEBUG({
       // 	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
       // 	  dbgs() << "MBB "; MBB.dump();
@@ -470,12 +636,18 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     }
 
     // Unknown branch type
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+      });
     return true;
   }
 
   // If we reached here, there are two branches.
   // If there are three terminators, we don't know what sort of block this is.
   if (++I != REnd && isUnpredicatedTerminator(*I)) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+      });
     return true;
   }
 
@@ -484,11 +656,13 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   if (SecondLastInst->isUnconditionalBranch()) {
     // Return if the last instruction cannot be removed.
     if (!AllowModify) {
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+	});
       return true;
     }
     unsigned int TBBOpIdx;
     fetchUnconditionalBranchInfo(SecondLastInst, TBBOpIdx);
-
     TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB();
     LastInst->eraseFromParent();
     return false;
@@ -498,13 +672,23 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     // Conditional branch followed by an unconditional branch.
     // The last one must be unconditional.
     if (!LastInst->isUnconditionalBranch()) {
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+	});
       return true;
     }
     unsigned int TBBOpIdx;
     unsigned int FTBBOpIdx;
+    bool do_have_metadata = false;
 
     fetchUnconditionalBranchInfo(LastInst, FTBBOpIdx);
-    fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond);
+    fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond, do_have_metadata);
+    if (do_have_metadata) {
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable metadata\n";
+	});
+      return true;
+    }
     TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB();
     FBB = LastInst->getOperand(FTBBOpIdx).getMBB();
     // LLVM_DEBUG({
@@ -522,15 +706,18 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Unknown branch type
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+    });
   return true;
 }
 
 unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                     int *BytesRemoved) const {
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "MBB "; MBB.dump();
-  //   });
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+    });
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
 
@@ -556,17 +743,17 @@ unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
 void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
                                           MachineBasicBlock *TBB, DebugLoc DL,
                                           ArrayRef<MachineOperand> Cond) const {
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "MBB "; MBB.dump();
-  //     for (unsigned i = 0; i < Cond.size(); ++i) {
-  // 	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-  // 	  if (Cond[i].isReg()) {
-  // 	    dbgs() << "Cond[" << i << "] isUse " << Cond[i].isUse() << "\n";
-  // 	    dbgs() << "Cond[" << i << "] isDef " << Cond[i].isDef() << "\n";
-  // 	  }
-  //     }
-  //   });
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+	  if (Cond[i].isReg()) {
+	    dbgs() << "Cond[" << i << "] isUse " << Cond[i].isUse() << "\n";
+	    dbgs() << "Cond[" << i << "] isDef " << Cond[i].isDef() << "\n";
+	  }
+      }
+    });
 
   // LLVM_DEBUG({
   //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
@@ -667,17 +854,17 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
 
   MIB.addMBB(TBB);
 
-  // // add back remaining metadata
-  // for (unsigned i = 0; i < Cond.size(); ++i) {
-  //    if (Cond[i].isMetadata()) {
-  //     MIB.addMetadata(Cond[i].getMetadata());
-  //    }
-  // }
+  // add back remaining metadata
+  for (unsigned i = 0; i < Cond.size(); ++i) {
+     if (Cond[i].isMetadata()) {
+      MIB.addMetadata(Cond[i].getMetadata());
+     }
+  }
 
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "MIB "; MIB->dump();
-  //   });
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MIB "; MIB->dump();
+    });
 }
 
 unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
@@ -685,13 +872,13 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
                                     const DebugLoc &DL, int *BytesAdded) const {
-  // LLVM_DEBUG({
-  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  //     dbgs() << "MBB "; MBB.dump();
-  //     for (unsigned i = 0; i < Cond.size(); ++i) {
-  // 	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
-  //     }
-  //   });
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
   unsigned nrOfInsertedMachineInstr = 0;
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
@@ -717,36 +904,47 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
   // to instructions added.
   if (BytesAdded)
     *BytesAdded = nrOfInsertedMachineInstr;
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+    });
   return nrOfInsertedMachineInstr;
 }
 
-// bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const {
-//   switch (MI.getDesc().getOpcode()) {
-//   default:
-//     break;
-//   case DPU::CLZ_Urr:
-//   case DPU::LSLXrrr:
-//   case DPU::LSRXrrr:
-//   case DPU::ANDrri:
-//   case DPU::JEQrii:
-//   case DPU::JNEQrii:
-//     {
-//       //   return false;
-//       for (const MachineOperand &Op : MI.operands()) {
-// 	if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI.getMF()->getFunction().getContext(), "MySpecialMetadata")) {
-// 	  LLVM_DEBUG({
-// 	      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Don't sink because I have MySpecialMetadata.\n";
-// 	    });
-// 	  return false; // Do not sink this instruction
-// 	}
-//       }
-//       LLVM_DEBUG({
-// 	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " I'm potentially something used in arith+cond+jump from EmitInstrWithCustomInserter but I allow sink because I don't have MySpecialMetadata.\n";
-// 	});
-//       break;
-//     }
-//   }
-
-//   // return true;
-//   return TargetInstrInfo::shouldSink(MI);
-// }
+bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const {
+  switch (MI.getDesc().getOpcode()) {
+  default:
+    break;
+  case DPU::CLZ_Urr:
+  case DPU::LSLXrrr:
+  case DPU::LSRXrrr:
+  case DPU::ANDrri:
+  case DPU::JEQrii:
+  case DPU::JNEQrii:
+  // case DPU::ADDrrr:
+  // case DPU::ADDCrrr:
+  case DPU::SUBrrr:
+  case DPU::SUBCrrr:
+    {
+      //   return false;
+      for (const MachineOperand &Op : MI.operands()) {
+	if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI.getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+	  LLVM_DEBUG({
+	      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Don't sink because I have MySpecialMetadata.\n";
+	      MI.dump();
+	    });
+	  return false; // Do not sink this instruction
+	}
+      }
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " I'm potentially something used in arith+cond+jump from EmitInstrWithCustomInserter but I allow sink because I don't have MySpecialMetadata.\n";
+	  MI.dump();
+	});
+      break;
+    }
+  }
+
+  // return true;
+  return TargetInstrInfo::shouldSink(MI);
+}
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.h b/llvm/lib/Target/DPU/DPUInstrInfo.h
index 2d08d67f4f721..98fc84304958f 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.h
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.h
@@ -43,14 +43,22 @@ class DPUInstrInfo : public DPUGenInstrInfo {
                             const TargetRegisterInfo *TRI) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
-
+  void expand64BitRegisterAluInstruction(MachineInstr &MI,
+					 MachineBasicBlock &MBB,
+					 unsigned int LsbOpcode,
+					 unsigned int MsbOpcode) const;
+  void expand64BitImmediateAluInstruction(MachineInstr &MI,
+					  MachineBasicBlock &MBB,
+					  unsigned int LsbOpcode,
+					  unsigned int MsbOpcode) const;
+  
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
 
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-
+  
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
@@ -66,7 +74,7 @@ class DPUInstrInfo : public DPUGenInstrInfo {
   void buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                               DebugLoc DL, ArrayRef<MachineOperand> Cond) const;
 
-  // bool shouldSink(const MachineInstr &MI) const override;
+  bool shouldSink(const MachineInstr &MI) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.td b/llvm/lib/Target/DPU/DPUInstrInfo.td
index 6b89c0e906556..b923d56beddea 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.td
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.td
@@ -217,58 +217,66 @@ defm : WramStoreImmPat<store, SDrii, s16_i64_imm>;
 
 def : Pat<(i32 (trunc DoubleReg:$src)), (EXTRACT_SUBREG DoubleReg:$src, sub_32bit)>;
 
-let isMoveImm = 1, isAsCheapAsAMove = 0 in {
+let isMoveImm = 1, isAsCheapAsAMove = 0
+, usesCustomInserter = 1
+in {
   def MOVE64ri: PseudoDPUInstruction<
                     (outs GP64_REG:$dc), (ins i64imm:$imm),
                     "",
                     [(set i64:$dc, (i64 imm:$imm))]>;
 }
 
-let isAsCheapAsAMove = 0 in {
-def ADD64rr: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
-                  "",
-                  [(set i64:$dc, (add i64:$da, i64:$db))]>;
-
+let isAsCheapAsAMove = 0
+// , usesCustomInserter = 1
+in {
 def ADD64ri: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
                   "",
                   [(set i64:$dc, (add i64:$da, (i64 imm:$imm)))]>;
 
-def SUB64rr: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
-                  "",
-                  [(set i64:$dc, (sub i64:$da, i64:$db))]>;
-
-def OR64rr: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
+def AND64ri: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
                   "",
-                  [(set i64:$dc, (or i64:$da, i64:$db))]>;
+                  [(set i64:$dc, (and i64:$da, (i64 imm:$imm)))]>;
 
 def OR64ri: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
                   "",
                   [(set i64:$dc, (or i64:$da, (i64 imm:$imm)))]>;
 
+def XOR64ri: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
+                  "",
+                  [(set i64:$dc, (xor i64:$da, (i64 imm:$imm)))]>;
+}
+
+let isAsCheapAsAMove = 0
+// , usesCustomInserter = 1
+in {
+def ADD64rr: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
+                  "",
+                  [(set i64:$dc, (add i64:$da, i64:$db))]>;
+
 def AND64rr: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
                   [(set i64:$dc, (and i64:$da, i64:$db))]>;
 
-def AND64ri: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
+def OR64rr: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
-                  [(set i64:$dc, (and i64:$da, (i64 imm:$imm)))]>;
+                  [(set i64:$dc, (or i64:$da, i64:$db))]>;
 
-def XOR64rr: PseudoDPUInstruction<
+def SUB64rr: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
-                  [(set i64:$dc, (xor i64:$da, i64:$db))]>;
+                  [(set i64:$dc, (sub i64:$da, i64:$db))]>;
 
-def XOR64ri: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
+def XOR64rr: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
-                  [(set i64:$dc, (xor i64:$da, (i64 imm:$imm)))]>;
+                  [(set i64:$dc, (xor i64:$da, i64:$db))]>;
 }
 
 // Bit operations: 64 bits emulation.
@@ -454,26 +462,36 @@ def Jcci: PseudoDPUInstruction<
                 [(DPUBrCCi (i32 imm:$cc), i32:$ra, (s11_imm:$immediate), bb:$dst)]
                 >;
 
-def TmpJcci: PseudoDPUInstruction<
-                (outs), (ins ccopcode:$cc, OP_REG:$ra, s11_imm:$immediate, GP_REG:$dependency, pcoffset:$dst),
-                "",
-                []
-                >;
+// def TmpJcci: PseudoDPUInstruction<
+//                 (outs), (ins ccopcode:$cc, OP_REG:$ra, s11_imm:$immediate, GP_REG:$dependency, pcoffset:$dst),
+//                 "",
+//                 []
+//                 >;
 }
 
-let isBranch = 1, isTerminator = 1, isCompare = 1, isBarrier = 0, isIndirectBranch = 0, isAsCheapAsAMove = 0 in {
+let isBranch = 1, isTerminator = 1, isCompare = 1, isBarrier = 0, isIndirectBranch = 0, isAsCheapAsAMove = 0
+// , usesCustomInserter = 1
+in {
 def Jcc64: PseudoDPUInstruction<
                 (outs), (ins ccopcode:$cc, GP64_REG:$da, GP64_REG:$db, pcoffset:$dst),
                 "",
                 [(DPUBrCC (i32 imm:$cc), i64:$da, i64:$db, bb:$dst)]
                 >;
+
+// def Jcci64: PseudoDPUInstruction<
+//                 (outs), (ins ccopcode:$cc, GP64_REG:$da, s11_i64_imm:$immediate, pcoffset:$dst),
+//                 "",
+//                 [(DPUBrCCi (i32 imm:$cc), i64:$da, (s11_i64_imm:$immediate), bb:$dst)]
+//                 >;
 }
 
 // -----------------------------------------------------------------------------
 // SETCC
 // -----------------------------------------------------------------------------
 
-let isAsCheapAsAMove = 0 in {
+let isAsCheapAsAMove = 0
+, usesCustomInserter = 1
+in {
 def SET64cc: PseudoDPUInstruction<
                     (outs GP_REG:$rc), (ins ccopcode:$cc, GP64_REG:$lhs, GP64_REG:$rhs),
                     "",
@@ -634,3 +652,15 @@ let usesCustomInserter = 1 in {
     def MRAM_LOAD64_X32mr : MRAM_LOAD64_X_mr<mram_extloadi32>;
     def MRAM_LOAD_DOUBLEmr: MRAM_LOAD64_X_mr<mram_load>;
 }
+
+//===----------------------------------------------------------------------===//
+// Bit manipulation instructions
+//===----------------------------------------------------------------------===//
+
+// ((1 << n) - 1)
+def : Pat<(sub (shl (i32 1), GP_REG:$n), (i32 1)),
+          (LSLXrrr LNEG, GP_REG:$n)>;
+	  
+def : Pat<(xor (shl (i32 -1), GP_REG:$n), (i32 -1)),
+          (LSLXrrr LNEG, GP_REG:$n)>;
+// ====
diff --git a/llvm/lib/Target/DPU/DPUMacroFusion.cpp b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
index 43655fc012e50..6a14246c852c0 100644
--- a/llvm/lib/Target/DPU/DPUMacroFusion.cpp
+++ b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
@@ -56,7 +56,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
       });
     return false;
   case DPU::JUMPi:
-  case DPU::TmpJcci:
+  // case DPU::TmpJcci:
     break;
   case DPU::JNEQrii:
   case DPU::JEQrii:
diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index d5575207c6234..c96a23c933e17 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -247,17 +247,17 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
   default:
     LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
     return false;
-  // case DPU::MOVEri:
-  //   OpPrototype = OpriLimited;
-  //   OpJumpOpc = DPU::MOVErici;
-  //   OpNullJumpOpc = DPU::MOVErici; // should not be used
-  //   usableConditions = normalConditionsSet;
-  //   break;
-  // case DPU::MOVErr:
-  //   OpPrototype = Oprr;
-  //   OpJumpOpc = DPU::MOVErrci;
-  //   OpNullJumpOpc = DPU::MOVErrci; // should not be used
-  //   usableConditions = normalConditionsSet;
+  case DPU::MOVEri:
+    OpPrototype = OpriLimited;
+    OpJumpOpc = DPU::MOVErici;
+    OpNullJumpOpc = DPU::MOVErici; // should not be used
+    usableConditions = normalConditionsSet;
+    break;
+  case DPU::MOVErr:
+    OpPrototype = Oprr;
+    OpJumpOpc = DPU::MOVErrci;
+    OpNullJumpOpc = DPU::MOVErrci; // should not be used
+    usableConditions = normalConditionsSet;
     break;
   case DPU::SUBrrr:
     OpPrototype = Oprrr;
@@ -660,7 +660,7 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     // we morph the branch from unconditional to conditional
     // by this, we modify the CFG by creating artificially a fall through which is not declared
     // so, it's bugged
-    // return false;
+    return false;
     // 
     
     if (!ImmCanBeEncodedOn8Bits) {
@@ -722,7 +722,7 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     
     return true;
   }
-  case DPU::TmpJcci:
+  // case DPU::TmpJcci:
   case DPU::Jcci: {
     LLVM_DEBUG({
 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
@@ -792,6 +792,12 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
                "&& !isSourceCondition) && (!ImmCanBeEncodedOn11Bits)\n");
         return false;
       }
+      if (SecondLastOpc == DPU::MOVEri || SecondLastOpc == DPU::MOVErr) {
+	LLVM_DEBUG(
+            dbgs()
+            << "KO: move to zero is invalid\n");
+	return false;
+      }
       // todo: this is not optimal. One register has been allocated but not used
       // now. This can become an issue (unnecessary spilling)
       ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
diff --git a/llvm/lib/Target/DPU/DPUPostRAFusion.cpp b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
index cae1aedaf03ef..a3cc5ab25e5d5 100644
--- a/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
+++ b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
@@ -72,8 +72,8 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
   MachineInstr *LastInst, *SecondLastInst;
   unsigned int LastOpc, SecondLastOpc;
 
-  LLVMContext &Context = MBB->getParent()->getFunction().getContext();
-  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  // LLVMContext &Context = MBB->getParent()->getFunction().getContext();
+  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
   LastInst = getLastNonDebugInstrFrom(I, REnd);
   if (LastInst == NULL) {
@@ -87,6 +87,10 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
     return false;
   }
 
+  if (!do_have_special_metadata(LastInst)
+      || !do_have_special_metadata(SecondLastInst))
+    return false;
+  
   LastOpc = LastInst->getOpcode();
   SecondLastOpc = SecondLastInst->getOpcode();
 
@@ -100,17 +104,19 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
   //     and why not tackle other possible optim that may have introduce this code
   //        event from user maybe
   // original code is JEQrii, but JNEQrii could be introduce by analyzeBranch
-  if ((LastOpc == DPU::JEQrii || LastOpc == DPU::JNEQrii) && do_have_special_metadata(LastInst)
-      && SecondLastOpc == DPU::ANDrri && do_have_special_metadata(SecondLastInst)) {
+  if ((LastOpc == DPU::JEQrii || LastOpc == DPU::JNEQrii) 
+      && SecondLastOpc == DPU::ANDrri) {
     I++;
     MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd);
     if (ThirdLastInst == NULL) {
       // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
       return false;
     }
+    if (!do_have_special_metadata(ThirdLastInst))
+      return false;
+    
     unsigned int ThirdLastOpc = ThirdLastInst->getOpcode();
-    if ((ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr)
-	&& do_have_special_metadata(ThirdLastInst)) {
+    if ((ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr)) {
 
       LLVM_DEBUG({
 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
@@ -121,13 +127,13 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
       unsigned int new_opcode = (ThirdLastOpc == DPU::LSLXrrr ?
 				 DPU::LSLXrrrci : DPU::LSRXrrrci);
       MachineInstrBuilder ComboInst = BuildMI(MBB, ThirdLastInst->getDebugLoc(),
-					      InstrInfo.get(new_opcode),
-					      ThirdLastInst->getOperand(0).getReg());
+					      InstrInfo.get(new_opcode));
+      ComboInst.add(ThirdLastInst->getOperand(0));
       ComboInst.add(ThirdLastInst->getOperand(1));
       ComboInst.add(ThirdLastInst->getOperand(2));
       ComboInst.addImm(DPUAsmCondition::Condition::Shift32);
       ComboInst.addMBB(LastInst->getOperand(2).getMBB());
-      ComboInst.addMetadata(N);
+      // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink
       
       LLVM_DEBUG({
 	  dbgs() << "OK\n";
@@ -152,8 +158,8 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
 
   // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch
   // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch
-  if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii) && do_have_special_metadata(LastInst)
-      && SecondLastOpc == DPU::MUL_UL_ULrrr && do_have_special_metadata(SecondLastInst)) {
+  if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii)
+      && SecondLastOpc == DPU::MUL_UL_ULrrr) {
 
     LLVM_DEBUG({
 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
@@ -162,13 +168,13 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
       });
       
     MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
-					    InstrInfo.get(DPU::MUL_UL_ULrrrci),
-					    SecondLastInst->getOperand(0).getReg());
-    ComboInst.add(SecondLastInst->getOperand(1));
+					    InstrInfo.get(DPU::MUL_UL_ULrrrci));
+    ComboInst.add(SecondLastInst->getOperand(0));
     ComboInst.add(SecondLastInst->getOperand(1));
+    ComboInst.add(SecondLastInst->getOperand(2));
     ComboInst.addImm(DPUAsmCondition::Small);
-    ComboInst.addMBB(LastInst->getOperand(2).getMBB());
-    ComboInst.addMetadata(N);
+    ComboInst.add(LastInst->getOperand(2));
+    // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink
     
     LLVM_DEBUG({
 	dbgs() << "OK\n";
@@ -190,8 +196,8 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
   }
 
   // original code is JNEQrii, but JEQrii could be introduce by analyzeBranch
-  if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii) && do_have_special_metadata(LastInst)
-      && SecondLastOpc == DPU::CLZ_Urr && do_have_special_metadata(SecondLastInst)) {
+  if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii)
+      && SecondLastOpc == DPU::CLZ_Urr) {
 
     LLVM_DEBUG({
 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
@@ -200,12 +206,13 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
       });
 
     MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
-					    InstrInfo.get(DPU::CLZ_Urrci),
-					    SecondLastInst->getOperand(0).getReg());
+					    InstrInfo.get(DPU::CLZ_Urrci));
+    ComboInst.add(SecondLastInst->getOperand(0));
     ComboInst.add(SecondLastInst->getOperand(1));
-    ComboInst.addImm(DPUAsmCondition::Condition::NotMaximum);
-    ComboInst.addMBB(LastInst->getOperand(2).getMBB());
-    ComboInst.addMetadata(N);
+    ComboInst.addImm((LastOpc == DPU::JNEQrii) ?
+		     DPUAsmCondition::Condition::NotMaximum : DPUAsmCondition::Condition::Maximum);
+    ComboInst.add(LastInst->getOperand(2));
+    // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink
 
     LLVM_DEBUG({
 	dbgs() << "OK\n";
@@ -227,53 +234,6 @@ static bool runOnMachineBB(MachineBasicBlock *MBB,
     return true;
   }
 
-  // switch (SecondLastOpc) {
-  // default:
-  //   LLVM_DEBUG(dbgs() << "KO: Unknown SecondLastOpc\n");
-  //   return false;
-  // case DPU::CLZ_Urr: {
-  //   LLVM_DEBUG({
-  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
-  // 	dbgs() << "study CLZ_Urr to CLZ_Urrci\n";
-  // 	SecondLastInst->dump();
-  // 	LastInst->dump();
-  //     });
-    
-  //   bool do_def_reg_alias = false;
-  //   const TargetRegisterInfo *TRI = MBB->getParent()->getSubtarget().getRegisterInfo();
-  //   for (MCRegAliasIterator Alias(SecondLastInst->getOperand(0).getReg(), TRI, true); Alias.isValid(); ++Alias) {
-  //     Register AliasReg = *Alias;
-  //     if (LastInst->getOperand(0).getReg() == AliasReg) {
-  // 	// dbgs() << "yep it's alias\n";
-  // 	do_def_reg_alias = true;
-  //     }
-  //   }
-  //   if (LastInst->getOpcode() == DPU::JNEQrii
-  // 	&& LastInst->getOperand(1).getImm() == 32
-  // 	&& do_def_reg_alias
-  // 	) {
-  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-      
-  //     MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(DPU::CLZ_Urrci), SecondLastInst->getOperand(0).getReg())
-  // 	.add(SecondLastInst->getOperand(1))
-  // 	.addImm(DPUAsmCondition::Condition::NotMaximum)
-  // 	.addMBB(LastInst->getOperand(2).getMBB());
-
-  //     LLVM_DEBUG({
-  // 	  dbgs() << "OK\n";
-  // 	  dbgs() << "del "; SecondLastInst->dump();
-  // 	  dbgs() << "del "; LastInst->dump();
-  // 	  dbgs() << "fused to\n";
-  // 	  dbgs() << "add "; ComboInst->dump();
-  // 	});
-  //     LastInst->eraseFromParent();
-  //     SecondLastInst->eraseFromParent();
-  //     LLVM_DEBUG({dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";});
-  //     return true;
-  //   }
-  // }
-  // }
-  
   return false;
 }
 
diff --git a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
index 4e5313f12050c..cdbe91cbc44d3 100644
--- a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
@@ -9,9 +9,13 @@
 
 // possibly move that earlier in the pipeline
 //   all simple arithmetic could be moved to in EmitInstrWithCustomInserter pre regalloc and other optim
+//   here I needed to add some option again, because we tweak it postRA
+//       if we do that express them directly during ISEL, we would benefit more natural optimization earlier
+//          also, possibility of FastIsel and GlobalSel instead of InstructionSel ...
 
-// TODO: expand test cases for splicing
+// TODO: expand test cases for splicing stuff
 //       need_splice = 0/1  x  canFallThrough = 0/1
+//     and/or doing Jcc and Setcc earlier as well
 
 #include "DPU.h"
 #include "DPUInstrInfo.h"
@@ -189,15 +193,29 @@ static void resolve64BitRegisterAluInstruction(
   unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
   unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
 
-  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(LsbOpcode),
+  MachineInstrBuilder MIBDestLsb;
+  MIBDestLsb = BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(LsbOpcode),
           LSBDestReg)
       .addReg(LSBDOp1Reg)
       .addReg(LSBOp2Reg);
-  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(MsbOpcode),
+
+  MachineInstrBuilder MIBDestMsb;
+  MIBDestMsb = BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(MsbOpcode),
           MSBDestReg)
       .addReg(MSBDOp1Reg)
       .addReg(MSBOp2Reg);
 
+  for (unsigned i = 0; i < 3; i++) {
+    if (MBBIter->getOperand(i).isRenamable()) {
+      MIBDestLsb->getOperand(i).setIsRenamable();
+      MIBDestMsb->getOperand(i).setIsRenamable();
+    }
+    if (MBBIter->getOperand(i).isKill()) {
+      MIBDestLsb->getOperand(i).setIsKill();
+      MIBDestMsb->getOperand(i).setIsKill();
+    }
+  }
+
   LLVM_DEBUG({
       dbgs() << "** instruction replaced, but still need removal\n";
       dbgs() << "** MBB: "; MBB->dump();
@@ -432,46 +450,84 @@ static void resolveJcc64(MachineBasicBlock *MBB,
     break;
   case ISD::SETOGT:
   case ISD::SETGT:
+    LLVM_DEBUG({ dbgs() << "GT " << ISD::SETOGT << " " << ISD::SETGT << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::ExtendedGreaterThanSigned);
     break;
   case ISD::SETOGE:
   case ISD::SETGE:
+    LLVM_DEBUG({ dbgs() << "GE " << ISD::SETOGE << " " << ISD::SETGE << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::GreaterOrEqualSigned);
     break;
   case ISD::SETOLT:
   case ISD::SETLT:
+    LLVM_DEBUG({ dbgs() << "LT " << ISD::SETOLT << " " << ISD::SETLT << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::LessThanSigned);
     break;
   case ISD::SETOLE:
   case ISD::SETLE:
+    LLVM_DEBUG({ dbgs() << "GE " << ISD::SETOLE << " " << ISD::SETLE << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::ExtendedLessOrEqualSigned);
     break;
   case ISD::SETUGT:
-    resolveJcc64AsSub64(
-        MBB, MBBIter, InstrInfo,
-        DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned);
+    LLVM_DEBUG({ dbgs() << "UGT " << ISD::SETUGT << "\n"; });
+    resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
+			DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned);
+    
     break;
   case ISD::SETUGE:
+    LLVM_DEBUG({ dbgs() << "UGE " << ISD::SETUGE << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::GreaterOrEqualUnsigned);
     break;
   case ISD::SETULT:
+    LLVM_DEBUG({ dbgs() << "ULT " << ISD::SETULT << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::LessThanUnsigned);
     break;
   case ISD::SETULE:
-    resolveJcc64AsSub64(
-        MBB, MBBIter, InstrInfo,
-        DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned);
+    LLVM_DEBUG({ dbgs() << "ULE " << ISD::SETULE << "\n"; });
+    resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
+			DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned);
     break;
   }
 }
 
-static void resolveMOVE64rr(MachineBasicBlock *MBB,
+static void resolveJcci64(MachineBasicBlock *MBB,
+		       MachineBasicBlock::iterator MBBIter,
+		       const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  unsigned int OpCode =
+    findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true);
+  const MachineInstrBuilder &MIB =
+    BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode));
+  MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2));
+
+  for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) {
+    MachineOperand &Operand = MBBIter->getOperand(i);
+
+    if (Operand.isMBB()) {
+      MIB.add(Operand);
+      break;
+    }
+  }
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveMOVE64ri(MachineBasicBlock *MBB,
 			    MachineBasicBlock::iterator MBBIter,
 			    const DPUInstrInfo &InstrInfo) {
   LLVM_DEBUG({
@@ -519,8 +575,7 @@ static void resolveSET64cc(MachineBasicBlock *MBB,
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
 
   unsigned int DestReg = MBBIter->getOperand(0).getReg();
-  auto ImmCond = static_cast<DPUAsmCondition::Condition>(
-							 MBBIter->getOperand(1).getImm());
+  auto ImmCond = static_cast<DPUAsmCondition::Condition>(MBBIter->getOperand(1).getImm());
   unsigned int Op1Reg = MBBIter->getOperand(2).getReg();
   unsigned int Op2Reg = MBBIter->getOperand(3).getReg();
 
@@ -620,7 +675,7 @@ static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB,
       resolveJcc(MBB, MBBIter, InstrInfo);
       break;
 
-    case DPU::TmpJcci:
+    // case DPU::TmpJcci:
     case DPU::Jcci:
       resolveJcci(MBB, MBBIter, InstrInfo);
       break;
@@ -629,50 +684,56 @@ static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB,
       resolveJcc64(MBB, MBBIter, InstrInfo);
       break;
 
-    case DPU::SET64cc:
-      resolveSET64cc(MBB, MBBIter, InstrInfo);
-      break;
-
-    case DPU::MOVE64ri:
-      resolveMOVE64rr(MBB, MBBIter, InstrInfo);
-      break;
+    // case DPU::Jcci64:
+    //   resolveJcci64(MBB, MBBIter, InstrInfo);
+    //   break;
+      
+    // case DPU::SET64cc:
+    //   resolveSET64cc(MBB, MBBIter, InstrInfo);
+    //   break;
+
+    // case DPU::MOVE64ri:
+    //   resolveMOVE64ri(MBB, MBBIter, InstrInfo);
+    //   break;
+
+    // case DPU::ADD64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrrr,
+    //                                      DPU::ADDCrrr);
+    //   break;
+    // case DPU::AND64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrrr,
+    //                                      DPU::ANDrrr);
+    //   break;
+    // case DPU::OR64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrrr,
+    //                                      DPU::ORrrr);
+    //   break;
+    // case DPU::SUB64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::SUBrrr,
+    //                                      DPU::SUBCrrr);
+    //   break;
+    // case DPU::XOR64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrrr,
+    //                                      DPU::XORrrr);
+    //   break;
+
+    // case DPU::AND64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrri,
+    //                                       DPU::ANDrri);
+    //   break;
+    // case DPU::ADD64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrri,
+    //                                       DPU::ADDCrri);
+    //   break;
+    // case DPU::OR64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrri,
+    //                                       DPU::ORrri);
+    //   break;
+    // case DPU::XOR64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrri,
+    //                                       DPU::XORrri);
+    //   break;
 
-    case DPU::ADD64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrrr,
-                                         DPU::ADDCrrr);
-      break;
-    case DPU::ADD64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrri,
-                                          DPU::ADDCrri);
-      break;
-    case DPU::SUB64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::SUBrrr,
-                                         DPU::SUBCrrr);
-      break;
-    case DPU::OR64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrrr,
-                                         DPU::ORrrr);
-      break;
-    case DPU::OR64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrri,
-                                          DPU::ORrri);
-      break;
-    case DPU::AND64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrrr,
-                                         DPU::ANDrrr);
-      break;
-    case DPU::AND64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrri,
-                                          DPU::ANDrri);
-      break;
-    case DPU::XOR64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrrr,
-                                         DPU::XORrrr);
-      break;
-    case DPU::XOR64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrri,
-                                          DPU::XORrri);
-      break;
     }
 
     if (InstrModified) {
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index ffdd77b035c2b..0b220529df968 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -89,10 +89,10 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI)
   PredictableSelectIsExpensive = true;
   setJumpIsExpensive(false);
 
-  setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-  setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
-  setLibcallName(RTLIB::SDIV_I32, "__div32");
-  setLibcallName(RTLIB::UDIV_I32, "__udiv32");
+  // setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+  // setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  // setLibcallName(RTLIB::SDIV_I32, "__div32");
+  // setLibcallName(RTLIB::UDIV_I32, "__udiv32");
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &DPU::GP_REGRegClass);
@@ -227,7 +227,7 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI)
   setOperationAction(ISD::BR_CC, MVT::i16, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
-
+  
   setOperationAction(ISD::ADDC, MVT::i8, Expand);
   setOperationAction(ISD::ADDC, MVT::i16, Expand);
   setOperationAction(ISD::ADDC, MVT::i32, Expand);
@@ -384,17 +384,18 @@ SDValue DPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
   default: {
     LLVM_DEBUG({
-      dbgs() << "FAIL: ";
-      Op.dump(&DAG);
-      dbgs() << "\n";
-      const char *NodeName = getTargetNodeName(Op.getOpcode());
-      if (NodeName != nullptr) {
-        dbgs() << "\tnode name = " << NodeName << "\n";
-      }
-      for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) {
-        dbgs() << "\toperand #" << std::to_string(eachOp) << " = ";
-        Op.getOperand(eachOp).dump(&DAG);
-      }
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "FAIL: ";
+	Op.dump(&DAG);
+	dbgs() << "\n";
+	const char *NodeName = getTargetNodeName(Op.getOpcode());
+	if (NodeName != nullptr) {
+	  dbgs() << "\tnode name = " << NodeName << "\n";
+	}
+	for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) {
+	  dbgs() << "\toperand #" << std::to_string(eachOp) << " = ";
+	  Op.getOperand(eachOp).dump(&DAG);
+	}
       });
     report_fatal_error("NOT implemented: lowering of such a type of SDValue");
   }
@@ -433,18 +434,18 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "DPUISD::BrCC";
   case DPUISD::BrCCi:
     return "DPUISD::BrCCi";
-  case DPUISD::BrCCZero:
-    return "DPUISD::BrCCZero";
-  case DPUISD::OrJCCZero:
-    return "DPUISD::OrJCCZero";
-  case DPUISD::AndJCCZero:
-    return "DPUISD::AndJCCZero";
-  case DPUISD::XorJCCZero:
-    return "DPUISD::XorJCCZero";
-  case DPUISD::AddJCCZero:
-    return "DPUISD::AddJCCZero";
-  case DPUISD::SubJCCZero:
-    return "DPUISD::SubJCCZero";
+  // case DPUISD::BrCCZero:
+  //   return "DPUISD::BrCCZero";
+  // case DPUISD::OrJCCZero:
+  //   return "DPUISD::OrJCCZero";
+  // case DPUISD::AndJCCZero:
+  //   return "DPUISD::AndJCCZero";
+  // case DPUISD::XorJCCZero:
+  //   return "DPUISD::XorJCCZero";
+  // case DPUISD::AddJCCZero:
+  //   return "DPUISD::AddJCCZero";
+  // case DPUISD::SubJCCZero:
+  //   return "DPUISD::SubJCCZero";
   case DPUISD::Wrapper:
     return "DPUISD::Wrapper";
   case DPUISD::TRUNC64:
@@ -491,12 +492,12 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "DPUISD::MUL16_SU";
   case DPUISD::MUL16_SS:
     return "DPUISD::MUL16_SS";
-  case DPUISD::Addc:
-    return "DPUISD::Addc";
-  case DPUISD::Subc:
-    return "DPUISD::Subc";
-  case DPUISD::Rsubc:
-    return "DPUISD::Rsubc";
+  // case DPUISD::Addc:
+  //   return "DPUISD::Addc";
+  // case DPUISD::Subc:
+  //   return "DPUISD::Subc";
+  // case DPUISD::Rsubc:
+  //   return "DPUISD::Rsubc";
   case DPUISD::Clo:
     return "DPUISD::Clo";
   case DPUISD::Cls:
@@ -515,154 +516,154 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "DPUISD::Lsr1x";
   case DPUISD::LslAdd:
     return "DPUISD::LslAdd";
-  case DPUISD::AddJcc:
-    return "DPUISD::AddJcc";
-  case DPUISD::AddNullJcc:
-    return "DPUISD::AddNullJcc";
-  case DPUISD::AddcJcc:
-    return "DPUISD::AddcJcc";
-  case DPUISD::AddcNullJcc:
-    return "DPUISD::AddcNullJcc";
-  case DPUISD::AndJcc:
-    return "DPUISD::AndJcc";
-  case DPUISD::AndNullJcc:
-    return "DPUISD::AndNullJcc";
-  case DPUISD::OrJcc:
-    return "DPUISD::OrJcc";
-  case DPUISD::OrNullJcc:
-    return "DPUISD::OrNullJcc";
-  case DPUISD::XorJcc:
-    return "DPUISD::XorJcc";
-  case DPUISD::XorNullJcc:
-    return "DPUISD::XorNullJcc";
-  case DPUISD::NandJcc:
-    return "DPUISD::NandJcc";
-  case DPUISD::NandNullJcc:
-    return "DPUISD::NandNullJcc";
-  case DPUISD::NorJcc:
-    return "DPUISD::NorJcc";
-  case DPUISD::NorNullJcc:
-    return "DPUISD::NorNullJcc";
-  case DPUISD::NxorJcc:
-    return "DPUISD::NxorJcc";
-  case DPUISD::NxorNullJcc:
-    return "DPUISD::NxorNullJcc";
-  case DPUISD::AndnJcc:
-    return "DPUISD::AndnJcc";
-  case DPUISD::AndnNullJcc:
-    return "DPUISD::AndnNullJcc";
-  case DPUISD::OrnJcc:
-    return "DPUISD::OrnJcc";
-  case DPUISD::OrnNullJcc:
-    return "DPUISD::OrnNullJcc";
-  case DPUISD::LslJcc:
-    return "DPUISD::LslJcc";
-  case DPUISD::LslNullJcc:
-    return "DPUISD::LslNullJcc";
-  case DPUISD::LslxJcc:
-    return "DPUISD::LslxJcc";
-  case DPUISD::LslxNullJcc:
-    return "DPUISD::LslxNullJcc";
-  case DPUISD::Lsl1Jcc:
-    return "DPUISD::Lsl1Jcc";
-  case DPUISD::Lsl1NullJcc:
-    return "DPUISD::Lsl1NullJcc";
-  case DPUISD::Lsl1xJcc:
-    return "DPUISD::Lsl1xJcc";
-  case DPUISD::Lsl1xNullJcc:
-    return "DPUISD::Lsl1xNullJcc";
-  case DPUISD::LsrJcc:
-    return "DPUISD::LsrJcc";
-  case DPUISD::LsrNullJcc:
-    return "DPUISD::LsrNullJcc";
-  case DPUISD::LsrxJcc:
-    return "DPUISD::LsrxJcc";
-  case DPUISD::LsrxNullJcc:
-    return "DPUISD::LsrxNullJcc";
-  case DPUISD::Lsr1Jcc:
-    return "DPUISD::Lsr1Jcc";
-  case DPUISD::Lsr1NullJcc:
-    return "DPUISD::Lsr1NullJcc";
-  case DPUISD::Lsr1xJcc:
-    return "DPUISD::Lsr1xJcc";
-  case DPUISD::Lsr1xNullJcc:
-    return "DPUISD::Lsr1xNullJcc";
-  case DPUISD::AsrJcc:
-    return "DPUISD::AsrJcc";
-  case DPUISD::AsrNullJcc:
-    return "DPUISD::AsrNullJcc";
-  case DPUISD::RolJcc:
-    return "DPUISD::RolJcc";
-  case DPUISD::RolNullJcc:
-    return "DPUISD::RolNullJcc";
-  case DPUISD::RorJcc:
-    return "DPUISD::RorJcc";
-  case DPUISD::RorNullJcc:
-    return "DPUISD::RorNullJcc";
-  case DPUISD::MUL8_UUJcc:
-    return "DPUISD::MUL8_UUJcc";
-  case DPUISD::MUL8_UUNullJcc:
-    return "DPUISD::MUL8_UUNullJcc";
-  case DPUISD::MUL8_SUJcc:
-    return "DPUISD::MUL8_SUJcc";
-  case DPUISD::MUL8_SUNullJcc:
-    return "DPUISD::MUL8_SUNullJcc";
-  case DPUISD::MUL8_SSJcc:
-    return "DPUISD::MUL8_SSJcc";
-  case DPUISD::MUL8_SSNullJcc:
-    return "DPUISD::MUL8_SSNullJcc";
-  case DPUISD::SubJcc:
-    return "DPUISD::SubJcc";
-  case DPUISD::SubNullJcc:
-    return "DPUISD::SubNullJcc";
-  case DPUISD::RsubJcc:
-    return "DPUISD::RsubJcc";
-  case DPUISD::RsubNullJcc:
-    return "DPUISD::RsubNullJcc";
-  case DPUISD::SubcJcc:
-    return "DPUISD::SubcJcc";
-  case DPUISD::SubcNullJcc:
-    return "DPUISD::SubcNullJcc";
-  case DPUISD::RsubcJcc:
-    return "DPUISD::RsubcJcc";
-  case DPUISD::RsubcNullJcc:
-    return "DPUISD::RsubcNullJcc";
-  case DPUISD::CaoJcc:
-    return "DPUISD::CaoJcc";
-  case DPUISD::CaoNullJcc:
-    return "DPUISD::CaoNullJcc";
-  case DPUISD::ClzJcc:
-    return "DPUISD::ClzJcc";
-  case DPUISD::ClzNullJcc:
-    return "DPUISD::ClzNullJcc";
-  case DPUISD::CloJcc:
-    return "DPUISD::CloJcc";
-  case DPUISD::CloNullJcc:
-    return "DPUISD::CloNullJcc";
-  case DPUISD::ClsJcc:
-    return "DPUISD::ClsJcc";
-  case DPUISD::ClsNullJcc:
-    return "DPUISD::ClsNullJcc";
-  case DPUISD::MoveJcc:
-    return "DPUISD::MoveJcc";
-  case DPUISD::MoveNullJcc:
-    return "DPUISD::MoveNullJcc";
-  case DPUISD::RolAddJcc:
-    return "DPUISD::RolAddJcc";
-  case DPUISD::RolAddNullJcc:
-    return "DPUISD::RolAddNullJcc";
-  case DPUISD::LsrAddJcc:
-    return "DPUISD::LsrAddJcc";
-  case DPUISD::LsrAddNullJcc:
-    return "DPUISD::LsrAddNullJcc";
-  case DPUISD::LslAddJcc:
-    return "DPUISD::LslAddJcc";
-  case DPUISD::LslAddNullJcc:
-    return "DPUISD::LslAddNullJcc";
-  case DPUISD::LslSubJcc:
-    return "DPUISD::LslSubJcc";
-  case DPUISD::LslSubNullJcc:
-    return "DPUISD::LslSubNullJcc";
+  // case DPUISD::AddJcc:
+  //   return "DPUISD::AddJcc";
+  // case DPUISD::AddNullJcc:
+  //   return "DPUISD::AddNullJcc";
+  // case DPUISD::AddcJcc:
+  //   return "DPUISD::AddcJcc";
+  // case DPUISD::AddcNullJcc:
+  //   return "DPUISD::AddcNullJcc";
+  // case DPUISD::AndJcc:
+  //   return "DPUISD::AndJcc";
+  // case DPUISD::AndNullJcc:
+  //   return "DPUISD::AndNullJcc";
+  // case DPUISD::OrJcc:
+  //   return "DPUISD::OrJcc";
+  // case DPUISD::OrNullJcc:
+  //   return "DPUISD::OrNullJcc";
+  // case DPUISD::XorJcc:
+  //   return "DPUISD::XorJcc";
+  // case DPUISD::XorNullJcc:
+  //   return "DPUISD::XorNullJcc";
+  // case DPUISD::NandJcc:
+  //   return "DPUISD::NandJcc";
+  // case DPUISD::NandNullJcc:
+  //   return "DPUISD::NandNullJcc";
+  // case DPUISD::NorJcc:
+  //   return "DPUISD::NorJcc";
+  // case DPUISD::NorNullJcc:
+  //   return "DPUISD::NorNullJcc";
+  // case DPUISD::NxorJcc:
+  //   return "DPUISD::NxorJcc";
+  // case DPUISD::NxorNullJcc:
+  //   return "DPUISD::NxorNullJcc";
+  // case DPUISD::AndnJcc:
+  //   return "DPUISD::AndnJcc";
+  // case DPUISD::AndnNullJcc:
+  //   return "DPUISD::AndnNullJcc";
+  // case DPUISD::OrnJcc:
+  //   return "DPUISD::OrnJcc";
+  // case DPUISD::OrnNullJcc:
+  //   return "DPUISD::OrnNullJcc";
+  // case DPUISD::LslJcc:
+  //   return "DPUISD::LslJcc";
+  // case DPUISD::LslNullJcc:
+  //   return "DPUISD::LslNullJcc";
+  // case DPUISD::LslxJcc:
+  //   return "DPUISD::LslxJcc";
+  // case DPUISD::LslxNullJcc:
+  //   return "DPUISD::LslxNullJcc";
+  // case DPUISD::Lsl1Jcc:
+  //   return "DPUISD::Lsl1Jcc";
+  // case DPUISD::Lsl1NullJcc:
+  //   return "DPUISD::Lsl1NullJcc";
+  // case DPUISD::Lsl1xJcc:
+  //   return "DPUISD::Lsl1xJcc";
+  // case DPUISD::Lsl1xNullJcc:
+  //   return "DPUISD::Lsl1xNullJcc";
+  // case DPUISD::LsrJcc:
+  //   return "DPUISD::LsrJcc";
+  // case DPUISD::LsrNullJcc:
+  //   return "DPUISD::LsrNullJcc";
+  // case DPUISD::LsrxJcc:
+  //   return "DPUISD::LsrxJcc";
+  // case DPUISD::LsrxNullJcc:
+  //   return "DPUISD::LsrxNullJcc";
+  // case DPUISD::Lsr1Jcc:
+  //   return "DPUISD::Lsr1Jcc";
+  // case DPUISD::Lsr1NullJcc:
+  //   return "DPUISD::Lsr1NullJcc";
+  // case DPUISD::Lsr1xJcc:
+  //   return "DPUISD::Lsr1xJcc";
+  // case DPUISD::Lsr1xNullJcc:
+  //   return "DPUISD::Lsr1xNullJcc";
+  // case DPUISD::AsrJcc:
+  //   return "DPUISD::AsrJcc";
+  // case DPUISD::AsrNullJcc:
+  //   return "DPUISD::AsrNullJcc";
+  // case DPUISD::RolJcc:
+  //   return "DPUISD::RolJcc";
+  // case DPUISD::RolNullJcc:
+  //   return "DPUISD::RolNullJcc";
+  // case DPUISD::RorJcc:
+  //   return "DPUISD::RorJcc";
+  // case DPUISD::RorNullJcc:
+  //   return "DPUISD::RorNullJcc";
+  // case DPUISD::MUL8_UUJcc:
+  //   return "DPUISD::MUL8_UUJcc";
+  // case DPUISD::MUL8_UUNullJcc:
+  //   return "DPUISD::MUL8_UUNullJcc";
+  // case DPUISD::MUL8_SUJcc:
+  //   return "DPUISD::MUL8_SUJcc";
+  // case DPUISD::MUL8_SUNullJcc:
+  //   return "DPUISD::MUL8_SUNullJcc";
+  // case DPUISD::MUL8_SSJcc:
+  //   return "DPUISD::MUL8_SSJcc";
+  // case DPUISD::MUL8_SSNullJcc:
+  //   return "DPUISD::MUL8_SSNullJcc";
+  // case DPUISD::SubJcc:
+  //   return "DPUISD::SubJcc";
+  // case DPUISD::SubNullJcc:
+  //   return "DPUISD::SubNullJcc";
+  // case DPUISD::RsubJcc:
+  //   return "DPUISD::RsubJcc";
+  // case DPUISD::RsubNullJcc:
+  //   return "DPUISD::RsubNullJcc";
+  // case DPUISD::SubcJcc:
+  //   return "DPUISD::SubcJcc";
+  // case DPUISD::SubcNullJcc:
+  //   return "DPUISD::SubcNullJcc";
+  // case DPUISD::RsubcJcc:
+  //   return "DPUISD::RsubcJcc";
+  // case DPUISD::RsubcNullJcc:
+  //   return "DPUISD::RsubcNullJcc";
+  // case DPUISD::CaoJcc:
+  //   return "DPUISD::CaoJcc";
+  // case DPUISD::CaoNullJcc:
+  //   return "DPUISD::CaoNullJcc";
+  // case DPUISD::ClzJcc:
+  //   return "DPUISD::ClzJcc";
+  // case DPUISD::ClzNullJcc:
+  //   return "DPUISD::ClzNullJcc";
+  // case DPUISD::CloJcc:
+  //   return "DPUISD::CloJcc";
+  // case DPUISD::CloNullJcc:
+  //   return "DPUISD::CloNullJcc";
+  // case DPUISD::ClsJcc:
+  //   return "DPUISD::ClsJcc";
+  // case DPUISD::ClsNullJcc:
+  //   return "DPUISD::ClsNullJcc";
+  // case DPUISD::MoveJcc:
+  //   return "DPUISD::MoveJcc";
+  // case DPUISD::MoveNullJcc:
+  //   return "DPUISD::MoveNullJcc";
+  // case DPUISD::RolAddJcc:
+  //   return "DPUISD::RolAddJcc";
+  // case DPUISD::RolAddNullJcc:
+  //   return "DPUISD::RolAddNullJcc";
+  // case DPUISD::LsrAddJcc:
+  //   return "DPUISD::LsrAddJcc";
+  // case DPUISD::LsrAddNullJcc:
+  //   return "DPUISD::LsrAddNullJcc";
+  // case DPUISD::LslAddJcc:
+  //   return "DPUISD::LslAddJcc";
+  // case DPUISD::LslAddNullJcc:
+  //   return "DPUISD::LslAddNullJcc";
+  // case DPUISD::LslSubJcc:
+  //   return "DPUISD::LslSubJcc";
+  // case DPUISD::LslSubNullJcc:
+  //   return "DPUISD::LslSubNullJcc";
   case DPUISD::TEST_NODE:
     return "DPUISD::TEST_NODE";
   }
@@ -1737,11 +1738,24 @@ SDValue DPUTargetLowering::LowerBrCc(SDValue Op, SelectionDAG &DAG) const {
 
   // First, let's determine if there is a constant operand we can keep as
   // immediate.
+  ConstantSDNode *LC = dyn_cast<ConstantSDNode>(leftOp);
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(rightOp);
-
+  LLVM_DEBUG({
+      dbgs() << "leftOp "; leftOp->dump();
+      dbgs() << "rightOp "; rightOp->dump();
+      if (LC) {
+	dbgs() << "a const: "; LC->dump();
+      }
+      
+      if (C) {
+	dbgs() << "a const: "; C->dump();
+      }
+    });
+  
   // todo: handle 64bit compare with immediate
-  if (!(C && isLegalICmpImmediate(C->getSExtValue())) ||
-      (rightOp.getValueType().getSimpleVT().SimpleTy == MVT::i64)) {
+  if (!(C && isLegalICmpImmediate(C->getSExtValue()))
+      || (rightOp.getValueType().getSimpleVT().SimpleTy == MVT::i64)
+      ) {
     // No suitable constant found. We cannot do anything special.
     SDValue Chain = Op.getOperand(0);
     SDLoc dl(Op);
@@ -2067,25 +2081,25 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  // LLVMContext &Context = F->getFunction().getContext();
-  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
-  BuildMI(BB, dl, TII.get(MulLL), LLDest)
-      .addReg(Op1)
-      .addReg(Op2)
-      .addImm(DPUAsmCondition::Small)
-      .addMBB(fastMBB)
-    // .addMetadata(N)
-    ;
-
-  // BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest)
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  // BuildMI(BB, dl, TII.get(MulLL), LLDest)
   //     .addReg(Op1)
   //     .addReg(Op2)
-  //   .addMetadata(N);
-  // BuildMI(BB, dl, TII.get(DPU::JLTUrii))
-  //   .addReg(LLDest)
-  //   .addImm(0x100)
-  //   .addMBB(fastMBB)
-  //   .addMetadata(N);
+  //     .addImm(DPUAsmCondition::Small)
+  //     .addMBB(fastMBB)
+  //   // .addMetadata(N)
+  //   ;
+
+  BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest)
+      .addReg(Op1)
+      .addReg(Op2)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JLTUrii))
+    .addReg(LLDest)
+    .addImm(0x100)
+    .addMBB(fastMBB)
+    .addMetadata(N);
   
   BuildMI(slowMBB, dl, TII.get(MulHL), HLDest).addReg(Op1).addReg(Op2);
   BuildMI(slowMBB, dl, TII.get(DPU::LSL_ADDrrri), LSL1Dest)
@@ -2127,14 +2141,20 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
 
 static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI,
                                                        MachineBasicBlock *BB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator I = ++BB->getIterator();
   MachineFunction *F = BB->getParent();
-  MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *falseMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, trueMBB);
+  F->insert(I, falseMBB);
   F->insert(I, endMBB);
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
@@ -2142,81 +2162,96 @@ static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI,
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   endMBB->transferSuccessorsAndUpdatePHIs(BB);
   // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(trueMBB);
+  BB->addSuccessor(falseMBB);
   BB->addSuccessor(endMBB);
-  trueMBB->addSuccessor(endMBB);
+  falseMBB->addSuccessor(endMBB);
 
   unsigned int Dest = MI.getOperand(0).getReg();
   unsigned int CondReg = MI.getOperand(1).getReg();
   unsigned int TrueReg = MI.getOperand(2).getReg();
   unsigned int FalseReg = MI.getOperand(3).getReg();
 
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned FalseResultReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-
-  BuildMI(BB, dl, TII.get(DPU::ORrrr), FalseResultReg)
-      .addReg(CondReg)
-      .addReg(FalseReg);
-
-  BuildMI(BB, dl, TII.get(DPU::TmpJcci))
-      .addImm(ISD::CondCode::SETEQ)
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
       .addReg(CondReg)
-      .addImm(0)
-      .addReg(FalseResultReg)
+      .addImm(1)
       .addMBB(endMBB);
 
+  BuildMI(falseMBB, dl, TII.get(DPU::JUMPi))
+      .addMBB(endMBB);
+  
   BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
-      .addReg(TrueReg)
-      .addMBB(trueMBB)
-      .addReg(FalseResultReg)
-      .addMBB(BB);
+    .addReg(TrueReg).addMBB(BB)
+    .addReg(FalseReg).addMBB(falseMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** falseMBB: "; falseMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
   return endMBB;
 }
 
-static MachineBasicBlock *
-EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, trueMBB);
-  F->insert(I, endMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), BB,
-                 std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(BB);
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(trueMBB);
-  BB->addSuccessor(endMBB);
-
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int CondReg = MI.getOperand(1).getReg();
-  unsigned int TrueReg = MI.getOperand(2).getReg();
-  unsigned int FalseReg = MI.getOperand(3).getReg();
-
-  BuildMI(BB, dl, TII.get(DPU::Jcci))
-      .addImm(ISD::CondCode::SETEQ)
-      .addReg(CondReg)
-      .addImm(0)
-      .addMBB(endMBB);
-
-  trueMBB->addSuccessor(endMBB);
+// static MachineBasicBlock *
+// EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction to replace: "; MI.dump();
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "****** \n";
+//     });
+//   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+//   DebugLoc dl = MI.getDebugLoc();
+//   const BasicBlock *LLVM_BB = BB->getBasicBlock();
+//   MachineFunction::iterator I = ++BB->getIterator();
+//   MachineFunction *F = BB->getParent();
+//   MachineBasicBlock *falseMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   F->insert(I, falseMBB);
+//   F->insert(I, endMBB);
+//   // Update machine-CFG edges by transferring all successors of the current
+//   // block to the new block which will contain the Phi node for the select.
+//   endMBB->splice(endMBB->begin(), BB,
+//                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+//   endMBB->transferSuccessorsAndUpdatePHIs(BB);
+//   // Next, add the true and fallthrough blocks as its successors.
+//   BB->addSuccessor(trueMBB);
+//   BB->addSuccessor(endMBB);
+//   falseMBB->addSuccessor(endMBB);
 
-  BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
-      .addReg(TrueReg)
-      .addMBB(trueMBB)
-      .addReg(FalseReg)
-      .addMBB(BB);
+//   unsigned int Dest = MI.getOperand(0).getReg();
+//   unsigned int CondReg = MI.getOperand(1).getReg();
+//   unsigned int TrueReg = MI.getOperand(2).getReg();
+//   unsigned int FalseReg = MI.getOperand(3).getReg();
+
+//   BuildMI(BB, dl, TII.get(DPU::Jcci))
+//       .addImm(ISD::CondCode::SETEQ)
+//       .addReg(CondReg)
+//       .addImm(1)
+//       .addMBB(endMBB);
+
+//   BuildMI(falseBB, dl, TII.get(DPU::Jumpi))
+//       .addMBB(endMBB);
+  
+//   BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
+//     .addReg(TrueReg).addMBB(BB)
+//     .addReg(FalseReg).addMBB(falseMBB);
 
-  MI.eraseFromParent(); // The pseudo instruction is gone now.
-  return endMBB;
-}
+//   MI.eraseFromParent(); // The pseudo instruction is gone now.
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction replaced\n";
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "** falseMBB: "; falseMBB->dump();
+//       dbgs() << "** endMBB: "; endMBB->dump();
+//       dbgs() << "****** \n";
+//     });
+//   return endMBB;
+// }
 
 static MachineBasicBlock *
 EmitMramSubStoreWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
@@ -2464,8 +2499,8 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  // LLVMContext &Context = F->getFunction().getContext();
-  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
   // BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
   //     .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -2473,14 +2508,14 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   // unsigned DummyReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   
   /// faulty
-  BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
-      // .addReg(LsbOp1Reg)
-    .addReg(Op1Reg, 0, DPU::sub_32bit)
-      .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB)
-    // .addMetadata(N)
-    ;
+  // BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
+  //     // .addReg(LsbOp1Reg)
+  //   .addReg(Op1Reg, 0, DPU::sub_32bit)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB)
+  //   // .addMetadata(N)
+  //   ;
 
   /// good, but
   // could increase quite a bit the code size
@@ -2490,20 +2525,20 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   //   on a few example, I can keep them adjacent
   //  but I may kill other optimization stuff in other code
   //   that use it genuinelly
-  // BuildMI(BB, dl, TII.get(DPU::LSLXrrr), LsbToMsbPartReg)
-  //   // .addReg(LsbOp1Reg)
-  //   .addReg(Op1Reg, 0, DPU::sub_32bit)
-  //   .addReg(ShiftReg)
-  //   .addMetadata(N);
-  // BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
-  //   .addReg(ShiftReg)
-  //   .addImm(0x20)
-  //   .addMetadata(N);
-  // BuildMI(BB, dl, TII.get(DPU::JEQrii))
-  //   .addReg(ShiftReg_check)
-  //   .addImm(0x20)
-  //   .addMBB(bigShiftMBB)
-  //   .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::LSLXrrr), LsbToMsbPartReg)
+    // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
+    .addReg(ShiftReg)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
+    .addReg(ShiftReg_check)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(N);
   
   // BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg)
       // .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
@@ -2524,8 +2559,7 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), Undef2Reg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG),
-          SmallShiftResultPart0Reg)
+  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultPart0Reg)
       .addReg(Undef2Reg)
       .addReg(SmallShiftLsbReg)
       .addImm(DPU::sub_32bit);
@@ -2752,36 +2786,33 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
       RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  // LLVMContext &Context = F->getFunction().getContext();
-  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
   BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
-  BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
-      .addReg(MsbOp1Reg)
-      .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB)
-    // .addMetadata(N)
-    ;
-
-  // LLVMContext &Context = F->getFunction().getContext();
-  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
-
-  // BuildMI(BB, dl, TII.get(DPU::LSRXrrr), MsbToLsbPartReg)
-  //   .addReg(MsbOp1Reg)
-  //   .addReg(ShiftReg)
-  //   .addMetadata(N);
-  // BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
-  //   .addReg(ShiftReg)
-  //   .addImm(0x20)
-  //   .addMetadata(N);
-  // BuildMI(BB, dl, TII.get(DPU::JEQrii))
-  //   .addReg(ShiftReg_check)
-  //   .addImm(0x20)
-  //   .addMBB(bigShiftMBB)
-  //   .addMetadata(N);
+  // BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
+  //     .addReg(MsbOp1Reg)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB)
+  //   // .addMetadata(N)
+  //   ;
+
+  BuildMI(BB, dl, TII.get(DPU::LSRXrrr), MsbToLsbPartReg)
+    .addReg(MsbOp1Reg)
+    .addReg(ShiftReg)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
+    .addReg(ShiftReg_check)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(N);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), LsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -3235,24 +3266,26 @@ static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
   unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned LsbAddReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  // LLVMContext &Context = F->getFunction().getContext();
-  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
 
-  BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
-      .addImm(DPUAsmCondition::Condition::NotMaximum)
-      .addMBB(endMBB)
-    // .addMetadata(N)
-    ;
+  // BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
+  //     .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+  //     .addImm(DPUAsmCondition::Condition::NotMaximum)
+  //     .addMBB(endMBB)
+  //   // .addMetadata(N)
+  //   ;
 
-  // BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg)
-  //   .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
-  //   .addMetadata(N);
-  // BuildMI(BB, dl, TII.get(DPU::JNEQrii))
-  //   .addReg(FastResultReg, 0, DPU::sub_32bit)
-  //   .addImm(32)
-  //   .addMBB(endMBB)
-  //   .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+    .addMetadata(N)
+    ;
+  BuildMI(BB, dl, TII.get(DPU::JNEQrii))
+    .addReg(FastResultReg, 0, DPU::sub_32bit)
+    .addImm(32)
+    .addMBB(endMBB)
+    .addMetadata(N)
+    ;
 
   BuildMI(msbAreZerosMBB, dl, TII.get(DPU::CLZrr), LsbClzReg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -3404,6 +3437,428 @@ static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
 //   return fastMBB;
 // }
 
+static MachineBasicBlock *EmitAlu64BitRRWithCustomInserter(MachineInstr &MI,
+							   MachineBasicBlock *MBB,
+							   unsigned LsbOpcode,
+							   unsigned MsbOpcode) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  
+  LLVMContext &Context = MF.getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned LHSReg = MI.getOperand(1).getReg();
+  unsigned RHSReg = MI.getOperand(2).getReg();
+
+  // Create new virtual registers for the lower and upper halves
+  unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  // Split the 64-bit operands into 32-bit halves
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Lo).addReg(RHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Hi).addReg(RHSReg, 0, DPU::sub_32bit_hi);
+
+  // Perform the lower 32-bit subtraction
+  MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(LsbOpcode), Dst_Lo)
+    .addReg(LHS_Lo)
+    .addReg(RHS_Lo)
+    .addMetadata(N)
+    ;
+
+  // Perform the upper 32-bit subtraction with carry
+  MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(MsbOpcode), Dst_Hi)
+    .addReg(LHS_Hi)
+    .addReg(RHS_Hi)
+    .addMetadata(N)
+    ;
+
+  // Combine the result into the 64-bit destination register
+  unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0)
+      .addReg(UndefReg)
+      .addReg(Dst_Lo)
+      .addImm(DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1)
+      .addReg(Dstp0)
+      .addReg(Dst_Hi)
+      .addImm(DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1);
+  
+  for (unsigned i = 1; i < 3; i++) {
+    if (MI.getOperand(i).isKill()) {
+      MIBLsb->getOperand(i).setIsKill();
+      MIBMsb->getOperand(i).setIsKill();
+    }
+  }
+  
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  return MBB;
+}
+
+static MachineBasicBlock *EmitAlu64BitRIWithCustomInserter(MachineInstr &MI,
+							   MachineBasicBlock *MBB,
+							   unsigned LsbOpcode,
+							   unsigned MsbOpcode) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction &MF = *MBB->getParent();
+
+  LLVMContext &Context = MF.getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned LHSReg = MI.getOperand(1).getReg();
+  int64_t RHSImm = MI.getOperand(2).getImm();
+
+  // Create new virtual registers for the lower and upper halves
+  unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  // Split the 64-bit operands into 32-bit halves
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi);
+
+  int64_t RHSImmLo = RHSImm & 0xFFFFFFFFl;
+  int64_t RHSImmHi = (RHSImm >> 32) & 0xFFFFFFFFl;
+
+  // // what if value is zero???
+  // // probably optimizable :)
+  // switch (RHSImmLo) {
+  // case 0:
+  // case 1:
+  // case 0xffffffff:
+  // case 0x80000000:
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "RHSImmLo = " << RHSImmLo << " could be optimized\n";
+  //     });
+  // }
+
+  // switch (RHSImmHi) {
+  // case 0:
+  // case 1:
+  // case 0xffffffff:
+  // case 0x80000000:
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "RHSImmHi = " << RHSImmHi << " could be optimized\n";
+  //     });
+  // }
+  
+  // Perform the lower 32-bit subtraction
+  MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(LsbOpcode), Dst_Lo)
+    .addReg(LHS_Lo)
+    .addImm(RHSImmLo)
+    .addMetadata(N);
+
+  // Perform the upper 32-bit subtraction with carry
+  MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(MsbOpcode), Dst_Hi)
+    .addReg(LHS_Hi)
+    .addImm(RHSImmHi)
+    .addMetadata(N);
+
+  // Combine the result into the 64-bit destination register
+  unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0)
+      .addReg(UndefReg)
+      .addReg(Dst_Lo)
+      .addImm(DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1)
+      .addReg(Dstp0)
+      .addReg(Dst_Hi)
+      .addImm(DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1);
+  
+  if (MI.getOperand(1).isKill()) {
+    MIBLsb->getOperand(1).setIsKill();
+    MIBMsb->getOperand(1).setIsKill();
+  }
+  
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  return MBB;
+}
+
+static MachineBasicBlock *EmitMove64RiWithCustomInserter(MachineInstr &MI,
+							 MachineBasicBlock *MBB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  int64_t RHSImm = MI.getOperand(1).getImm();
+
+  // Create new virtual registers for the lower and upper halves
+  unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  int64_t RHSImmLo = RHSImm & 0xFFFFFFFFl;
+  int64_t RHSImmHi = (RHSImm >> 32) & 0xFFFFFFFFl;
+
+  // // what if value is zero???
+  // // probably optimizable :)
+  switch (RHSImmLo) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "RHSImmLo = " << RHSImmLo << " could be optimized\n";
+      });
+  }
+
+  switch (RHSImmHi) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "RHSImmHi = " << RHSImmHi << " could be optimized\n";
+      });
+  }
+
+  // Perform the lower 32-bit subtraction
+  MachineInstrBuilder MIBLsb;
+  // switch (RHSImmLo) {
+  // default: {
+    MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::MOVEri), Dst_Lo).addImm(RHSImmLo);
+    // break;
+  // }
+  // case 0: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::ZERO);
+  //   break;
+  // }
+  // case 1: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::ONE);
+  //   break;
+  // }
+  // case 0xffffffff: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::LNEG);
+  //   break;
+  // }
+  // case 0x80000000: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::MNEG);
+  //   break;
+  // }
+  // }
+
+  // Perform the upper 32-bit subtraction with carry
+  MachineInstrBuilder MIBMsb;
+  // switch (RHSImmHi) {
+  // default: {
+  MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::MOVEri), Dst_Hi).addImm(RHSImmHi);
+   //  break;
+  // }
+  // case 0: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::ZERO);
+  //   break;
+  // }
+  // case 1: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::ONE);
+  //   break;
+  // }
+  // case 0xffffffff: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::LNEG);
+  //   break;
+  // }
+  // case 0x80000000: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::MNEG);
+  //   break;
+  // }
+  // }
+
+  // Combine the result into the 64-bit destination register
+  unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0)
+      .addReg(UndefReg)
+      .addReg(Dst_Lo)
+      .addImm(DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1)
+      .addReg(Dstp0)
+      .addReg(Dst_Hi)
+      .addImm(DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1);
+
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  return MBB;
+}
+
+static DPUAsmCondition::Condition
+findSelect64SetConditionFor(DPUAsmCondition::Condition cond) {
+  switch (cond) {
+  default:
+    llvm_unreachable("invalid condition");
+  case DPUAsmCondition::Condition::Zero:
+  case DPUAsmCondition::Condition::Equal:
+    return DPUAsmCondition::Condition::ExtendedZero;
+  case DPUAsmCondition::Condition::NotZero:
+  case DPUAsmCondition::Condition::NotEqual:
+    return DPUAsmCondition::Condition::ExtendedNotZero;
+  case DPUAsmCondition::Condition::GreaterThanSigned:
+    return DPUAsmCondition::Condition::ExtendedGreaterThanSigned;
+  case DPUAsmCondition::Condition::GreaterOrEqualSigned:
+    return DPUAsmCondition::Condition::GreaterOrEqualSigned;
+  case DPUAsmCondition::Condition::LessThanSigned:
+    return DPUAsmCondition::Condition::LessThanSigned;
+  case DPUAsmCondition::Condition::LessOrEqualSigned:
+    return DPUAsmCondition::Condition::ExtendedLessOrEqualSigned;
+  case DPUAsmCondition::Condition::GreaterThanUnsigned:
+    return DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned;
+  case DPUAsmCondition::Condition::GreaterOrEqualUnsigned:
+    return DPUAsmCondition::Condition::GreaterOrEqualUnsigned;
+  case DPUAsmCondition::Condition::LessThanUnsigned:
+    return DPUAsmCondition::Condition::LessThanUnsigned;
+  case DPUAsmCondition::Condition::LessOrEqualUnsigned:
+    return DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned;
+  }
+}
+
+static MachineBasicBlock *EmitSetCC64WithCustomInserter(MachineInstr &MI,
+							MachineBasicBlock *MBB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction &MF = *MBB->getParent();
+
+  LLVMContext &Context = MF.getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  auto ImmCond = static_cast<DPUAsmCondition::Condition>(MI.getOperand(1).getImm());
+  unsigned LHSReg = MI.getOperand(2).getReg();
+  unsigned RHSReg = MI.getOperand(3).getReg();
+
+  unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  // unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Lo).addReg(RHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Hi).addReg(RHSReg, 0, DPU::sub_32bit_hi);
+  
+  DPUAsmCondition::Condition SetCondition =
+    findSelect64SetConditionFor(ImmCond);
+
+  MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::SUBzrr))
+    .addReg(DPU::ZERO)
+    .addReg(LHS_Lo)
+    .addReg(RHS_Lo)
+    .addMetadata(N);
+  MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::SUBCrrrc), DstReg)
+    .addReg(LHS_Hi)
+    .addReg(RHS_Hi)
+    .addImm(SetCondition)
+    .addMetadata(N);
+
+  for (unsigned i = 2; i < 4; i++) {
+    if (MI.getOperand(i).isKill()) {
+      MIBLsb->getOperand(i - 1).setIsKill();
+      MIBMsb->getOperand(i - 1).setIsKill();
+    }
+  }
+
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  return MBB;
+}
+
 MachineBasicBlock *
 DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -3439,7 +3894,9 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case DPU::SELECTrr:
     return EmitSelectWithCustomInserter(MI, BB);
   case DPU::SELECT64rr:
-    return EmitSelect64WithCustomInserter(MI, BB);
+    // return EmitSelect64WithCustomInserter(MI, BB);
+    return EmitSelectWithCustomInserter(MI, BB);
+    
   case DPU::MRAM_STORE_BYTErm:
     return EmitMramSubStoreWithCustomInserter(MI, BB, 7, DPU::SBrir);
   case DPU::MRAM_STORE_HALFrm:
@@ -3479,6 +3936,7 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return EmitMramSubLoadWithCustomInserter(MI, BB, 4, DPU::LW_Srri);
   case DPU::MRAM_LOAD_DOUBLEmr:
     return EmitMramLoadDoubleWithCustomInserter(MI, BB);
+    
   case DPU::LSL64rr:
     return EmitLsl64RegisterWithCustomInserter(MI, BB);
   case DPU::LSL64ri:
@@ -3509,5 +3967,42 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                 DPU::LSR_ADDrrri);
   case DPU::CLZ64r:
     return EmitClz64WithCustomInserter(MI, BB);
+
+  // RR
+  // case DPU::ADD64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ADDrrr, DPU::ADDCrrr);
+
+  // case DPU::AND64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ANDrrr, DPU::ANDrrr);
+
+  // case DPU::OR64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ORrrr, DPU::ORrrr);
+
+  // case DPU::SUB64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::SUBrrr, DPU::SUBCrrr);
+
+  // case DPU::XOR64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::XORrrr, DPU::XORrrr);
+
+  // // RI
+  // case DPU::ADD64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ADDrri, DPU::ADDCrri);
+
+  // case DPU::AND64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ANDrri, DPU::ANDrri);
+
+  // case DPU::OR64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ORrri, DPU::ORrri);
+
+  // case DPU::XOR64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::XORrri, DPU::XORrri);
+
+    
+  case DPU::MOVE64ri:
+    return EmitMove64RiWithCustomInserter(MI, BB);
+
+  case DPU::SET64cc:
+    return EmitSetCC64WithCustomInserter(MI, BB);
+
   }
 }
diff --git a/llvm/lib/Target/DPU/DPUTargetMachine.cpp b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
index a292eb41821d0..734bc0d541ff3 100644
--- a/llvm/lib/Target/DPU/DPUTargetMachine.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
@@ -54,6 +54,7 @@ DPUTargetMachine::DPUTargetMachine(const Target &T, const Triple &TT,
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
+  // setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
@@ -84,7 +85,7 @@ class DPUPassConfig : public TargetPassConfig {
 
   bool addInstSelector() override;
 
-  // void addPostRegAlloc() override;
+  void addPostRegAlloc() override;
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
 };
@@ -104,10 +105,10 @@ bool DPUPassConfig::addInstSelector() {
   return false;
 }
 
-// void DPUPassConfig::addPostRegAlloc() {
-//   DPUTargetMachine &TM = getDPUTargetMachine();
-//   addPass(createDPUPostRAFusionPass(TM));
-// }
+void DPUPassConfig::addPostRegAlloc() {
+  DPUTargetMachine &TM = getDPUTargetMachine();
+  addPass(createDPUPostRAFusionPass(TM));
+}
 
 void DPUPassConfig::addPreEmitPass() {
   DPUTargetMachine &TM = getDPUTargetMachine();
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 8bd3036f1fc34..336a990a046ca 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -367,13 +367,19 @@ struct ScopedSaveAliaseesAndUsed {
   }
 
   ~ScopedSaveAliaseesAndUsed() {
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
     appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end()));
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
     appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(),
                                                        CompilerUsed.end()));
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 
-    for (auto P : FunctionAliases)
+    for (auto P : FunctionAliases) {
+      // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
       P.first->setIndirectSymbol(
           ConstantExpr::getBitCast(P.second, P.first->getType()));
+      // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
+    }
   }
 };
 
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index ef9f18a2289e9..26ced977d52fc 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -75,38 +75,52 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *>
   GlobalVariable *GV = M.getGlobalVariable(Name);
   SmallPtrSet<Constant *, 16> InitAsSet;
   SmallVector<Constant *, 16> Init;
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   if (GV) {
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
     auto *CA = cast<ConstantArray>(GV->getInitializer());
     for (auto &Op : CA->operands()) {
+      // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
       Constant *C = cast_or_null<Constant>(Op);
       if (InitAsSet.insert(C).second)
         Init.push_back(C);
     }
     GV->eraseFromParent();
   }
-
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   for (auto *V : Values) {
-    Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
+    // V->dump();
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
+    // Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+    Constant *C = ConstantExpr::getPointerBitCastOrAddrSpaceCast(V, Int8PtrTy);
     if (InitAsSet.insert(C).second)
       Init.push_back(C);
   }
-
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   if (Init.empty())
     return;
-
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
   GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
                                 ConstantArray::get(ATy, Init), Name);
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   GV->setSection("llvm.metadata");
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 }
 
 void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   appendToUsedList(M, "llvm.used", Values);
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 }
 
 void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   appendToUsedList(M, "llvm.compiler.used", Values);
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 }
 
 FunctionCallee