From e9dedc0880f430f81c06c9a59f4a42d2715a8f0e Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 29 Jul 2024 16:34:21 +0200
Subject: [PATCH 1/8] dpu: llvm: fix some BuildMI construction

In those use, last parameter of BuildMI is the DestReg.
It basically declare this regsiter as a definition internally.

SD variant represents store instruction, it doesn't define or allocate to a register.
CALL variant define the first register, usualy R23 as link register in our ABI.
SUB variant define the first register as well.

This patch fix those errors:

```
*** Bad machine code: Explicit operand marked as def ***
- function:    __divdf3
- basic block: %bb.0 entry (0x55d88649e508)
- instruction: $r22 = SDrir 88, $d22
- operand 0:   $r22

*** Bad machine code: Explicit definition marked as use ***
- function:    __divdf3
- basic block: %bb.35 if.else141 (0x560777d636f8)
- instruction: CALLri $r23, &__muldi3, debug-location !307; work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/divdf3.c:166:52
- operand 0:   $r23

*** Bad machine code: Explicit definition marked as use ***
- function:    process_inputs_all_tasklets
- basic block: %bb.5  (0x563bd908a390)
- instruction: CALLrr $r23, killed $r3, debug-location !219; work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/wramfifo.c:108:21
- operand 0:   $r23

*** Bad machine code: Explicit definition marked as use ***
- function:    printf
- basic block: %bb.0  (0x5595f29ecd90)
- instruction: SUBrrif $r0, $r22, 56, 9, debug-location !103; work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/stdlib/stdio.c:110:5
- operand 0:   $r0
```
---
 llvm/lib/Target/DPU/DPUFrameLowering.cpp | 3 ++-
 llvm/lib/Target/DPU/DPUInstrInfo.cpp     | 9 +++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUFrameLowering.cpp b/llvm/lib/Target/DPU/DPUFrameLowering.cpp
index 8bf3c6c06650b..026354d10e304 100644
--- a/llvm/lib/Target/DPU/DPUFrameLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUFrameLowering.cpp
@@ -85,7 +85,8 @@ void DPUFrameLowering::emitPrologue(MachineFunction &MF,
         .addCFIIndex(CFIIndex)
         .setMIFlag(MachineInstr::FrameSetup);
 
-    BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir), DPU::R22)
+    BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir))
+        .addReg(DPU::R22)
         .addImm(StackSize - STACK_SIZE_FOR_D22)
         .addReg(DPU::D22);
     BuildMI(MBB, MBBI, DL, DPUII.get(DPU::ADDrri), DPU::R22)
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index db957f97bcaa9..59d7ee95171a0 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -106,13 +106,11 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::JUMPr)).addReg(DPU::R23);
     break;
   case DPU::CALLi:
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri))
-        .addReg(DPU::R23)
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri), DPU::R23)
         .add(MI.getOperand(0));
     break;
   case DPU::CALLr:
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr))
-        .addReg(DPU::R23)
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr), DPU::R23)
         .add(MI.getOperand(0));
     break;
   case DPU::ADD_VAStart: { // Get the first index in stack where the first
@@ -122,8 +120,7 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       StackSize = MF->getFrameInfo().getStackSize();
     }
     unsigned int ResultReg = MI.getOperand(0).getReg();
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif))
-        .addReg(ResultReg)
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif), ResultReg)
         .addReg(DPU::R22)
         .addImm(StackSize + STACK_SIZE_FOR_D22)
         .addImm(DPUAsmCondition::Condition::False);

From f3c79f5764ee6b3c047c17efe3aadb7ba5b43a1b Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 29 Jul 2024 17:16:37 +0200
Subject: [PATCH 2/8] dpu: llvm: fix analyzeBranch/removeBranch/insertBranch

Those function is heavily used all along the LLVM backend pipeline, to inspect
and optimize the CFG.

UPMEM DPU ISA provides arithmetic+comp+branch in one instruction.
We introduce them as early as DPUTargetLowering::EmitInstrWithCustomInserter,
and when optimizer move around the CFG, it was losing correctness.

This patch fix issues like:
```
*** Bad machine code: Explicit definition marked as use ***
- function:    test
- basic block: %bb.0 entry (0x56544fed73d8)
- instruction: CLZ_Urrci $d0, $r2, 33, %bb.2, debug-location !31; work/simple_examples/minimal.c:24:12
- operand 0:   $d0

*** Bad machine code: Using an undefined physical register ***
- function:    test
- basic block: %bb.0 entry (0x56544fed73d8)
- instruction: CLZ_Urrci $d0, $r2, 33, %bb.2, debug-location !31; work/simple_examples/minimal.c:24:12
- operand 0:   $d0
```

or:
```
*** Bad machine code: Explicit definition marked as use ***
- function:    __muldf3
- basic block: %bb.35 cleanup98.i (0x55f824732cb8)
- instruction: LSRXrrrci $r6, $r2, $r4, 40, %bb.25, debug-location !310; work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/fp_lib.h:268:48 @[ work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/fp_mul_impl.inc:99:9 @[ work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/muldf3.c:21:12 ] ]
- operand 0:   $r6

*** Bad machine code: Using an undefined physical register ***
- function:    __muldf3
- basic block: %bb.35 cleanup98.i (0x55f824732cb8)
- instruction: LSRXrrrci $r6, $r2, $r4, 40, %bb.25, debug-location !310; work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/fp_lib.h:268:48 @[ work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/fp_mul_impl.inc:99:9 @[ work/dpu_tools_llvm_cleanup_20240710_2/dpu-rt/src/syslib/muldf3.c:21:12 ] ]
- operand 0:   $r6
```
---
 llvm/lib/Target/DPU/DPUInstrInfo.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index 59d7ee95171a0..d4772e56328df 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -448,10 +448,8 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
   MIB = BuildMI(&MBB, DL, get(Opc));
 
   for (unsigned i = 1; i < Cond.size(); ++i) {
-    if (Cond[i].isReg())
-      MIB.addReg(Cond[i].getReg());
-    else if (Cond[i].isImm())
-      MIB.addImm(Cond[i].getImm());
+    if (Cond[i].isReg() || Cond[i].isImm())
+      MIB->addOperand(Cond[i]);
     else
       assert(false && "Cannot copy operand");
   }

From f3b490eb8a15647d44dea79f65d9eb71d992f60b Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Mon, 29 Jul 2024 21:24:40 +0200
Subject: [PATCH 3/8] dpu: llvm: fix CALL variant implicit def

CALL variant instruction defines implicitely r0, or d0 (r0-r1) for 64-bit value,
don't lose this information.
---
 llvm/lib/Target/DPU/DPUInstrInfo.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index d4772e56328df..dee077276fe58 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -107,11 +107,13 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     break;
   case DPU::CALLi:
     BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri), DPU::R23)
-        .add(MI.getOperand(0));
+        .add(MI.getOperand(0))
+        .copyImplicitOps(MI);
     break;
   case DPU::CALLr:
     BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr), DPU::R23)
-        .add(MI.getOperand(0));
+        .add(MI.getOperand(0))
+        .copyImplicitOps(MI);
     break;
   case DPU::ADD_VAStart: { // Get the first index in stack where the first
                            // vaargs is stored

From 0a6b4ef78c78dd596dde58b70bec2480e7ff02f1 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Tue, 30 Jul 2024 15:17:24 +0200
Subject: [PATCH 4/8] dpu: llvm: CMakeLists.txt sort alphabetically

---
 llvm/lib/Target/DPU/CMakeLists.txt | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/DPU/CMakeLists.txt b/llvm/lib/Target/DPU/CMakeLists.txt
index 7a887b71ee3aa..203ebb9a4837c 100644
--- a/llvm/lib/Target/DPU/CMakeLists.txt
+++ b/llvm/lib/Target/DPU/CMakeLists.txt
@@ -2,32 +2,32 @@ add_llvm_component_group(DPU)
 
 set(LLVM_TARGET_DEFINITIONS DPU.td)
 
-tablegen(LLVM DPUGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM DPUGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM DPUGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM DPUGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM DPUGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM DPUGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM DPUGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM DPUGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM DPUGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM DPUGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM DPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM DPUGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(DPUCommonTableGen)
 
 add_llvm_target(DPUCodeGen
-        DPUTargetMachine.cpp
-        DPURegisterInfo.cpp
+        DPUAsmPrinter.cpp
         DPUFrameLowering.cpp
-        DPUTargetLowering.cpp
-        DPUMachineFunctionInfo.cpp
         DPUInstrInfo.cpp
-        DPUSubtarget.cpp
         DPUISelDAGToDAG.cpp
-        DPUAsmPrinter.cpp
-        DPUMCInstLower.cpp
+        DPUMachineFunctionInfo.cpp
+        DPUMacroFusion.cpp
         DPUMergeComboInstrPass.cpp
+        DPUMCInstLower.cpp
+        DPURegisterInfo.cpp
         DPUResolveMacroInstrPass.cpp
-        DPUMacroFusion.cpp
         DPUSelectionDAGInfo.cpp
+        DPUSubtarget.cpp
+        DPUTargetLowering.cpp
+        DPUTargetMachine.cpp
 
         DEPENDS
         intrinsics_gen

From f8ffa915f76d15cee0b17c3e7d583ee9e232b7ff Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Tue, 30 Jul 2024 08:04:56 +0200
Subject: [PATCH 5/8] dpu: llvm: fix ResolveMacroInstrPass

Add properly register used in new MBBs when lowering Jcc familly of pseudo instruction.
Fix machine CFG topology.

This is a WIP, it will be rearranged/rebased later.
---
 .../Target/DPU/DPUResolveMacroInstrPass.cpp   | 85 ++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
index bbfb4fec0d67e..9b417cd8f12e2 100644
--- a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
@@ -149,6 +149,11 @@ static void resolve64BitRegisterAluInstruction(
     MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter,
     const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode,
     unsigned int MsbOpcode) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+    });
   MachineFunction *MF = MBB->getParent();
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
 
@@ -173,6 +178,11 @@ static void resolve64BitRegisterAluInstruction(
           MSBDestReg)
       .addReg(MSBDOp1Reg)
       .addReg(MSBOp2Reg);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
 }
 
 static void resolveJeq64(MachineBasicBlock *MBB,
@@ -181,6 +191,25 @@ static void resolveJeq64(MachineBasicBlock *MBB,
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
   MachineFunction *F = MBB->getParent();
+
+  bool need_splice = std::next(MBBIter) != MBB->end();
+
+  MachineBasicBlock *FTMBB = MBB->getFallThrough();
+  MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** canFallThrough: " << MBB->canFallThrough() << "\n";
+      if (MBB->canFallThrough()) {
+        dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
+
   MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(I, trueMBB);
@@ -190,12 +219,13 @@ static void resolveJeq64(MachineBasicBlock *MBB,
   endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
   endMBB->transferSuccessorsAndUpdatePHIs(MBB);
   // Next, add the true and fallthrough blocks as its successors.
-  auto JumpMBB = MBBIter->getOperand(3).getMBB();
   MBB->addSuccessor(trueMBB);
   MBB->addSuccessor(endMBB);
   trueMBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(endMBB);
 
+  endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
+
   unsigned int Op1Reg = MBBIter->getOperand(1).getReg();
   unsigned int Op2Reg = MBBIter->getOperand(2).getReg();
 
@@ -215,6 +245,19 @@ static void resolveJeq64(MachineBasicBlock *MBB,
       .addReg(MsbOp1Reg)
       .addReg(MsbOp2Reg)
       .addMBB(JumpMBB);
+  trueMBB->addLiveIn(MsbOp1Reg);
+  trueMBB->addLiveIn(MsbOp2Reg);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** trueMBB: "; trueMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "** FTMBB: "; FTMBB->dump();
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
 }
 
 static void resolveJneq64(MachineBasicBlock *MBB,
@@ -223,6 +266,25 @@ static void resolveJneq64(MachineBasicBlock *MBB,
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
   MachineFunction *F = MBB->getParent();
+
+  bool need_splice = std::next(MBBIter) != MBB->end();
+  bool canFallThrough = MBB->canFallThrough();
+  MachineBasicBlock *FTMBB = MBB->getFallThrough();
+  MachineBasicBlock * JumpMBB = MBBIter->getOperand(3).getMBB();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** canFallThrough: " << canFallThrough << "\n";
+      if (canFallThrough) {
+        dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
+
   MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(I, trueMBB);
@@ -232,12 +294,13 @@ static void resolveJneq64(MachineBasicBlock *MBB,
   endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
   endMBB->transferSuccessorsAndUpdatePHIs(MBB);
   // Next, add the true and fallthrough blocks as its successors.
-  auto JumpMBB = MBBIter->getOperand(3).getMBB();
   MBB->addSuccessor(trueMBB);
   MBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(endMBB);
 
+  endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
+
   unsigned int Op1Reg = MBBIter->getOperand(1).getReg();
   unsigned int Op2Reg = MBBIter->getOperand(2).getReg();
 
@@ -257,6 +320,21 @@ static void resolveJneq64(MachineBasicBlock *MBB,
       .addReg(MsbOp1Reg)
       .addReg(MsbOp2Reg)
       .addMBB(JumpMBB);
+  trueMBB->addLiveIn(MsbOp1Reg);
+  trueMBB->addLiveIn(MsbOp2Reg);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** trueMBB: "; trueMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      if (canFallThrough) {
+        dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
 }
 
 static void resolveJcc64AsSub64(MachineBasicBlock *MBB,
@@ -499,5 +577,8 @@ bool DPUResolveMacroInstrPass::runOnMachineFunction(MachineFunction &MF) {
     changeMade |= resolveMacroInstructionsInMBB(MBB, InstrInfo);
   }
 
+  LLVM_DEBUG(dbgs() << "********** DPU/ResolveMacroInstrPass: " << MF.getName()
+                    << " done: changeMade = " << changeMade << " **********\n\n");
+
   return changeMade;
 }

From 3cdd1568a318f8e8e7e0ea80175f5580a2588ab0 Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Tue, 30 Jul 2024 08:10:37 +0200
Subject: [PATCH 6/8] dpu: llvm: fixing EmitInstrWithCustomInserter
 arithmetic+comp+branch familly

This is a partial WIP fix.
I'm saving it now to keep it.

The idea is to emit simple but correct code first,
and work out for fuse them back together later in the pipeline.

When in presence of known post RA fusable set of instructions, we try
to keep them together during pre RA:
- DPUInstrInfo::shouldSink return false
- DPUMacroFussion::shouldScheduleAdjacent return true

TODO:
- the other instruction
- a new CFGOptimizer to cleanup what split-critical-edge did when it make sense
-- some critical-edge are broken down to new MBB with a simple JUMPi
-- and are never optimized/cleaned up out
-- this would cause bigger code footprint, and performance regression
-- note that this situation is already present without this big fix ...
---
 llvm/lib/Target/DPU/CMakeLists.txt            |   2 +
 llvm/lib/Target/DPU/DPU.h                     |   1 +
 llvm/lib/Target/DPU/DPUHelper.cpp             |  45 +
 llvm/lib/Target/DPU/DPUHelper.h               |  20 +
 llvm/lib/Target/DPU/DPUInstrInfo.cpp          |  33 +-
 llvm/lib/Target/DPU/DPUInstrInfo.h            |   2 +
 llvm/lib/Target/DPU/DPUMCInstLower.cpp        |   1 +
 llvm/lib/Target/DPU/DPUMacroFusion.cpp        |  37 +-
 .../lib/Target/DPU/DPUMergeComboInstrPass.cpp |  22 +-
 llvm/lib/Target/DPU/DPUPostRAFusion.cpp       | 252 ++++++
 llvm/lib/Target/DPU/DPUTargetLowering.cpp     | 772 ++++++++++--------
 llvm/lib/Target/DPU/DPUTargetMachine.cpp      |  14 +-
 12 files changed, 835 insertions(+), 366 deletions(-)
 create mode 100644 llvm/lib/Target/DPU/DPUHelper.cpp
 create mode 100644 llvm/lib/Target/DPU/DPUHelper.h
 create mode 100644 llvm/lib/Target/DPU/DPUPostRAFusion.cpp

diff --git a/llvm/lib/Target/DPU/CMakeLists.txt b/llvm/lib/Target/DPU/CMakeLists.txt
index 203ebb9a4837c..b5a26c5647661 100644
--- a/llvm/lib/Target/DPU/CMakeLists.txt
+++ b/llvm/lib/Target/DPU/CMakeLists.txt
@@ -16,12 +16,14 @@ add_public_tablegen_target(DPUCommonTableGen)
 add_llvm_target(DPUCodeGen
         DPUAsmPrinter.cpp
         DPUFrameLowering.cpp
+        DPUHelper.cpp
         DPUInstrInfo.cpp
         DPUISelDAGToDAG.cpp
         DPUMachineFunctionInfo.cpp
         DPUMacroFusion.cpp
         DPUMergeComboInstrPass.cpp
         DPUMCInstLower.cpp
+        DPUPostRAFusion.cpp
         DPURegisterInfo.cpp
         DPUResolveMacroInstrPass.cpp
         DPUSelectionDAGInfo.cpp
diff --git a/llvm/lib/Target/DPU/DPU.h b/llvm/lib/Target/DPU/DPU.h
index 2ef567d9bc868..a308d3a40f8bc 100644
--- a/llvm/lib/Target/DPU/DPU.h
+++ b/llvm/lib/Target/DPU/DPU.h
@@ -20,6 +20,7 @@ class FunctionPass;
 class DPUTargetMachine;
 
 FunctionPass *createDPUMergeComboInstrPass(DPUTargetMachine &tm);
+FunctionPass *createDPUPostRAFusionPass(DPUTargetMachine &tm);
 FunctionPass *createDPUResolveMacroInstrPass(DPUTargetMachine &tm);
 
 } // namespace llvm
diff --git a/llvm/lib/Target/DPU/DPUHelper.cpp b/llvm/lib/Target/DPU/DPUHelper.cpp
new file mode 100644
index 0000000000000..957d43ffc3be1
--- /dev/null
+++ b/llvm/lib/Target/DPU/DPUHelper.cpp
@@ -0,0 +1,45 @@
+#include "DPUHelper.h"
+
+#include <llvm/CodeGen/MachineOperand.h>
+#include <llvm/IR/Function.h>
+
+namespace llvm {
+
+#define POSTRA_FUSION_METADATA_STRING "MySpecialMetadata"
+
+MDNode *getPostRAFusionMetadata(const MachineFunction *MF) {
+  LLVMContext &Context = MF->getFunction().getContext();
+  return MDNode::get(Context,
+                     MDString::get(Context, POSTRA_FUSION_METADATA_STRING));
+}
+
+bool hasPostRAFusionMetadata(const MachineInstr *MI) {
+  for (const MachineOperand &Op : MI->operands()) {
+    if (!Op.isMetadata())
+      continue;
+
+    LLVMContext &Context = MI->getMF()->getFunction().getContext();
+    if (Op.getMetadata()->getOperand(0).get() ==
+        MDString::get(Context, POSTRA_FUSION_METADATA_STRING)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+MachineInstr *
+getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
+                         MachineBasicBlock::reverse_iterator REnd) {
+  // Skip all the debug instructions.
+  while (I != REnd &&
+         (I->isDebugValue() || I->getOpcode() == TargetOpcode::DBG_VALUE)) {
+    ++I;
+  }
+  if (I == REnd) {
+    return NULL;
+  }
+  return &*I;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/DPU/DPUHelper.h b/llvm/lib/Target/DPU/DPUHelper.h
new file mode 100644
index 0000000000000..9b3436bcad68c
--- /dev/null
+++ b/llvm/lib/Target/DPU/DPUHelper.h
@@ -0,0 +1,20 @@
+#ifndef LLVM_LIB_TARGET_DPU_DPUHELPER_H
+#define LLVM_LIB_TARGET_DPU_DPUHELPER_H
+
+#include <llvm/CodeGen/MachineBasicBlock.h>
+#include <llvm/CodeGen/MachineFunction.h>
+#include <llvm/CodeGen/MachineInstr.h>
+#include <llvm/IR/Metadata.h>
+
+namespace llvm {
+
+MDNode *getPostRAFusionMetadata(const MachineFunction *MF);
+bool hasPostRAFusionMetadata(const MachineInstr *MI);
+
+MachineInstr *
+getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
+                         MachineBasicBlock::reverse_iterator REnd);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_DPU_DPUHELPER_H
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index dee077276fe58..f55e72108145c 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "DPUHelper.h"
 #include "DPUInstrInfo.h"
 #include "DPUTargetMachine.h"
 
@@ -300,10 +301,20 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst,
       Cond.push_back(operand);
     }
   }
+
+  for (const MachineOperand &Op : Inst->operands()) {
+    if (Op.isMetadata()) {
+      Cond.push_back(Op);
+    }
+  }
 }
 
 static inline bool isAnalyzableBranch(MachineInstr *Inst) {
-  return Inst->isBranch() && !Inst->isIndirectBranch();
+  return (Inst->isBranch() && !Inst->isIndirectBranch()
+	  // We intentionally know that those will be optimized by us
+	  // during DPUPostRAFusion, don't let split the critical edge
+	  // && !hasPostRAFusionMetadata(Inst)
+	  );
 }
 
 bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
@@ -450,13 +461,22 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
   MIB = BuildMI(&MBB, DL, get(Opc));
 
   for (unsigned i = 1; i < Cond.size(); ++i) {
-    if (Cond[i].isReg() || Cond[i].isImm())
+    if (Cond[i].isReg() || Cond[i].isImm()) {
       MIB->addOperand(Cond[i]);
-    else
+    } else if (Cond[i].isMetadata()) {
+      // skip
+    } else {
       assert(false && "Cannot copy operand");
+    }
   }
 
   MIB.addMBB(TBB);
+
+  for (unsigned i = 0; i < Cond.size(); ++i) {
+     if (Cond[i].isMetadata()) {
+      MIB.addMetadata(Cond[i].getMetadata());
+     }
+  }
 }
 
 unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
@@ -491,3 +511,10 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
     *BytesAdded = nrOfInsertedMachineInstr;
   return nrOfInsertedMachineInstr;
 }
+
+bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const {
+  if (hasPostRAFusionMetadata(&MI))
+    return false;
+
+  return TargetInstrInfo::shouldSink(MI);
+}
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.h b/llvm/lib/Target/DPU/DPUInstrInfo.h
index e9c2a3b920a05..14c199c9160e8 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.h
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.h
@@ -65,6 +65,8 @@ class DPUInstrInfo : public DPUGenInstrInfo {
 
   void buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                               DebugLoc DL, ArrayRef<MachineOperand> Cond) const;
+
+  bool shouldSink(const MachineInstr &MI) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/DPU/DPUMCInstLower.cpp b/llvm/lib/Target/DPU/DPUMCInstLower.cpp
index 311c64f86b142..954f3834cc138 100644
--- a/llvm/lib/Target/DPU/DPUMCInstLower.cpp
+++ b/llvm/lib/Target/DPU/DPUMCInstLower.cpp
@@ -102,6 +102,7 @@ void DPUMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
 
     case MachineOperand::MO_RegisterMask:
+    case MachineOperand::MO_Metadata:
       continue;
 
     case MachineOperand::MO_GlobalAddress:
diff --git a/llvm/lib/Target/DPU/DPUMacroFusion.cpp b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
index a606c017d7cfb..2cec6c8ea4ccd 100644
--- a/llvm/lib/Target/DPU/DPUMacroFusion.cpp
+++ b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
@@ -7,8 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "DPUHelper.h"
 #include "DPUMacroFusion.h"
 #include "DPUSubtarget.h"
+
 #include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
@@ -28,14 +30,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   // We are mainly interested in merging a simple operation with a simple
   // conditional/unconditional branch
   LLVM_DEBUG({
-    dbgs() << "DPU/Merge: checking macro fusion:\n\t";
-    if (!FirstMI)
-      dbgs() << "<NONE>";
-    else
-      FirstMI->dump();
-    dbgs() << "\n\t";
-    SecondMI.dump();
-    dbgs() << "\n";
+    dbgs() << "DPU/Merge: checking macro fusion:\n";
+    if (!FirstMI) {
+      dbgs() << "\t<NONE>\n";
+    } else {
+      dbgs() << "\t"; FirstMI->dump();
+    }
+    dbgs() << "\t"; SecondMI.dump();
   });
 
   if (!FirstMI) {
@@ -45,6 +46,26 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
     return true;
   }
 
+  // check if they are candidate for PostRAFusion
+  if (hasPostRAFusionMetadata(FirstMI)
+      && hasPostRAFusionMetadata(&SecondMI)) {
+    // and if they share operands
+    for (auto &FirstMIOperands : FirstMI->operands()) {
+      if (!FirstMIOperands.isReg())
+	continue;
+
+      for (auto &SecondMIOperands : SecondMI.operands()) {
+	if (!SecondMIOperands.isReg())
+	  continue;
+
+	if (FirstMIOperands.getReg() == SecondMIOperands.getReg()) {
+	  LLVM_DEBUG({ dbgs() << "DPU/Merge: the two instructions can be fused in PostRA\n"; });
+	  return true;
+	}
+      }
+    }
+  }
+
   unsigned firstOpc = FirstMI->getOpcode();
   unsigned secondOpc = SecondMI.getOpcode();
 
diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 998d4f0d4bcc5..c774c236490ed 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -6,12 +6,14 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+#include "DPU.h"
+#include "DPUHelper.h"
 #include "DPUTargetMachine.h"
-#include <llvm/CodeGen/MachineInstrBuilder.h>
-#include <set>
 
-#include "DPU.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include <llvm/CodeGen/MachineInstrBuilder.h>
+
+#include <set>
 
 #define GET_INSTRINFO_ENUM
 
@@ -188,20 +190,6 @@ static const ISD::CondCode sourceConditions[] = {
     ISD::SETOEQ, ISD::SETOGE, ISD::SETOLT, ISD::SETONE, ISD::SETUEQ,
     ISD::SETEQ,  ISD::SETGE,  ISD::SETLT,  ISD::SETNE};
 
-static MachineInstr *
-getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
-                         MachineBasicBlock::reverse_iterator REnd) {
-  // Skip all the debug instructions.
-  while (I != REnd &&
-         (I->isDebugValue() || I->getOpcode() == TargetOpcode::DBG_VALUE)) {
-    ++I;
-  }
-  if (I == REnd) {
-    return NULL;
-  }
-  return &*I;
-}
-
 static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
                                         const DPUInstrInfo &InstrInfo) {
   MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend();
diff --git a/llvm/lib/Target/DPU/DPUPostRAFusion.cpp b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
new file mode 100644
index 0000000000000..135cb730a443c
--- /dev/null
+++ b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
@@ -0,0 +1,252 @@
+#include "DPU.h"
+#include "DPUHelper.h"
+#include "DPUTargetMachine.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+#define GET_INSTRINFO_ENUM
+
+#include "DPUCondCodes.h"
+#include "DPUGenInstrInfo.inc"
+#include "DPUISelLowering.h"
+#include "MCTargetDesc/DPUAsmCondition.h"
+
+#define GET_REGINFO_ENUM
+#include "DPUGenRegisterInfo.inc"
+
+#define DEBUG_TYPE "dpu-postra-fusion"
+
+using namespace llvm;
+
+namespace {
+class DPUPostRAFusionPass : public MachineFunctionPass {
+public:
+  const DPUInstrInfo *TII;
+  static char ID;
+
+  explicit DPUPostRAFusionPass(DPUTargetMachine &TM)
+      : MachineFunctionPass(ID), TM(TM) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  llvm::StringRef getPassName() const override {
+    return "DPU PostRA Fussion";
+  }
+
+private:
+  const DPUTargetMachine &TM;
+  bool runOnMachineBB(MachineBasicBlock &MBB);
+};
+
+char DPUPostRAFusionPass::ID = 0;
+} // namespace
+
+FunctionPass *llvm::createDPUPostRAFusionPass(DPUTargetMachine &TM) {
+  return new DPUPostRAFusionPass(TM);
+}
+
+bool DPUPostRAFusionPass::runOnMachineBB(MachineBasicBlock &MBB) {
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+  MachineInstr *LastInst, *SecondLastInst;
+  unsigned int LastOpc, SecondLastOpc;
+
+  LastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (LastInst == NULL) {
+    return false;
+  }
+  I++;
+  SecondLastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (SecondLastInst == NULL) {
+    return false;
+  }
+
+  LastOpc = LastInst->getOpcode();
+  SecondLastOpc = SecondLastInst->getOpcode();
+
+  if (!hasPostRAFusionMetadata(LastInst)
+      || !hasPostRAFusionMetadata(SecondLastInst)) {
+    return false;
+  }
+
+  DebugLoc DL = SecondLastInst->getDebugLoc();
+
+  // attempt to merge lsl/r variant; and XX 32; jeq XX 32; instructions
+  // that has a special metadata
+  // TODO: implement more generic situation without the metadata
+  // TODO: split-critical-edge could break BB and reverse cond+branch
+  if ((LastOpc == DPU::JEQrii // || LastOpc == DPU::JNEQrii
+       )
+      && SecondLastOpc == DPU::ANDrri) {
+    I++;
+    MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd);
+    if (ThirdLastInst == NULL) {
+      // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
+      return false;
+    }
+
+    if (!hasPostRAFusionMetadata(ThirdLastInst)) {
+      // This should not happen AFAIK, but I don't know everything yet ...
+      return false;
+    }
+
+    unsigned int ThirdLastOpc = ThirdLastInst->getOpcode();
+    if (ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr
+	|| ThirdLastOpc == DPU::LSLrrr || ThirdLastOpc == DPU::LSRrrr) {
+
+      LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB.dump();
+      });
+
+      unsigned int NewOpcode;
+
+      switch (ThirdLastOpc) {
+      default:
+	report_fatal_error("This should not happen. Please report to UPMEM.");
+	break;
+
+      case DPU::LSLXrrr:
+	NewOpcode = DPU::LSLXrrrci;
+	break;
+
+      case DPU::LSRXrrr:
+	NewOpcode = DPU::LSRXrrrci;
+	break;
+
+      case DPU::LSLrrr:
+	NewOpcode = DPU::LSLrrrci;
+	break;
+
+      case DPU::LSRrrr:
+	NewOpcode = DPU::LSRrrrci;
+	break;
+      }
+
+      MachineInstrBuilder ComboInst = BuildMI(&MBB, ThirdLastInst->getDebugLoc(),
+					      TII->get(NewOpcode),
+					      ThirdLastInst->getOperand(0).getReg());
+      ComboInst.add(ThirdLastInst->getOperand(1));
+      ComboInst.add(ThirdLastInst->getOperand(2));
+      ComboInst.addImm(DPUAsmCondition::Condition::Shift32);
+      ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+      // ComboInst.addMetadata(N);
+
+      LLVM_DEBUG({
+	  dbgs() << "OK\n";
+	  dbgs() << "del "; ThirdLastInst->dump();
+	  dbgs() << "del "; SecondLastInst->dump();
+	  dbgs() << "del "; LastInst->dump();
+	  dbgs() << "fused to\n";
+	  dbgs() << "add "; ComboInst->dump();
+	});
+
+      LastInst->eraseFromParent();
+      SecondLastInst->eraseFromParent();
+      ThirdLastInst->eraseFromParent();
+      LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB.dump();
+      });
+      return true;
+    }
+  }
+
+  // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch
+  // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch
+  // if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii)
+  //     && SecondLastOpc == DPU::MUL_UL_ULrrr) {
+
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "before change: \n";
+  // 	dbgs() << "** MBB "; MBB->dump();
+  //     });
+
+  //   MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
+  // 					    InstrInfo.get(DPU::MUL_UL_ULrrrci),
+  // 					    SecondLastInst->getOperand(0).getReg());
+  //   ComboInst.add(SecondLastInst->getOperand(1));
+  //   ComboInst.add(SecondLastInst->getOperand(1));
+  //   ComboInst.addImm(DPUAsmCondition::Small);
+  //   ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+  //   // ComboInst.addMetadata(N);
+
+  //   LLVM_DEBUG({
+  // 	dbgs() << "OK\n";
+  // 	dbgs() << "del "; SecondLastInst->dump();
+  // 	dbgs() << "del "; LastInst->dump();
+  // 	dbgs() << "fused to\n";
+  // 	dbgs() << "add "; ComboInst->dump();
+  //     });
+  //   LastInst->eraseFromParent();
+  //   SecondLastInst->eraseFromParent();
+
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "after change: \n";
+  // 	dbgs() << "** MBB "; MBB->dump();
+  //     });
+
+  //   return true;
+  // }
+
+  // the original code is JNEQrii, but it is possible that split-critical-edge breaks
+  //   the BB and reverse cond+branch
+  if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii)
+      && SecondLastOpc == DPU::CLZ_Urr) {
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB.dump();
+      });
+
+    MachineInstrBuilder ComboInst = BuildMI(&MBB, DL, TII->get(DPU::CLZ_Urrci),
+					    SecondLastInst->getOperand(0).getReg());
+    ComboInst.add(SecondLastInst->getOperand(1));
+    ComboInst.addImm((LastOpc == DPU::JNEQrii) ?
+		     DPUAsmCondition::Condition::NotMaximum : DPUAsmCondition::Condition::Maximum);
+    ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+    // ComboInst.addMetadata(N);
+
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+
+    LastInst->eraseFromParent();
+    SecondLastInst->eraseFromParent();
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB.dump();
+      });
+
+    return true;
+  }
+
+  return false;
+}
+
+bool DPUPostRAFusionPass::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName()
+                    << " **********\n\n");
+
+  TII = static_cast<const DPUInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  bool Modified = false;
+
+  for (auto &MBB : MF) {
+    Modified |= runOnMachineBB(MBB);
+  }
+
+  LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName()
+                    << " done: Modified = " << Modified << " **********\n\n");
+  return Modified;
+}
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index beb7d532e2d00..4dd9f7b0fdfe1 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -12,10 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DPUTargetLowering.h"
+#include "DPUHelper.h"
 #include "DPUISelLowering.h"
 #include "DPUMachineFunctionInfo.h"
+#include "DPUTargetLowering.h"
 #include "DPUTargetMachine.h"
+
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -24,8 +26,9 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/IntrinsicsDPU.h"
-#include <iostream>
-#include <llvm/MC/MCSymbol.h>
+#include "llvm/MC/MCSymbol.h"
+
+// #include <iostream>
 
 #define GET_REGINFO_ENUM
 
@@ -1971,18 +1974,18 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   BB->addSuccessor(fastMBB);
   slowMBB->addSuccessor(fastMBB);
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1 = MI.getOperand(1).getReg();
-  unsigned int Op2 = MI.getOperand(2).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
 
   MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned int LLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int HLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int HL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int HHDest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int LSL1Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register LLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register HLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register HL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register HHDest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register LSL1Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
   BuildMI(BB, dl, TII.get(MulLL), LLDest)
       .addReg(Op1)
@@ -2039,10 +2042,10 @@ static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI,
   BB->addSuccessor(endMBB);
   trueMBB->addSuccessor(endMBB);
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int CondReg = MI.getOperand(1).getReg();
-  unsigned int TrueReg = MI.getOperand(2).getReg();
-  unsigned int FalseReg = MI.getOperand(3).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register CondReg = MI.getOperand(1).getReg();
+  Register TrueReg = MI.getOperand(2).getReg();
+  Register FalseReg = MI.getOperand(3).getReg();
 
   MachineRegisterInfo &RI = F->getRegInfo();
   unsigned FalseResultReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
@@ -2088,10 +2091,10 @@ EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   BB->addSuccessor(trueMBB);
   BB->addSuccessor(endMBB);
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int CondReg = MI.getOperand(1).getReg();
-  unsigned int TrueReg = MI.getOperand(2).getReg();
-  unsigned int FalseReg = MI.getOperand(3).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register CondReg = MI.getOperand(1).getReg();
+  Register TrueReg = MI.getOperand(2).getReg();
+  Register FalseReg = MI.getOperand(3).getReg();
 
   BuildMI(BB, dl, TII.get(DPU::Jcci))
       .addImm(ISD::CondCode::SETEQ)
@@ -2119,12 +2122,12 @@ EmitMramSubStoreWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   MachineFunction *F = BB->getParent();
 
   MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned ExactWramCacheAddrReg =
+  Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register ExactWramCacheAddrReg =
       RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int storeRegister = MI.getOperand(0).getReg();
+  Register storeRegister = MI.getOperand(0).getReg();
 
   // todo __sw_cache_buffer should have abstract representation
 
@@ -2175,8 +2178,8 @@ EmitMramStoreDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   MachineFunction *F = BB->getParent();
 
   MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
   // todo __sw_cache_buffer should have abstract representation
 
@@ -2214,10 +2217,10 @@ EmitMramSubLoadWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   MachineFunction *F = BB->getParent();
 
   MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned ExactWramCacheAddrReg =
+  Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register ExactWramCacheAddrReg =
       RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
   // todo __sw_cache_buffer should have abstract representation
@@ -2263,8 +2266,8 @@ EmitMramLoadDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   MachineFunction *F = BB->getParent();
 
   MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
   // todo __sw_cache_buffer should have abstract representation
 
@@ -2294,124 +2297,149 @@ EmitMramLoadDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   return BB;
 }
 
-static MachineBasicBlock *
-EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
+static MachineBasicBlock *emitLsl64RegisterWithCustomInserter(MachineInstr &MI,
+							      MachineBasicBlock *MBB) {
   /*
       What we want to generate (with dc.h != rb in that example):
-      lslx       __R0, da.l, rb, ?sh32 @+4
+      lslx       __R0, da.l, rb, ?sh32 @bigShift
+    smallShift:
       lsl        dc.h, da.h, rb
       or         dc.h, dc.h, __R0
-      lsl        dc.l, da.l, rb, ?true @+3
+      lsl        dc.l, da.l, rb, ?true @end
+    bigShift:
       lsl        dc.h, da.l, rb
       move       dc.l, 0
+    end:
    */
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *smallShiftMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *bigShiftMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, smallShiftMBB);
-  F->insert(I, bigShiftMBB);
-  F->insert(I, endMBB);
+  MachineFunction *MF = MBB->getParent();
+
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineBasicBlock *smallShiftMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *bigShiftMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB);
+
+  MachineFunction::iterator I = ++MBB->getIterator();
+  MF->insert(I, smallShiftMBB);
+  MF->insert(I, bigShiftMBB);
+  MF->insert(I, endMBB);
+
+  // Move all instructions after the instruction to endMBB.
+  endMBB->splice(endMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), BB,
-                 std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(BB);
+  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1Reg = MI.getOperand(1).getReg();
-  unsigned int ShiftReg = MI.getOperand(2).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1Reg = MI.getOperand(1).getReg();
+  Register ShiftReg = MI.getOperand(2).getReg();
 
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned LsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  Register LsbToMsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MsbToMsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register LsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register ShiftCheckReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  Register BigShiftMsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register BigShiftLsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  unsigned BigShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned BigShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register SmallShiftMsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register SmallShiftLsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  unsigned SmallShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned SmallShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register BigShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SmallShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SmallShiftResultReg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register BigShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SmallShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register Undef2Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  unsigned BigShiftResultPart0Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SmallShiftResultPart0Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  MDNode *MDN = getPostRAFusionMetadata(MF);
 
-  BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
+  BuildMI(MBB, DL, TII.get(DPU::COPY), LsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
 
-  BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
+  // BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
+  //     .addReg(LsbOp1Reg)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB);
+
+  BuildMI(MBB, DL, TII.get(DPU::LSLXrrr), LsbToMsbPartReg)
       .addReg(LsbOp1Reg)
       .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB);
+      .addMetadata(MDN);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg)
+  BuildMI(MBB, DL, TII.get(DPU::ANDrri), ShiftCheckReg)
+      .addReg(ShiftReg)
+      .addImm(0x20)
+      .addMetadata(MDN);
+
+  BuildMI(MBB, DL, TII.get(DPU::JEQrii))
+      .addReg(ShiftCheckReg)
+      .addImm(0x20)
+      .addMBB(bigShiftMBB)
+      .addMetadata(MDN);
+
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::COPY), MsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), MsbToMsbPartReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::LSLrrr), MsbToMsbPartReg)
       .addReg(MsbOp1Reg)
       .addReg(ShiftReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftMsbReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftMsbReg)
       .addReg(MsbToMsbPartReg)
       .addReg(LsbToMsbPartReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), SmallShiftLsbReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::LSLrrr), SmallShiftLsbReg)
       .addReg(LsbOp1Reg)
       .addReg(ShiftReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), Undef2Reg);
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), Undef2Reg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG),
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG),
           SmallShiftResultPart0Reg)
       .addReg(Undef2Reg)
       .addReg(SmallShiftLsbReg)
       .addImm(DPU::sub_32bit);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg)
       .addReg(SmallShiftResultPart0Reg)
       .addReg(SmallShiftMsbReg)
       .addImm(DPU::sub_32bit_hi);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB);
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::JUMPi)).addMBB(endMBB);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::LSLrrr), BigShiftMsbReg)
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::LSLrrr), BigShiftMsbReg)
       .addReg(LsbOp1Reg)
       .addReg(ShiftReg);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::MOVEri), BigShiftLsbReg).addImm(0);
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::MOVEri), BigShiftLsbReg).addImm(0);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg)
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg)
       .addReg(UndefReg)
       .addReg(BigShiftLsbReg)
       .addImm(DPU::sub_32bit);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg)
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg)
       .addReg(BigShiftResultPart0Reg)
       .addReg(BigShiftMsbReg)
       .addImm(DPU::sub_32bit_hi);
 
-  BB->addSuccessor(smallShiftMBB);
-  BB->addSuccessor(bigShiftMBB);
+  MBB->addSuccessor(smallShiftMBB);
+  MBB->addSuccessor(bigShiftMBB);
   smallShiftMBB->addSuccessor(endMBB);
   bigShiftMBB->addSuccessor(endMBB);
 
-  BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
+  BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest)
       .addReg(BigShiftResultReg)
       .addMBB(bigShiftMBB)
       .addReg(SmallShiftResultReg)
@@ -2428,8 +2456,8 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   MachineFunction *F = BB->getParent();
   MachineRegisterInfo &RI = F->getRegInfo();
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1Reg = MI.getOperand(1).getReg();
   int64_t ShiftImm = MI.getOperand(2).getImm();
 
   if (ShiftImm < 32) {
@@ -2439,13 +2467,13 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
           lsl_add dc.h __R0 da.h ShiftImm
           lsl     dc.l da.l ShiftImm
      */
-    unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-    unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
     BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb)
         .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -2484,10 +2512,10 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
           lsl dc.h da.l ${ShiftImm - 32}
           move dc.l 0
        */
-      unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-      unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-      unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-      unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+      Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+      Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+      Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+      Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
       BuildMI(*BB, MI, dl, TII.get(DPU::MOVEri), ResultLsb).addImm(0);
       BuildMI(*BB, MI, dl, TII.get(DPU::LSLrri), ResultMsb)
@@ -2512,10 +2540,10 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
         move dc.h da.l
         move dc.l 0
      */
-    unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-    unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
     BuildMI(*BB, MI, dl, TII.get(DPU::MOVEri), ResultLsb).addImm(0);
     BuildMI(*BB, MI, dl, TII.get(DPU::COPY), ResultMsb)
@@ -2538,101 +2566,128 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   return BB;
 }
 
-static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
-    MachineInstr &MI, MachineBasicBlock *BB, unsigned int shiftRight,
+static MachineBasicBlock *emitShiftRight64RegisterWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned int shiftRight,
     unsigned int shiftRightExtended) {
   /*
       What we want to generate (with dc.l != rb in that example):
-      lsrx    __R0, da.h, rb, ?sh32 @+4
+      lsrx    __R0, da.h, rb, ?sh32 @bigShift
+    smallShift:
       lsr     dc.l, da.l, rb
       or      dc.l, dc.l, __R0
-      lsr     dc.h, da.h, rb, ?true @+2       // asr     dc.h, da.h, rb, ?true
-     @+2 lsr.u   dc, da.h, rb                    // asr.s   dc, da.h, rb
+      lsr     dc.h, da.h, rb, ?true @end   // asr     dc.h, da.h, rb, ?true @end
+     bigShift:
+      lsr.u   dc, da.h, rb                 // asr.s   dc, da.h, rb
+     end:
    */
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *smallShiftMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *bigShiftMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, smallShiftMBB);
-  F->insert(I, bigShiftMBB);
-  F->insert(I, endMBB);
+  MachineFunction *MF = MBB->getParent();
+
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineBasicBlock *smallShiftMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *bigShiftMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB);
+
+  MachineFunction::iterator I = ++MBB->getIterator();
+  MF->insert(I, smallShiftMBB);
+  MF->insert(I, bigShiftMBB);
+  MF->insert(I, endMBB);
+
+  // Move all instructions after the instruction to EndMBB.
+  endMBB->splice(endMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), BB,
-                 std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(BB);
+  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1Reg = MI.getOperand(1).getReg();
-  unsigned int ShiftReg = MI.getOperand(2).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1Reg = MI.getOperand(1).getReg();
+  Register ShiftReg = MI.getOperand(2).getReg();
 
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned LsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned SmallShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned SmallShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SmallShiftResultPart0Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SmallShiftResultReg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-
-  BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  Register LsbToLsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MsbToLsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register LsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register MsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register ShiftCheckReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
-      .addReg(MsbOp1Reg)
-      .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB);
+  Register SmallShiftLsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register SmallShiftMsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SmallShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SmallShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register BigShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+
+  MDNode *MDN = getPostRAFusionMetadata(MF);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), LsbOp1Reg)
+  BuildMI(MBB, DL, TII.get(DPU::COPY), MsbOp1Reg)
+      .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
+
+  // BuildMI(MBB, DL, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
+  //     .addReg(MsbOp1Reg)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB);
+
+  BuildMI(MBB, DL, TII.get(DPU::LSRXrrr), MsbToLsbPartReg)
+    .addReg(MsbOp1Reg)
+    .addReg(ShiftReg)
+    .addMetadata(MDN);
+
+  BuildMI(MBB, DL, TII.get(DPU::ANDrri), ShiftCheckReg)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(MDN);
+
+  BuildMI(MBB, DL, TII.get(DPU::JEQrii))
+    .addReg(ShiftCheckReg)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(MDN);
+
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::COPY), LsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::LSRrrr), LsbToLsbPartReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::LSRrrr), LsbToLsbPartReg)
       .addReg(LsbOp1Reg)
       .addReg(ShiftReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftLsbReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftLsbReg)
       .addReg(MsbToLsbPartReg)
       .addReg(LsbToLsbPartReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(shiftRight), SmallShiftMsbReg)
+  BuildMI(smallShiftMBB, DL, TII.get(shiftRight), SmallShiftMsbReg)
       .addReg(MsbOp1Reg)
       .addReg(ShiftReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG),
-          SmallShiftResultPart0Reg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultPart0Reg)
       .addReg(UndefReg)
       .addReg(SmallShiftLsbReg)
       .addImm(DPU::sub_32bit);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg)
       .addReg(SmallShiftResultPart0Reg)
       .addReg(SmallShiftMsbReg)
       .addImm(DPU::sub_32bit_hi);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB);
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::JUMPi)).addMBB(endMBB);
 
-  BuildMI(bigShiftMBB, dl, TII.get(shiftRightExtended), BigShiftResultReg)
+  BuildMI(bigShiftMBB, DL, TII.get(shiftRightExtended), BigShiftResultReg)
       .addReg(MsbOp1Reg)
       .addReg(ShiftReg);
 
-  BB->addSuccessor(smallShiftMBB);
-  BB->addSuccessor(bigShiftMBB);
+  MBB->addSuccessor(smallShiftMBB);
+  MBB->addSuccessor(bigShiftMBB);
   smallShiftMBB->addSuccessor(endMBB);
   bigShiftMBB->addSuccessor(endMBB);
 
-  BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
+  BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest)
       .addReg(BigShiftResultReg)
       .addMBB(bigShiftMBB)
       .addReg(SmallShiftResultReg)
@@ -2650,8 +2705,8 @@ static MachineBasicBlock *EmitShiftRight64ImmediateWithCustomInserter(
   MachineFunction *F = BB->getParent();
   MachineRegisterInfo &RI = F->getRegInfo();
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1Reg = MI.getOperand(1).getReg();
   int64_t ShiftImm = MI.getOperand(2).getImm();
 
   if (ShiftImm < 32) {
@@ -2661,13 +2716,13 @@ static MachineBasicBlock *EmitShiftRight64ImmediateWithCustomInserter(
           lsr_add  dc.l __R0 da.l ShiftImm
           lsr      dc.h da.h ShiftImm       // asr      dc.h da.h ShiftImm
      */
-    unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-    unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
     BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb)
         .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -2723,7 +2778,7 @@ static MachineBasicBlock *EmitShiftRight64ImmediateWithCustomInserter(
 }
 
 static MachineBasicBlock *
-EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
+emitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB,
                                     unsigned int lsN, unsigned int lsNJump,
                                     unsigned int lsNx) {
   /*
@@ -2732,127 +2787,144 @@ EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
         lsNx    __R0, da.l, rb
         lsNx    __R1, da.h, rb
         lsN     dc.h, da.h, rb
-        lsN     __R2, da.l, rb  , ?sh32 @+3
+        lsN     __R2, da.l, rb  , ?sh32 @bigShift
         or      dc.h, dc.h, __R0
-        or      dc.l, __R2, __R1, ?true @+3
+        or      dc.l, __R2, __R1, ?true @end
+     bigShift:
         or      dc.l, dc.h, __R0
         or      dc.h, __R2, __R1
+     end:
    */
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *smallShiftMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *bigShiftMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, smallShiftMBB);
-  F->insert(I, bigShiftMBB);
-  F->insert(I, endMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), BB,
-                 std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(BB);
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1Reg = MI.getOperand(1).getReg();
-  unsigned int ShiftReg = MI.getOperand(2).getReg();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineBasicBlock *smallShiftMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *bigShiftMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB);
 
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned Op1LsbShiftX = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned Op1MsbShiftX = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned Op1LsbShift = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned Op1MsbShift = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned SmallShiftLsbResultReg =
-      RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned SmallShiftMsbResultReg =
-      RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned BigShiftLsbResultReg =
-      RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned BigShiftMsbResultReg =
-      RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SmallShiftResultReg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-
-  unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned UndefReg1 = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SmallShiftResultPart0Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned BigShiftResultPart0Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-
-  BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb)
+  MachineFunction::iterator I = ++MBB->getIterator();
+  MF->insert(I, smallShiftMBB);
+  MF->insert(I, bigShiftMBB);
+  MF->insert(I, endMBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  endMBB->splice(endMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1Reg = MI.getOperand(1).getReg();
+  Register ShiftReg = MI.getOperand(2).getReg();
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  Register Op1Lsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register Op1Msb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register Op1LsbShiftX = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register Op1MsbShiftX = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register Op1LsbShift = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register Op1MsbShift = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register SmallShiftLsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register SmallShiftMsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register BigShiftLsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register BigShiftMsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register BigShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SmallShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+
+  Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register UndefReg1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SmallShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register BigShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+
+  Register ShiftCheckReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  MDNode *MDN = getPostRAFusionMetadata(MF);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Lsb)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
-  BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Msb)
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Msb)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
-  BuildMI(*BB, MI, dl, TII.get(lsNx), Op1MsbShiftX)
+  BuildMI(*MBB, MI, DL, TII.get(lsNx), Op1MsbShiftX)
       .addReg(Op1Msb)
       .addReg(ShiftReg);
-  BuildMI(*BB, MI, dl, TII.get(lsNx), Op1LsbShiftX)
+  BuildMI(*MBB, MI, DL, TII.get(lsNx), Op1LsbShiftX)
       .addReg(Op1Lsb)
       .addReg(ShiftReg);
 
-  BuildMI(*BB, MI, dl, TII.get(lsN), Op1MsbShift)
+  BuildMI(*MBB, MI, DL, TII.get(lsN), Op1MsbShift)
       .addReg(Op1Msb)
       .addReg(ShiftReg);
-  BuildMI(*BB, MI, dl, TII.get(lsNJump), Op1LsbShift)
+
+  // BuildMI(*MBB, MI, DL, TII.get(lsNJump), Op1LsbShift)
+  //     .addReg(Op1Lsb)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB);
+  BuildMI(*MBB, MI, DL, TII.get(lsN), Op1LsbShift)
       .addReg(Op1Lsb)
       .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB);
-
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftMsbResultReg)
+      .addMetadata(MDN);
+  BuildMI(MBB, DL, TII.get(DPU::ANDrri), ShiftCheckReg)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(MDN)
+    ;
+  BuildMI(MBB, DL, TII.get(DPU::JEQrii))
+    .addReg(ShiftCheckReg)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(MDN)
+    ;
+
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftMsbResultReg)
       .addReg(Op1MsbShift)
       .addReg(Op1LsbShiftX);
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftLsbResultReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftLsbResultReg)
       .addReg(Op1LsbShift)
       .addReg(Op1MsbShiftX);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG),
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG),
           SmallShiftResultPart0Reg)
       .addReg(UndefReg)
       .addReg(SmallShiftLsbResultReg)
       .addImm(DPU::sub_32bit);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg)
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg)
       .addReg(SmallShiftResultPart0Reg)
       .addReg(SmallShiftMsbResultReg)
       .addImm(DPU::sub_32bit_hi);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB);
+  BuildMI(smallShiftMBB, DL, TII.get(DPU::JUMPi)).addMBB(endMBB);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::ORrrr), BigShiftLsbResultReg)
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::ORrrr), BigShiftLsbResultReg)
       .addReg(Op1MsbShift)
       .addReg(Op1LsbShiftX);
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::ORrrr), BigShiftMsbResultReg)
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::ORrrr), BigShiftMsbResultReg)
       .addReg(Op1LsbShift)
       .addReg(Op1MsbShiftX);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg1);
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg1);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg)
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg)
       .addReg(UndefReg1)
       .addReg(BigShiftLsbResultReg)
       .addImm(DPU::sub_32bit);
 
-  BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg)
+  BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg)
       .addReg(BigShiftResultPart0Reg)
       .addReg(BigShiftMsbResultReg)
       .addImm(DPU::sub_32bit_hi);
 
-  BB->addSuccessor(smallShiftMBB);
-  BB->addSuccessor(bigShiftMBB);
+  MBB->addSuccessor(smallShiftMBB);
+  MBB->addSuccessor(bigShiftMBB);
   smallShiftMBB->addSuccessor(endMBB);
   bigShiftMBB->addSuccessor(endMBB);
 
-  BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
+  BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest)
       .addReg(BigShiftResultReg)
       .addMBB(bigShiftMBB)
       .addReg(SmallShiftResultReg)
@@ -2863,15 +2935,15 @@ EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
 }
 
 static MachineBasicBlock *
-EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
+emitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB,
                                      unsigned int lsNx, unsigned int lsN_add) {
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  MachineFunction *F = BB->getParent();
-  MachineRegisterInfo &RI = F->getRegInfo();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1Reg = MI.getOperand(1).getReg();
   int64_t ShiftImm = MI.getOperand(2).getImm();
 
   ShiftImm = ShiftImm % 64;
@@ -2884,43 +2956,43 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
         lsN_add dc.l, __R1, da.l, imm
         lsN_add dc.h, __R0, da.h, imm
      */
-    unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-    unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-
-    BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb)
+    Register Op1Lsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register Op1Msb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultLsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultLsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register ResultPart = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+
+    BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Lsb)
         .addReg(Op1Reg, 0, DPU::sub_32bit);
-    BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Msb)
+    BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Msb)
         .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
-    BuildMI(*BB, MI, dl, TII.get(lsNx), ResultLsbPart)
+    BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultLsbPart)
         .addReg(Op1Msb)
         .addImm(ShiftImm);
-    BuildMI(*BB, MI, dl, TII.get(lsNx), ResultMsbPart)
+    BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultMsbPart)
         .addReg(Op1Lsb)
         .addImm(ShiftImm);
-    BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultLsb)
+    BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultLsb)
         .addReg(ResultLsbPart)
         .addReg(Op1Lsb)
         .addImm(ShiftImm);
-    BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultMsb)
+    BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultMsb)
         .addReg(ResultMsbPart)
         .addReg(Op1Msb)
         .addImm(ShiftImm);
 
-    BuildMI(*BB, MI, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+    BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-    BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), ResultPart)
+    BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), ResultPart)
         .addReg(UndefReg)
         .addReg(ResultLsb)
         .addImm(DPU::sub_32bit);
 
-    BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), Dest)
+    BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dest)
         .addReg(ResultPart)
         .addReg(ResultMsb)
         .addImm(DPU::sub_32bit_hi);
@@ -2932,43 +3004,43 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
         lsN_add dc.h, __R1, da.l, ${ShiftImm - 32}
         lsN_add dc.l, __R0, da.h, ${ShiftImm - 32}
      */
-    unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-    unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-    unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-
-    BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb)
+    Register Op1Lsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register Op1Msb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultLsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultLsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register ResultMsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+    Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+    Register ResultPart = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+
+    BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Lsb)
         .addReg(Op1Reg, 0, DPU::sub_32bit);
-    BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Msb)
+    BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Msb)
         .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
-    BuildMI(*BB, MI, dl, TII.get(lsNx), ResultLsbPart)
+    BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultLsbPart)
         .addReg(Op1Lsb)
         .addImm(ShiftImm - 32);
-    BuildMI(*BB, MI, dl, TII.get(lsNx), ResultMsbPart)
+    BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultMsbPart)
         .addReg(Op1Msb)
         .addImm(ShiftImm - 32);
-    BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultLsb)
+    BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultLsb)
         .addReg(ResultLsbPart)
         .addReg(Op1Msb)
         .addImm(ShiftImm - 32);
-    BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultMsb)
+    BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultMsb)
         .addReg(ResultMsbPart)
         .addReg(Op1Lsb)
         .addImm(ShiftImm - 32);
 
-    BuildMI(*BB, MI, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+    BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-    BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), ResultPart)
+    BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), ResultPart)
         .addReg(UndefReg)
         .addReg(ResultLsb)
         .addImm(DPU::sub_32bit);
 
-    BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), Dest)
+    BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dest)
         .addReg(ResultPart)
         .addReg(ResultMsb)
         .addImm(DPU::sub_32bit_hi);
@@ -2977,82 +3049,108 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
     /*
         swapd dc da
      */
-    BuildMI(*BB, MI, dl, TII.get(DPU::SWAPDrr), Dest).addReg(Op1Reg);
+    BuildMI(*MBB, MI, DL, TII.get(DPU::SWAPDrr), Dest).addReg(Op1Reg);
   }
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
-  return BB;
+  return MBB;
 }
 
-static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
-                                                      MachineBasicBlock *BB) {
+static MachineBasicBlock *emitClz64WithCustomInserter(MachineInstr &MI,
+                                                      MachineBasicBlock *MBB) {
   /*
       What we want to generate (with dc != da in that example):
-      clz.u dc, da.h ?nmax @+3
+      clz.u dc, da.h ?nmax @end
       clz dc.l da.l
       add dc.l dc.l 32
+    end:
    */
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *msbAreZerosMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, msbAreZerosMBB);
-  F->insert(I, endMBB);
+  /*
+    Though, arithmetic+comparison+branch is difficult to manage,
+    we break `clz.u dc, da.h ?nmax @end` here, and fuse it back later.
+   */
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  const BasicBlock *BB = MBB->getBasicBlock();
+
+  MachineBasicBlock *msbAreZerosMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB);
+
+  MachineFunction::iterator I = ++MBB->getIterator();
+  MF->insert(I, msbAreZerosMBB);
+  MF->insert(I, endMBB);
+
+  // Move all instructions after the instruction to endMBB.
+  endMBB->splice(endMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), BB,
-                 std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(BB);
+  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
-  BB->addSuccessor(msbAreZerosMBB);
-  BB->addSuccessor(endMBB);
+  MBB->addSuccessor(msbAreZerosMBB);
+  MBB->addSuccessor(endMBB);
   msbAreZerosMBB->addSuccessor(endMBB);
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  Register Dest = MI.getOperand(0).getReg();
+  Register Op1Reg = MI.getOperand(1).getReg();
 
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned FastResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SlowResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SlowResultPart1Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-
-  BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  Register FastResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SlowResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SlowResultPart1Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  Register SlowResultPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  Register LsbClzReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  MDNode *MDN = getPostRAFusionMetadata(MF);
+
+  MachineInstrBuilder cpt1 = BuildMI(MBB, DL, TII.get(DPU::CLZ_Urr), FastResultReg)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
-      .addImm(DPUAsmCondition::Condition::NotMaximum)
-      .addMBB(endMBB);
+      .addMetadata(MDN);
+
+  MachineInstrBuilder cpt2 = BuildMI(MBB, DL, TII.get(DPU::JNEQrii))
+      .addReg(FastResultReg, 0, DPU::sub_32bit)
+      .addImm(32)
+      .addMBB(endMBB)
+      .addMetadata(MDN);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::CLZrr), LsbClzReg)
+  MachineInstrBuilder cpt3 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::CLZrr), LsbClzReg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg)
+  MachineInstrBuilder cpt4 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::ADDrri), SlowResultPartReg)
       .addReg(LsbClzReg)
       .addImm(32);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  MachineInstrBuilder cpt5 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg)
+  MachineInstrBuilder cpt6 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg)
       .addReg(UndefReg)
       .addReg(SlowResultPartReg)
       .addImm(DPU::sub_32bit);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg)
+  MachineInstrBuilder cpt7 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::INSERT_SUBREG), SlowResultReg)
       .addReg(SlowResultPart1Reg)
       .addReg(FastResultReg, 0, DPU::sub_32bit_hi)
       .addImm(DPU::sub_32bit_hi);
 
-  BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
+  MachineInstrBuilder cpt8 = BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest)
       .addReg(FastResultReg)
-      .addMBB(BB)
+      .addMBB(MBB)
       .addReg(SlowResultReg)
       .addMBB(msbAreZerosMBB);
 
+  if (MI.getOperand(1).isKill()) {
+    // cpt1->getOperand(1).setIsKill();
+    cpt3->getOperand(1).setIsKill();
+    cpt4->getOperand(1).setIsKill();
+
+    cpt6->getOperand(1).setIsKill();
+    cpt6->getOperand(2).setIsKill();
+    cpt7->getOperand(1).setIsKill();
+  }
+
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return endMBB;
 }
@@ -3119,34 +3217,34 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case DPU::MRAM_LOAD_DOUBLEmr:
     return EmitMramLoadDoubleWithCustomInserter(MI, BB);
   case DPU::LSL64rr:
-    return EmitLsl64RegisterWithCustomInserter(MI, BB);
+    return emitLsl64RegisterWithCustomInserter(MI, BB);
   case DPU::LSL64ri:
     return EmitLsl64ImmediateWithCustomInserter(MI, BB);
   case DPU::LSR64rr:
-    return EmitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr,
+    return emitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr,
                                                       DPU::LSR_Urrr);
   case DPU::LSR64ri:
     return EmitShiftRight64ImmediateWithCustomInserter(
         MI, BB, DPU::LSRrri, DPU::LSR_Urri, DPU::MOVE_Urr);
   case DPU::ASR64rr:
-    return EmitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::ASRrrr,
+    return emitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::ASRrrr,
                                                       DPU::ASR_Srrr);
   case DPU::ASR64ri:
     return EmitShiftRight64ImmediateWithCustomInserter(
         MI, BB, DPU::ASRrri, DPU::ASR_Srri, DPU::MOVE_Srr);
   case DPU::ROL64rr:
-    return EmitRot64RegisterWithCustomInserter(MI, BB, DPU::LSLrrr,
+    return emitRot64RegisterWithCustomInserter(MI, BB, DPU::LSLrrr,
                                                DPU::LSLrrrci, DPU::LSLXrrr);
   case DPU::ROR64rr:
-    return EmitRot64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr,
+    return emitRot64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr,
                                                DPU::LSRrrrci, DPU::LSRXrrr);
   case DPU::ROL64ri:
-    return EmitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSLXrri,
+    return emitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSLXrri,
                                                 DPU::LSL_ADDrrri);
   case DPU::ROR64ri:
-    return EmitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSRXrri,
+    return emitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSRXrri,
                                                 DPU::LSR_ADDrrri);
   case DPU::CLZ64r:
-    return EmitClz64WithCustomInserter(MI, BB);
+    return emitClz64WithCustomInserter(MI, BB);
   }
 }
diff --git a/llvm/lib/Target/DPU/DPUTargetMachine.cpp b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
index 5815b161c6ce9..e42cab004d1d3 100644
--- a/llvm/lib/Target/DPU/DPUTargetMachine.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DPUTargetMachine.h"
 #include "DPU.h"
 #include "DPUISelDAGToDAG.h"
 #include "DPUMacroFusion.h"
+#include "DPUTargetMachine.h"
 #include "DPUTargetTransformInfo.h"
 #include "MCTargetDesc/DPUMCAsmInfo.h"
+
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -84,6 +85,8 @@ class DPUPassConfig : public TargetPassConfig {
 
   bool addInstSelector() override;
 
+  void addPostRegAlloc() override;
+
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
 };
@@ -103,6 +106,15 @@ bool DPUPassConfig::addInstSelector() {
   return false;
 }
 
+void DPUPassConfig::addPostRegAlloc() {
+  // TODO: add CFGOptimizer
+  // if (addPass(&TailDuplicateID))
+  //   printAndVerify("After Post-RegAlloc TailDuplicate");
+
+  DPUTargetMachine &TM = getDPUTargetMachine();
+  addPass(createDPUPostRAFusionPass(TM));
+}
+
 void DPUPassConfig::addPreEmitPass() {
   DPUTargetMachine &TM = getDPUTargetMachine();
   addPass(createDPUMergeComboInstrPass(TM));

From d2d6314263211b8b1f3b20fd5926311daece205f Mon Sep 17 00:00:00 2001
From: Willy Wolff <wwolff@upmem.com>
Date: Fri, 23 Aug 2024 09:45:36 +0200
Subject: [PATCH 7/8] dpu: compiler-rt: add DPU support

---
 compiler-rt/dpu/CMakeLists.txt                | 306 ++++++++++++++++++
 compiler-rt/dpu/Toolchain.cmake               |  12 +
 compiler-rt/dpu/compiler_rt_tests.sh          | 269 +++++++++++++++
 compiler-rt/dpu/lldb_python.py                |  42 +++
 compiler-rt/lib/builtins/dpu/div32.c          |  97 ++++++
 compiler-rt/lib/builtins/dpu/divdi3.c         |  31 ++
 compiler-rt/lib/builtins/dpu/divsi3.c         |  23 ++
 compiler-rt/lib/builtins/dpu/moddi3.c         |  31 ++
 compiler-rt/lib/builtins/dpu/modsi3.c         |  34 ++
 compiler-rt/lib/builtins/dpu/mul32.S          |  48 +++
 compiler-rt/lib/builtins/dpu/mul32.c          |  59 ++++
 compiler-rt/lib/builtins/dpu/muldi3.c         | 171 ++++++++++
 compiler-rt/lib/builtins/dpu/mulsi3.c         |   8 +
 compiler-rt/lib/builtins/dpu/udiv32.S         |  49 +++
 compiler-rt/lib/builtins/dpu/udiv32.c         |  63 ++++
 compiler-rt/lib/builtins/dpu/udiv64.c         |  59 ++++
 compiler-rt/lib/builtins/dpu/udivdi3.c        |  19 ++
 compiler-rt/lib/builtins/dpu/udivmodsi4.c     |  29 ++
 compiler-rt/lib/builtins/dpu/udivsi3.c        |  15 +
 compiler-rt/lib/builtins/dpu/umoddi3.c        |  19 ++
 compiler-rt/lib/builtins/dpu/umodsi3.c        |  27 ++
 .../test/builtins/Unit/comparedf2_test.c      |   2 +-
 .../test/builtins/Unit/comparesf2_test.c      |   2 +-
 23 files changed, 1413 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/dpu/CMakeLists.txt
 create mode 100644 compiler-rt/dpu/Toolchain.cmake
 create mode 100644 compiler-rt/dpu/compiler_rt_tests.sh
 create mode 100644 compiler-rt/dpu/lldb_python.py
 create mode 100644 compiler-rt/lib/builtins/dpu/div32.c
 create mode 100644 compiler-rt/lib/builtins/dpu/divdi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/divsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/moddi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/modsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/mul32.S
 create mode 100644 compiler-rt/lib/builtins/dpu/mul32.c
 create mode 100644 compiler-rt/lib/builtins/dpu/muldi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/mulsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udiv32.S
 create mode 100644 compiler-rt/lib/builtins/dpu/udiv32.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udiv64.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udivdi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udivmodsi4.c
 create mode 100644 compiler-rt/lib/builtins/dpu/udivsi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/umoddi3.c
 create mode 100644 compiler-rt/lib/builtins/dpu/umodsi3.c

diff --git a/compiler-rt/dpu/CMakeLists.txt b/compiler-rt/dpu/CMakeLists.txt
new file mode 100644
index 0000000000000..3a81885225652
--- /dev/null
+++ b/compiler-rt/dpu/CMakeLists.txt
@@ -0,0 +1,306 @@
+cmake_minimum_required(VERSION 3.13)
+
+project(librt C ASM)
+
+set(CMAKE_AR llvm-ar)
+set(CMAKE_LINKER llvm-ld)
+set(CMAKE_NM llvm-nm)
+set(CMAKE_OBJDUMP llvm-objdump)
+set(CMAKE_RANLIB llvm-ranlib)
+set(OBJCOPY llvm-objcopy)
+set(CLANGFORMAT clang-format)
+
+set(COMPILER_RT_BUILTINS_DIR ../lib/builtins)
+
+set(GENERIC_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/mul32.S
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/mulsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/muldi3.c
+
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.S
+  # ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.c optimized above
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/div32.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/divsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/modsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/umodsi3.c
+
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv64.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/divdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/moddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/umoddi3.c
+
+  ${COMPILER_RT_BUILTINS_DIR}/absvdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/absvsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/adddf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashldi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashrdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/bswapdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/bswapsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/cmpdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparedf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparesf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/divdf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmoddi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendsfdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendhfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffsdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffssi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatdidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatdisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatundidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatundisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/fp_mode.c
+  ${COMPILER_RT_BUILTINS_DIR}/int_util.c
+  ${COMPILER_RT_BUILTINS_DIR}/lshrdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/moddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/modsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulodi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulosi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/negdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/paritydi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/paritysi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powidf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powisf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/subdf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncdfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncdfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncsfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ucmpdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmoddi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umoddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umodsi3.c
+  )
+
+set(GENERIC_TF_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/addtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/absvti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashrti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparetf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/cmpti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/divtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmodti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/extenddftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendhftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendsftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffsti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatditf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunditf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/lshrti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/modti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muloti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/multf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/multi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/negti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powitf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/subtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ucmpti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmodti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umodti3.c
+  )
+
+set(GENERIC_COMPLEX_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/divdc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c
+  )
+
+set(GENERIC_COMPLEX_TF_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/divdc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divtc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/multc3.c
+  )
+
+set(SOURCES ${GENERIC_SOURCES}
+  # ${GENERIC_TF_SOURCES}
+  # ${GENERIC_COMPLEX}
+  # ${GENERIC_COMPLEX_TF_SOURCES}
+  )
+
+function(add_dpu_library)
+  set(options PROFILING)
+  set(oneValueArgs TARGET OPT_LEVEL LTO)
+  set(multiValueArgs SOURCES)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message("ARGN: ${ARGN}")
+
+  message(${options})
+  message(${oneValueArgs})
+  message(${multiValueArgs})
+
+  message("TARGET: ${arg_TARGET}")
+  message("OPT_LEVEL: ${arg_OPT_LEVEL}")
+  message("PROFILING: ${arg_PROFILING}")
+  message("LTO: ${arg_LTO}")
+  message("LTO_TYPE: ${arg_LTO_TYPE}")
+
+  set(LOCAL_TARGET ${arg_TARGET})
+
+  set(OTHER_FLAGS)
+  list(APPEND OTHER_FLAGS -Wall)
+  list(APPEND OTHER_FLAGS -Wextra)
+
+  if (arg_OPT_LEVEL)
+    list(APPEND OTHER_FLAGS ${arg_OPT_LEVEL})
+    string(REPLACE "-" "" arg_OPT_LEVEL ${arg_OPT_LEVEL})
+    string(APPEND LOCAL_TARGET "_${arg_OPT_LEVEL}")
+  endif()
+  if (arg_LTO)
+    list(APPEND OTHER_FLAGS ${arg_LTO})
+    string(REPLACE "-f" "" arg_LTO ${arg_LTO})
+    string(REPLACE "=" "" arg_LTO ${arg_LTO})
+    string(APPEND LOCAL_TARGET "_${arg_LTO}")
+  else()
+    string(APPEND LOCAL_TARGET "_")
+  endif()
+  if (arg_PROFILING)
+    list(APPEND OTHER_FLAGS -pg)
+    string(APPEND LOCAL_TARGET "_pg")
+  endif()
+
+  list(APPEND OTHER_FLAGS -g0)
+  list(APPEND OTHER_FLAGS -mllvm -verify-machineinstrs)
+  # list(APPEND OTHER_FLAGS -mllvm -debug) --> deduped
+
+  message("LOCAL_TARGET: ${LOCAL_TARGET}")
+  message("OTHER_FLAGS: ${OTHER_FLAGS}")
+
+  add_library(${LOCAL_TARGET} STATIC "${arg_SOURCES}")
+
+  target_include_directories(${LOCAL_TARGET} PRIVATE
+    ${COMPILER_RT_BUILTINS_DIR}
+    ${COMPILER_RT_BUILTINS_DIR}/dpu)
+
+  target_compile_options(${LOCAL_TARGET}
+    PRIVATE ${NOSTDLIB_FLAGS} ${STRICT_FLAGS} ${COMPILER_TIMESTAMP_DEF} ${OTHER_FLAGS})
+
+  # set_target_properties(${LOCAL_TARGET} PROPERTIES OUTPUT_NAME "rt")
+
+  if (arg_LTO)
+    install(
+      TARGETS ${LOCAL_TARGET}
+      ARCHIVE
+      DESTINATION ${arg_OPT_LEVEL}/${arg_LTO}
+      )
+  else()
+    install(
+      TARGETS ${LOCAL_TARGET}
+      ARCHIVE
+      DESTINATION ${arg_OPT_LEVEL}/no_lto
+      )
+  endif()
+endfunction()
+
+# add_dpu_library(
+#     TARGET rt
+#     OPT_LEVEL -O3
+#     # LTO -flto
+#     # PROFILING
+#     SOURCES ${SOURCES}
+#     )
+
+foreach(OPT_LEVEL -O0;-O1;-O2;-O3;-Os)
+  add_dpu_library(
+    TARGET rt
+    OPT_LEVEL ${OPT_LEVEL}
+    SOURCES ${SOURCES}
+    )
+  # add_dpu_library(
+  #   TARGET rt
+  #   OPT_LEVEL ${OPT_LEVEL}
+  #   PROFILING
+  #   SOURCES ${SOURCES}
+  #   )
+  foreach(LTO -flto;-flto=thin)
+    add_dpu_library(
+      TARGET rt
+      OPT_LEVEL ${OPT_LEVEL}
+      LTO ${LTO}
+      SOURCES ${SOURCES}
+      )
+    # add_dpu_library(
+    #   TARGET rt
+    #   OPT_LEVEL ${OPT_LEVEL}
+    #   LTO ${LTO}
+    #   PROFILING
+    #   SOURCES ${SOURCES}
+    #   )
+  endforeach()
+endforeach()
diff --git a/compiler-rt/dpu/Toolchain.cmake b/compiler-rt/dpu/Toolchain.cmake
new file mode 100644
index 0000000000000..ae09a95e9b705
--- /dev/null
+++ b/compiler-rt/dpu/Toolchain.cmake
@@ -0,0 +1,12 @@
+include(CMakeForceCompiler)
+
+# set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS s;S;asm)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_CROSSCOMPILING 1)
+set(CMAKE_ASM_COMPILER dpu-clang)
+set(CMAKE_C_COMPILER dpu-clang)
+set(CMAKE_CXX_COMPILER dpu-clang)
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_C_COMPILER_WORKS 1)
+set(CMAKE_CXX_COMPILER_WORKS 1)
diff --git a/compiler-rt/dpu/compiler_rt_tests.sh b/compiler-rt/dpu/compiler_rt_tests.sh
new file mode 100644
index 0000000000000..83912da93aec8
--- /dev/null
+++ b/compiler-rt/dpu/compiler_rt_tests.sh
@@ -0,0 +1,269 @@
+#!/bin/bash
+
+set -eux
+
+LLVM_SOURCE=~/work/dpu_tools_llvm_cleanup_20240710_2/llvm-project
+COMPILER_RT=${LLVM_SOURCE}/compiler-rt/lib/builtins
+COMPILER_RT_TESTS=${LLVM_SOURCE}/compiler-rt/test/builtins/Unit
+
+COMPILER_RT_BUILD=`pwd`
+
+# not supported
+# declare -a TESTS_=(
+    # absvti2_test.c
+    # adddf3vfp_test.c
+    # addsf3vfp_test.c
+    # addtf3_test.c
+    # addvti3_test.c
+    # ashlti3_test.c
+    # ashrti3_test.c
+    # clzti2_test.c
+    # cmpti2_test.c
+    # compiler_rt_logb_test.c
+    # compiler_rt_logbf_test.c
+    # compiler_rt_logbl_test.c
+    # ctzti2_test.c
+    # divdc3_test.c
+    # divdf3vfp_test.c
+    # divmodti4_test.c
+    # divsf3vfp_test.c
+    # divsc3_test.c
+    # divtc3_test.c
+    # divtf3_test.c
+    # divti3_test.c
+    # divxc3_test.c
+    # eqdf2vfp_test.c
+    # eqsf2vfp_test.c
+    # eqtf2_test.c
+    # extenddftf2_test.c
+    # extendhftf2_test.c
+    # extendsfdf2vfp_test.c
+    # extendsftf2_test.c
+    # ffsti2_test.c
+    # fixdfsivfp_test.c
+    # fixdfti_test.c
+    # fixsfsivfp_test.c
+    # fixsfti_test.c
+    # fixtfti_test.c
+    # fixunsdfsivfp_test.c
+    # fixunsdfti_test.c
+    # fixunssfsivfp_test.c
+    # fixunssfti_test.c
+    # floatditf_test.c
+    # floatsidfvfp_test.c
+    # floatsisfvfp_test.c
+    # floatunditf_test.c
+    # floatunssidfvfp_test.c
+    # floatunssisfvfp_test.c
+    # muldc3_test.c
+    # ltdf2vfp_test.c
+    # ltsf2vfp_test.c
+    # gedf2vfp_test.c
+    # gesf2vfp_test.c
+    # gtdf2vfp_test.c
+    # gtsf2vfp_test.c
+    # ledf2vfp_test.c
+    # lesf2vfp_test.c
+    # muldf3vfp_test.c
+    # mulsf3vfp_test.c
+    # nedf2vfp_test.c
+    # negdf2vfp_test.c
+    # negsf2vfp_test.c
+    # nesf2vfp_test.c
+    # subdf3vfp_test.c
+    # subsf3vfp_test.c
+    # truncdfsf2vfp_test.c
+    # unorddf2vfp_test.c
+    # unordsf2vfp_test.c
+    # mulsc3_test.c
+    # mulxc3_test.c
+    # powixf2_test.c
+    # subvti3_test.c
+    # ucmpti2_test.c
+    # udivmodti4_test.c
+    # udivti3_test.c
+    # umodti3_test.c
+    # subtf3_test.c
+    # powitf2_test.c
+    # negvti2_test.c
+    # modti3_test.c
+    # muloti4_test.c
+    # multc3_test.c
+    # multi3_test.c
+    # mulvti3_test.c
+    # negti2_test.c
+    # netf2_test.c
+    # parityti2_test.c
+    # popcountti2_test.c
+    # fixtfdi_test.c
+    # fixtfsi_test.c
+    # fixunstfdi_test.c
+    # fixunstfsi_test.c
+    # fixunstfti_test.c
+    # fixunsxfdi_test.c
+    # fixunsxfsi_test.c
+    # fixunsxfti_test.c
+    # fixxfti_test.c
+    # floatdixf_test.c
+    # floatsitf_test.c
+    # floattidf_test.c
+    # floattisf_test.c
+    # floattitf_test.c
+    # floattixf_test.c
+    # floatundixf_test.c
+    # floatunsitf_test.c
+    # floatuntidf_test.c
+    # floatuntisf_test.c
+    # floatuntitf_test.c
+    # floatuntixf_test.c
+    # getf2_test.c
+    # gttf2_test.c
+    # letf2_test.c
+    # lshrti3_test.c
+    # lttf2_test.c
+    # multf3_test.c
+    # unordtf2_test.c
+    # trunctfdf2_test.c
+    # trunctfhf2_test.c
+    # trunctfsf2_test.c
+    # fixxfdi_test.c
+    # udivmoddi4_test.c # too big :)
+# )
+
+declare -a TESTS=(
+    # test.c
+    absvdi2_test.c
+    absvsi2_test.c
+    addvdi3_test.c
+    addvsi3_test.c
+    ashldi3_test.c
+    ashrdi3_test.c
+    bswapdi2_test.c
+    bswapsi2_test.c
+    clzdi2_test.c
+    clzsi2_test.c
+    cmpdi2_test.c
+    comparedf2_test.c
+    comparesf2_test.c
+    ctzdi2_test.c
+    ctzsi2_test.c
+    divdf3_test.c
+    divdi3_test.c
+    divmodsi4_test.c
+    divsf3_test.c
+    divsi3_test.c
+    extendhfsf2_test.c
+    ffsdi2_test.c
+    ffssi2_test.c
+    fixdfdi_test.c
+    fixsfdi_test.c
+    fixunsdfdi_test.c
+    fixunsdfsi_test.c
+    fixunssfdi_test.c
+    fixunssfsi_test.c
+    floatdidf_test.c
+    floatdisf_test.c
+    floatundidf_test.c
+    floatundisf_test.c
+    lshrdi3_test.c
+    moddi3_test.c
+    modsi3_test.c
+    muldi3_test.c
+    mulodi4_test.c
+    mulosi4_test.c
+    mulsi3_test.c
+    mulvdi3_test.c
+    mulvsi3_test.c
+    negdi2_test.c
+    negvdi2_test.c
+    negvsi2_test.c
+    paritydi2_test.c
+    paritysi2_test.c
+    popcountdi2_test.c
+    popcountsi2_test.c
+    powidf2_test.c
+    powisf2_test.c
+    subvdi3_test.c
+    subvsi3_test.c
+    truncdfhf2_test.c
+    truncdfsf2_test.c
+    truncsfhf2_test.c
+    ucmpdi2_test.c
+    udivdi3_test.c
+    udivmodsi4_test.c
+    udivsi3_test.c
+    umoddi3_test.c
+    umodsi3_test.c
+)
+
+declare -a OPT_LEVELS=(
+    O0
+    O1
+    O2
+    O3
+    Os
+)
+
+declare -a COMPILER_OPTIONS=(
+    no_lto
+    lto
+    ltothin
+)
+
+MYPWD=`pwd`
+
+mkdir -p test
+cd test
+
+for COMPILER_OPTION in "${COMPILER_OPTIONS[@]}"
+do
+    mkdir -p ${COMPILER_OPTION}
+    cd ${COMPILER_OPTION}
+
+    case "$COMPILER_OPTION" in
+	"no_lto") COMPILER_OPTION_="";;
+	"lto") COMPILER_OPTION_="-flto";;
+	"ltothin") COMPILER_OPTION_="-flto=thin";;
+    esac
+
+    case "$COMPILER_OPTION" in
+	"no_lto") COMPILER_OPTION_LIB="";;
+	"lto") COMPILER_OPTION_LIB="lto";;
+	"ltothin") COMPILER_OPTION_LIB="ltothin";;
+    esac
+
+    for OPT_LEVEL in "${OPT_LEVELS[@]}"
+    do
+	mkdir -p ${OPT_LEVEL}
+	cd ${OPT_LEVEL}
+
+	for TEST in "${TESTS[@]}"
+	do
+	    clang --target=dpu-upmem-dpurte -mcpu=v1A \
+		  -I${COMPILER_RT} \
+		  -g0 \
+		  -v \
+		  -save-temps \
+		  -I ${MYPWD} \
+		  ${COMPILER_OPTION_} \
+		  -${OPT_LEVEL} \
+		  ${COMPILER_RT_TESTS}/${TEST} \
+		  -o $(basename "${TEST}" .c) \
+		  -L ${COMPILER_RT_BUILD} -lrt_${OPT_LEVEL}_${COMPILER_OPTION_LIB} \
+		  -mllvm -debug -mllvm -print-after-all -mllvm -verify-machineinstrs \
+		&> `basename ${TEST}`_compiler_log.txt
+
+	    # dpu-lldb --batch --one-line run -- $(basename "${TEST}" .c)
+	    python3 ${LLVM_SOURCE}/compiler-rt/dpu/lldb_python.py $(basename "${TEST}" .c)
+	done
+	cd ..
+    done
+
+    cd ..
+done
+cd ..
+
+		  # -L  ~/scratch/dpu_tools/share/upmem/include/built-in/v1A -lrt_v1A \
+		  # -save-temps \
+		  # -mllvm -debug -mllvm -print-after-all -mllvm -verify-machineinstrs \
+		  # --thinlto-jobs=1
diff --git a/compiler-rt/dpu/lldb_python.py b/compiler-rt/dpu/lldb_python.py
new file mode 100644
index 0000000000000..e333723af601e
--- /dev/null
+++ b/compiler-rt/dpu/lldb_python.py
@@ -0,0 +1,42 @@
+import sys
+import os
+import subprocess
+import dpu
+import lldb
+import tempfile
+
+binary = sys.argv[1]
+
+debugger = lldb.SBDebugger().Create()
+debugger.SetAsync(False)
+
+target = debugger.CreateTarget(binary)
+assert target.IsValid()
+
+launch_info = lldb.SBLaunchInfo(None)
+launch_info.SetWorkingDirectory(os.getcwd())
+
+with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+    stdout_path = tmp_file.name
+
+launch_info.AddOpenFileAction(1, stdout_path, False, True)
+
+# process = target.Launch(debugger.GetListener(), None, None, ".",
+#                         "stdout.txt", "stderr.txt", None, 0, False, error)
+process = target.Launch(launch_info, lldb.SBError())
+# process = target.LaunchSimple(None, None, ".")
+
+# print(process)
+
+assert process.IsValid()
+
+with open(stdout_path, 'r') as file:
+    stdout_data = file.read()
+
+os.remove(stdout_path)
+
+print(stdout_data)
+
+# Cleanup LLDB
+# lldb.SBDebugger.Terminate()
+sys.exit(process.exit_state)
diff --git a/compiler-rt/lib/builtins/dpu/div32.c b/compiler-rt/lib/builtins/dpu/div32.c
new file mode 100644
index 0000000000000..df25bbbdaf9d4
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/div32.c
@@ -0,0 +1,97 @@
+/* Copyright 2024 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+/* int64_t */
+void
+__div32(int32_t dividend, int32_t divider
+	, int32_t *p_q, int32_t *p_rem
+	)
+{
+    uint64_t res;
+    uint32_t q;
+    uint32_t rem;
+
+    __asm__ goto("clo zero, %[dividend], z, %l[__div32_pos_dividend]\n\t"
+                 "clo zero, %[divider], z, %l[__div32_neg_dividend_pos_divider]\n\t"
+                 :
+                 : [dividend] "r"(dividend), [divider] "r"(divider)
+                 :
+                 : __div32_pos_dividend, __div32_neg_dividend_pos_divider);
+
+    /* The quotient's sign depends on the sign of the dividend and divider... After few tries it sounds */
+    /* like the quickest way to select the operators is to branch according to the cases. */
+
+    /* __div32_neg_dividend_neg_divider: */
+    /* As a result, the quotient is positive and the remainder negative */
+    dividend = 0 - dividend;
+    divider = 0 - divider;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    rem = (uint32_t)res;
+    rem = 0 - rem;
+    goto recombine;
+
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+    
+__div32_neg_dividend_pos_divider:
+    /* As a result, the quotient is negative and the remainder negative */
+    dividend = 0 - dividend;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    q = 0 - q;
+    rem = (uint32_t)res;
+    rem = 0 - rem;
+    goto recombine;
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+__div32_pos_dividend:
+    __asm__ goto("clo zero, %[divider], z, %l[__div32_pos_dividend_pos_divider]"
+                 :
+                 : [divider] "r"(divider)
+                 :
+                 : __div32_pos_dividend_pos_divider);
+    /* As a result, the quotient is negative and the remainder positive */
+    divider = 0 - divider;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    q = 0 - q;
+    rem = (uint32_t)res;
+    goto recombine;
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+__div32_pos_dividend_pos_divider:
+    /* The dividend and divider are both positive */
+    res = __udiv32(dividend, divider);
+    /* goto last_exit; */
+    q = (uint32_t) (res >> 32);
+    rem = (uint32_t) res;
+    /* goto recombine; */
+
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    
+/* recombine: */
+/*     res = q; */
+/*     res <<= 32; */
+/*     res |= rem; */
+/* last_exit: */
+/*     return res; */
+
+ recombine:
+    *p_q = q;
+    *p_rem = rem;
+    return;
+}
diff --git a/compiler-rt/lib/builtins/dpu/divdi3.c b/compiler-rt/lib/builtins/dpu/divdi3.c
new file mode 100644
index 0000000000000..178cbf35fd2ee
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/divdi3.c
@@ -0,0 +1,31 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 signed division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+
+extern uint64_t __udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+int64_t
+__divdi3(int64_t dividend, int64_t divider)
+{
+    if (dividend >= 0) {
+        if (divider >= 0) {
+            return __udiv64(dividend, divider, 0);
+        } else {
+            return -__udiv64(dividend, -divider, 0);
+        }
+    } else if (divider >= 0) {
+        // Negative dividend, positive divider
+        return -__udiv64(-dividend, divider, 0);
+    } else {
+        // Negative dividend, negative divider
+        return __udiv64(-dividend, -divider, 0);
+    }
+}
diff --git a/compiler-rt/lib/builtins/dpu/divsi3.c b/compiler-rt/lib/builtins/dpu/divsi3.c
new file mode 100644
index 0000000000000..8ec97468aaf83
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/divsi3.c
@@ -0,0 +1,23 @@
+/* Copyright 2024 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+
+/* extern int64_t __div32(int32_t dividend, int32_t divider); */
+extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem);
+
+#include "int_lib.h"
+
+COMPILER_RT_ABI si_int
+__divsi3(si_int a, si_int b)
+{
+  /* int64_t res = __div32(a, b); */
+  /* return (si_int) (res >> 32); */
+  
+  int32_t q;
+  int32_t rem;
+  __div32(a, b, &q, &rem);
+  return q;
+}
diff --git a/compiler-rt/lib/builtins/dpu/moddi3.c b/compiler-rt/lib/builtins/dpu/moddi3.c
new file mode 100644
index 0000000000000..dad11e699f87c
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/moddi3.c
@@ -0,0 +1,31 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 signed division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+int64_t
+__moddi3(int64_t dividend, int64_t divider)
+{
+    if (dividend >= 0) {
+        if (divider >= 0) {
+            return __udiv64(dividend, divider, 1);
+        } else {
+            return __udiv64(dividend, -divider, 1);
+        }
+    } else if (divider >= 0) {
+        // Negative dividend, positive divider
+        return -__udiv64(-dividend, divider, 1);
+    } else {
+        // Negative dividend, negative divider
+        return -__udiv64(-dividend, -divider, 1);
+    }
+}
diff --git a/compiler-rt/lib/builtins/dpu/modsi3.c b/compiler-rt/lib/builtins/dpu/modsi3.c
new file mode 100644
index 0000000000000..c0cc59e8c92f9
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/modsi3.c
@@ -0,0 +1,34 @@
+/* ===-- modsi3.c - Implement __modsi3 -------------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __modsi3 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include <stdint.h>
+
+/* extern int64_t __div32(int32_t dividend, int32_t divider); */
+extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem);
+
+#include "int_lib.h"
+
+/* Returns: a % b */
+
+COMPILER_RT_ABI si_int
+__modsi3(si_int a, si_int b)
+{
+    /* int64_t res = __div32(a, b); */
+    /* return (si_int) res; */
+  
+  int32_t q;
+  int32_t rem;
+  __div32(a, b, &q, &rem);
+  return rem;
+}
diff --git a/compiler-rt/lib/builtins/dpu/mul32.S b/compiler-rt/lib/builtins/dpu/mul32.S
new file mode 100644
index 0000000000000..fe735ab5b328f
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mul32.S
@@ -0,0 +1,48 @@
+        .text
+        .globl  __mul32
+        .type   __mul32,@function
+__mul32:
+        jgtu r1, r0, .Ltmp0
+        move r2, r0
+        move r0, r1, true, .Ltmp1
+.Ltmp0:
+        move r2, r1
+        // move r0, r0
+.Ltmp1:
+        move r1, zero
+        mul_step d0, r2, d0, 0, z, .Ltmp2
+        mul_step d0, r2, d0, 1, z, .Ltmp2
+        mul_step d0, r2, d0, 2, z, .Ltmp2
+        mul_step d0, r2, d0, 3, z, .Ltmp2
+        mul_step d0, r2, d0, 4, z, .Ltmp2
+        mul_step d0, r2, d0, 5, z, .Ltmp2
+        mul_step d0, r2, d0, 6, z, .Ltmp2
+        mul_step d0, r2, d0, 7, z, .Ltmp2
+        mul_step d0, r2, d0, 8, z, .Ltmp2
+        mul_step d0, r2, d0, 9, z, .Ltmp2
+        mul_step d0, r2, d0, 10, z, .Ltmp2
+        mul_step d0, r2, d0, 11, z, .Ltmp2
+        mul_step d0, r2, d0, 12, z, .Ltmp2
+        mul_step d0, r2, d0, 13, z, .Ltmp2
+        mul_step d0, r2, d0, 14, z, .Ltmp2
+        mul_step d0, r2, d0, 15, z, .Ltmp2
+        mul_step d0, r2, d0, 16, z, .Ltmp2
+        mul_step d0, r2, d0, 17, z, .Ltmp2
+        mul_step d0, r2, d0, 18, z, .Ltmp2
+        mul_step d0, r2, d0, 19, z, .Ltmp2
+        mul_step d0, r2, d0, 20, z, .Ltmp2
+        mul_step d0, r2, d0, 21, z, .Ltmp2
+        mul_step d0, r2, d0, 22, z, .Ltmp2
+        mul_step d0, r2, d0, 23, z, .Ltmp2
+        mul_step d0, r2, d0, 24, z, .Ltmp2
+        mul_step d0, r2, d0, 25, z, .Ltmp2
+        mul_step d0, r2, d0, 26, z, .Ltmp2
+        mul_step d0, r2, d0, 27, z, .Ltmp2
+        mul_step d0, r2, d0, 28, z, .Ltmp2
+        mul_step d0, r2, d0, 29, z, .Ltmp2
+        mul_step d0, r2, d0, 30, z, .Ltmp2
+        mul_step d0, r2, d0, 31, z, .Ltmp2
+.Ltmp2:
+        move r0, r1
+
+        jump r23
diff --git a/compiler-rt/lib/builtins/dpu/mul32.c b/compiler-rt/lib/builtins/dpu/mul32.c
new file mode 100644
index 0000000000000..cc6be09b64847
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mul32.c
@@ -0,0 +1,59 @@
+#include <stdint.h>
+
+int32_t __mulsi3(int32_t a, int32_t b)
+{
+    int32_t dest;
+
+    int32_t temp0;
+    uint64_t temp1;
+
+    this is not working yet ...
+      temp1.hi/temp1.lo is not yet supported
+      
+    __asm__ volatile("  jgtu %[b], %[a], 1f\n"
+                     "  move %[temp0], %[a]\n"
+                     "  move %[temp1.hi], %[b], true, 2f\n"
+                     "1:\n"
+                     "  move %[temp0], %[b]\n"
+                     "  move %[temp1.hi], %[a]\n"
+                     "2:\n"
+                     "  move r1, zero\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 0 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 1 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 2 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 3 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 4 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 5 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 6 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 7 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 8 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 9 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 10, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 11, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 12, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 13, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 14, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 15, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 16, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 17, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 18, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 19, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 20, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 21, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 22, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 23, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 24, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 25, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 26, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 27, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 28, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 29, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 30, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 31, z, 3f\n"
+                     "3:\n"
+                     "  move %[dest], %[temp1.lo]\n"
+                     : [dest] "=&r"(dest), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1)
+                     : [a]"r"(a), [b]"r"(b)
+                     : );
+    return dest;
+}
diff --git a/compiler-rt/lib/builtins/dpu/muldi3.c b/compiler-rt/lib/builtins/dpu/muldi3.c
new file mode 100644
index 0000000000000..2d5a28b1dc260
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/muldi3.c
@@ -0,0 +1,171 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 multiplication emulation.
+ *
+ * A relatively fast emulation of 64x64 multiplication using byte multipliers.
+ * Basically, the two operands X and Y are seen as byte polynomials:
+ *  - X = X0.2^0 + X1.2^8 + X2.2^16 + X3.2^24 + X4.2^32 + X5.2^40 + X6.2^48 + X7.2^56
+ *  - Y = Y0.2^0 + Y1.2^8 + Y2.2^16 + Y3.2^24 + Y4.2^32 + Y5.2^40 + Y6.2^48 + Y7.2^56
+ *
+ * The product Z is expressed as a similar polynomial. Since the result is 64 bits,
+ * the function drops any coefficient for a power greater than 56, hence the following
+ * formula:
+ *  Z = (X0.Y0).2^0
+ *      + (X0.Y1 + X1.Y0).2^8
+ *      + (X0.Y2 + X2.Y0 + X1.Y1).2^16
+ *      + (X0.Y3 + X1.Y2 + X2.Y1 + X3.Y0).2^24
+ *      + (X0.Y4 + X1.Y3 + X2.Y2 + X3.Y1 + X4.Y0).2^32
+ *      etc.
+ *
+ * Each individual produce is computed with the native built-in 8x8 instructions.
+ * Resulting processing time is in the magnitude of 150 instructions.
+ *
+ * The two operands are found in __D0 and the first kernel nano-stack entry.
+ * The result goes into __R0 (lsbits) and __R1 (msbits).
+ * Also, __R2 contains the return address register, instead of __RET__.
+ */
+#include <stdint.h>
+
+static inline __attribute__((always_inline)) uint16_t
+_mul00(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return (a & 0xff) * (b & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_ul_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+static inline __attribute__((always_inline)) uint16_t
+_mul01(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return (a & 0xff) * ((b >> 8) & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_ul_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+#define _mul02(a, b) _mul00(a, (b >> 16))
+#define _mul03(a, b) _mul01(a, (b >> 16))
+
+static inline __attribute__((always_inline)) uint16_t
+_mul11(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return ((a >> 8) & 0xff) * ((b >> 8) & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_uh_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+static inline __attribute__((always_inline)) uint16_t
+_mul12(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return ((a >> 8) & 0xff) * ((b >> 16) & 0xff);
+#else
+    uint32_t r = (b >> 16);
+    __asm__ volatile("mul_uh_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(r) :);
+    return r;
+#endif
+}
+
+#define _mul13(a, b) _mul11(a, (b >> 16))
+#define _mul22(a, b) _mul00((a >> 16), (b >> 16))
+#define _mul23(a, b) _mul01((a >> 16), (b >> 16))
+#define _mul33(a, b) _mul11((a >> 16), (b >> 16))
+
+#define mulx0y0(xl, yl) _mul00(xl, yl)
+#define mulx0y1(xl, yl) _mul01(xl, yl)
+#define mulx0y2(xl, yl) _mul02(xl, yl)
+#define mulx0y3(xl, yl) _mul03(xl, yl)
+#define mulx0y4(xl, yh) _mul00(xl, yh)
+#define mulx0y5(xl, yh) _mul01(xl, yh)
+#define mulx0y6(xl, yh) _mul02(xl, yh)
+#define mulx0y7(xl, yh) _mul03(xl, yh)
+
+#define mulx1y1(xl, yl) _mul11(xl, yl)
+#define mulx1y2(xl, yl) _mul12(xl, yl)
+#define mulx1y3(xl, yl) _mul13(xl, yl)
+#define mulx1y4(xl, yh) _mul01(yh, xl)
+#define mulx1y5(xl, yh) _mul11(xl, yh)
+#define mulx1y6(xl, yh) _mul12(xl, yh)
+
+#define mulx2y2(xl, yl) _mul22(xl, yl)
+#define mulx2y3(xl, yl) _mul23(xl, yl)
+#define mulx2y4(xl, yh) _mul02(yh, xl)
+#define mulx2y5(xl, yh) _mul12(yh, xl)
+
+#define mulx3y3(xl, yl) _mul33(xl, yl)
+#define mulx3y4(xl, yh) _mul03(yh, xl)
+
+// Symmetry...
+#define mulx1y0(xl, yl) mulx0y1(yl, xl)
+#define mulx2y0(xl, yl) mulx0y2(yl, xl)
+#define mulx2y1(xl, yl) mulx1y2(yl, xl)
+#define mulx3y0(xl, yl) mulx0y3(yl, xl)
+#define mulx3y1(xl, yl) mulx1y3(yl, xl)
+#define mulx3y2(xl, yl) mulx2y3(yl, xl)
+#define mulx4y0(xh, yl) mulx0y4(yl, xh)
+#define mulx4y1(xh, yl) mulx1y4(yl, xh)
+#define mulx4y2(xh, yl) mulx2y4(yl, xh)
+#define mulx4y3(xh, yl) mulx3y4(yl, xh)
+#define mulx5y0(xh, yl) mulx0y5(yl, xh)
+#define mulx5y1(xh, yl) mulx1y5(yl, xh)
+#define mulx5y2(xh, yl) mulx2y5(yl, xh)
+#define mulx6y0(xh, yl) mulx0y6(yl, xh)
+#define mulx6y1(xh, yl) mulx1y6(yl, xh)
+#define mulx7y0(xh, yl) mulx0y7(yl, xh)
+
+uint64_t
+__muldi3(uint64_t x, uint64_t y)
+{
+    uint32_t xl = x;
+    uint32_t xh = ((uint64_t)x >> 32);
+    uint32_t yl = y;
+    uint32_t yh = ((uint64_t)y >> 32);
+
+    // Each fragment of the product.
+    uint32_t p0, p1, p2, p3, p4, p5, p6, p7, rh;
+    uint64_t rl;
+
+    p0 = mulx0y0(xl, yl);
+    rl = (uint64_t)p0;
+
+    p1 = mulx0y1(xl, yl) + mulx1y0(xl, yl);
+    rl += ((uint64_t)p1 << 8);
+
+    p2 = mulx0y2(xl, yl) + mulx2y0(xl, yl) + mulx1y1(xl, yl);
+    rl += ((uint64_t)p2 << 16);
+
+    p3 = mulx0y3(xl, yl) + mulx3y0(xl, yl) + mulx1y2(xl, yl) + mulx2y1(xl, yl);
+    rl += ((uint64_t)p3 << 24);
+
+    p4 = mulx0y4(xl, yh) + mulx4y0(xh, yl) + mulx1y3(xl, yl) + mulx3y1(xl, yl) + mulx2y2(xl, yl);
+    rh = p4;
+    
+    p5 = (mulx0y5(xl, yh) + mulx5y0(xh, yl) + mulx1y4(xl, yh) + mulx4y1(xh, yl)
+	  + mulx2y3(xl, yl) + mulx3y2(xl, yl));
+    rh += p5 << 8;
+
+    p6 = (mulx0y6(xl, yh) + mulx6y0(xh, yl) + mulx1y5(xl, yh) + mulx5y1(xh, yl)
+	  + mulx2y4(xl, yh) + mulx4y2(xh, yl) + mulx3y3(xl, yl));
+    rh += p6 << 16;
+    
+    p7 = (mulx0y7(xl, yh) + mulx7y0(xh, yl) + mulx1y6(xl, yh) + mulx6y1(xh, yl)
+	  + mulx2y5(xl, yh) + mulx5y2(xh, yl) + mulx3y4(xl, yh) + mulx4y3(xh, yl));
+    rh += p7 << 24;
+
+    return rl + (((uint64_t)rh) << 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/mulsi3.c b/compiler-rt/lib/builtins/dpu/mulsi3.c
new file mode 100644
index 0000000000000..f41210acd79cd
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mulsi3.c
@@ -0,0 +1,8 @@
+#include <stdint.h>
+
+extern int32_t __mul32(int32_t a, int32_t b);
+
+int32_t __mulsi3(int32_t a, int32_t b)
+{
+  return __mul32(a, b);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udiv32.S b/compiler-rt/lib/builtins/dpu/udiv32.S
new file mode 100644
index 0000000000000..8298d37dd8a0e
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv32.S
@@ -0,0 +1,49 @@
+        .text
+        .globl  __udiv32
+        .type   __udiv32,@function
+__udiv32:
+	clz r2, r1, max, 1f // r2 = by how many the divider can be shifted on 32-bit
+	clz r3, r0         // r3 = number of useless bits of the dividend
+	sub r2, r3, r2, gtu, 2f// r2 = the maximal shift to be done
+	move r3, r1
+	move.u d0, r0
+	jump r2, 3f                 // As we will jump backward relatively to label 3 forward
+	div_step d0, r3, d0, 31
+	div_step d0, r3, d0, 30
+	div_step d0, r3, d0, 29
+	div_step d0, r3, d0, 28
+	div_step d0, r3, d0, 27
+	div_step d0, r3, d0, 26
+	div_step d0, r3, d0, 25
+	div_step d0, r3, d0, 24
+	div_step d0, r3, d0, 23
+	div_step d0, r3, d0, 22
+	div_step d0, r3, d0, 21
+	div_step d0, r3, d0, 20
+	div_step d0, r3, d0, 19
+	div_step d0, r3, d0, 18
+	div_step d0, r3, d0, 17
+	div_step d0, r3, d0, 16
+	div_step d0, r3, d0, 15
+	div_step d0, r3, d0, 14
+	div_step d0, r3, d0, 13
+	div_step d0, r3, d0, 12
+	div_step d0, r3, d0, 11
+	div_step d0, r3, d0, 10
+	div_step d0, r3, d0, 9
+	div_step d0, r3, d0, 8
+	div_step d0, r3, d0, 7
+	div_step d0, r3, d0, 6
+	div_step d0, r3, d0, 5
+	div_step d0, r3, d0, 4
+	div_step d0, r3, d0, 3
+	div_step d0, r3, d0, 2
+	div_step d0, r3, d0, 1
+3:
+	div_step d0, r3, d0, 0
+4:	
+	jump r23
+2:
+	move.u d0, r0, true, 4b
+1:
+	fault 2
diff --git a/compiler-rt/lib/builtins/dpu/udiv32.c b/compiler-rt/lib/builtins/dpu/udiv32.c
new file mode 100644
index 0000000000000..22f617e14fd71
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv32.c
@@ -0,0 +1,63 @@
+#include <stdint.h>
+
+uint64_t
+__udiv32(uint32_t dividend, uint32_t divider)
+{
+    uint64_t dest;
+
+    uint32_t temp0;
+    uint32_t temp1;
+
+    /* clang-format off */
+    __asm__ volatile("  clz %[temp0], %[divider], max, 1f\n" // %[temp0] = by how many the divider can be shifted on 32-bit
+                     "  clz %[temp1], %[dividend]\n" // %[temp1] = number of useless bits of the dividend
+                     "  sub %[temp0], %[temp1], %[temp0], gtu, 2f\n" // %[temp0] = the maximal shift to be done
+                     "  move %[temp1], %[divider]\n"
+                     "  move.u %[dest], %[dividend]\n"
+                     "  jump %[temp0], 3f\n" // As we will jump backward relatively to label 3 forward
+                     "  div_step %[dest], %[temp1], %[dest], 31\n"
+                     "  div_step %[dest], %[temp1], %[dest], 30\n"
+                     "  div_step %[dest], %[temp1], %[dest], 29\n"
+                     "  div_step %[dest], %[temp1], %[dest], 28\n"
+                     "  div_step %[dest], %[temp1], %[dest], 27\n"
+                     "  div_step %[dest], %[temp1], %[dest], 26\n"
+                     "  div_step %[dest], %[temp1], %[dest], 25\n"
+                     "  div_step %[dest], %[temp1], %[dest], 24\n"
+                     "  div_step %[dest], %[temp1], %[dest], 23\n"
+                     "  div_step %[dest], %[temp1], %[dest], 22\n"
+                     "  div_step %[dest], %[temp1], %[dest], 21\n"
+                     "  div_step %[dest], %[temp1], %[dest], 20\n"
+                     "  div_step %[dest], %[temp1], %[dest], 19\n"
+                     "  div_step %[dest], %[temp1], %[dest], 18\n"
+                     "  div_step %[dest], %[temp1], %[dest], 17\n"
+                     "  div_step %[dest], %[temp1], %[dest], 16\n"
+                     "  div_step %[dest], %[temp1], %[dest], 15\n"
+                     "  div_step %[dest], %[temp1], %[dest], 14\n"
+                     "  div_step %[dest], %[temp1], %[dest], 13\n"
+                     "  div_step %[dest], %[temp1], %[dest], 12\n"
+                     "  div_step %[dest], %[temp1], %[dest], 11\n"
+                     "  div_step %[dest], %[temp1], %[dest], 10\n"
+                     "  div_step %[dest], %[temp1], %[dest], 9\n"
+                     "  div_step %[dest], %[temp1], %[dest], 8\n"
+                     "  div_step %[dest], %[temp1], %[dest], 7\n"
+                     "  div_step %[dest], %[temp1], %[dest], 6\n"
+                     "  div_step %[dest], %[temp1], %[dest], 5\n"
+                     "  div_step %[dest], %[temp1], %[dest], 4\n"
+                     "  div_step %[dest], %[temp1], %[dest], 3\n"
+                     "  div_step %[dest], %[temp1], %[dest], 2\n"
+                     "  div_step %[dest], %[temp1], %[dest], 1\n"
+                     "3:\n"
+                     "  div_step %[dest], %[temp1], %[dest], 0\n"
+                     "4:\n"
+                     "  jump 5f\n"
+                     "2:\n"
+                     "  move.u %[dest], %[dividend], true, 4b\n"
+                     "1:\n"
+                     "  fault 2\n"
+                     "5:\n"
+                     : [dest] "=r"(dest), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1)
+                     : [dividend] "r"(dividend), [divider] "r"(divider));
+    /* clang-format on */
+
+    return dest;
+}
diff --git a/compiler-rt/lib/builtins/dpu/udiv64.c b/compiler-rt/lib/builtins/dpu/udiv64.c
new file mode 100644
index 0000000000000..e55b3ffe9904c
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv64.c
@@ -0,0 +1,59 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 multiplication unsigned division.
+ */
+#include <stdint.h>
+
+static unsigned int
+__clz__(uint64_t x)
+{
+    return __builtin_clzl(x);
+}
+
+uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder)
+{
+    uint64_t dxo = dividend, dxe = 0;
+
+    if (divider == 0) {
+      __asm__ volatile("fault 2");
+      /* unreachable(); */
+      __builtin_unreachable();
+    }
+    if (divider > dividend) {
+        if (ask_remainder == 0)
+            return 0;
+        else
+            return dividend;
+    }
+
+    // Mimic the div_step.
+    /// div_step functionality:
+    //   if (Dxo >= (Ra<< #u5)) {
+    //     Dxo = Dxo - (Ra<< #u5);
+    //     Dxe = (Dxe << 1) | 1;
+    //   } else {
+    //     Dxe =  Dxe << 1;
+    //   }
+    int dividerl0 = __clz__(divider), dividendl0 = __clz__(dividend);
+
+    int i = dividerl0 - dividendl0;
+
+    for (; i >= 0; i--) {
+        uint64_t pivot = ((uint64_t)divider << i);
+        if (dxo >= pivot) {
+            dxo = dxo - pivot;
+            dxe = ((uint64_t)dxe << 1) | 1L;
+        } else {
+            dxe = (uint64_t)dxe << 1;
+        }
+    }
+    if (ask_remainder == 1)
+        return dxo;
+    else
+        return dxe;
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivdi3.c b/compiler-rt/lib/builtins/dpu/udivdi3.c
new file mode 100644
index 0000000000000..1b60b934b85f4
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivdi3.c
@@ -0,0 +1,19 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 unsigned division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+uint64_t
+__udivdi3(uint64_t dividend, uint64_t divider)
+{
+    return __udiv64(dividend, divider, 0);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivmodsi4.c b/compiler-rt/lib/builtins/dpu/udivmodsi4.c
new file mode 100644
index 0000000000000..3a3f3902b6f61
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivmodsi4.c
@@ -0,0 +1,29 @@
+/*===-- udivmodsi4.c - Implement __udivmodsi4 ------------------------------===
+ *
+ *                    The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __udivmodsi4 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+#include "int_lib.h"
+
+/* Returns: a / b, *rem = a % b  */
+
+COMPILER_RT_ABI su_int
+__udivmodsi4(su_int a, su_int b, su_int *rem)
+{
+    uint64_t res = __udiv32(a, b);
+    *rem = (su_int)res;
+    return (su_int) (res >> 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivsi3.c b/compiler-rt/lib/builtins/dpu/udivsi3.c
new file mode 100644
index 0000000000000..dcc1d9fcf672f
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivsi3.c
@@ -0,0 +1,15 @@
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+#include "../int_lib.h"
+
+typedef su_int fixuint_t;
+typedef si_int fixint_t;
+
+// Returns: a / b
+
+COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) {
+  uint64_t res = __udiv32(a, b);
+  return (su_int) (res >> 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/umoddi3.c b/compiler-rt/lib/builtins/dpu/umoddi3.c
new file mode 100644
index 0000000000000..4b3a82b01eb98
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/umoddi3.c
@@ -0,0 +1,19 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 unsigned remainder.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divider)
+{
+    return __udiv64(dividend, divider, 1);
+}
diff --git a/compiler-rt/lib/builtins/dpu/umodsi3.c b/compiler-rt/lib/builtins/dpu/umodsi3.c
new file mode 100644
index 0000000000000..c85cd8a4d9aed
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/umodsi3.c
@@ -0,0 +1,27 @@
+/* ===-- umodsi3.c - Implement __umodsi3 -----------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __umodsi3 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include "int_lib.h"
+
+/* Returns: a % b */
+
+extern unsigned long
+__udiv32(unsigned int, unsigned int);
+
+COMPILER_RT_ABI su_int
+__umodsi3(su_int a, su_int b)
+{
+    unsigned long res = __udiv32(a, b);
+    return (unsigned int)res;
+}
diff --git a/compiler-rt/test/builtins/Unit/comparedf2_test.c b/compiler-rt/test/builtins/Unit/comparedf2_test.c
index 27666e2ad689b..d606ae7eff6ca 100644
--- a/compiler-rt/test/builtins/Unit/comparedf2_test.c
+++ b/compiler-rt/test/builtins/Unit/comparedf2_test.c
@@ -458,7 +458,7 @@ static const struct TestVector vectors[] = {
     {__builtin_inf(),__builtin_inf(),0,0,0,0,0,0,0},
 };    
 
-int main(int argc, char *argv[]) {
+int main() {
     const int numVectors = sizeof vectors / sizeof vectors[0];
     int i;
     for (i = 0; i<numVectors; ++i) {
diff --git a/compiler-rt/test/builtins/Unit/comparesf2_test.c b/compiler-rt/test/builtins/Unit/comparesf2_test.c
index b6a52b74633aa..f129bece62364 100644
--- a/compiler-rt/test/builtins/Unit/comparesf2_test.c
+++ b/compiler-rt/test/builtins/Unit/comparesf2_test.c
@@ -458,7 +458,7 @@ static const struct TestVector vectors[] = {
     {__builtin_inff(),__builtin_inff(),0,0,0,0,0,0,0},
 };    
 
-int main(int argc, char *argv[]) {
+int main() {
     const int numVectors = sizeof vectors / sizeof vectors[0];
     int i;
     for (i = 0; i<numVectors; ++i) {

From e802fe623fd65b79c37f1d7808f3642c8e6ad885 Mon Sep 17 00:00:00 2001
From: Reshabh Sharma <Reshabhkumar.Sharma@amd.com>
Date: Thu, 29 Apr 2021 11:04:23 +0530
Subject: [PATCH 8/8] [ASAN] NFC: Use addrspace cast for pointers in non-zero
 addrspace

Pointers in non-zero address spaces need to be address space
casted before appending to the used list.

Reviewed by: vitalybuka

Differential Revision: https://reviews.llvm.org/D101363
---
 llvm/lib/Transforms/Utils/ModuleUtils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index ef9f18a2289e9..90675143a8c84 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -87,7 +87,7 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *>
 
   Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
   for (auto *V : Values) {
-    Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+    Constant *C = ConstantExpr::getPointerBitCastOrAddrSpaceCast(V, Int8PtrTy);
     if (InitAsSet.insert(C).second)
       Init.push_back(C);
   }