diff --git a/compiler-rt/dpu/CMakeLists.txt b/compiler-rt/dpu/CMakeLists.txt
new file mode 100644
index 0000000000000..19e3c2790baf1
--- /dev/null
+++ b/compiler-rt/dpu/CMakeLists.txt
@@ -0,0 +1,306 @@
+cmake_minimum_required(VERSION 3.13)
+
+project(librt C ASM)
+
+set(CMAKE_AR llvm-ar)
+set(CMAKE_LINKER llvm-ld)
+set(CMAKE_NM llvm-nm)
+set(CMAKE_OBJDUMP llvm-objdump)
+set(CMAKE_RANLIB llvm-ranlib)
+set(OBJCOPY llvm-objcopy)
+set(CLANGFORMAT clang-format)
+
+set(COMPILER_RT_BUILTINS_DIR ../lib/builtins)
+
+set(GENERIC_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/mul32.S
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/mulsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/muldi3.c
+
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.S
+  # ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.c optimized above
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/div32.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/divsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/modsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/umodsi3.c
+  
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv64.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/divdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/moddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/udivdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/dpu/umoddi3.c
+
+  ${COMPILER_RT_BUILTINS_DIR}/absvdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/absvsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/adddf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashldi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashrdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/bswapdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/bswapsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/cmpdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparedf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparesf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/divdf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmoddi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendsfdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendhfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffsdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffssi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatdidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatdisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatundidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatundisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/fp_mode.c
+  ${COMPILER_RT_BUILTINS_DIR}/int_util.c
+  ${COMPILER_RT_BUILTINS_DIR}/lshrdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/moddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/modsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulodi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulosi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/negdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/paritydi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/paritysi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountsi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powidf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powisf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/subdf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subsf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncdfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncdfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/truncsfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ucmpdi2.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivdi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmoddi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmodsi4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivsi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umoddi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umodsi3.c
+  )
+
+set(GENERIC_TF_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/addtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/addvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/absvti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ashrti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/comparetf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/clzti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/cmpti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ctzti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/divtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divmodti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/divti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/extenddftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendhftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/extendsftf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ffsti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixdfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixsfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixtfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunsdfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunssfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfdi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfsi.c
+  ${COMPILER_RT_BUILTINS_DIR}/fixunstfti.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatditf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatsitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floattitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunditf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatunsitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntidf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntisf.c
+  ${COMPILER_RT_BUILTINS_DIR}/floatuntitf.c
+  ${COMPILER_RT_BUILTINS_DIR}/lshrti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/modti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muloti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/multf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/multi3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/negti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/negvti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/popcountti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/powitf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/subtf3.c
+  ${COMPILER_RT_BUILTINS_DIR}/subvti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfdf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfhf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/trunctfsf2.c
+  ${COMPILER_RT_BUILTINS_DIR}/ucmpti2.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivmodti4.c
+  ${COMPILER_RT_BUILTINS_DIR}/udivti3.c
+  ${COMPILER_RT_BUILTINS_DIR}/umodti3.c
+  )
+
+set(GENERIC_COMPLEX_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/divdc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c
+  )
+
+set(GENERIC_COMPLEX_TF_SOURCES
+  ${COMPILER_RT_BUILTINS_DIR}/divdc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/divtc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/muldc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c
+  ${COMPILER_RT_BUILTINS_DIR}/multc3.c
+  )
+
+set(SOURCES ${GENERIC_SOURCES}
+  # ${GENERIC_TF_SOURCES}
+  # ${GENERIC_COMPLEX}
+  # ${GENERIC_COMPLEX_TF_SOURCES}
+  )
+
+function(add_dpu_library)
+  set(options PROFILING)
+  set(oneValueArgs TARGET OPT_LEVEL LTO)
+  set(multiValueArgs SOURCES)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message("ARGN: ${ARGN}")
+
+  message(${options})
+  message(${oneValueArgs})
+  message(${multiValueArgs})
+  
+  message("TARGET: ${arg_TARGET}")
+  message("OPT_LEVEL: ${arg_OPT_LEVEL}")
+  message("PROFILING: ${arg_PROFILING}")
+  message("LTO: ${arg_LTO}")
+  message("LTO_TYPE: ${arg_LTO_TYPE}")
+  
+  set(LOCAL_TARGET ${arg_TARGET})
+
+  set(OTHER_FLAGS)
+  list(APPEND OTHER_FLAGS -Wall)
+  list(APPEND OTHER_FLAGS -Wextra)
+
+  if (arg_OPT_LEVEL)
+    list(APPEND OTHER_FLAGS ${arg_OPT_LEVEL})
+    string(REPLACE "-" "" arg_OPT_LEVEL ${arg_OPT_LEVEL})
+    string(APPEND LOCAL_TARGET "_${arg_OPT_LEVEL}")
+  endif()
+  if (arg_LTO)
+    list(APPEND OTHER_FLAGS ${arg_LTO})
+    string(REPLACE "-f" "" arg_LTO ${arg_LTO})
+    string(REPLACE "=" "" arg_LTO ${arg_LTO})
+    string(APPEND LOCAL_TARGET "_${arg_LTO}")
+  else()
+    string(APPEND LOCAL_TARGET "_")
+  endif()
+  if (arg_PROFILING)
+    list(APPEND OTHER_FLAGS -pg)
+    string(APPEND LOCAL_TARGET "_pg")
+  endif()
+
+  list(APPEND OTHER_FLAGS -g0)
+  list(APPEND OTHER_FLAGS -mllvm -verify-machineinstrs)
+  # list(APPEND OTHER_FLAGS -mllvm -debug) --> deduped
+
+  message("LOCAL_TARGET: ${LOCAL_TARGET}")
+  message("OTHER_FLAGS: ${OTHER_FLAGS}")
+
+  add_library(${LOCAL_TARGET} STATIC "${arg_SOURCES}")
+
+  target_include_directories(${LOCAL_TARGET} PRIVATE
+    ${COMPILER_RT_BUILTINS_DIR}
+    ${COMPILER_RT_BUILTINS_DIR}/dpu)
+  
+  target_compile_options(${LOCAL_TARGET}
+    PRIVATE ${NOSTDLIB_FLAGS} ${STRICT_FLAGS} ${COMPILER_TIMESTAMP_DEF} ${OTHER_FLAGS})
+
+  # set_target_properties(${LOCAL_TARGET} PROPERTIES OUTPUT_NAME "rt")
+
+  if (arg_LTO)
+    install(
+      TARGETS ${LOCAL_TARGET}
+      ARCHIVE
+      DESTINATION ${arg_OPT_LEVEL}/${arg_LTO}
+      )
+  else()
+    install(
+      TARGETS ${LOCAL_TARGET}
+      ARCHIVE
+      DESTINATION ${arg_OPT_LEVEL}/no_lto
+      )
+  endif()
+endfunction()
+
+# add_dpu_library(
+#     TARGET rt
+#     OPT_LEVEL -O3
+#     # LTO -flto
+#     # PROFILING
+#     SOURCES ${SOURCES}
+#     )
+  
+foreach(OPT_LEVEL -O0;-O1;-O2;-O3;-Os)
+  add_dpu_library(
+    TARGET rt
+    OPT_LEVEL ${OPT_LEVEL}
+    SOURCES ${SOURCES}
+    )
+  # add_dpu_library(
+  #   TARGET rt
+  #   OPT_LEVEL ${OPT_LEVEL}
+  #   PROFILING
+  #   SOURCES ${SOURCES}
+  #   )
+  foreach(LTO -flto;-flto=thin)
+    add_dpu_library(
+      TARGET rt
+      OPT_LEVEL ${OPT_LEVEL}
+      LTO ${LTO}
+      SOURCES ${SOURCES}
+      )
+    # add_dpu_library(
+    #   TARGET rt
+    #   OPT_LEVEL ${OPT_LEVEL}
+    #   LTO ${LTO}
+    #   PROFILING
+    #   SOURCES ${SOURCES}
+    #   )
+  endforeach()
+endforeach()
diff --git a/compiler-rt/dpu/Toolchain.cmake b/compiler-rt/dpu/Toolchain.cmake
new file mode 100644
index 0000000000000..ae09a95e9b705
--- /dev/null
+++ b/compiler-rt/dpu/Toolchain.cmake
@@ -0,0 +1,12 @@
+include(CMakeForceCompiler)
+
+# set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS s;S;asm)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_CROSSCOMPILING 1)
+set(CMAKE_ASM_COMPILER dpu-clang)
+set(CMAKE_C_COMPILER dpu-clang)
+set(CMAKE_CXX_COMPILER dpu-clang)
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_C_COMPILER_WORKS 1)
+set(CMAKE_CXX_COMPILER_WORKS 1)
diff --git a/compiler-rt/dpu/compiler_rt_tests.sh b/compiler-rt/dpu/compiler_rt_tests.sh
new file mode 100644
index 0000000000000..350c9ce6fb7e1
--- /dev/null
+++ b/compiler-rt/dpu/compiler_rt_tests.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+
+set -eux
+
+COMPILER_RT=~/work/dpu_tools_fix_64bit_reg/llvm-project/compiler-rt/lib/builtins
+COMPILER_RT_TESTS=~/work/dpu_tools_fix_64bit_reg/llvm-project/compiler-rt/test/builtins/Unit
+
+# not supported
+# declare -a TESTS_=(
+    # absvti2_test.c
+    # adddf3vfp_test.c
+    # addsf3vfp_test.c
+    # addtf3_test.c
+    # addvti3_test.c
+    # ashlti3_test.c
+    # ashrti3_test.c
+    # clzti2_test.c
+    # cmpti2_test.c
+    # compiler_rt_logb_test.c
+    # compiler_rt_logbf_test.c
+    # compiler_rt_logbl_test.c
+    # ctzti2_test.c
+    # divdc3_test.c
+    # divdf3vfp_test.c
+    # divmodti4_test.c
+    # divsf3vfp_test.c
+    # divsc3_test.c
+    # divtc3_test.c
+    # divtf3_test.c
+    # divti3_test.c
+    # divxc3_test.c
+    # eqdf2vfp_test.c
+    # eqsf2vfp_test.c
+    # eqtf2_test.c
+    # extenddftf2_test.c
+    # extendhftf2_test.c
+    # extendsfdf2vfp_test.c
+    # extendsftf2_test.c
+    # ffsti2_test.c
+    # fixdfsivfp_test.c
+    # fixdfti_test.c
+    # fixsfsivfp_test.c
+    # fixsfti_test.c
+    # fixtfti_test.c
+    # fixunsdfsivfp_test.c
+    # fixunsdfti_test.c
+    # fixunssfsivfp_test.c
+    # fixunssfti_test.c
+    # floatditf_test.c
+    # floatsidfvfp_test.c
+    # floatsisfvfp_test.c
+    # floatunditf_test.c
+    # floatunssidfvfp_test.c
+    # floatunssisfvfp_test.c
+    # muldc3_test.c
+    # ltdf2vfp_test.c
+    # ltsf2vfp_test.c
+    # gedf2vfp_test.c
+    # gesf2vfp_test.c
+    # gtdf2vfp_test.c
+    # gtsf2vfp_test.c
+    # ledf2vfp_test.c
+    # lesf2vfp_test.c
+    # muldf3vfp_test.c
+    # mulsf3vfp_test.c
+    # nedf2vfp_test.c
+    # negdf2vfp_test.c
+    # negsf2vfp_test.c
+    # nesf2vfp_test.c
+    # subdf3vfp_test.c
+    # subsf3vfp_test.c
+    # truncdfsf2vfp_test.c
+    # unorddf2vfp_test.c
+    # unordsf2vfp_test.c
+    # mulsc3_test.c
+    # mulxc3_test.c
+    # powixf2_test.c
+    # subvti3_test.c
+    # ucmpti2_test.c
+    # udivmodti4_test.c
+    # udivti3_test.c
+    # umodti3_test.c
+    # subtf3_test.c
+    # powitf2_test.c
+    # negvti2_test.c
+    # modti3_test.c
+    # muloti4_test.c
+    # multc3_test.c
+    # multi3_test.c
+    # mulvti3_test.c
+    # negti2_test.c
+    # netf2_test.c
+    # parityti2_test.c
+    # popcountti2_test.c
+    # fixtfdi_test.c
+    # fixtfsi_test.c
+    # fixunstfdi_test.c
+    # fixunstfsi_test.c
+    # fixunstfti_test.c
+    # fixunsxfdi_test.c
+    # fixunsxfsi_test.c
+    # fixunsxfti_test.c
+    # fixxfti_test.c
+    # floatdixf_test.c
+    # floatsitf_test.c
+    # floattidf_test.c
+    # floattisf_test.c
+    # floattitf_test.c
+    # floattixf_test.c
+    # floatundixf_test.c
+    # floatunsitf_test.c
+    # floatuntidf_test.c
+    # floatuntisf_test.c
+    # floatuntitf_test.c
+    # floatuntixf_test.c
+    # getf2_test.c
+    # gttf2_test.c
+    # letf2_test.c
+    # lshrti3_test.c
+    # lttf2_test.c
+    # multf3_test.c
+    # unordtf2_test.c
+    # trunctfdf2_test.c
+    # trunctfhf2_test.c
+    # trunctfsf2_test.c
+    # fixxfdi_test.c
+    # udivmoddi4_test.c # too big :)
+# )
+
+declare -a TESTS=(
+    test.c
+    absvdi2_test.c
+    absvsi2_test.c
+    addvdi3_test.c
+    addvsi3_test.c
+    ashldi3_test.c
+    ashrdi3_test.c
+    bswapdi2_test.c
+    bswapsi2_test.c
+    clzdi2_test.c
+    clzsi2_test.c
+    cmpdi2_test.c
+    comparedf2_test.c
+    comparesf2_test.c
+    ctzdi2_test.c
+    ctzsi2_test.c
+    divdf3_test.c
+    divdi3_test.c
+    divmodsi4_test.c
+    divsf3_test.c
+    divsi3_test.c
+    extendhfsf2_test.c
+    ffsdi2_test.c
+    ffssi2_test.c
+    fixdfdi_test.c
+    fixsfdi_test.c
+    fixunsdfdi_test.c
+    fixunsdfsi_test.c
+    fixunssfdi_test.c
+    fixunssfsi_test.c
+    floatdidf_test.c
+    floatdisf_test.c
+    floatundidf_test.c
+    floatundisf_test.c
+    lshrdi3_test.c
+    moddi3_test.c
+    modsi3_test.c
+    muldi3_test.c
+    mulodi4_test.c
+    mulosi4_test.c
+    mulsi3_test.c
+    mulvdi3_test.c
+    mulvsi3_test.c
+    negdi2_test.c
+    negvdi2_test.c
+    negvsi2_test.c
+    paritydi2_test.c
+    paritysi2_test.c
+    popcountdi2_test.c
+    popcountsi2_test.c
+    powidf2_test.c
+    powisf2_test.c
+    subvdi3_test.c
+    subvsi3_test.c
+    truncdfhf2_test.c
+    truncdfsf2_test.c
+    truncsfhf2_test.c
+    ucmpdi2_test.c
+    udivdi3_test.c
+    udivmodsi4_test.c
+    udivsi3_test.c
+    umoddi3_test.c
+    umodsi3_test.c
+)
+
+declare -a OPT_LEVELS=(
+    O0
+    # O1
+    # O2
+    # O3
+    # Os
+)
+
+declare -a COMPILER_OPTIONS=(
+    no_lto
+    # lto
+    # ltothin
+)
+
+MYPWD=`pwd`
+
+mkdir -p test
+cd test
+
+for COMPILER_OPTION in "${COMPILER_OPTIONS[@]}"
+do
+    mkdir -p ${COMPILER_OPTION}
+    cd ${COMPILER_OPTION}
+
+    case "$COMPILER_OPTION" in
+	"no_lto") COMPILER_OPTION_="";;
+	"lto") COMPILER_OPTION_="-flto";;
+	"ltothin") COMPILER_OPTION_="-flto=thin";;
+    esac
+
+    case "$COMPILER_OPTION" in
+	"no_lto") COMPILER_OPTION_LIB="";;
+	"lto") COMPILER_OPTION_LIB="lto";;
+	"ltothin") COMPILER_OPTION_LIB="ltothin";;
+    esac
+    
+    for OPT_LEVEL in "${OPT_LEVELS[@]}"
+    do
+	mkdir -p ${OPT_LEVEL}
+	cd ${OPT_LEVEL}
+	
+	for TEST in "${TESTS[@]}"
+	do
+	    clang --target=dpu-upmem-dpurte -mcpu=v1A \
+		  -I${COMPILER_RT} \
+		  -g0 \
+		  -v \
+		  -save-temps \
+		  -I ${MYPWD} \
+		  ${COMPILER_OPTION_} \
+		  -${OPT_LEVEL} \
+		  ${COMPILER_RT_TESTS}/${TEST} \
+		  -o $(basename "${TEST}" .c) \
+		  -L ${MYPWD}/install/${OPT_LEVEL}/${COMPILER_OPTION}/ -lrt_${OPT_LEVEL}_${COMPILER_OPTION_LIB} \
+		&> `basename ${TEST}`_compiler_log.txt
+
+	    # dpu-lldb --batch --one-line run -- $(basename "${TEST}" .c)
+	    python3 ~/work/simple_examples/lldb_python.py $(basename "${TEST}" .c)
+	done
+	cd ..
+    done
+
+    cd ..
+done
+cd ..
+
+		  # -L  ~/scratch/dpu_tools/share/upmem/include/built-in/v1A -lrt_v1A \
+		  # -save-temps \
+		  # -mllvm -debug -mllvm -print-after-all -mllvm -verify-machineinstrs \
+		  # --thinlto-jobs=1
diff --git a/compiler-rt/dpu/lldb_python.py b/compiler-rt/dpu/lldb_python.py
new file mode 100644
index 0000000000000..e333723af601e
--- /dev/null
+++ b/compiler-rt/dpu/lldb_python.py
@@ -0,0 +1,42 @@
+import sys
+import os
+import subprocess
+import dpu
+import lldb
+import tempfile
+
+binary = sys.argv[1]
+
+debugger = lldb.SBDebugger().Create()
+debugger.SetAsync(False)
+
+target = debugger.CreateTarget(binary)
+assert target.IsValid()
+
+launch_info = lldb.SBLaunchInfo(None)
+launch_info.SetWorkingDirectory(os.getcwd())
+
+with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+    stdout_path = tmp_file.name
+
+launch_info.AddOpenFileAction(1, stdout_path, False, True)
+
+# process = target.Launch(debugger.GetListener(), None, None, ".",
+#                         "stdout.txt", "stderr.txt", None, 0, False, error)
+process = target.Launch(launch_info, lldb.SBError())
+# process = target.LaunchSimple(None, None, ".")
+
+# print(process)
+
+assert process.IsValid()
+
+with open(stdout_path, 'r') as file:
+    stdout_data = file.read()
+
+os.remove(stdout_path)
+
+print(stdout_data)
+
+# Cleanup LLDB
+# lldb.SBDebugger.Terminate()
+sys.exit(process.exit_state)
diff --git a/compiler-rt/lib/builtins/dpu/div32.c b/compiler-rt/lib/builtins/dpu/div32.c
new file mode 100644
index 0000000000000..df25bbbdaf9d4
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/div32.c
@@ -0,0 +1,97 @@
+/* Copyright 2024 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+/* int64_t */
+void
+__div32(int32_t dividend, int32_t divider
+	, int32_t *p_q, int32_t *p_rem
+	)
+{
+    uint64_t res;
+    uint32_t q;
+    uint32_t rem;
+
+    __asm__ goto("clo zero, %[dividend], z, %l[__div32_pos_dividend]\n\t"
+                 "clo zero, %[divider], z, %l[__div32_neg_dividend_pos_divider]\n\t"
+                 :
+                 : [dividend] "r"(dividend), [divider] "r"(divider)
+                 :
+                 : __div32_pos_dividend, __div32_neg_dividend_pos_divider);
+
+    /* The quotient's sign depends on the sign of the dividend and divider... After few tries it sounds */
+    /* like the quickest way to select the operators is to branch according to the cases. */
+
+    /* __div32_neg_dividend_neg_divider: */
+    /* As a result, the quotient is positive and the remainder negative */
+    dividend = 0 - dividend;
+    divider = 0 - divider;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    rem = (uint32_t)res;
+    rem = 0 - rem;
+    goto recombine;
+
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+    
+__div32_neg_dividend_pos_divider:
+    /* As a result, the quotient is negative and the remainder negative */
+    dividend = 0 - dividend;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    q = 0 - q;
+    rem = (uint32_t)res;
+    rem = 0 - rem;
+    goto recombine;
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+__div32_pos_dividend:
+    __asm__ goto("clo zero, %[divider], z, %l[__div32_pos_dividend_pos_divider]"
+                 :
+                 : [divider] "r"(divider)
+                 :
+                 : __div32_pos_dividend_pos_divider);
+    /* As a result, the quotient is negative and the remainder positive */
+    divider = 0 - divider;
+    res = __udiv32(dividend, divider);
+    q = (uint32_t)(res >> 32);
+    q = 0 - q;
+    rem = (uint32_t)res;
+    goto recombine;
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    /* return; */
+
+__div32_pos_dividend_pos_divider:
+    /* The dividend and divider are both positive */
+    res = __udiv32(dividend, divider);
+    /* goto last_exit; */
+    q = (uint32_t) (res >> 32);
+    rem = (uint32_t) res;
+    /* goto recombine; */
+
+    /* *p_q = q; */
+    /* *p_rem = rem; */
+    
+/* recombine: */
+/*     res = q; */
+/*     res <<= 32; */
+/*     res |= rem; */
+/* last_exit: */
+/*     return res; */
+
+ recombine:
+    *p_q = q;
+    *p_rem = rem;
+    return;
+}
diff --git a/compiler-rt/lib/builtins/dpu/divdi3.c b/compiler-rt/lib/builtins/dpu/divdi3.c
new file mode 100644
index 0000000000000..178cbf35fd2ee
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/divdi3.c
@@ -0,0 +1,31 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 signed division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+
+extern uint64_t __udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+int64_t
+__divdi3(int64_t dividend, int64_t divider)
+{
+    if (dividend >= 0) {
+        if (divider >= 0) {
+            return __udiv64(dividend, divider, 0);
+        } else {
+            return -__udiv64(dividend, -divider, 0);
+        }
+    } else if (divider >= 0) {
+        // Negative dividend, positive divider
+        return -__udiv64(-dividend, divider, 0);
+    } else {
+        // Negative dividend, negative divider
+        return __udiv64(-dividend, -divider, 0);
+    }
+}
diff --git a/compiler-rt/lib/builtins/dpu/divsi3.c b/compiler-rt/lib/builtins/dpu/divsi3.c
new file mode 100644
index 0000000000000..8ec97468aaf83
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/divsi3.c
@@ -0,0 +1,23 @@
+/* Copyright 2024 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <stdint.h>
+
+/* extern int64_t __div32(int32_t dividend, int32_t divider); */
+extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem);
+
+#include "int_lib.h"
+
+COMPILER_RT_ABI si_int
+__divsi3(si_int a, si_int b)
+{
+  /* int64_t res = __div32(a, b); */
+  /* return (si_int) (res >> 32); */
+  
+  int32_t q;
+  int32_t rem;
+  __div32(a, b, &q, &rem);
+  return q;
+}
diff --git a/compiler-rt/lib/builtins/dpu/moddi3.c b/compiler-rt/lib/builtins/dpu/moddi3.c
new file mode 100644
index 0000000000000..dad11e699f87c
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/moddi3.c
@@ -0,0 +1,31 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 signed division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+int64_t
+__moddi3(int64_t dividend, int64_t divider)
+{
+    if (dividend >= 0) {
+        if (divider >= 0) {
+            return __udiv64(dividend, divider, 1);
+        } else {
+            return __udiv64(dividend, -divider, 1);
+        }
+    } else if (divider >= 0) {
+        // Negative dividend, positive divider
+        return -__udiv64(-dividend, divider, 1);
+    } else {
+        // Negative dividend, negative divider
+        return -__udiv64(-dividend, -divider, 1);
+    }
+}
diff --git a/compiler-rt/lib/builtins/dpu/modsi3.c b/compiler-rt/lib/builtins/dpu/modsi3.c
new file mode 100644
index 0000000000000..c0cc59e8c92f9
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/modsi3.c
@@ -0,0 +1,34 @@
+/* ===-- modsi3.c - Implement __modsi3 -------------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __modsi3 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include <stdint.h>
+
+/* extern int64_t __div32(int32_t dividend, int32_t divider); */
+extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem);
+
+#include "int_lib.h"
+
+/* Returns: a % b */
+
+COMPILER_RT_ABI si_int
+__modsi3(si_int a, si_int b)
+{
+    /* int64_t res = __div32(a, b); */
+    /* return (si_int) res; */
+  
+  int32_t q;
+  int32_t rem;
+  __div32(a, b, &q, &rem);
+  return rem;
+}
diff --git a/compiler-rt/lib/builtins/dpu/mul32.S b/compiler-rt/lib/builtins/dpu/mul32.S
new file mode 100644
index 0000000000000..fe735ab5b328f
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mul32.S
@@ -0,0 +1,48 @@
+        .text
+        .globl  __mul32
+        .type   __mul32,@function
+__mul32:
+        jgtu r1, r0, .Ltmp0
+        move r2, r0
+        move r0, r1, true, .Ltmp1
+.Ltmp0:
+        move r2, r1
+        // move r0, r0
+.Ltmp1:
+        move r1, zero
+        mul_step d0, r2, d0, 0, z, .Ltmp2
+        mul_step d0, r2, d0, 1, z, .Ltmp2
+        mul_step d0, r2, d0, 2, z, .Ltmp2
+        mul_step d0, r2, d0, 3, z, .Ltmp2
+        mul_step d0, r2, d0, 4, z, .Ltmp2
+        mul_step d0, r2, d0, 5, z, .Ltmp2
+        mul_step d0, r2, d0, 6, z, .Ltmp2
+        mul_step d0, r2, d0, 7, z, .Ltmp2
+        mul_step d0, r2, d0, 8, z, .Ltmp2
+        mul_step d0, r2, d0, 9, z, .Ltmp2
+        mul_step d0, r2, d0, 10, z, .Ltmp2
+        mul_step d0, r2, d0, 11, z, .Ltmp2
+        mul_step d0, r2, d0, 12, z, .Ltmp2
+        mul_step d0, r2, d0, 13, z, .Ltmp2
+        mul_step d0, r2, d0, 14, z, .Ltmp2
+        mul_step d0, r2, d0, 15, z, .Ltmp2
+        mul_step d0, r2, d0, 16, z, .Ltmp2
+        mul_step d0, r2, d0, 17, z, .Ltmp2
+        mul_step d0, r2, d0, 18, z, .Ltmp2
+        mul_step d0, r2, d0, 19, z, .Ltmp2
+        mul_step d0, r2, d0, 20, z, .Ltmp2
+        mul_step d0, r2, d0, 21, z, .Ltmp2
+        mul_step d0, r2, d0, 22, z, .Ltmp2
+        mul_step d0, r2, d0, 23, z, .Ltmp2
+        mul_step d0, r2, d0, 24, z, .Ltmp2
+        mul_step d0, r2, d0, 25, z, .Ltmp2
+        mul_step d0, r2, d0, 26, z, .Ltmp2
+        mul_step d0, r2, d0, 27, z, .Ltmp2
+        mul_step d0, r2, d0, 28, z, .Ltmp2
+        mul_step d0, r2, d0, 29, z, .Ltmp2
+        mul_step d0, r2, d0, 30, z, .Ltmp2
+        mul_step d0, r2, d0, 31, z, .Ltmp2
+.Ltmp2:
+        move r0, r1
+
+        jump r23
diff --git a/compiler-rt/lib/builtins/dpu/mul32.c b/compiler-rt/lib/builtins/dpu/mul32.c
new file mode 100644
index 0000000000000..cc6be09b64847
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mul32.c
@@ -0,0 +1,59 @@
+#include <stdint.h>
+
+int32_t __mulsi3(int32_t a, int32_t b)
+{
+    int32_t dest;
+
+    int32_t temp0;
+    uint64_t temp1;
+
+    this is not working yet ...
+      temp1.hi/temp1.lo is not yet supported
+      
+    __asm__ volatile("  jgtu %[b], %[a], 1f\n"
+                     "  move %[temp0], %[a]\n"
+                     "  move %[temp1.hi], %[b], true, 2f\n"
+                     "1:\n"
+                     "  move %[temp0], %[b]\n"
+                     "  move %[temp1.hi], %[a]\n"
+                     "2:\n"
+                     "  move r1, zero\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 0 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 1 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 2 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 3 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 4 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 5 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 6 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 7 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 8 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 9 , z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 10, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 11, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 12, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 13, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 14, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 15, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 16, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 17, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 18, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 19, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 20, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 21, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 22, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 23, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 24, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 25, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 26, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 27, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 28, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 29, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 30, z, 3f\n"
+                     "  mul_step %[temp1], %[temp0], %[temp1], 31, z, 3f\n"
+                     "3:\n"
+                     "  move %[dest], %[temp1.lo]\n"
+                     : [dest] "=&r"(dest), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1)
+                     : [a]"r"(a), [b]"r"(b)
+                     : );
+    return dest;
+}
diff --git a/compiler-rt/lib/builtins/dpu/muldi3.c b/compiler-rt/lib/builtins/dpu/muldi3.c
new file mode 100644
index 0000000000000..2d5a28b1dc260
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/muldi3.c
@@ -0,0 +1,171 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 multiplication emulation.
+ *
+ * A relatively fast emulation of 64x64 multiplication using byte multipliers.
+ * Basically, the two operands X and Y are seen as byte polynomials:
+ *  - X = X0.2^0 + X1.2^8 + X2.2^16 + X3.2^24 + X4.2^32 + X5.2^40 + X6.2^48 + X7.2^56
+ *  - Y = Y0.2^0 + Y1.2^8 + Y2.2^16 + Y3.2^24 + Y4.2^32 + Y5.2^40 + Y6.2^48 + Y7.2^56
+ *
+ * The product Z is expressed as a similar polynomial. Since the result is 64 bits,
+ * the function drops any coefficient for a power greater than 56, hence the following
+ * formula:
+ *  Z = (X0.Y0).2^0
+ *      + (X0.Y1 + X1.Y0).2^8
+ *      + (X0.Y2 + X2.Y0 + X1.Y1).2^16
+ *      + (X0.Y3 + X1.Y2 + X2.Y1 + X3.Y0).2^24
+ *      + (X0.Y4 + X1.Y3 + X2.Y2 + X3.Y1 + X4.Y0).2^32
+ *      etc.
+ *
+ * Each individual produce is computed with the native built-in 8x8 instructions.
+ * Resulting processing time is in the magnitude of 150 instructions.
+ *
+ * The two operands are found in __D0 and the first kernel nano-stack entry.
+ * The result goes into __R0 (lsbits) and __R1 (msbits).
+ * Also, __R2 contains the return address register, instead of __RET__.
+ */
+#include <stdint.h>
+
+static inline __attribute__((always_inline)) uint16_t
+_mul00(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return (a & 0xff) * (b & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_ul_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+static inline __attribute__((always_inline)) uint16_t
+_mul01(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return (a & 0xff) * ((b >> 8) & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_ul_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+#define _mul02(a, b) _mul00(a, (b >> 16))
+#define _mul03(a, b) _mul01(a, (b >> 16))
+
+static inline __attribute__((always_inline)) uint16_t
+_mul11(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return ((a >> 8) & 0xff) * ((b >> 8) & 0xff);
+#else
+    uint32_t r;
+    __asm__ volatile("mul_uh_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :);
+    return r;
+#endif
+}
+
+static inline __attribute__((always_inline)) uint16_t
+_mul12(uint32_t a, uint32_t b)
+{
+#ifndef DPU
+    return ((a >> 8) & 0xff) * ((b >> 16) & 0xff);
+#else
+    uint32_t r = (b >> 16);
+    __asm__ volatile("mul_uh_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(r) :);
+    return r;
+#endif
+}
+
+#define _mul13(a, b) _mul11(a, (b >> 16))
+#define _mul22(a, b) _mul00((a >> 16), (b >> 16))
+#define _mul23(a, b) _mul01((a >> 16), (b >> 16))
+#define _mul33(a, b) _mul11((a >> 16), (b >> 16))
+
+#define mulx0y0(xl, yl) _mul00(xl, yl)
+#define mulx0y1(xl, yl) _mul01(xl, yl)
+#define mulx0y2(xl, yl) _mul02(xl, yl)
+#define mulx0y3(xl, yl) _mul03(xl, yl)
+#define mulx0y4(xl, yh) _mul00(xl, yh)
+#define mulx0y5(xl, yh) _mul01(xl, yh)
+#define mulx0y6(xl, yh) _mul02(xl, yh)
+#define mulx0y7(xl, yh) _mul03(xl, yh)
+
+#define mulx1y1(xl, yl) _mul11(xl, yl)
+#define mulx1y2(xl, yl) _mul12(xl, yl)
+#define mulx1y3(xl, yl) _mul13(xl, yl)
+#define mulx1y4(xl, yh) _mul01(yh, xl)
+#define mulx1y5(xl, yh) _mul11(xl, yh)
+#define mulx1y6(xl, yh) _mul12(xl, yh)
+
+#define mulx2y2(xl, yl) _mul22(xl, yl)
+#define mulx2y3(xl, yl) _mul23(xl, yl)
+#define mulx2y4(xl, yh) _mul02(yh, xl)
+#define mulx2y5(xl, yh) _mul12(yh, xl)
+
+#define mulx3y3(xl, yl) _mul33(xl, yl)
+#define mulx3y4(xl, yh) _mul03(yh, xl)
+
+// Symmetry...
+#define mulx1y0(xl, yl) mulx0y1(yl, xl)
+#define mulx2y0(xl, yl) mulx0y2(yl, xl)
+#define mulx2y1(xl, yl) mulx1y2(yl, xl)
+#define mulx3y0(xl, yl) mulx0y3(yl, xl)
+#define mulx3y1(xl, yl) mulx1y3(yl, xl)
+#define mulx3y2(xl, yl) mulx2y3(yl, xl)
+#define mulx4y0(xh, yl) mulx0y4(yl, xh)
+#define mulx4y1(xh, yl) mulx1y4(yl, xh)
+#define mulx4y2(xh, yl) mulx2y4(yl, xh)
+#define mulx4y3(xh, yl) mulx3y4(yl, xh)
+#define mulx5y0(xh, yl) mulx0y5(yl, xh)
+#define mulx5y1(xh, yl) mulx1y5(yl, xh)
+#define mulx5y2(xh, yl) mulx2y5(yl, xh)
+#define mulx6y0(xh, yl) mulx0y6(yl, xh)
+#define mulx6y1(xh, yl) mulx1y6(yl, xh)
+#define mulx7y0(xh, yl) mulx0y7(yl, xh)
+
+uint64_t
+__muldi3(uint64_t x, uint64_t y)
+{
+    uint32_t xl = x;
+    uint32_t xh = ((uint64_t)x >> 32);
+    uint32_t yl = y;
+    uint32_t yh = ((uint64_t)y >> 32);
+
+    // Each fragment of the product.
+    uint32_t p0, p1, p2, p3, p4, p5, p6, p7, rh;
+    uint64_t rl;
+
+    p0 = mulx0y0(xl, yl);
+    rl = (uint64_t)p0;
+
+    p1 = mulx0y1(xl, yl) + mulx1y0(xl, yl);
+    rl += ((uint64_t)p1 << 8);
+
+    p2 = mulx0y2(xl, yl) + mulx2y0(xl, yl) + mulx1y1(xl, yl);
+    rl += ((uint64_t)p2 << 16);
+
+    p3 = mulx0y3(xl, yl) + mulx3y0(xl, yl) + mulx1y2(xl, yl) + mulx2y1(xl, yl);
+    rl += ((uint64_t)p3 << 24);
+
+    p4 = mulx0y4(xl, yh) + mulx4y0(xh, yl) + mulx1y3(xl, yl) + mulx3y1(xl, yl) + mulx2y2(xl, yl);
+    rh = p4;
+    
+    p5 = (mulx0y5(xl, yh) + mulx5y0(xh, yl) + mulx1y4(xl, yh) + mulx4y1(xh, yl)
+	  + mulx2y3(xl, yl) + mulx3y2(xl, yl));
+    rh += p5 << 8;
+
+    p6 = (mulx0y6(xl, yh) + mulx6y0(xh, yl) + mulx1y5(xl, yh) + mulx5y1(xh, yl)
+	  + mulx2y4(xl, yh) + mulx4y2(xh, yl) + mulx3y3(xl, yl));
+    rh += p6 << 16;
+    
+    p7 = (mulx0y7(xl, yh) + mulx7y0(xh, yl) + mulx1y6(xl, yh) + mulx6y1(xh, yl)
+	  + mulx2y5(xl, yh) + mulx5y2(xh, yl) + mulx3y4(xl, yh) + mulx4y3(xh, yl));
+    rh += p7 << 24;
+
+    return rl + (((uint64_t)rh) << 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/mulsi3.c b/compiler-rt/lib/builtins/dpu/mulsi3.c
new file mode 100644
index 0000000000000..f41210acd79cd
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/mulsi3.c
@@ -0,0 +1,8 @@
+#include <stdint.h>
+
+extern int32_t __mul32(int32_t a, int32_t b);
+
+int32_t __mulsi3(int32_t a, int32_t b)
+{
+  return __mul32(a, b);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udiv32.S b/compiler-rt/lib/builtins/dpu/udiv32.S
new file mode 100644
index 0000000000000..8298d37dd8a0e
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv32.S
@@ -0,0 +1,49 @@
+        .text
+        .globl  __udiv32
+        .type   __udiv32,@function
+__udiv32:
+	clz r2, r1, max, 1f // r2 = by how many the divider can be shifted on 32-bit
+	clz r3, r0         // r3 = number of useless bits of the dividend
+	sub r2, r3, r2, gtu, 2f// r2 = the maximal shift to be done
+	move r3, r1
+	move.u d0, r0
+	jump r2, 3f                 // As we will jump backward relatively to label 3 forward
+	div_step d0, r3, d0, 31
+	div_step d0, r3, d0, 30
+	div_step d0, r3, d0, 29
+	div_step d0, r3, d0, 28
+	div_step d0, r3, d0, 27
+	div_step d0, r3, d0, 26
+	div_step d0, r3, d0, 25
+	div_step d0, r3, d0, 24
+	div_step d0, r3, d0, 23
+	div_step d0, r3, d0, 22
+	div_step d0, r3, d0, 21
+	div_step d0, r3, d0, 20
+	div_step d0, r3, d0, 19
+	div_step d0, r3, d0, 18
+	div_step d0, r3, d0, 17
+	div_step d0, r3, d0, 16
+	div_step d0, r3, d0, 15
+	div_step d0, r3, d0, 14
+	div_step d0, r3, d0, 13
+	div_step d0, r3, d0, 12
+	div_step d0, r3, d0, 11
+	div_step d0, r3, d0, 10
+	div_step d0, r3, d0, 9
+	div_step d0, r3, d0, 8
+	div_step d0, r3, d0, 7
+	div_step d0, r3, d0, 6
+	div_step d0, r3, d0, 5
+	div_step d0, r3, d0, 4
+	div_step d0, r3, d0, 3
+	div_step d0, r3, d0, 2
+	div_step d0, r3, d0, 1
+3:
+	div_step d0, r3, d0, 0
+4:	
+	jump r23
+2:
+	move.u d0, r0, true, 4b
+1:
+	fault 2
diff --git a/compiler-rt/lib/builtins/dpu/udiv32.c b/compiler-rt/lib/builtins/dpu/udiv32.c
new file mode 100644
index 0000000000000..22f617e14fd71
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv32.c
@@ -0,0 +1,63 @@
+#include <stdint.h>
+
+uint64_t
+__udiv32(uint32_t dividend, uint32_t divider)
+{
+    uint64_t dest;
+
+    uint32_t temp0;
+    uint32_t temp1;
+
+    /* clang-format off */
+    __asm__ volatile("  clz %[temp0], %[divider], max, 1f\n" // %[temp0] = by how many the divider can be shifted on 32-bit
+                     "  clz %[temp1], %[dividend]\n" // %[temp1] = number of useless bits of the dividend
+                     "  sub %[temp0], %[temp1], %[temp0], gtu, 2f\n" // %[temp0] = the maximal shift to be done
+                     "  move %[temp1], %[divider]\n"
+                     "  move.u %[dest], %[dividend]\n"
+                     "  jump %[temp0], 3f\n" // As we will jump backward relatively to label 3 forward
+                     "  div_step %[dest], %[temp1], %[dest], 31\n"
+                     "  div_step %[dest], %[temp1], %[dest], 30\n"
+                     "  div_step %[dest], %[temp1], %[dest], 29\n"
+                     "  div_step %[dest], %[temp1], %[dest], 28\n"
+                     "  div_step %[dest], %[temp1], %[dest], 27\n"
+                     "  div_step %[dest], %[temp1], %[dest], 26\n"
+                     "  div_step %[dest], %[temp1], %[dest], 25\n"
+                     "  div_step %[dest], %[temp1], %[dest], 24\n"
+                     "  div_step %[dest], %[temp1], %[dest], 23\n"
+                     "  div_step %[dest], %[temp1], %[dest], 22\n"
+                     "  div_step %[dest], %[temp1], %[dest], 21\n"
+                     "  div_step %[dest], %[temp1], %[dest], 20\n"
+                     "  div_step %[dest], %[temp1], %[dest], 19\n"
+                     "  div_step %[dest], %[temp1], %[dest], 18\n"
+                     "  div_step %[dest], %[temp1], %[dest], 17\n"
+                     "  div_step %[dest], %[temp1], %[dest], 16\n"
+                     "  div_step %[dest], %[temp1], %[dest], 15\n"
+                     "  div_step %[dest], %[temp1], %[dest], 14\n"
+                     "  div_step %[dest], %[temp1], %[dest], 13\n"
+                     "  div_step %[dest], %[temp1], %[dest], 12\n"
+                     "  div_step %[dest], %[temp1], %[dest], 11\n"
+                     "  div_step %[dest], %[temp1], %[dest], 10\n"
+                     "  div_step %[dest], %[temp1], %[dest], 9\n"
+                     "  div_step %[dest], %[temp1], %[dest], 8\n"
+                     "  div_step %[dest], %[temp1], %[dest], 7\n"
+                     "  div_step %[dest], %[temp1], %[dest], 6\n"
+                     "  div_step %[dest], %[temp1], %[dest], 5\n"
+                     "  div_step %[dest], %[temp1], %[dest], 4\n"
+                     "  div_step %[dest], %[temp1], %[dest], 3\n"
+                     "  div_step %[dest], %[temp1], %[dest], 2\n"
+                     "  div_step %[dest], %[temp1], %[dest], 1\n"
+                     "3:\n"
+                     "  div_step %[dest], %[temp1], %[dest], 0\n"
+                     "4:\n"
+                     "  jump 5f\n"
+                     "2:\n"
+                     "  move.u %[dest], %[dividend], true, 4b\n"
+                     "1:\n"
+                     "  fault 2\n"
+                     "5:\n"
+                     : [dest] "=r"(dest), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1)
+                     : [dividend] "r"(dividend), [divider] "r"(divider));
+    /* clang-format on */
+
+    return dest;
+}
diff --git a/compiler-rt/lib/builtins/dpu/udiv64.c b/compiler-rt/lib/builtins/dpu/udiv64.c
new file mode 100644
index 0000000000000..e55b3ffe9904c
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udiv64.c
@@ -0,0 +1,59 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 multiplication unsigned division.
+ */
+#include <stdint.h>
+
+static unsigned int
+__clz__(uint64_t x)
+{
+    return __builtin_clzl(x);
+}
+
+uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder)
+{
+    uint64_t dxo = dividend, dxe = 0;
+
+    if (divider == 0) {
+      __asm__ volatile("fault 2");
+      /* unreachable(); */
+      __builtin_unreachable();
+    }
+    if (divider > dividend) {
+        if (ask_remainder == 0)
+            return 0;
+        else
+            return dividend;
+    }
+
+    // Mimic the div_step.
+    /// div_step functionality:
+    //   if (Dxo >= (Ra<< #u5)) {
+    //     Dxo = Dxo - (Ra<< #u5);
+    //     Dxe = (Dxe << 1) | 1;
+    //   } else {
+    //     Dxe =  Dxe << 1;
+    //   }
+    int dividerl0 = __clz__(divider), dividendl0 = __clz__(dividend);
+
+    int i = dividerl0 - dividendl0;
+
+    for (; i >= 0; i--) {
+        uint64_t pivot = ((uint64_t)divider << i);
+        if (dxo >= pivot) {
+            dxo = dxo - pivot;
+            dxe = ((uint64_t)dxe << 1) | 1L;
+        } else {
+            dxe = (uint64_t)dxe << 1;
+        }
+    }
+    if (ask_remainder == 1)
+        return dxo;
+    else
+        return dxe;
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivdi3.c b/compiler-rt/lib/builtins/dpu/udivdi3.c
new file mode 100644
index 0000000000000..1b60b934b85f4
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivdi3.c
@@ -0,0 +1,19 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 unsigned division.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+uint64_t
+__udivdi3(uint64_t dividend, uint64_t divider)
+{
+    return __udiv64(dividend, divider, 0);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivmodsi4.c b/compiler-rt/lib/builtins/dpu/udivmodsi4.c
new file mode 100644
index 0000000000000..3a3f3902b6f61
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivmodsi4.c
@@ -0,0 +1,29 @@
+/*===-- udivmodsi4.c - Implement __udivmodsi4 ------------------------------===
+ *
+ *                    The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __udivmodsi4 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+#include "int_lib.h"
+
+/* Returns: a / b, *rem = a % b  */
+
+COMPILER_RT_ABI su_int
+__udivmodsi4(su_int a, su_int b, su_int *rem)
+{
+    uint64_t res = __udiv32(a, b);
+    *rem = (su_int)res;
+    return (su_int) (res >> 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/udivsi3.c b/compiler-rt/lib/builtins/dpu/udivsi3.c
new file mode 100644
index 0000000000000..dcc1d9fcf672f
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/udivsi3.c
@@ -0,0 +1,15 @@
+#include <stdint.h>
+
+extern uint64_t __udiv32(uint32_t dividend, uint32_t divider);
+
+#include "../int_lib.h"
+
+typedef su_int fixuint_t;
+typedef si_int fixint_t;
+
+// Returns: a / b
+
+COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) {
+  uint64_t res = __udiv32(a, b);
+  return (su_int) (res >> 32);
+}
diff --git a/compiler-rt/lib/builtins/dpu/umoddi3.c b/compiler-rt/lib/builtins/dpu/umoddi3.c
new file mode 100644
index 0000000000000..4b3a82b01eb98
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/umoddi3.c
@@ -0,0 +1,19 @@
+/* Copyright 2020 UPMEM. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/*
+ * 64x64 unsigned remainder.
+ *
+ * This is the actual libcall implementation, as requested by the compiler.
+ */
+#include <stdint.h>
+extern uint64_t
+__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder);
+
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divider)
+{
+    return __udiv64(dividend, divider, 1);
+}
diff --git a/compiler-rt/lib/builtins/dpu/umodsi3.c b/compiler-rt/lib/builtins/dpu/umodsi3.c
new file mode 100644
index 0000000000000..c85cd8a4d9aed
--- /dev/null
+++ b/compiler-rt/lib/builtins/dpu/umodsi3.c
@@ -0,0 +1,27 @@
+/* ===-- umodsi3.c - Implement __umodsi3 -----------------------------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is dual licensed under the MIT and the University of Illinois Open
+ * Source Licenses. See LICENSE_LLVM.TXT for details.
+ *
+ * ===----------------------------------------------------------------------===
+ *
+ * This file implements __umodsi3 for the compiler_rt library.
+ *
+ * ===----------------------------------------------------------------------===
+ */
+
+#include "int_lib.h"
+
+/* Returns: a % b */
+
+extern unsigned long
+__udiv32(unsigned int, unsigned int);
+
+COMPILER_RT_ABI su_int
+__umodsi3(su_int a, su_int b)
+{
+    unsigned long res = __udiv32(a, b);
+    return (unsigned int)res;
+}
diff --git a/compiler-rt/test/builtins/Unit/comparedf2_test.c b/compiler-rt/test/builtins/Unit/comparedf2_test.c
index 27666e2ad689b..d606ae7eff6ca 100644
--- a/compiler-rt/test/builtins/Unit/comparedf2_test.c
+++ b/compiler-rt/test/builtins/Unit/comparedf2_test.c
@@ -458,7 +458,7 @@ static const struct TestVector vectors[] = {
     {__builtin_inf(),__builtin_inf(),0,0,0,0,0,0,0},
 };    
 
-int main(int argc, char *argv[]) {
+int main() {
     const int numVectors = sizeof vectors / sizeof vectors[0];
     int i;
     for (i = 0; i<numVectors; ++i) {
diff --git a/compiler-rt/test/builtins/Unit/comparesf2_test.c b/compiler-rt/test/builtins/Unit/comparesf2_test.c
index b6a52b74633aa..f129bece62364 100644
--- a/compiler-rt/test/builtins/Unit/comparesf2_test.c
+++ b/compiler-rt/test/builtins/Unit/comparesf2_test.c
@@ -458,7 +458,7 @@ static const struct TestVector vectors[] = {
     {__builtin_inff(),__builtin_inff(),0,0,0,0,0,0,0},
 };    
 
-int main(int argc, char *argv[]) {
+int main() {
     const int numVectors = sizeof vectors / sizeof vectors[0];
     int i;
     for (i = 0; i<numVectors; ++i) {
diff --git a/compiler-rt/test/builtins/Unit/test.c b/compiler-rt/test/builtins/Unit/test.c
new file mode 100644
index 0000000000000..bad88690c884f
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/test.c
@@ -0,0 +1,13 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+  fprintf(stderr, "hello err\n");
+  fprintf(stdout, "hello out\n");
+  srand(42);
+  for (int i = 0; i < 10; i++) {
+    printf("%d %d\n", i, rand());
+  }
+  return 0;
+}
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 378df1b75e25d..15ed3e94bff5b 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1103,23 +1103,51 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
 bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
                                      AllSuccsCache &AllSuccessors) {
   // Don't sink instructions that the target prefers not to sink.
-  if (!TII->shouldSink(MI))
+  if (!TII->shouldSink(MI)) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "shouldSink false "; MI.dump();
+    //   });
     return false;
-
+  }
+  
   // Check if it's safe to move the instruction.
-  if (!MI.isSafeToMove(AA, SawStore))
+  if (!MI.isSafeToMove(AA, SawStore)) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "not safe "; MI.dump();
+    // 	dbgs() << "mayStore(): " << MI.mayStore() << "\n";
+    // 	dbgs() << "mayLoad(): " << MI.mayLoad() << "\n";
+    // 	dbgs() << "isCall(): " << MI.isCall() << "\n";
+    // 	dbgs() << "isPHI(): " << MI.isPHI() << "\n";
+    // 	dbgs() << "hasOrderedMemoryRef(): " << MI.hasOrderedMemoryRef() << "\n";
+    // 	dbgs() << "isPosition(): " << MI.isPosition() << "\n";
+    // 	dbgs() << "isDebugInstr(): " << MI.isDebugInstr() << "\n";
+    // 	dbgs() << "isTerminator(): " << MI.isTerminator() << "\n";
+    // 	dbgs() << "mayRaiseFPException(): " << MI.mayRaiseFPException() << "\n";
+    // 	dbgs() << "hasUnmodeledSideEffects(): " << MI.hasUnmodeledSideEffects() << "\n";
+    // 	dbgs() << "isDereferenceableInvariantLoad(AA): " << MI.isDereferenceableInvariantLoad(AA) << "\n";
+    // 	dbgs() << "SawStore: " << SawStore << "\n";
+    //   });
     return false;
-
+  }
+  
   // Convergent operations may not be made control-dependent on additional
   // values.
-  if (MI.isConvergent())
+  if (MI.isConvergent()) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "isconvergent "; MI.dump();
+    //   });
     return false;
-
+  }
+  
   // Don't break implicit null checks.  This is a performance heuristic, and not
   // required for correctness.
-  if (SinkingPreventsImplicitNullCheck(MI, TII, TRI))
+  if (SinkingPreventsImplicitNullCheck(MI, TII, TRI)) {
+    LLVM_DEBUG({
+	dbgs() << "nullcheck "; MI.dump();
+      });
     return false;
-
+  }
+  
   // FIXME: This should include support for sinking instructions within the
   // block they are currently in to shorten the live ranges.  We often get
   // instructions sunk into the top of a large block, but it would be better to
@@ -1134,9 +1162,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
       FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
 
   // If there are no outputs, it must have side-effects.
-  if (!SuccToSinkTo)
+  if (!SuccToSinkTo) {
+    // LLVM_DEBUG({
+    // 	dbgs() << "no succ "; MI.dump();
+    //   });
     return false;
-
+  }
   // If the instruction to move defines a dead physical register which is live
   // when leaving the basic block, don't move it because it could turn into a
   // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
@@ -1146,8 +1177,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
     Register Reg = MO.getReg();
     if (Reg == 0 || !Register::isPhysicalRegister(Reg))
       continue;
-    if (SuccToSinkTo->isLiveIn(Reg))
+    if (SuccToSinkTo->isLiveIn(Reg)) {
+      // LLVM_DEBUG({
+      // 	  dbgs() << "zombie "; MI.dump();
+      // 	});
       return false;
+    }
   }
 
   LLVM_DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo);
diff --git a/llvm/lib/Target/DPU/CMakeLists.txt b/llvm/lib/Target/DPU/CMakeLists.txt
index 7a887b71ee3aa..9e216ef08cb39 100644
--- a/llvm/lib/Target/DPU/CMakeLists.txt
+++ b/llvm/lib/Target/DPU/CMakeLists.txt
@@ -28,7 +28,7 @@ add_llvm_target(DPUCodeGen
         DPUResolveMacroInstrPass.cpp
         DPUMacroFusion.cpp
         DPUSelectionDAGInfo.cpp
-
+	DPUPostRAFusion.cpp
         DEPENDS
         intrinsics_gen
 
diff --git a/llvm/lib/Target/DPU/DPU.h b/llvm/lib/Target/DPU/DPU.h
index 2ef567d9bc868..7f84823cb9ae0 100644
--- a/llvm/lib/Target/DPU/DPU.h
+++ b/llvm/lib/Target/DPU/DPU.h
@@ -19,6 +19,7 @@ namespace llvm {
 class FunctionPass;
 class DPUTargetMachine;
 
+FunctionPass *createDPUPostRAFusionPass(DPUTargetMachine &tm);
 FunctionPass *createDPUMergeComboInstrPass(DPUTargetMachine &tm);
 FunctionPass *createDPUResolveMacroInstrPass(DPUTargetMachine &tm);
 
diff --git a/llvm/lib/Target/DPU/DPU.td b/llvm/lib/Target/DPU/DPU.td
index 65f22ee7312f9..e262860b24780 100644
--- a/llvm/lib/Target/DPU/DPU.td
+++ b/llvm/lib/Target/DPU/DPU.td
@@ -71,4 +71,5 @@ def DPU : Target {
   let AssemblyParsers = [DPUAsmParser];
   let AssemblyParserVariants = [DPUAsmParserVariant];
   let AssemblyWriters = [DPUInstPrinter];
+  let AllowRegisterRenaming = 1;
 }
diff --git a/llvm/lib/Target/DPU/DPUFrameLowering.cpp b/llvm/lib/Target/DPU/DPUFrameLowering.cpp
index 8bf3c6c06650b..026354d10e304 100644
--- a/llvm/lib/Target/DPU/DPUFrameLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUFrameLowering.cpp
@@ -85,7 +85,8 @@ void DPUFrameLowering::emitPrologue(MachineFunction &MF,
         .addCFIIndex(CFIIndex)
         .setMIFlag(MachineInstr::FrameSetup);
 
-    BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir), DPU::R22)
+    BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir))
+        .addReg(DPU::R22)
         .addImm(StackSize - STACK_SIZE_FOR_D22)
         .addReg(DPU::D22);
     BuildMI(MBB, MBBI, DL, DPUII.get(DPU::ADDrri), DPU::R22)
diff --git a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
index c501d43ed7a89..539056aeb055b 100644
--- a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp
@@ -100,25 +100,52 @@ class DPUDAGToDAGISel : public SelectionDAGISel {
 StringRef DPUDAGToDAGISel::getPassName() const { return "DPUDAGToDAGISel"; }
 
 bool DPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+  
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
   processFunctionAfterISel(MF);
 
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
   return Ret;
 }
 
 void DPUDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
   MachineRegisterInfo *MRI = &MF.getRegInfo();
 
   auto &SubTarget = static_cast<const DPUSubtarget &>(MF.getSubtarget());
   auto InstrInfo = SubTarget.getInstrInfo();
   auto RegInfo = SubTarget.getRegisterInfo();
 
-  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
-       ++MFI)
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE; ++MFI) {
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
     for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
-      replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I);
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	});
+
+      bool res = replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I);
+      if (res) {
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " YES did something.\n";
+      }
     }
+  }
 }
 
 static inline bool canCommuteOperation(MachineInstr *MI, unsigned opNo,
@@ -149,6 +176,10 @@ bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
                                                  const DPUInstrInfo *DII,
                                                  const TargetRegisterInfo *TRI,
                                                  const MachineInstr &MI) {
+  // This function seems to do manual coalescing
+  //    probably we should use the proper one that probably knows better
+  //    maybe prob with MI operand constraint ... ?
+  //    probably better to educate the coalescer, or better define register class
   unsigned DstReg = 0, CstReg = 0;
 
   if (MI.getOpcode() == DPU::COPY) {
@@ -220,6 +251,8 @@ bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI,
              UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(OtherReg))) {
           UMI->getOperand(newOpNo).setReg(CstReg);
           UMI->getOperand(OpNo).setReg(OtherReg);
+
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " YES did something.\n";
         }
       }
 
@@ -387,10 +420,70 @@ void DPUDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
+  EVT VT = Node->getValueType(0);
+  SDLoc DL(Node);
+
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  
   switch (Opcode) {
+  case ISD::Constant: {
+    LLVM_DEBUG({dbgs() << "a constant: "; Node->dump();});
+    if (VT == MVT::i32) {
+      // Materialize some constants as copies from constant register.
+      // This allows the coalescer to propagate these into other instructions.
+      ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+      if (ConstNode->isNullValue()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::ZERO, MVT::i32);
+	ReplaceNode(Node, New.getNode());
+	return;
+      } else if (ConstNode->isOne()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::ONE, MVT::i32);
+	ReplaceNode(Node, New.getNode());
+	return;
+      } else if (ConstNode->isAllOnesValue()) {
+	SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::LNEG, MVT::i32);
+	ReplaceNode(Node, New.getNode());
+	return;
+      } else {
+	const ConstantInt *Cst = ConstNode->getConstantIntValue();
+	if (Cst->isMinValue(/* signed = */ true)) {
+	  SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::MNEG, MVT::i32);
+	  ReplaceNode(Node, New.getNode());
+	  return;
+	}
+      }
+    } else if (VT == MVT::i64) {
+      ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+      if (ConstNode->isNullValue()) {
+	// // Create a new virtual register of type i64
+	// SDValue ImpDef = SDValue(CurDAG->getMachineNode(DPU::IMPLICIT_DEF, DL, MVT::i64), 0);
+	// // Insert the low part into the virtual register
+	// SDValue InsertLo = CurDAG->getTargetInsertSubreg(DPU::sub_32bit, DL, MVT::i64, 
+	// 						 ImpDef,
+	// 						 CurDAG->getRegister(DPU::ZERO, MVT::i32));
+	// // Insert the high part into the virtual register
+	// SDValue InsertHi = CurDAG->getTargetInsertSubreg(DPU::sub_32bit_hi, DL, MVT::i64, 
+	// 						 InsertLo,
+	// 						 CurDAG->getRegister(DPU::ZERO, MVT::i32));
+	// // Replace the old node with the new virtual register value
+	// ReplaceNode(Node, InsertHi.getNode());
+
+	SDValue truc = SDValue(CurDAG->getMachineNode(DPU::MOVE_Srr, DL, MVT::i64,
+						      CurDAG->getRegister(DPU::ZERO, MVT::i32)), 0);
+	ReplaceNode(Node, truc.getNode());
+	return;
+      } else if (ConstNode->isOne()) {
+	SDValue truc = SDValue(CurDAG->getMachineNode(DPU::MOVE_Srr, DL, MVT::i64,
+						      CurDAG->getRegister(DPU::ONE, MVT::i32)), 0);
+	ReplaceNode(Node, truc.getNode());
+	return;
+      }
+    }
+    break;
+  }
   case ISD::FrameIndex: {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    EVT VT = Node->getValueType(0);
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
     unsigned Opc = DPU::ADDrri;
     if (Node->hasOneUse()) {
diff --git a/llvm/lib/Target/DPU/DPUISelLowering.h b/llvm/lib/Target/DPU/DPUISelLowering.h
index 87d963121a70b..91eadd89e9489 100644
--- a/llvm/lib/Target/DPU/DPUISelLowering.h
+++ b/llvm/lib/Target/DPU/DPUISelLowering.h
@@ -32,12 +32,12 @@ enum {
   SetCC,    // SET to a condition
   BrCC,     // Jump and branch with condition
   BrCCi,    // Jump and branch with condition
-  BrCCZero, // Jump and branch with condition and one operand equal to zero
-  OrJCCZero,
-  AndJCCZero,
-  XorJCCZero,
-  AddJCCZero,
-  SubJCCZero,
+  // BrCCZero, // Jump and branch with condition and one operand equal to zero
+  // OrJCCZero,
+  // AndJCCZero,
+  // XorJCCZero,
+  // AddJCCZero,
+  // SubJCCZero,
   Wrapper,    // Global addresses, externals...
   TRUNC64,    // Keep the LSBits register,
   LSL64_32,   // Shift 32 positions to the left
@@ -62,9 +62,9 @@ enum {
   MUL16_SU,
   MUL16_SS,
 
-  Addc,
-  Subc,
-  Rsubc,
+  // Addc,
+  // Subc,
+  // Rsubc,
 
   Clo,
   Cls,
@@ -77,80 +77,80 @@ enum {
 
   LslAdd,
 
-  AddJcc,
-  AddNullJcc,
-  AddcJcc,
-  AddcNullJcc,
-  AndJcc,
-  AndNullJcc,
-  OrJcc,
-  OrNullJcc,
-  XorJcc,
-  XorNullJcc,
-  NandJcc,
-  NandNullJcc,
-  NorJcc,
-  NorNullJcc,
-  NxorJcc,
-  NxorNullJcc,
-  AndnJcc,
-  AndnNullJcc,
-  OrnJcc,
-  OrnNullJcc,
-  LslJcc,
-  LslNullJcc,
-  LslxJcc,
-  LslxNullJcc,
-  Lsl1Jcc,
-  Lsl1NullJcc,
-  Lsl1xJcc,
-  Lsl1xNullJcc,
-  LsrJcc,
-  LsrNullJcc,
-  LsrxJcc,
-  LsrxNullJcc,
-  Lsr1Jcc,
-  Lsr1NullJcc,
-  Lsr1xJcc,
-  Lsr1xNullJcc,
-  AsrJcc,
-  AsrNullJcc,
-  RolJcc,
-  RolNullJcc,
-  RorJcc,
-  RorNullJcc,
-  MUL8_UUJcc,
-  MUL8_UUNullJcc,
-  MUL8_SUJcc,
-  MUL8_SUNullJcc,
-  MUL8_SSJcc,
-  MUL8_SSNullJcc,
-  SubJcc,
-  SubNullJcc,
-  RsubJcc,
-  RsubNullJcc,
-  SubcJcc,
-  SubcNullJcc,
-  RsubcJcc,
-  RsubcNullJcc,
-  CaoJcc,
-  CaoNullJcc,
-  ClzJcc,
-  ClzNullJcc,
-  CloJcc,
-  CloNullJcc,
-  ClsJcc,
-  ClsNullJcc,
-  MoveJcc,
-  MoveNullJcc,
-  RolAddJcc,
-  RolAddNullJcc,
-  LsrAddJcc,
-  LsrAddNullJcc,
-  LslAddJcc,
-  LslAddNullJcc,
-  LslSubJcc,
-  LslSubNullJcc,
+  // AddJcc,
+  // AddNullJcc,
+  // AddcJcc,
+  // AddcNullJcc,
+  // AndJcc,
+  // AndNullJcc,
+  // OrJcc,
+  // OrNullJcc,
+  // XorJcc,
+  // XorNullJcc,
+  // NandJcc,
+  // NandNullJcc,
+  // NorJcc,
+  // NorNullJcc,
+  // NxorJcc,
+  // NxorNullJcc,
+  // AndnJcc,
+  // AndnNullJcc,
+  // OrnJcc,
+  // OrnNullJcc,
+  // LslJcc,
+  // LslNullJcc,
+  // LslxJcc,
+  // LslxNullJcc,
+  // Lsl1Jcc,
+  // Lsl1NullJcc,
+  // Lsl1xJcc,
+  // Lsl1xNullJcc,
+  // LsrJcc,
+  // LsrNullJcc,
+  // LsrxJcc,
+  // LsrxNullJcc,
+  // Lsr1Jcc,
+  // Lsr1NullJcc,
+  // Lsr1xJcc,
+  // Lsr1xNullJcc,
+  // AsrJcc,
+  // AsrNullJcc,
+  // RolJcc,
+  // RolNullJcc,
+  // RorJcc,
+  // RorNullJcc,
+  // MUL8_UUJcc,
+  // MUL8_UUNullJcc,
+  // MUL8_SUJcc,
+  // MUL8_SUNullJcc,
+  // MUL8_SSJcc,
+  // MUL8_SSNullJcc,
+  // SubJcc,
+  // SubNullJcc,
+  // RsubJcc,
+  // RsubNullJcc,
+  // SubcJcc,
+  // SubcNullJcc,
+  // RsubcJcc,
+  // RsubcNullJcc,
+  // CaoJcc,
+  // CaoNullJcc,
+  // ClzJcc,
+  // ClzNullJcc,
+  // CloJcc,
+  // CloNullJcc,
+  // ClsJcc,
+  // ClsNullJcc,
+  // MoveJcc,
+  // MoveNullJcc,
+  // RolAddJcc,
+  // RolAddNullJcc,
+  // LsrAddJcc,
+  // LsrAddNullJcc,
+  // LslAddJcc,
+  // LslAddNullJcc,
+  // LslSubJcc,
+  // LslSubNullJcc,
 
   ADD_VASTART,
 
diff --git a/llvm/lib/Target/DPU/DPUInstrFormats.td b/llvm/lib/Target/DPU/DPUInstrFormats.td
index 66116ab29b153..a4e80392af3b6 100644
--- a/llvm/lib/Target/DPU/DPUInstrFormats.td
+++ b/llvm/lib/Target/DPU/DPUInstrFormats.td
@@ -97,6 +97,7 @@ def u5_imm  : UImmOperand< 5, i32>;
 def u8_imm  : UImmOperand< 8, i32>;
 
 def s8_i64_imm : SImmOperand<8, i64>;
+def s11_i64_imm : SImmOperand<11, i64>;
 def s16_i64_imm : SImmOperand<16, i64>;
 def s32_i64_imm : SImmOperand<32, i64>;
 def u32_i64_imm : UImmOperand<32, i64>;
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
index db957f97bcaa9..eb10d5bdbcf0e 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp
@@ -53,7 +53,9 @@ void DPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   DebugLoc DL = (I != MBB.end()) ? I->getDebugLoc() : DebugLoc();
-  unsigned Opcode = (RC == &DPU::GP_REGRegClass) ? DPU::SWrir : DPU::SDrir;
+  unsigned Opcode = (RC == &DPU::GP_REGRegClass
+		     || RC == &DPU::GPZ_REGRegClass
+		     ) ? DPU::SWrir : DPU::SDrir;
 
   LLVM_DEBUG({
     dbgs() << "DPU/Instr - storeRegToStackSlot DestReg="
@@ -82,7 +84,9 @@ void DPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
-  unsigned Opcode = (RC == &DPU::GP_REGRegClass) ? DPU::LWrri : DPU::LDrri;
+  unsigned Opcode = (RC == &DPU::GP_REGRegClass
+		     || RC == &DPU::GPZ_REGRegClass
+		     ) ? DPU::LWrri : DPU::LDrri;
   LLVM_DEBUG({
     dbgs() << "DPU/Instr - loadRegFromStackSlot DestReg="
            << std::to_string(DestReg) << " Opcode= " << std::to_string(Opcode)
@@ -94,26 +98,148 @@ void DPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(Opcode), DestReg).addFrameIndex(FI).addImm(0);
 }
 
+void DPUInstrInfo::expand64BitRegisterAluInstruction(MachineInstr &MI,
+						     MachineBasicBlock &MBB,
+						     unsigned int LsbOpcode,
+						     unsigned int MsbOpcode) const {
+  MachineFunction *MF = MBB.getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MI.getOperand(0).getReg();
+  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  unsigned int Op2Reg = MI.getOperand(2).getReg();
+
+  unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
+  unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
+
+  unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
+  unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
+
+  unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
+  unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
+
+  MachineInstrBuilder MIBDestLsb;
+  MIBDestLsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(LsbOpcode),
+		       LSBDestReg)
+    .addReg(LSBDOp1Reg)
+    .addReg(LSBOp2Reg);
+
+  MachineInstrBuilder MIBDestMsb;
+  MIBDestMsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(MsbOpcode),
+		       MSBDestReg)
+    .addReg(MSBDOp1Reg)
+    .addReg(MSBOp2Reg);
+
+  for (unsigned i = 0; i < 3; i++) {
+    if (MI.getOperand(i).isRenamable()) {
+      MIBDestLsb->getOperand(i).setIsRenamable();
+      MIBDestMsb->getOperand(i).setIsRenamable();
+    }
+    if (MI.getOperand(i).isKill()) {
+      MIBDestLsb->getOperand(i).setIsKill();
+      MIBDestMsb->getOperand(i).setIsKill();
+    }
+  }
+}
+
+void DPUInstrInfo::expand64BitImmediateAluInstruction(MachineInstr &MI,
+						      MachineBasicBlock &MBB,
+						      unsigned int LsbOpcode,
+						      unsigned int MsbOpcode) const {
+  MachineFunction *MF = MBB.getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MI.getOperand(0).getReg();
+  unsigned int Op1Reg = MI.getOperand(1).getReg();
+  int64_t Op2Imm = MI.getOperand(2).getImm();
+
+  unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
+  unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
+
+  unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
+  unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
+
+  int64_t LSBOp2Imm = Op2Imm & 0xFFFFFFFFl;
+  int64_t MSBOp2Imm = (Op2Imm >> 32) & 0xFFFFFFFFl;
+
+  // // what if value is zero???
+  // // probably optimizable :)
+  switch (LSBOp2Imm) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "LSBOp2Imm = " << LSBOp2Imm << " could be optimized\n";
+      });
+  }
+
+  switch (MSBOp2Imm) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "MSBOp2Imm = " << MSBOp2Imm << " could be optimized\n";
+      });
+  }
+  
+  MachineInstrBuilder MIBDestLsb;
+  MIBDestLsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(LsbOpcode),
+		       LSBDestReg)
+    .addReg(LSBDOp1Reg)
+    .addImm(LSBOp2Imm);
+
+  MachineInstrBuilder MIBDestMsb;
+  MIBDestMsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(MsbOpcode),
+		       MSBDestReg)
+    .addReg(MSBDOp1Reg)
+    .addImm(MSBOp2Imm);
+
+  for (unsigned i = 0; i < 2; i++) {
+    if (MI.getOperand(i).isRenamable()) {
+      MIBDestLsb->getOperand(i).setIsRenamable();
+      MIBDestMsb->getOperand(i).setIsRenamable();
+    }
+    if (MI.getOperand(i).isKill()) {
+      MIBDestLsb->getOperand(i).setIsKill();
+      MIBDestMsb->getOperand(i).setIsKill();
+    }
+  }
+}
+
 bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
 
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to expand: "; MI.dump();
+      dbgs() << "** MBB: "; MBB.dump();
+      dbgs() << "****** \n";
+    });
   switch (MI.getDesc().getOpcode()) {
   default:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "Don't know how to expand.\n";
+      });
     return false;
   case DPU::RETi:
     BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::JUMPr)).addReg(DPU::R23);
     break;
   case DPU::CALLi:
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri))
-        .addReg(DPU::R23)
-        .add(MI.getOperand(0));
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri), DPU::R23)
+        .add(MI.getOperand(0))
+        .copyImplicitOps(MI);
     break;
   case DPU::CALLr:
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr))
-        .addReg(DPU::R23)
-        .add(MI.getOperand(0));
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr), DPU::R23)
+        .add(MI.getOperand(0))
+        .copyImplicitOps(MI);
     break;
   case DPU::ADD_VAStart: { // Get the first index in stack where the first
                            // vaargs is stored
@@ -122,16 +248,57 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       StackSize = MF->getFrameInfo().getStackSize();
     }
     unsigned int ResultReg = MI.getOperand(0).getReg();
-    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif))
-        .addReg(ResultReg)
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif), ResultReg)
         .addReg(DPU::R22)
         .addImm(StackSize + STACK_SIZE_FOR_D22)
         .addImm(DPUAsmCondition::Condition::False);
     break;
   }
+
+  case DPU::ADD64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::ADDrrr, DPU::ADDCrrr);
+    break;
+  case DPU::AND64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::ANDrrr, DPU::ANDrrr);
+    break;
+  case DPU::OR64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::ORrrr, DPU::ORrrr);
+    break;
+  case DPU::SUB64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::SUBrrr, DPU::SUBCrrr);
+    break;
+  case DPU::XOR64rr:
+    expand64BitRegisterAluInstruction(MI, MBB, DPU::XORrrr, DPU::XORrrr);
+    break;
+    
+  case DPU::ADD64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::ADDrri, DPU::ADDCrri);
+    break;
+  case DPU::AND64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::ANDrri, DPU::ANDrri);
+    break;
+  case DPU::OR64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::ORrri, DPU::ORrri);
+    break;
+  case DPU::XOR64ri:
+    expand64BitImmediateAluInstruction(MI, MBB, DPU::XORrri, DPU::XORrri);
+    break;
+    
+  // case DPU::Jcci:
+  // case DPU::TmpJcci:
+  // case DPU::Jcc: {
+  //   // don't expand yet as they are used for late optimization
+  //   // these late optimization should be reworked and placed earlier in the pipeline
+  //   // so we could treat more cases of optim
+  //   break;
+  // }
   }
 
   MBB.erase(MI);
+
+  LLVM_DEBUG({
+      dbgs() << "** MBB: "; MBB.dump();
+    });
   return true;
 }
 
@@ -139,18 +306,24 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL, MCRegister DestReg,
                                MCRegister SrcReg, bool KillSrc) const {
+  LLVM_DEBUG({ dbgs() << "DPU/Instr - copyPhysReg "; I->dump(); });
+
+  bool is_dest_renamable = I->getOperand(0).isRenamable();
+  bool is_src_renamable = I->getOperand(1).isRenamable();
+  MachineInstrBuilder MIB;
+
   if (DPU::GP_REGRegClass.contains(DestReg) &&
       DPU::OP_REGRegClass.contains(SrcReg)) {
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::MOVErr), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::MOVErr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
   } else if (DPU::GP64_REGRegClass.contains(DestReg, SrcReg)) {
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::MOVDrr), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::MOVDrr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
   } else if (DPU::GP64_REGRegClass.contains(SrcReg) &&
              DPU::GP_REGRegClass.contains(DestReg)) {
@@ -158,7 +331,7 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::EXTRACT_SUBREG), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::EXTRACT_SUBREG), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
         .addImm(DPU::sub_32bit);
   } else if (DPU::GP_REGRegClass.contains(SrcReg) &&
@@ -167,11 +340,16 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg
                       << " kill= " << KillSrc << " to dest=" << DestReg
                       << "\n");
-    BuildMI(MBB, I, DL, get(DPU::MOVE_Srr), DestReg)
+    MIB = BuildMI(MBB, I, DL, get(DPU::MOVE_Srr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
   } else {
     llvm_unreachable("Impossible reg-to-reg copy");
   }
+
+  if (is_dest_renamable)
+    MIB->getOperand(0).setIsRenamable();
+  if (is_src_renamable)
+    MIB->getOperand(1).setIsRenamable();
 }
 
 static bool reverseBranchOpc(unsigned Opc, unsigned &ReversedOpc) {
@@ -257,6 +435,7 @@ bool DPUInstrInfo::reverseBranchCondition(
   case DPU::Jcc:
   case DPU::Jcci:
   case DPU::Jcc64:
+  // case DPU::Jcci64:
     Cond[1].setImm(ISD::getSetCCInverse(ISD::CondCode(Cond[1].getImm()), MVT::i32));
     break;
   default: {
@@ -275,6 +454,11 @@ bool DPUInstrInfo::reverseBranchCondition(
 static void
 fetchUnconditionalBranchInfo(MachineInstr *Inst,
                              unsigned &targetBasicBlockOperandIndex) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+    });
+
   switch (Inst->getOpcode()) {
   case DPU::JUMPi:
     targetBasicBlockOperandIndex = 0;
@@ -286,21 +470,73 @@ fetchUnconditionalBranchInfo(MachineInstr *Inst,
 
 static void fetchConditionalBranchInfo(MachineInstr *Inst,
                                        unsigned &targetBasicBlockOperandIndex,
-                                       SmallVectorImpl<MachineOperand> &Cond) {
+                                       SmallVectorImpl<MachineOperand> &Cond,
+				       bool &do_have_metadata) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+      dbgs() << "Cond.size() " << Cond.size() << "\n";
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
+  
   unsigned Opc = Inst->getOpcode();
   Cond.push_back(MachineOperand::CreateImm(Opc));
 
+  // for (unsigned int eachOperandIndex = 0; eachOperandIndex < Inst->getNumOperands();
+  //      eachOperandIndex++) {
+  //   MachineOperand &operand = Inst->getOperand(eachOperandIndex);
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "operand " << eachOperandIndex << ": "; operand.dump();
+  //     });
+  //   if (operand.isMBB()) {
+  //     targetBasicBlockOperandIndex = eachOperandIndex;
+  //   } else {
+  //     Cond.push_back(operand);
+  //   }
+  // }
   unsigned int NumOp = Inst->getNumExplicitOperands();
+  // unsigned int NumOp = Inst->getNumOperands();
 
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "NumOp " << NumOp << "\n";
+  //   });
   for (unsigned int eachOperandIndex = 0; eachOperandIndex < NumOp;
        eachOperandIndex++) {
+    // LLVM_DEBUG({
+    // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    // 	dbgs() << "operand " << eachOperandIndex << ": ";
+    //   });
     MachineOperand &operand = Inst->getOperand(eachOperandIndex);
+    // LLVM_DEBUG({
+    // 	operand.dump();
+    //   });
     if (operand.isMBB()) {
       targetBasicBlockOperandIndex = eachOperandIndex;
     } else {
       Cond.push_back(operand);
     }
   }
+
+  do_have_metadata = false;
+  for (const MachineOperand &Op : Inst->operands()) {
+    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(Inst->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+      Cond.push_back(Op);
+      do_have_metadata = true;
+    }
+  }
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "Inst "; Inst->dump();
+      dbgs() << "Cond.size() " << Cond.size() << "\n";
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
 }
 
 static inline bool isAnalyzableBranch(MachineInstr *Inst) {
@@ -312,6 +548,15 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
+  
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 
   // Skip all the debug instructions.
@@ -331,6 +576,9 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
   // If not an analyzable branch (e.g., indirect jump), just leave.
   if (!isAnalyzableBranch(LastInst)) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+      });
     return true;
   }
 
@@ -366,18 +614,40 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     // Conditional branch
     if (LastInst->isConditionalBranch()) {
       unsigned int TBBOpIdx;
-      fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond);
+      bool do_have_metadata = false;
+      fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond, do_have_metadata);
+      if (do_have_metadata) {
+	LLVM_DEBUG({
+	    dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable metadata\n";
+	  });
+	return true;
+      }
+      // LLVM_DEBUG({
+      // 	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      // 	  dbgs() << "MBB "; MBB.dump();
+      // 	  dbgs() << "LastInst "; LastInst->dump();
+      // 	  dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
+      // 	  for (unsigned i = 0; i < Cond.size(); ++i) {
+      // 	    dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      // 	  }
+      // 	});
       TBB = LastInst->getOperand(TBBOpIdx).getMBB();
       return false;
     }
 
     // Unknown branch type
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+      });
     return true;
   }
 
   // If we reached here, there are two branches.
   // If there are three terminators, we don't know what sort of block this is.
   if (++I != REnd && isUnpredicatedTerminator(*I)) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+      });
     return true;
   }
 
@@ -386,11 +656,13 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   if (SecondLastInst->isUnconditionalBranch()) {
     // Return if the last instruction cannot be removed.
     if (!AllowModify) {
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+	});
       return true;
     }
     unsigned int TBBOpIdx;
     fetchUnconditionalBranchInfo(SecondLastInst, TBBOpIdx);
-
     TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB();
     LastInst->eraseFromParent();
     return false;
@@ -400,25 +672,52 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     // Conditional branch followed by an unconditional branch.
     // The last one must be unconditional.
     if (!LastInst->isUnconditionalBranch()) {
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+	});
       return true;
     }
     unsigned int TBBOpIdx;
     unsigned int FTBBOpIdx;
+    bool do_have_metadata = false;
 
     fetchUnconditionalBranchInfo(LastInst, FTBBOpIdx);
-    fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond);
+    fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond, do_have_metadata);
+    if (do_have_metadata) {
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable metadata\n";
+	});
+      return true;
+    }
     TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB();
     FBB = LastInst->getOperand(FTBBOpIdx).getMBB();
-
+    // LLVM_DEBUG({
+    // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    // 	dbgs() << "MBB "; MBB.dump();
+    // 	dbgs() << "LastInst "; LastInst->dump();
+    // 	dbgs() << "SecondLastInst "; SecondLastInst->dump();
+    // 	dbgs() << "TBBOpIdx " << TBBOpIdx << "\n";
+
+    // 	for (unsigned i = 0; i < Cond.size(); ++i) {
+    // 	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+    // 	}
+    //   });
     return false;
   }
 
   // Unknown branch type
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n";
+    });
   return true;
 }
 
 unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                     int *BytesRemoved) const {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+    });
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
 
@@ -444,22 +743,128 @@ unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
 void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB,
                                           MachineBasicBlock *TBB, DebugLoc DL,
                                           ArrayRef<MachineOperand> Cond) const {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	  dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+	  if (Cond[i].isReg()) {
+	    dbgs() << "Cond[" << i << "] isUse " << Cond[i].isUse() << "\n";
+	    dbgs() << "Cond[" << i << "] isDef " << Cond[i].isDef() << "\n";
+	  }
+      }
+    });
+
+  // LLVM_DEBUG({
+  //     dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  //     dbgs() << "DPU::sub_32bit " << DPU::sub_32bit << "\n";
+  //     dbgs() << "DPU::sub_32bit_hi " << DPU::sub_32bit_hi << "\n";
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	dbgs() << "Cond[" << i << "] = "; Cond[i].dump();
+  // 	if (Cond[i].isReg()) {
+  // 	  dbgs() << "is Reg\n";
+  // 	  dbgs() << Cond[i].getReg() << "\n";
+  // 	  dbgs() << Cond[i].getSubReg() << "\n";
+
+  // 	  dbgs() << "contains " << DPU::GP64_REGRegClass.contains(Cond[i].getReg()) << "\n";
+  // 	}
+  //     }
+  //   });
+
   MachineInstrBuilder MIB;
 
   unsigned Opc = Cond[0].getImm();
 
-  MIB = BuildMI(&MBB, DL, get(Opc));
+  // treat special cases
+  // those where not well handled with LLVM SSA stuff
+  // bool have_metadata = false;
+  // TODO: find a better way to discover if it's an arithmetic+comp+jump
+  //       or simply rely solely on metadata?
+  // switch (Opc) {
+  // default:
+  //   break;
+  // case DPU::CLZ_Urrci:
+  // case DPU::MUL_UL_ULrrrci:
+  // case DPU::LSLXrrrci:
+  // case DPU::LSRXrrrci:
+  //   {
+  //     for (unsigned i = 0; i < Cond.size(); ++i) {
+  // 	if (Cond[i].isMetadata()
+  // 	    && Cond[i].getMetadata()->getOperand(0).get() == MDString::get(MBB.getParent()->getFunction().getContext(), "MySpecialMetadata")) {
+  // 	  have_metadata = true;
+  // 	}
+  //     }
+  //     break;
+  //   }
+  // }
 
-  for (unsigned i = 1; i < Cond.size(); ++i) {
-    if (Cond[i].isReg())
-      MIB.addReg(Cond[i].getReg());
-    else if (Cond[i].isImm())
+  MIB = BuildMI(&MBB, DL, get(Opc));
+  // for (unsigned i = 1; i < Cond.size(); ++i) {
+  //   MIB->addOperand(Cond[i]);
+  // }
+  
+  
+  unsigned start = 1;
+  // if (have_metadata) {
+  //   MIB = BuildMI(&MBB, DL, get(Opc), Cond[start].getReg());
+  //   start++;
+  // } else {
+  //   MIB = BuildMI(&MBB, DL, get(Opc));
+  // }
+
+  for (unsigned i = start; i < Cond.size(); ++i) {
+    // LLVM_DEBUG({
+    // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    // 	dbgs() << " working on " << i << "\n";
+    //   });
+    if (Cond[i].isReg()) {
+      // LLVM_DEBUG({
+      // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      // });
+      // MIB.addReg(Cond[i].getReg());
+      // LLVM_DEBUG({
+      // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      // });
+      MIB->addOperand(Cond[i]);
+      // if (Cond[i].isDef()) {
+      // 	// The register in question could potentially be a
+      // 	// subreg hi/lo of a 64-bit vreg
+      // 	if (unsigned SubReg = Cond[i].getSubReg()) {
+      // 	  MIB.addDef(Cond[i].getReg(), 0, SubReg);
+      // 	} else {
+      // 	  MIB.addDef(Cond[i].getReg());
+      // 	}
+      // } else {
+      // 	// The register in question could potentially be a
+      // 	// subreg hi/lo of a 64-bit vreg
+      // 	if (unsigned SubReg = Cond[i].getSubReg()) {
+      // 	  MIB.addReg(Cond[i].getReg(), 0, SubReg);
+      // 	} else {
+      // 	  MIB.addReg(Cond[i].getReg());
+      // 	}
+      // }
+    } else if (Cond[i].isImm()) {
       MIB.addImm(Cond[i].getImm());
-    else
+    } else if (Cond[i].isMetadata()) {
+      // MIB.addMetadata(Cond[i].getMetadata());
+    } else {
       assert(false && "Cannot copy operand");
+    }
   }
 
   MIB.addMBB(TBB);
+
+  // add back remaining metadata
+  for (unsigned i = 0; i < Cond.size(); ++i) {
+     if (Cond[i].isMetadata()) {
+      MIB.addMetadata(Cond[i].getMetadata());
+     }
+  }
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MIB "; MIB->dump();
+    });
 }
 
 unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
@@ -467,6 +872,13 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *FBB,
                                     ArrayRef<MachineOperand> Cond,
                                     const DebugLoc &DL, int *BytesAdded) const {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+      for (unsigned i = 0; i < Cond.size(); ++i) {
+	dbgs() << "Cond[" << i << "] "; Cond[i].dump();
+      }
+    });
   unsigned nrOfInsertedMachineInstr = 0;
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
@@ -492,5 +904,47 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
   // to instructions added.
   if (BytesAdded)
     *BytesAdded = nrOfInsertedMachineInstr;
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "MBB "; MBB.dump();
+    });
   return nrOfInsertedMachineInstr;
 }
+
+bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const {
+  switch (MI.getDesc().getOpcode()) {
+  default:
+    break;
+  case DPU::CLZ_Urr:
+  case DPU::LSLXrrr:
+  case DPU::LSRXrrr:
+  case DPU::ANDrri:
+  case DPU::JEQrii:
+  case DPU::JNEQrii:
+  // case DPU::ADDrrr:
+  // case DPU::ADDCrrr:
+  case DPU::SUBrrr:
+  case DPU::SUBCrrr:
+    {
+      //   return false;
+      for (const MachineOperand &Op : MI.operands()) {
+	if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI.getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+	  LLVM_DEBUG({
+	      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Don't sink because I have MySpecialMetadata.\n";
+	      MI.dump();
+	    });
+	  return false; // Do not sink this instruction
+	}
+      }
+      LLVM_DEBUG({
+	  dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " I'm potentially something used in arith+cond+jump from EmitInstrWithCustomInserter but I allow sink because I don't have MySpecialMetadata.\n";
+	  MI.dump();
+	});
+      break;
+    }
+  }
+
+  // return true;
+  return TargetInstrInfo::shouldSink(MI);
+}
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.h b/llvm/lib/Target/DPU/DPUInstrInfo.h
index e9c2a3b920a05..98fc84304958f 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.h
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.h
@@ -43,14 +43,22 @@ class DPUInstrInfo : public DPUGenInstrInfo {
                             const TargetRegisterInfo *TRI) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
-
+  void expand64BitRegisterAluInstruction(MachineInstr &MI,
+					 MachineBasicBlock &MBB,
+					 unsigned int LsbOpcode,
+					 unsigned int MsbOpcode) const;
+  void expand64BitImmediateAluInstruction(MachineInstr &MI,
+					  MachineBasicBlock &MBB,
+					  unsigned int LsbOpcode,
+					  unsigned int MsbOpcode) const;
+  
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
 
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-
+  
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
@@ -65,6 +73,8 @@ class DPUInstrInfo : public DPUGenInstrInfo {
 
   void buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                               DebugLoc DL, ArrayRef<MachineOperand> Cond) const;
+
+  bool shouldSink(const MachineInstr &MI) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.td b/llvm/lib/Target/DPU/DPUInstrInfo.td
index 6b89c0e906556..b923d56beddea 100644
--- a/llvm/lib/Target/DPU/DPUInstrInfo.td
+++ b/llvm/lib/Target/DPU/DPUInstrInfo.td
@@ -217,58 +217,66 @@ defm : WramStoreImmPat<store, SDrii, s16_i64_imm>;
 
 def : Pat<(i32 (trunc DoubleReg:$src)), (EXTRACT_SUBREG DoubleReg:$src, sub_32bit)>;
 
-let isMoveImm = 1, isAsCheapAsAMove = 0 in {
+let isMoveImm = 1, isAsCheapAsAMove = 0
+, usesCustomInserter = 1
+in {
   def MOVE64ri: PseudoDPUInstruction<
                     (outs GP64_REG:$dc), (ins i64imm:$imm),
                     "",
                     [(set i64:$dc, (i64 imm:$imm))]>;
 }
 
-let isAsCheapAsAMove = 0 in {
-def ADD64rr: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
-                  "",
-                  [(set i64:$dc, (add i64:$da, i64:$db))]>;
-
+let isAsCheapAsAMove = 0
+// , usesCustomInserter = 1
+in {
 def ADD64ri: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
                   "",
                   [(set i64:$dc, (add i64:$da, (i64 imm:$imm)))]>;
 
-def SUB64rr: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
-                  "",
-                  [(set i64:$dc, (sub i64:$da, i64:$db))]>;
-
-def OR64rr: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
+def AND64ri: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
                   "",
-                  [(set i64:$dc, (or i64:$da, i64:$db))]>;
+                  [(set i64:$dc, (and i64:$da, (i64 imm:$imm)))]>;
 
 def OR64ri: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
                   "",
                   [(set i64:$dc, (or i64:$da, (i64 imm:$imm)))]>;
 
+def XOR64ri: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
+                  "",
+                  [(set i64:$dc, (xor i64:$da, (i64 imm:$imm)))]>;
+}
+
+let isAsCheapAsAMove = 0
+// , usesCustomInserter = 1
+in {
+def ADD64rr: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
+                  "",
+                  [(set i64:$dc, (add i64:$da, i64:$db))]>;
+
 def AND64rr: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
                   [(set i64:$dc, (and i64:$da, i64:$db))]>;
 
-def AND64ri: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
+def OR64rr: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
-                  [(set i64:$dc, (and i64:$da, (i64 imm:$imm)))]>;
+                  [(set i64:$dc, (or i64:$da, i64:$db))]>;
 
-def XOR64rr: PseudoDPUInstruction<
+def SUB64rr: PseudoDPUInstruction<
                   (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
-                  [(set i64:$dc, (xor i64:$da, i64:$db))]>;
+                  [(set i64:$dc, (sub i64:$da, i64:$db))]>;
 
-def XOR64ri: PseudoDPUInstruction<
-                  (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm),
+def XOR64rr: PseudoDPUInstruction<
+                  (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db),
                   "",
-                  [(set i64:$dc, (xor i64:$da, (i64 imm:$imm)))]>;
+                  [(set i64:$dc, (xor i64:$da, i64:$db))]>;
 }
 
 // Bit operations: 64 bits emulation.
@@ -454,26 +462,36 @@ def Jcci: PseudoDPUInstruction<
                 [(DPUBrCCi (i32 imm:$cc), i32:$ra, (s11_imm:$immediate), bb:$dst)]
                 >;
 
-def TmpJcci: PseudoDPUInstruction<
-                (outs), (ins ccopcode:$cc, OP_REG:$ra, s11_imm:$immediate, GP_REG:$dependency, pcoffset:$dst),
-                "",
-                []
-                >;
+// def TmpJcci: PseudoDPUInstruction<
+//                 (outs), (ins ccopcode:$cc, OP_REG:$ra, s11_imm:$immediate, GP_REG:$dependency, pcoffset:$dst),
+//                 "",
+//                 []
+//                 >;
 }
 
-let isBranch = 1, isTerminator = 1, isCompare = 1, isBarrier = 0, isIndirectBranch = 0, isAsCheapAsAMove = 0 in {
+let isBranch = 1, isTerminator = 1, isCompare = 1, isBarrier = 0, isIndirectBranch = 0, isAsCheapAsAMove = 0
+// , usesCustomInserter = 1
+in {
 def Jcc64: PseudoDPUInstruction<
                 (outs), (ins ccopcode:$cc, GP64_REG:$da, GP64_REG:$db, pcoffset:$dst),
                 "",
                 [(DPUBrCC (i32 imm:$cc), i64:$da, i64:$db, bb:$dst)]
                 >;
+
+// def Jcci64: PseudoDPUInstruction<
+//                 (outs), (ins ccopcode:$cc, GP64_REG:$da, s11_i64_imm:$immediate, pcoffset:$dst),
+//                 "",
+//                 [(DPUBrCCi (i32 imm:$cc), i64:$da, (s11_i64_imm:$immediate), bb:$dst)]
+//                 >;
 }
 
 // -----------------------------------------------------------------------------
 // SETCC
 // -----------------------------------------------------------------------------
 
-let isAsCheapAsAMove = 0 in {
+let isAsCheapAsAMove = 0
+, usesCustomInserter = 1
+in {
 def SET64cc: PseudoDPUInstruction<
                     (outs GP_REG:$rc), (ins ccopcode:$cc, GP64_REG:$lhs, GP64_REG:$rhs),
                     "",
@@ -634,3 +652,15 @@ let usesCustomInserter = 1 in {
     def MRAM_LOAD64_X32mr : MRAM_LOAD64_X_mr<mram_extloadi32>;
     def MRAM_LOAD_DOUBLEmr: MRAM_LOAD64_X_mr<mram_load>;
 }
+
+//===----------------------------------------------------------------------===//
+// Bit manipulation instructions
+//===----------------------------------------------------------------------===//
+
+// ((1 << n) - 1)
+def : Pat<(sub (shl (i32 1), GP_REG:$n), (i32 1)),
+          (LSLXrrr LNEG, GP_REG:$n)>;
+	  
+def : Pat<(xor (shl (i32 -1), GP_REG:$n), (i32 -1)),
+          (LSLXrrr LNEG, GP_REG:$n)>;
+// ====
diff --git a/llvm/lib/Target/DPU/DPUMCInstLower.cpp b/llvm/lib/Target/DPU/DPUMCInstLower.cpp
index 311c64f86b142..954f3834cc138 100644
--- a/llvm/lib/Target/DPU/DPUMCInstLower.cpp
+++ b/llvm/lib/Target/DPU/DPUMCInstLower.cpp
@@ -102,6 +102,7 @@ void DPUMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
 
     case MachineOperand::MO_RegisterMask:
+    case MachineOperand::MO_Metadata:
       continue;
 
     case MachineOperand::MO_GlobalAddress:
diff --git a/llvm/lib/Target/DPU/DPUMacroFusion.cpp b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
index a606c017d7cfb..6a14246c852c0 100644
--- a/llvm/lib/Target/DPU/DPUMacroFusion.cpp
+++ b/llvm/lib/Target/DPU/DPUMacroFusion.cpp
@@ -28,14 +28,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   // We are mainly interested in merging a simple operation with a simple
   // conditional/unconditional branch
   LLVM_DEBUG({
-    dbgs() << "DPU/Merge: checking macro fusion:\n\t";
-    if (!FirstMI)
-      dbgs() << "<NONE>";
-    else
-      FirstMI->dump();
-    dbgs() << "\n\t";
-    SecondMI.dump();
-    dbgs() << "\n";
+    dbgs() << "DPU/Merge: checking macro fusion:\n";
+    if (!FirstMI) {
+      dbgs() << "\t<NONE>\n";
+    } else {
+      dbgs() << "\t"; FirstMI->dump();
+    }
+    dbgs() << "\t"; SecondMI.dump();
   });
 
   if (!FirstMI) {
@@ -51,14 +50,38 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   switch (secondOpc) {
   default:
     // todo probably more opportunities (Conditional branches...)
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
     return false;
   case DPU::JUMPi:
-  case DPU::TmpJcci:
+  // case DPU::TmpJcci:
+    break;
+  case DPU::JNEQrii:
+  case DPU::JEQrii:
+    if (!(FirstMI->getOperand(0).isReg() && SecondMI.getOperand(0).isReg() &&
+	  (FirstMI->getOperand(0).getReg() ==
+	   SecondMI.getOperand(0).getReg()))) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
+    LLVM_DEBUG({
+	dbgs() << "first reg " << FirstMI->getOperand(0).getReg() << "\n";
+	dbgs() << "second reg " << SecondMI.getOperand(0).getReg() << "\n";
+      });
+    return false;
+    }
     break;
   case DPU::Jcci:
     if (!(FirstMI->getOperand(0).isReg() && SecondMI.getOperand(1).isReg() &&
-          (FirstMI->getOperand(0).getReg() ==
-           SecondMI.getOperand(1).getReg()))) {
+	  (FirstMI->getOperand(0).getReg() ==
+	   SecondMI.getOperand(1).getReg()))) {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
       return false;
     }
     break;
@@ -68,7 +91,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   default:
     // todo probably more opportunities (Operations with specific immediate
     // operands, call...)
-    LLVM_DEBUG(dbgs() << "DPU/Merge: the two instructions cannot be fused\n");
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "DPU/Merge: the two instructions cannot be fused\n";
+      });
     return false;
   case DPU::ADDrri:
   case DPU::ADDrrr:
@@ -92,6 +118,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   case DPU::RORrrr:
   case DPU::RORrri:
   case DPU::CLZrr:
+  case DPU::CLZ_Urr:
   case DPU::CAOrr:
   case DPU::MUL_UL_ULrrr:
   case DPU::MUL_SL_ULrrr:
diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
index 998d4f0d4bcc5..c96a23c933e17 100644
--- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp
@@ -6,6 +6,9 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
+// TODO: expand to more situation of arith+comp+branch
+
 #include "DPUTargetMachine.h"
 #include <llvm/CodeGen/MachineInstrBuilder.h>
 #include <set>
@@ -653,12 +656,25 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     LLVM_DEBUG(dbgs() << "KO: Unknown LastOpc\n");
     return false;
   case DPU::JUMPi: {
+    // this is currently wrong
+    // we morph the branch from unconditional to conditional
+    // by this, we modify the CFG by creating artificially a fall through which is not declared
+    // so, it's bugged
+    return false;
+    // 
+    
     if (!ImmCanBeEncodedOn8Bits) {
       LLVM_DEBUG(
           dbgs() << "KO: LastOpc == DPU::JUMPi && !ImmCanBeEncodedOn8Bits\n");
       return false;
     }
 
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
     int64_t actualCondition = ISD::SETTRUE2;
     MachineInstrBuilder ComboInst =
         BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(OpJumpOpc))
@@ -687,14 +703,33 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
     auto actualConditionOperand = MachineOperand::CreateImm(actualCondition);
     ComboInst.add(actualConditionOperand).add(LastInst->getOperand(0));
 
-    LLVM_DEBUG(dbgs() << "OK\n"; LastInst->dump(); SecondLastInst->dump(););
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    
     LastInst->eraseFromParent();
     SecondLastInst->eraseFromParent();
 
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+    
     return true;
   }
-  case DPU::TmpJcci:
+  // case DPU::TmpJcci:
   case DPU::Jcci: {
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: (if any)\n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
     bool isSourceCondition = false;
 
     if (SecondLastInst->getOperand(0).getReg() !=
@@ -757,11 +792,17 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
                "&& !isSourceCondition) && (!ImmCanBeEncodedOn11Bits)\n");
         return false;
       }
+      if (SecondLastOpc == DPU::MOVEri || SecondLastOpc == DPU::MOVErr) {
+	LLVM_DEBUG(
+            dbgs()
+            << "KO: move to zero is invalid\n");
+	return false;
+      }
       // todo: this is not optimal. One register has been allocated but not used
       // now. This can become an issue (unnecessary spilling)
       ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
                           InstrInfo.get(OpNullJumpOpc))
-                      .addReg(DPU::ZERO);
+	.addReg(DPU::ZERO);
     } else {
       if (!ImmCanBeEncodedOn8Bits) {
         LLVM_DEBUG(
@@ -794,14 +835,30 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB,
       break;
     }
 
-    LastInst->getOperand(0).setImm(actualCondition);
-    ComboInst.add(LastInst->getOperand(0))
+    // why modify the original instruction ???
+    // LastInst->getOperand(0).setImm(actualCondition);
+    // ComboInst.add(LastInst->getOperand(0))
+    //     .add(LastInst->getOperand(LastInst->getNumOperands() - 1));
+    ComboInst.addImm(actualCondition)
         .add(LastInst->getOperand(LastInst->getNumOperands() - 1));
 
-    LLVM_DEBUG(dbgs() << "OK\n"; LastInst->dump(); SecondLastInst->dump(););
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    
     LastInst->eraseFromParent();
     SecondLastInst->eraseFromParent();
 
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+    
     return true;
   }
   case DPU::Jcc:
@@ -821,9 +878,19 @@ bool DPUMergeComboInstrPass::runOnMachineFunction(MachineFunction &MF) {
   for (auto &MFI : MF) {
     MachineBasicBlock *MBB = &MFI;
 
-    LLVM_DEBUG(MBB->dump());
-    changeMade |= mergeComboInstructionsInMBB(MBB, InstrInfo);
+    // LLVM_DEBUG(MBB->dump());
+
+    bool local_change = mergeComboInstructionsInMBB(MBB, InstrInfo);
+    if (local_change) {
+      // LLVM_DEBUG({
+      // 	  dbgs() << "\nchanged to:\n";
+      // 	  MBB->dump();
+      // 	});
+      changeMade = true;
+    }
   }
 
+  LLVM_DEBUG(dbgs() << "********** DPU/MergeComboInstrPass: " << MF.getName()
+                    << " done: changeMade = " << changeMade << " **********\n\n");
   return changeMade;
 }
diff --git a/llvm/lib/Target/DPU/DPUPostRAFusion.cpp b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
new file mode 100644
index 0000000000000..a3cc5ab25e5d5
--- /dev/null
+++ b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp
@@ -0,0 +1,256 @@
+#include "DPUTargetMachine.h"
+#include "DPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include <llvm/CodeGen/MachineInstrBuilder.h>
+
+#define GET_INSTRINFO_ENUM
+
+#include "DPUCondCodes.h"
+#include "DPUGenInstrInfo.inc"
+#include "DPUISelLowering.h"
+#include "MCTargetDesc/DPUAsmCondition.h"
+
+#define GET_REGINFO_ENUM
+#include "DPUGenRegisterInfo.inc"
+
+#define DEBUG_TYPE "dpu-postra-fusion"
+
+using namespace llvm;
+
+namespace {
+class DPUPostRAFusionPass : public MachineFunctionPass {
+public:
+  static char ID;
+
+  explicit DPUPostRAFusionPass(DPUTargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  llvm::StringRef getPassName() const override {
+    return "DPU PostRA Fussion";
+  }
+
+private:
+  const DPUTargetMachine &TM;
+};
+
+char DPUPostRAFusionPass::ID = 0;
+} // namespace
+
+FunctionPass *llvm::createDPUPostRAFusionPass(DPUTargetMachine &tm) {
+  return new DPUPostRAFusionPass(tm);
+}
+
+static MachineInstr *
+getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I,
+                         MachineBasicBlock::reverse_iterator REnd) {
+  // Skip all the debug instructions.
+  while (I != REnd &&
+         (I->isDebugValue() || I->getOpcode() == TargetOpcode::DBG_VALUE)) {
+    ++I;
+  }
+  if (I == REnd) {
+    return NULL;
+  }
+  return &*I;
+}
+
+static bool do_have_special_metadata(MachineInstr *MI) {
+  for (const MachineOperand &Op : MI->operands()) {
+    if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI->getMF()->getFunction().getContext(), "MySpecialMetadata")) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool runOnMachineBB(MachineBasicBlock *MBB,
+			   const DPUInstrInfo &InstrInfo) {
+  MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend();
+  MachineInstr *LastInst, *SecondLastInst;
+  unsigned int LastOpc, SecondLastOpc;
+
+  // LLVMContext &Context = MBB->getParent()->getFunction().getContext();
+  // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  LastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (LastInst == NULL) {
+    // LLVM_DEBUG(dbgs() << "KO: I == REnd\n");
+    return false;
+  }
+  I++;
+  SecondLastInst = getLastNonDebugInstrFrom(I, REnd);
+  if (SecondLastInst == NULL) {
+    // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
+    return false;
+  }
+
+  if (!do_have_special_metadata(LastInst)
+      || !do_have_special_metadata(SecondLastInst))
+    return false;
+  
+  LastOpc = LastInst->getOpcode();
+  SecondLastOpc = SecondLastInst->getOpcode();
+
+  // attempt to merge lslx/lsrx and XX 32 jeq XX 32 instructions
+  // TODO: check if it's shift32 as well?
+  //       or maybe use other metadata?
+  //         but this is to be extra careful, or for the next player in the game ... :)
+  // though, here I apply only when with my metadata
+  //   but if I actually not test my metadata, maybe
+  //     and add JNEQrii, I could pop both
+  //     and why not tackle other possible optim that may have introduce this code
+  //        event from user maybe
+  // original code is JEQrii, but JNEQrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JEQrii || LastOpc == DPU::JNEQrii) 
+      && SecondLastOpc == DPU::ANDrri) {
+    I++;
+    MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd);
+    if (ThirdLastInst == NULL) {
+      // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n");
+      return false;
+    }
+    if (!do_have_special_metadata(ThirdLastInst))
+      return false;
+    
+    unsigned int ThirdLastOpc = ThirdLastInst->getOpcode();
+    if ((ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr)) {
+
+      LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+      unsigned int new_opcode = (ThirdLastOpc == DPU::LSLXrrr ?
+				 DPU::LSLXrrrci : DPU::LSRXrrrci);
+      MachineInstrBuilder ComboInst = BuildMI(MBB, ThirdLastInst->getDebugLoc(),
+					      InstrInfo.get(new_opcode));
+      ComboInst.add(ThirdLastInst->getOperand(0));
+      ComboInst.add(ThirdLastInst->getOperand(1));
+      ComboInst.add(ThirdLastInst->getOperand(2));
+      ComboInst.addImm(DPUAsmCondition::Condition::Shift32);
+      ComboInst.addMBB(LastInst->getOperand(2).getMBB());
+      // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink
+      
+      LLVM_DEBUG({
+	  dbgs() << "OK\n";
+	  dbgs() << "del "; ThirdLastInst->dump();
+	  dbgs() << "del "; SecondLastInst->dump();
+	  dbgs() << "del "; LastInst->dump();
+	  dbgs() << "fused to\n";
+	  dbgs() << "add "; ComboInst->dump();
+	});
+
+      LastInst->eraseFromParent();
+      SecondLastInst->eraseFromParent();
+      ThirdLastInst->eraseFromParent();
+      LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+      return true;
+    }
+  }
+
+  // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch
+  // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii)
+      && SecondLastOpc == DPU::MUL_UL_ULrrr) {
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+      
+    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
+					    InstrInfo.get(DPU::MUL_UL_ULrrrci));
+    ComboInst.add(SecondLastInst->getOperand(0));
+    ComboInst.add(SecondLastInst->getOperand(1));
+    ComboInst.add(SecondLastInst->getOperand(2));
+    ComboInst.addImm(DPUAsmCondition::Small);
+    ComboInst.add(LastInst->getOperand(2));
+    // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink
+    
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    LastInst->eraseFromParent();
+    SecondLastInst->eraseFromParent();
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+    return true;
+  }
+
+  // original code is JNEQrii, but JEQrii could be introduce by analyzeBranch
+  if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii)
+      && SecondLastOpc == DPU::CLZ_Urr) {
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "before change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+    MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(),
+					    InstrInfo.get(DPU::CLZ_Urrci));
+    ComboInst.add(SecondLastInst->getOperand(0));
+    ComboInst.add(SecondLastInst->getOperand(1));
+    ComboInst.addImm((LastOpc == DPU::JNEQrii) ?
+		     DPUAsmCondition::Condition::NotMaximum : DPUAsmCondition::Condition::Maximum);
+    ComboInst.add(LastInst->getOperand(2));
+    // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink
+
+    LLVM_DEBUG({
+	dbgs() << "OK\n";
+	dbgs() << "del "; SecondLastInst->dump();
+	dbgs() << "del "; LastInst->dump();
+	dbgs() << "fused to\n";
+	dbgs() << "add "; ComboInst->dump();
+      });
+    
+    LastInst->eraseFromParent();
+    SecondLastInst->eraseFromParent();
+
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "after change: \n";
+	dbgs() << "** MBB "; MBB->dump();
+      });
+
+    return true;
+  }
+
+  return false;
+}
+
+bool DPUPostRAFusionPass::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName()
+                    << " **********\n\n");
+
+  auto &SubTarget = static_cast<const DPUSubtarget &>(MF.getSubtarget());
+  auto &InstrInfo = *SubTarget.getInstrInfo();
+  bool changeMade = false;
+
+  for (auto &MFI : MF) {
+    MachineBasicBlock *MBB = &MFI;
+    changeMade |= runOnMachineBB(MBB, InstrInfo);
+  }
+
+  LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName()
+                    << " done: changeMade = " << changeMade << " **********\n\n");
+  return changeMade;
+}
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.cpp b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
index 778ac2343a5c4..705b05ca0e746 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.cpp
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.cpp
@@ -50,28 +50,41 @@ DPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector DPURegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector reserved = BitVector(getNumRegs());
-  reserved.set(DPU::D22);
-  reserved.set(DPU::R22);
-  reserved.set(DPU::R23);
-  reserved.set(DPU::ZERO);
-  reserved.set(DPU::ONE);
-  reserved.set(DPU::LNEG);
-  reserved.set(DPU::MNEG);
-  reserved.set(DPU::ID);
-  reserved.set(DPU::ID2);
-  reserved.set(DPU::ID4);
-  reserved.set(DPU::ID8);
-  reserved.set(DPU::MAJ_D22);
-  reserved.set(DPU::MAJ_R22);
-  reserved.set(DPU::MAJ_R23);
-  reserved.set(DPU::MAJ_ZERO);
-  reserved.set(DPU::MAJ_ONE);
-  reserved.set(DPU::MAJ_LNEG);
-  reserved.set(DPU::MAJ_MNEG);
-  reserved.set(DPU::MAJ_ID);
-  reserved.set(DPU::MAJ_ID2);
-  reserved.set(DPU::MAJ_ID4);
-  reserved.set(DPU::MAJ_ID8);
+  
+  markSuperRegs(reserved, DPU::D22);
+  markSuperRegs(reserved, DPU::R22);
+  markSuperRegs(reserved, DPU::R23);
+  markSuperRegs(reserved, DPU::ZERO);
+  markSuperRegs(reserved, DPU::ONE);
+  markSuperRegs(reserved, DPU::LNEG);
+  markSuperRegs(reserved, DPU::MNEG);
+  markSuperRegs(reserved, DPU::ID);
+  markSuperRegs(reserved, DPU::ID2);
+  markSuperRegs(reserved, DPU::ID4);
+  markSuperRegs(reserved, DPU::ID8);
+  assert(checkAllSuperRegsMarked(reserved));
+  // reserved.set(DPU::D22);
+  // reserved.set(DPU::R22);
+  // reserved.set(DPU::R23);
+  // reserved.set(DPU::ZERO);
+  // reserved.set(DPU::ONE);
+  // reserved.set(DPU::LNEG);
+  // reserved.set(DPU::MNEG);
+  // reserved.set(DPU::ID);
+  // reserved.set(DPU::ID2);
+  // reserved.set(DPU::ID4);
+  // reserved.set(DPU::ID8);
+  // reserved.set(DPU::MAJ_D22);
+  // reserved.set(DPU::MAJ_R22);
+  // reserved.set(DPU::MAJ_R23);
+  // reserved.set(DPU::MAJ_ZERO);
+  // reserved.set(DPU::MAJ_ONE);
+  // reserved.set(DPU::MAJ_LNEG);
+  // reserved.set(DPU::MAJ_MNEG);
+  // reserved.set(DPU::MAJ_ID);
+  // reserved.set(DPU::MAJ_ID2);
+  // reserved.set(DPU::MAJ_ID4);
+  // reserved.set(DPU::MAJ_ID8);
   return reserved;
 }
 
@@ -167,3 +180,19 @@ DPURegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
                                       CallingConv::ID /*CC*/) const {
   return CSR_RegMask;
 }
+
+bool DPURegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+  switch(PhysReg) {
+  default:
+    return false;
+  case DPU::ZERO:
+  case DPU::ONE:
+  case DPU::LNEG:
+  case DPU::MNEG:
+  case DPU::ID:
+  case DPU::ID2:
+  case DPU::ID4:
+  case DPU::ID8:
+    return true;
+  }
+}
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.h b/llvm/lib/Target/DPU/DPURegisterInfo.h
index 5d769d6a0d9d7..25d9c575a3967 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.h
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.h
@@ -37,6 +37,8 @@ struct DPURegisterInfo : public DPUGenRegisterInfo {
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
 
+  bool isConstantPhysReg(MCRegister PhysReg) const override;
+
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.td b/llvm/lib/Target/DPU/DPURegisterInfo.td
index caa0d84670555..06c44a9aaeac2 100644
--- a/llvm/lib/Target/DPU/DPURegisterInfo.td
+++ b/llvm/lib/Target/DPU/DPURegisterInfo.td
@@ -56,31 +56,31 @@ def R22 : DPUReg<22, "r22">, DwarfRegNum<[22]>;
 // R23: reserved as the return address for functions
 def R23 : DPUReg<23, "r23">, DwarfRegNum<[23]>;
 
-// Thread data registers
-def MAJ_R0 : DPUReg<0, "R0", [], [R0]>, DwarfRegNum<[0]>;
-def MAJ_R1 : DPUReg<1, "R1", [], [R1]>, DwarfRegNum<[1]>;
-def MAJ_R2 : DPUReg<2, "R2", [], [R2]>, DwarfRegNum<[2]>;
-def MAJ_R3 : DPUReg<3, "R3", [], [R3]>, DwarfRegNum<[3]>;
-def MAJ_R4 : DPUReg<4, "R4", [], [R4]>, DwarfRegNum<[4]>;
-def MAJ_R5 : DPUReg<5, "R5", [], [R5]>, DwarfRegNum<[5]>;
-def MAJ_R6 : DPUReg<6, "R6", [], [R6]>, DwarfRegNum<[6]>;
-def MAJ_R7 : DPUReg<7, "R7", [], [R7]>, DwarfRegNum<[7]>;
-def MAJ_R8 : DPUReg<8, "R8", [], [R8]>, DwarfRegNum<[8]>;
-def MAJ_R9 : DPUReg<9, "R9", [], [R9]>, DwarfRegNum<[9]>;
-def MAJ_R10 : DPUReg<10, "R10", [], [R10]>, DwarfRegNum<[10]>;
-def MAJ_R11 : DPUReg<11, "R11", [], [R11]>, DwarfRegNum<[11]>;
-def MAJ_R12 : DPUReg<12, "R12", [], [R12]>, DwarfRegNum<[12]>;
-def MAJ_R13 : DPUReg<13, "R13", [], [R13]>, DwarfRegNum<[13]>;
-def MAJ_R14 : DPUReg<14, "R14", [], [R14]>, DwarfRegNum<[14]>;
-def MAJ_R15 : DPUReg<15, "R15", [], [R15]>, DwarfRegNum<[15]>;
-def MAJ_R16 : DPUReg<16, "R16", [], [R16]>, DwarfRegNum<[16]>;
-def MAJ_R17 : DPUReg<17, "R17", [], [R17]>, DwarfRegNum<[17]>;
-def MAJ_R18 : DPUReg<18, "R18", [], [R18]>, DwarfRegNum<[18]>;
-def MAJ_R19 : DPUReg<19, "R19", [], [R19]>, DwarfRegNum<[19]>;
-def MAJ_R20 : DPUReg<20, "R20", [], [R20]>, DwarfRegNum<[20]>;
-def MAJ_R21 : DPUReg<21, "R21", [], [R21]>, DwarfRegNum<[21]>;
-def MAJ_R22 : DPUReg<22, "R22", [], [R22]>, DwarfRegNum<[22]>;
-def MAJ_R23 : DPUReg<23, "R23", [], [R23]>, DwarfRegNum<[23]>;
+// // Thread data registers
+// def MAJ_R0 : DPUReg<0, "R0", [], [R0]>, DwarfRegNum<[0]>;
+// def MAJ_R1 : DPUReg<1, "R1", [], [R1]>, DwarfRegNum<[1]>;
+// def MAJ_R2 : DPUReg<2, "R2", [], [R2]>, DwarfRegNum<[2]>;
+// def MAJ_R3 : DPUReg<3, "R3", [], [R3]>, DwarfRegNum<[3]>;
+// def MAJ_R4 : DPUReg<4, "R4", [], [R4]>, DwarfRegNum<[4]>;
+// def MAJ_R5 : DPUReg<5, "R5", [], [R5]>, DwarfRegNum<[5]>;
+// def MAJ_R6 : DPUReg<6, "R6", [], [R6]>, DwarfRegNum<[6]>;
+// def MAJ_R7 : DPUReg<7, "R7", [], [R7]>, DwarfRegNum<[7]>;
+// def MAJ_R8 : DPUReg<8, "R8", [], [R8]>, DwarfRegNum<[8]>;
+// def MAJ_R9 : DPUReg<9, "R9", [], [R9]>, DwarfRegNum<[9]>;
+// def MAJ_R10 : DPUReg<10, "R10", [], [R10]>, DwarfRegNum<[10]>;
+// def MAJ_R11 : DPUReg<11, "R11", [], [R11]>, DwarfRegNum<[11]>;
+// def MAJ_R12 : DPUReg<12, "R12", [], [R12]>, DwarfRegNum<[12]>;
+// def MAJ_R13 : DPUReg<13, "R13", [], [R13]>, DwarfRegNum<[13]>;
+// def MAJ_R14 : DPUReg<14, "R14", [], [R14]>, DwarfRegNum<[14]>;
+// def MAJ_R15 : DPUReg<15, "R15", [], [R15]>, DwarfRegNum<[15]>;
+// def MAJ_R16 : DPUReg<16, "R16", [], [R16]>, DwarfRegNum<[16]>;
+// def MAJ_R17 : DPUReg<17, "R17", [], [R17]>, DwarfRegNum<[17]>;
+// def MAJ_R18 : DPUReg<18, "R18", [], [R18]>, DwarfRegNum<[18]>;
+// def MAJ_R19 : DPUReg<19, "R19", [], [R19]>, DwarfRegNum<[19]>;
+// def MAJ_R20 : DPUReg<20, "R20", [], [R20]>, DwarfRegNum<[20]>;
+// def MAJ_R21 : DPUReg<21, "R21", [], [R21]>, DwarfRegNum<[21]>;
+// def MAJ_R22 : DPUReg<22, "R22", [], [R22]>, DwarfRegNum<[22]>;
+// def MAJ_R23 : DPUReg<23, "R23", [], [R23]>, DwarfRegNum<[23]>;
 
 // Thread data registers, extended to 64 bits.
 let SubRegIndices = [sub_32bit, sub_32bit_hi], CoveredBySubRegs = 1 in {
@@ -97,39 +97,39 @@ let SubRegIndices = [sub_32bit, sub_32bit_hi], CoveredBySubRegs = 1 in {
   def D20 : DPUReg<20, "d20", [R21, R20]>;
   def D22 : DPUReg<22, "d22", [R23, R22]>;
 
-  def MAJ_D0 : DPUReg<0, "D0", [MAJ_R1, MAJ_R0], [D0]>;
-  def MAJ_D2 : DPUReg<2, "D2", [MAJ_R3, MAJ_R2], [D2]>;
-  def MAJ_D4 : DPUReg<4, "D4", [MAJ_R5, MAJ_R4], [D4]>;
-  def MAJ_D6 : DPUReg<6, "D6", [MAJ_R7, MAJ_R6], [D6]>;
-  def MAJ_D8 : DPUReg<8, "D8", [MAJ_R9, MAJ_R8], [D8]>;
-  def MAJ_D10 : DPUReg<10, "D10", [MAJ_R11, MAJ_R10], [D10]>;
-  def MAJ_D12 : DPUReg<12, "D12", [MAJ_R13, MAJ_R12], [D12]>;
-  def MAJ_D14 : DPUReg<14, "D14", [MAJ_R15, MAJ_R14], [D14]>;
-  def MAJ_D16 : DPUReg<16, "D16", [MAJ_R17, MAJ_R16], [D16]>;
-  def MAJ_D18 : DPUReg<18, "D18", [MAJ_R19, MAJ_R18], [D18]>;
-  def MAJ_D20 : DPUReg<20, "D20", [MAJ_R21, MAJ_R20], [D20]>;
-  def MAJ_D22 : DPUReg<22, "D22", [MAJ_R23, MAJ_R22], [D22]>;
+  // def MAJ_D0 : DPUReg<0, "D0", [MAJ_R1, MAJ_R0], [D0]>;
+  // def MAJ_D2 : DPUReg<2, "D2", [MAJ_R3, MAJ_R2], [D2]>;
+  // def MAJ_D4 : DPUReg<4, "D4", [MAJ_R5, MAJ_R4], [D4]>;
+  // def MAJ_D6 : DPUReg<6, "D6", [MAJ_R7, MAJ_R6], [D6]>;
+  // def MAJ_D8 : DPUReg<8, "D8", [MAJ_R9, MAJ_R8], [D8]>;
+  // def MAJ_D10 : DPUReg<10, "D10", [MAJ_R11, MAJ_R10], [D10]>;
+  // def MAJ_D12 : DPUReg<12, "D12", [MAJ_R13, MAJ_R12], [D12]>;
+  // def MAJ_D14 : DPUReg<14, "D14", [MAJ_R15, MAJ_R14], [D14]>;
+  // def MAJ_D16 : DPUReg<16, "D16", [MAJ_R17, MAJ_R16], [D16]>;
+  // def MAJ_D18 : DPUReg<18, "D18", [MAJ_R19, MAJ_R18], [D18]>;
+  // def MAJ_D20 : DPUReg<20, "D20", [MAJ_R21, MAJ_R20], [D20]>;
+  // def MAJ_D22 : DPUReg<22, "D22", [MAJ_R23, MAJ_R22], [D22]>;
 }
 
 // Constant registers.
 def ZERO: DPUReg<24, "zero">;
-def MAJ_ZERO: DPUReg<24, "ZERO", [], [ZERO]>;
+// def MAJ_ZERO: DPUReg<24, "ZERO", [], [ZERO]>;
 def ONE: DPUReg<25, "one">;
-def MAJ_ONE: DPUReg<25, "ONE", [], [ONE]>;
+// def MAJ_ONE: DPUReg<25, "ONE", [], [ONE]>;
 def LNEG: DPUReg<26, "lneg">;
-def MAJ_LNEG: DPUReg<26, "LNEG", [], [LNEG]>;
+// def MAJ_LNEG: DPUReg<26, "LNEG", [], [LNEG]>;
 def MNEG:  DPUReg<27, "mneg">;
-def MAJ_MNEG: DPUReg<27, "MNEG", [], [MNEG]>;
+// def MAJ_MNEG: DPUReg<27, "MNEG", [], [MNEG]>;
 // Thread id registers. Return the thread identification for the
 // current thread, times 1, 2, 4, 8.
 def ID:    DPUReg<28, "id">;
 def ID2:   DPUReg<29, "id2">;
 def ID4:   DPUReg<30, "id4">;
 def ID8:   DPUReg<31, "id8">;
-def MAJ_ID:    DPUReg<28, "ID", [], [ID]>;
-def MAJ_ID2:   DPUReg<29, "ID2", [], [ID2]>;
-def MAJ_ID4:   DPUReg<30, "ID4", [], [ID4]>;
-def MAJ_ID8:   DPUReg<31, "ID8", [], [ID8]>;
+// def MAJ_ID:    DPUReg<28, "ID", [], [ID]>;
+// def MAJ_ID2:   DPUReg<29, "ID2", [], [ID2]>;
+// def MAJ_ID4:   DPUReg<30, "ID4", [], [ID4]>;
+// def MAJ_ID8:   DPUReg<31, "ID8", [], [ID8]>;
 
 // Define the register class representing this bank of general
 // purpose registers used by ONE thread.
@@ -139,16 +139,41 @@ def MAJ_ID8:   DPUReg<31, "ID8", [], [ID8]>;
 // that can be used as an instruction operand.
 // Hide the reserved registers, so that we are very sure that the compiler will
 // not do anything with them.
-def GP_REG    : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23), (sequence "MAJ_R%u", 0, 23))>;
-def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG, MAJ_ZERO, MAJ_ONE, MAJ_LNEG, MAJ_MNEG)>;
-def ID_REG    : RegisterClass<"DPU", [i32], 32, (add ID, ID2, ID4, ID8, MAJ_ID, MAJ_ID2, MAJ_ID4, MAJ_ID8)>;
-def ZERO_REG  : RegisterClass<"DPU", [i32], 32, (add ZERO, MAJ_ZERO)>;
+def GP_REG    : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23)
+// , (sequence "MAJ_R%u", 0, 23)
+// , ZERO
+// , ONE
+// ,LNEG //<-- there is an issue with this one:        lsr_add r2, lneg, r2, 3  seems to be understood as sats r2, r2 ... encoding problem???
+// ,MNEG //<-- this one as well
+// in fact they cause more trouble now.
+// probably they are not well specified elsewhere
+// or encoding/decoding are not well tested properly with register constraints ...
+//  need to check that
+//   because register coalescing could be really interesting ...
+//    move $d/r 0/1/-1 could be potentially removed
+//   will check that later, first: correctness
+)>;
+
+def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG
+// , MAJ_ZERO, MAJ_ONE, MAJ_LNEG, MAJ_MNEG
+)>;
+
+def ID_REG    : RegisterClass<"DPU", [i32], 32, (add ID, ID2, ID4, ID8
+//, MAJ_ID, MAJ_ID2, MAJ_ID4, MAJ_ID8
+)>;
+
+def ZERO_REG  : RegisterClass<"DPU", [i32], 32, (add ZERO
+// , MAJ_ZERO
+)>;
+
 def OP_REG    : RegisterClass<"DPU", [i32], 32, (add GP_REG, CONST_REG, ID_REG)>;
 def GPZ_REG   : RegisterClass<"DPU", [i32], 32, (add GP_REG, ZERO_REG)>;
 
 // 64 bits registers are the combinations of 2 consecutive registers.
 def GP64_REG  : RegisterClass<"DPU", [i64], 64,
-                          (add D0, D2, D4, D6, D8, D10, D12, D14, D16, D18, D20, D22, MAJ_D0, MAJ_D2, MAJ_D4, MAJ_D6, MAJ_D8, MAJ_D10, MAJ_D12, MAJ_D14, MAJ_D16, MAJ_D18, MAJ_D20, MAJ_D22)>;
+                          (add D0, D2, D4, D6, D8, D10, D12, D14, D16, D18, D20, D22
+			  // , MAJ_D0, MAJ_D2, MAJ_D4, MAJ_D6, MAJ_D8, MAJ_D10, MAJ_D12, MAJ_D14, MAJ_D16, MAJ_D18, MAJ_D20, MAJ_D22
+			  )>;
 
 def S0:   DPUReg<0, "s0">;
 def S1:   DPUReg<1, "s1">;
diff --git a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
index bbfb4fec0d67e..cdbe91cbc44d3 100644
--- a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
+++ b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp
@@ -7,6 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+// possibly move that earlier in the pipeline
+//   all simple arithmetic could be moved to in EmitInstrWithCustomInserter pre regalloc and other optim
+//   here I needed to add some option again, because we tweak it postRA
+//       if we do that express them directly during ISEL, we would benefit more natural optimization earlier
+//          also, possibility of FastIsel and GlobalSel instead of InstructionSel ...
+
+// TODO: expand test cases for splicing stuff
+//       need_splice = 0/1  x  canFallThrough = 0/1
+//     and/or doing Jcc and Setcc earlier as well
+
 #include "DPU.h"
 #include "DPUInstrInfo.h"
 #include "DPUSubtarget.h"
@@ -119,6 +129,13 @@ static void resolve64BitImmediateAluInstruction(
     MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter,
     const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode,
     unsigned int MsbOpcode) {
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+
   MachineFunction *MF = MBB->getParent();
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
 
@@ -143,12 +160,23 @@ static void resolve64BitImmediateAluInstruction(
           MSBDestReg)
       .addReg(MSBDOp1Reg)
       .addImm(MSBOp2Imm);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
 }
 
 static void resolve64BitRegisterAluInstruction(
     MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter,
     const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode,
     unsigned int MsbOpcode) {
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+    });
   MachineFunction *MF = MBB->getParent();
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
 
@@ -165,14 +193,33 @@ static void resolve64BitRegisterAluInstruction(
   unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
   unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
 
-  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(LsbOpcode),
+  MachineInstrBuilder MIBDestLsb;
+  MIBDestLsb = BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(LsbOpcode),
           LSBDestReg)
       .addReg(LSBDOp1Reg)
       .addReg(LSBOp2Reg);
-  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(MsbOpcode),
+
+  MachineInstrBuilder MIBDestMsb;
+  MIBDestMsb = BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(MsbOpcode),
           MSBDestReg)
       .addReg(MSBDOp1Reg)
       .addReg(MSBOp2Reg);
+
+  for (unsigned i = 0; i < 3; i++) {
+    if (MBBIter->getOperand(i).isRenamable()) {
+      MIBDestLsb->getOperand(i).setIsRenamable();
+      MIBDestMsb->getOperand(i).setIsRenamable();
+    }
+    if (MBBIter->getOperand(i).isKill()) {
+      MIBDestLsb->getOperand(i).setIsKill();
+      MIBDestMsb->getOperand(i).setIsKill();
+    }
+  }
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
 }
 
 static void resolveJeq64(MachineBasicBlock *MBB,
@@ -181,21 +228,48 @@ static void resolveJeq64(MachineBasicBlock *MBB,
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
   MachineFunction *F = MBB->getParent();
+
+  bool need_splice = std::next(MBBIter) != MBB->end();
+
+  MachineBasicBlock *FTMBB = MBB->getFallThrough();
+  MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** canFallThrough: " << MBB->canFallThrough() << "\n";
+      if (MBB->canFallThrough()) {
+	dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
+
   MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *endMBB;
+
   F->insert(I, trueMBB);
-  F->insert(I, endMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  if (need_splice) {
+    endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(I, endMBB);
+    // Update machine-CFG edges by transferring all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(MBB);
+    MBB->addSuccessor(endMBB);
+    endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
+  } else {
+    endMBB = FTMBB;
+    MBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
+  }
+
   // Next, add the true and fallthrough blocks as its successors.
-  auto JumpMBB = MBBIter->getOperand(3).getMBB();
   MBB->addSuccessor(trueMBB);
-  MBB->addSuccessor(endMBB);
   trueMBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(endMBB);
-
+  
   unsigned int Op1Reg = MBBIter->getOperand(1).getReg();
   unsigned int Op2Reg = MBBIter->getOperand(2).getReg();
 
@@ -215,6 +289,20 @@ static void resolveJeq64(MachineBasicBlock *MBB,
       .addReg(MsbOp1Reg)
       .addReg(MsbOp2Reg)
       .addMBB(JumpMBB);
+
+  trueMBB->addLiveIn(MsbOp1Reg);
+  trueMBB->addLiveIn(MsbOp2Reg);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** trueMBB: "; trueMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "** FTMBB: "; FTMBB->dump();
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
 }
 
 static void resolveJneq64(MachineBasicBlock *MBB,
@@ -223,18 +311,44 @@ static void resolveJneq64(MachineBasicBlock *MBB,
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
   MachineFunction *F = MBB->getParent();
+
+  bool need_splice = std::next(MBBIter) != MBB->end();
+  bool canFallThrough = MBB->canFallThrough();
+  MachineBasicBlock *FTMBB = MBB->getFallThrough();
+  MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** canFallThrough: " << canFallThrough << "\n";
+      if (canFallThrough) {
+	dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
+
   MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *endMBB;
   F->insert(I, trueMBB);
-  F->insert(I, endMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(MBB);
-  // Next, add the true and fallthrough blocks as its successors.
-  auto JumpMBB = MBBIter->getOperand(3).getMBB();
+
+  if (need_splice) {
+    endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(I, endMBB);
+    // Update machine-CFG edges by transferring all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(MBB);
+    MBB->addSuccessor(JumpMBB);
+    endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true);
+  } else {
+    endMBB = FTMBB;
+    MBB->removeSuccessor(endMBB, /* NormalizeSuccProbs = */ true);
+  }
+
   MBB->addSuccessor(trueMBB);
-  MBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(JumpMBB);
   trueMBB->addSuccessor(endMBB);
 
@@ -257,12 +371,35 @@ static void resolveJneq64(MachineBasicBlock *MBB,
       .addReg(MsbOp1Reg)
       .addReg(MsbOp2Reg)
       .addMBB(JumpMBB);
+  
+  trueMBB->addLiveIn(MsbOp1Reg);
+  trueMBB->addLiveIn(MsbOp2Reg);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** need_splice: " << need_splice << "\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "** trueMBB: "; trueMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      if (canFallThrough) {
+	dbgs() << "** FTMBB: "; FTMBB->dump();
+      }
+      dbgs() << "** JumpMBB: "; JumpMBB->dump();
+      dbgs() << "****** \n";
+    });
 }
 
 static void resolveJcc64AsSub64(MachineBasicBlock *MBB,
                                 MachineBasicBlock::iterator MBBIter,
                                 const DPUInstrInfo &InstrInfo,
                                 DPUAsmCondition::Condition Cond) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+  
   unsigned int Op1Reg = MBBIter->getOperand(1).getReg();
   unsigned int Op2Reg = MBBIter->getOperand(2).getReg();
   auto JumpMBB = MBBIter->getOperand(3).getMBB();
@@ -284,11 +421,20 @@ static void resolveJcc64AsSub64(MachineBasicBlock *MBB,
       .addReg(MsbOp2Reg)
       .addImm(Cond)
       .addMBB(JumpMBB);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
 }
 
 static void resolveJcc64(MachineBasicBlock *MBB,
                          MachineBasicBlock::iterator MBBIter,
                          const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+    });
+
   switch (MBBIter->getOperand(0).getImm()) {
   default:
     llvm_unreachable("invalid condition");
@@ -304,45 +450,214 @@ static void resolveJcc64(MachineBasicBlock *MBB,
     break;
   case ISD::SETOGT:
   case ISD::SETGT:
+    LLVM_DEBUG({ dbgs() << "GT " << ISD::SETOGT << " " << ISD::SETGT << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::ExtendedGreaterThanSigned);
     break;
   case ISD::SETOGE:
   case ISD::SETGE:
+    LLVM_DEBUG({ dbgs() << "GE " << ISD::SETOGE << " " << ISD::SETGE << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::GreaterOrEqualSigned);
     break;
   case ISD::SETOLT:
   case ISD::SETLT:
+    LLVM_DEBUG({ dbgs() << "LT " << ISD::SETOLT << " " << ISD::SETLT << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::LessThanSigned);
     break;
   case ISD::SETOLE:
   case ISD::SETLE:
+    LLVM_DEBUG({ dbgs() << "GE " << ISD::SETOLE << " " << ISD::SETLE << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::ExtendedLessOrEqualSigned);
     break;
   case ISD::SETUGT:
-    resolveJcc64AsSub64(
-        MBB, MBBIter, InstrInfo,
-        DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned);
+    LLVM_DEBUG({ dbgs() << "UGT " << ISD::SETUGT << "\n"; });
+    resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
+			DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned);
+    
     break;
   case ISD::SETUGE:
+    LLVM_DEBUG({ dbgs() << "UGE " << ISD::SETUGE << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::GreaterOrEqualUnsigned);
     break;
   case ISD::SETULT:
+    LLVM_DEBUG({ dbgs() << "ULT " << ISD::SETULT << "\n"; });
     resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
                         DPUAsmCondition::Condition::LessThanUnsigned);
     break;
   case ISD::SETULE:
-    resolveJcc64AsSub64(
-        MBB, MBBIter, InstrInfo,
-        DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned);
+    LLVM_DEBUG({ dbgs() << "ULE " << ISD::SETULE << "\n"; });
+    resolveJcc64AsSub64(MBB, MBBIter, InstrInfo,
+			DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned);
     break;
   }
 }
 
+static void resolveJcci64(MachineBasicBlock *MBB,
+		       MachineBasicBlock::iterator MBBIter,
+		       const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  unsigned int OpCode =
+    findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true);
+  const MachineInstrBuilder &MIB =
+    BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode));
+  MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2));
+
+  for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) {
+    MachineOperand &Operand = MBBIter->getOperand(i);
+
+    if (Operand.isMBB()) {
+      MIB.add(Operand);
+      break;
+    }
+  }
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveMOVE64ri(MachineBasicBlock *MBB,
+			    MachineBasicBlock::iterator MBBIter,
+			    const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+ 
+  MachineFunction *MF = MBB->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MBBIter->getOperand(0).getReg();
+  int64_t Op1Imm = MBBIter->getOperand(1).getImm();
+
+  int64_t LSBOp1Imm = Op1Imm & 0xFFFFFFFFl;
+  int64_t MSBOp1Imm = (Op1Imm >> 32) & 0xFFFFFFFFl;
+  unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
+  unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
+	  LSBDestReg)
+    .addImm(LSBOp1Imm);
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
+	  MSBDestReg)
+    .addImm(MSBOp1Imm);
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveSET64cc(MachineBasicBlock *MBB,
+			   MachineBasicBlock::iterator MBBIter,
+			   const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  MachineFunction *MF = MBB->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  unsigned int DestReg = MBBIter->getOperand(0).getReg();
+  auto ImmCond = static_cast<DPUAsmCondition::Condition>(MBBIter->getOperand(1).getImm());
+  unsigned int Op1Reg = MBBIter->getOperand(2).getReg();
+  unsigned int Op2Reg = MBBIter->getOperand(3).getReg();
+
+  DPUAsmCondition::Condition SetCondition =
+    findSelect64SetConditionFor(ImmCond);
+
+  unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
+  unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
+
+  unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
+  unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::SUBzrr))
+    .addReg(DPU::ZERO)
+    .addReg(LSBDOp1Reg)
+    .addReg(LSBOp2Reg);
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(),
+	  InstrInfo.get(DPU::SUBCrrrc), DestReg)
+    .addReg(MSBDOp1Reg)
+    .addReg(MSBOp2Reg)
+    .addImm(SetCondition);
+  
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveJcc(MachineBasicBlock *MBB,
+		       MachineBasicBlock::iterator MBBIter,
+		       const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  unsigned int OpCode =
+    findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), false);
+  BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode))
+    .add(MBBIter->getOperand(1))
+    .add(MBBIter->getOperand(2))
+    .add(MBBIter->getOperand(3));
+  
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
+static void resolveJcci(MachineBasicBlock *MBB,
+		       MachineBasicBlock::iterator MBBIter,
+		       const DPUInstrInfo &InstrInfo) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MBBIter->dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  unsigned int OpCode =
+    findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true);
+  const MachineInstrBuilder &MIB =
+    BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode));
+  MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2));
+
+  for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) {
+    MachineOperand &Operand = MBBIter->getOperand(i);
+
+    if (Operand.isMBB()) {
+      MIB.add(Operand);
+      break;
+    }
+  }
+
+  LLVM_DEBUG({
+      dbgs() << "** instruction replaced, but still need removal\n";
+      dbgs() << "** MBB: "; MBB->dump();
+    });
+}
+
 static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB,
                                           const DPUInstrInfo &InstrInfo) {
   bool Modified = false;
@@ -355,130 +670,77 @@ static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB,
     default:
       InstrModified = false;
       break;
-    case DPU::Jcc: {
-      unsigned int OpCode =
-          findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), false);
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode))
-          .add(MBBIter->getOperand(1))
-          .add(MBBIter->getOperand(2))
-          .add(MBBIter->getOperand(3));
+
+    case DPU::Jcc:
+      resolveJcc(MBB, MBBIter, InstrInfo);
       break;
-    }
-    case DPU::TmpJcci:
-    case DPU::Jcci: {
-      unsigned int OpCode =
-          findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true);
-      const MachineInstrBuilder &MIB =
-          BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode));
-      MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2));
-
-      for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) {
-        MachineOperand &Operand = MBBIter->getOperand(i);
-
-        if (Operand.isMBB()) {
-          MIB.add(Operand);
-          break;
-        }
-      }
 
+    // case DPU::TmpJcci:
+    case DPU::Jcci:
+      resolveJcci(MBB, MBBIter, InstrInfo);
       break;
-    }
+
     case DPU::Jcc64:
       resolveJcc64(MBB, MBBIter, InstrInfo);
       break;
-    case DPU::SET64cc: {
-      MachineFunction *MF = MBB->getParent();
-      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
-      unsigned int DestReg = MBBIter->getOperand(0).getReg();
-      auto ImmCond = static_cast<DPUAsmCondition::Condition>(
-          MBBIter->getOperand(1).getImm());
-      unsigned int Op1Reg = MBBIter->getOperand(2).getReg();
-      unsigned int Op2Reg = MBBIter->getOperand(3).getReg();
-
-      DPUAsmCondition::Condition SetCondition =
-          findSelect64SetConditionFor(ImmCond);
-
-      unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit);
-      unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi);
-
-      unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit);
-      unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi);
-
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::SUBzrr))
-          .addReg(DPU::ZERO)
-          .addReg(LSBDOp1Reg)
-          .addReg(LSBOp2Reg);
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(),
-              InstrInfo.get(DPU::SUBCrrrc), DestReg)
-          .addReg(MSBDOp1Reg)
-          .addReg(MSBOp2Reg)
-          .addImm(SetCondition);
 
-      break;
-    }
-    case DPU::MOVE64ri: {
-      MachineFunction *MF = MBB->getParent();
-      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
-      unsigned int DestReg = MBBIter->getOperand(0).getReg();
-      int64_t Op1Imm = MBBIter->getOperand(1).getImm();
-
-      int64_t LSBOp1Imm = Op1Imm & 0xFFFFFFFFl;
-      int64_t MSBOp1Imm = (Op1Imm >> 32) & 0xFFFFFFFFl;
-      unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit);
-      unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi);
-
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
-              LSBDestReg)
-          .addImm(LSBOp1Imm);
-      BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri),
-              MSBDestReg)
-          .addImm(MSBOp1Imm);
-      break;
-    }
-    case DPU::ADD64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrrr,
-                                         DPU::ADDCrrr);
-      break;
-    case DPU::ADD64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrri,
-                                          DPU::ADDCrri);
-      break;
-    case DPU::SUB64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::SUBrrr,
-                                         DPU::SUBCrrr);
-      break;
-    case DPU::OR64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrrr,
-                                         DPU::ORrrr);
-      break;
-    case DPU::OR64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrri,
-                                          DPU::ORrri);
-      break;
-    case DPU::AND64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrrr,
-                                         DPU::ANDrrr);
-      break;
-    case DPU::AND64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrri,
-                                          DPU::ANDrri);
-      break;
-    case DPU::XOR64rr:
-      resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrrr,
-                                         DPU::XORrrr);
-      break;
-    case DPU::XOR64ri:
-      resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrri,
-                                          DPU::XORrri);
-      break;
+    // case DPU::Jcci64:
+    //   resolveJcci64(MBB, MBBIter, InstrInfo);
+    //   break;
+      
+    // case DPU::SET64cc:
+    //   resolveSET64cc(MBB, MBBIter, InstrInfo);
+    //   break;
+
+    // case DPU::MOVE64ri:
+    //   resolveMOVE64ri(MBB, MBBIter, InstrInfo);
+    //   break;
+
+    // case DPU::ADD64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrrr,
+    //                                      DPU::ADDCrrr);
+    //   break;
+    // case DPU::AND64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrrr,
+    //                                      DPU::ANDrrr);
+    //   break;
+    // case DPU::OR64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrrr,
+    //                                      DPU::ORrrr);
+    //   break;
+    // case DPU::SUB64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::SUBrrr,
+    //                                      DPU::SUBCrrr);
+    //   break;
+    // case DPU::XOR64rr:
+    //   resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrrr,
+    //                                      DPU::XORrrr);
+    //   break;
+
+    // case DPU::AND64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrri,
+    //                                       DPU::ANDrri);
+    //   break;
+    // case DPU::ADD64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrri,
+    //                                       DPU::ADDCrri);
+    //   break;
+    // case DPU::OR64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrri,
+    //                                       DPU::ORrri);
+    //   break;
+    // case DPU::XOR64ri:
+    //   resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrri,
+    //                                       DPU::XORrri);
+    //   break;
+
     }
 
     if (InstrModified) {
       MBB->erase(MBBIter++);
-      Modified = true;
-    } else {
+      Modified |= true;
+    }
+    else {
       ++MBBIter;
     }
   }
@@ -499,5 +761,7 @@ bool DPUResolveMacroInstrPass::runOnMachineFunction(MachineFunction &MF) {
     changeMade |= resolveMacroInstructionsInMBB(MBB, InstrInfo);
   }
 
+  LLVM_DEBUG(dbgs() << "********** DPU/ResolveMacroInstrPass: " << MF.getName()
+	     << " done: changeMade = " << changeMade << " **********\n\n");
   return changeMade;
 }
diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
index 95ed30c7086ec..0b220529df968 100644
--- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp
@@ -89,13 +89,14 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI)
   PredictableSelectIsExpensive = true;
   setJumpIsExpensive(false);
 
-  setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-  setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
-  setLibcallName(RTLIB::SDIV_I32, "__div32");
-  setLibcallName(RTLIB::UDIV_I32, "__udiv32");
+  // setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+  // setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  // setLibcallName(RTLIB::SDIV_I32, "__div32");
+  // setLibcallName(RTLIB::UDIV_I32, "__udiv32");
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &DPU::GP_REGRegClass);
+  // addRegisterClass(MVT::i32, &DPU::CONST_REGRegClass);
   addRegisterClass(MVT::i64, &DPU::GP64_REGRegClass);
 
   // Compute derived properties from the register classes
@@ -226,7 +227,7 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI)
   setOperationAction(ISD::BR_CC, MVT::i16, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
-
+  
   setOperationAction(ISD::ADDC, MVT::i8, Expand);
   setOperationAction(ISD::ADDC, MVT::i16, Expand);
   setOperationAction(ISD::ADDC, MVT::i32, Expand);
@@ -382,23 +383,23 @@ SDValue DPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerVAARG(Op, DAG);
 
   default: {
-    const char *NodeName = getTargetNodeName(Op.getOpcode());
     LLVM_DEBUG({
-      dbgs() << "FAIL: ";
-      Op.dump(&DAG);
-    });
-    if (NodeName != nullptr) {
-      LLVM_DEBUG(dbgs() << "\tnode name = " << NodeName << "\n");
-    }
-    for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) {
-      LLVM_DEBUG({
-        dbgs() << "\toperand #" << std::to_string(eachOp) << " = ";
-        Op.getOperand(eachOp).dump(&DAG);
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "FAIL: ";
+	Op.dump(&DAG);
+	dbgs() << "\n";
+	const char *NodeName = getTargetNodeName(Op.getOpcode());
+	if (NodeName != nullptr) {
+	  dbgs() << "\tnode name = " << NodeName << "\n";
+	}
+	for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) {
+	  dbgs() << "\toperand #" << std::to_string(eachOp) << " = ";
+	  Op.getOperand(eachOp).dump(&DAG);
+	}
       });
-    }
-  }
     report_fatal_error("NOT implemented: lowering of such a type of SDValue");
   }
+  }
 }
 
 const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -433,18 +434,18 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "DPUISD::BrCC";
   case DPUISD::BrCCi:
     return "DPUISD::BrCCi";
-  case DPUISD::BrCCZero:
-    return "DPUISD::BrCCZero";
-  case DPUISD::OrJCCZero:
-    return "DPUISD::OrJCCZero";
-  case DPUISD::AndJCCZero:
-    return "DPUISD::AndJCCZero";
-  case DPUISD::XorJCCZero:
-    return "DPUISD::XorJCCZero";
-  case DPUISD::AddJCCZero:
-    return "DPUISD::AddJCCZero";
-  case DPUISD::SubJCCZero:
-    return "DPUISD::SubJCCZero";
+  // case DPUISD::BrCCZero:
+  //   return "DPUISD::BrCCZero";
+  // case DPUISD::OrJCCZero:
+  //   return "DPUISD::OrJCCZero";
+  // case DPUISD::AndJCCZero:
+  //   return "DPUISD::AndJCCZero";
+  // case DPUISD::XorJCCZero:
+  //   return "DPUISD::XorJCCZero";
+  // case DPUISD::AddJCCZero:
+  //   return "DPUISD::AddJCCZero";
+  // case DPUISD::SubJCCZero:
+  //   return "DPUISD::SubJCCZero";
   case DPUISD::Wrapper:
     return "DPUISD::Wrapper";
   case DPUISD::TRUNC64:
@@ -491,12 +492,12 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "DPUISD::MUL16_SU";
   case DPUISD::MUL16_SS:
     return "DPUISD::MUL16_SS";
-  case DPUISD::Addc:
-    return "DPUISD::Addc";
-  case DPUISD::Subc:
-    return "DPUISD::Subc";
-  case DPUISD::Rsubc:
-    return "DPUISD::Rsubc";
+  // case DPUISD::Addc:
+  //   return "DPUISD::Addc";
+  // case DPUISD::Subc:
+  //   return "DPUISD::Subc";
+  // case DPUISD::Rsubc:
+  //   return "DPUISD::Rsubc";
   case DPUISD::Clo:
     return "DPUISD::Clo";
   case DPUISD::Cls:
@@ -515,154 +516,154 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "DPUISD::Lsr1x";
   case DPUISD::LslAdd:
     return "DPUISD::LslAdd";
-  case DPUISD::AddJcc:
-    return "DPUISD::AddJcc";
-  case DPUISD::AddNullJcc:
-    return "DPUISD::AddNullJcc";
-  case DPUISD::AddcJcc:
-    return "DPUISD::AddcJcc";
-  case DPUISD::AddcNullJcc:
-    return "DPUISD::AddcNullJcc";
-  case DPUISD::AndJcc:
-    return "DPUISD::AndJcc";
-  case DPUISD::AndNullJcc:
-    return "DPUISD::AndNullJcc";
-  case DPUISD::OrJcc:
-    return "DPUISD::OrJcc";
-  case DPUISD::OrNullJcc:
-    return "DPUISD::OrNullJcc";
-  case DPUISD::XorJcc:
-    return "DPUISD::XorJcc";
-  case DPUISD::XorNullJcc:
-    return "DPUISD::XorNullJcc";
-  case DPUISD::NandJcc:
-    return "DPUISD::NandJcc";
-  case DPUISD::NandNullJcc:
-    return "DPUISD::NandNullJcc";
-  case DPUISD::NorJcc:
-    return "DPUISD::NorJcc";
-  case DPUISD::NorNullJcc:
-    return "DPUISD::NorNullJcc";
-  case DPUISD::NxorJcc:
-    return "DPUISD::NxorJcc";
-  case DPUISD::NxorNullJcc:
-    return "DPUISD::NxorNullJcc";
-  case DPUISD::AndnJcc:
-    return "DPUISD::AndnJcc";
-  case DPUISD::AndnNullJcc:
-    return "DPUISD::AndnNullJcc";
-  case DPUISD::OrnJcc:
-    return "DPUISD::OrnJcc";
-  case DPUISD::OrnNullJcc:
-    return "DPUISD::OrnNullJcc";
-  case DPUISD::LslJcc:
-    return "DPUISD::LslJcc";
-  case DPUISD::LslNullJcc:
-    return "DPUISD::LslNullJcc";
-  case DPUISD::LslxJcc:
-    return "DPUISD::LslxJcc";
-  case DPUISD::LslxNullJcc:
-    return "DPUISD::LslxNullJcc";
-  case DPUISD::Lsl1Jcc:
-    return "DPUISD::Lsl1Jcc";
-  case DPUISD::Lsl1NullJcc:
-    return "DPUISD::Lsl1NullJcc";
-  case DPUISD::Lsl1xJcc:
-    return "DPUISD::Lsl1xJcc";
-  case DPUISD::Lsl1xNullJcc:
-    return "DPUISD::Lsl1xNullJcc";
-  case DPUISD::LsrJcc:
-    return "DPUISD::LsrJcc";
-  case DPUISD::LsrNullJcc:
-    return "DPUISD::LsrNullJcc";
-  case DPUISD::LsrxJcc:
-    return "DPUISD::LsrxJcc";
-  case DPUISD::LsrxNullJcc:
-    return "DPUISD::LsrxNullJcc";
-  case DPUISD::Lsr1Jcc:
-    return "DPUISD::Lsr1Jcc";
-  case DPUISD::Lsr1NullJcc:
-    return "DPUISD::Lsr1NullJcc";
-  case DPUISD::Lsr1xJcc:
-    return "DPUISD::Lsr1xJcc";
-  case DPUISD::Lsr1xNullJcc:
-    return "DPUISD::Lsr1xNullJcc";
-  case DPUISD::AsrJcc:
-    return "DPUISD::AsrJcc";
-  case DPUISD::AsrNullJcc:
-    return "DPUISD::AsrNullJcc";
-  case DPUISD::RolJcc:
-    return "DPUISD::RolJcc";
-  case DPUISD::RolNullJcc:
-    return "DPUISD::RolNullJcc";
-  case DPUISD::RorJcc:
-    return "DPUISD::RorJcc";
-  case DPUISD::RorNullJcc:
-    return "DPUISD::RorNullJcc";
-  case DPUISD::MUL8_UUJcc:
-    return "DPUISD::MUL8_UUJcc";
-  case DPUISD::MUL8_UUNullJcc:
-    return "DPUISD::MUL8_UUNullJcc";
-  case DPUISD::MUL8_SUJcc:
-    return "DPUISD::MUL8_SUJcc";
-  case DPUISD::MUL8_SUNullJcc:
-    return "DPUISD::MUL8_SUNullJcc";
-  case DPUISD::MUL8_SSJcc:
-    return "DPUISD::MUL8_SSJcc";
-  case DPUISD::MUL8_SSNullJcc:
-    return "DPUISD::MUL8_SSNullJcc";
-  case DPUISD::SubJcc:
-    return "DPUISD::SubJcc";
-  case DPUISD::SubNullJcc:
-    return "DPUISD::SubNullJcc";
-  case DPUISD::RsubJcc:
-    return "DPUISD::RsubJcc";
-  case DPUISD::RsubNullJcc:
-    return "DPUISD::RsubNullJcc";
-  case DPUISD::SubcJcc:
-    return "DPUISD::SubcJcc";
-  case DPUISD::SubcNullJcc:
-    return "DPUISD::SubcNullJcc";
-  case DPUISD::RsubcJcc:
-    return "DPUISD::RsubcJcc";
-  case DPUISD::RsubcNullJcc:
-    return "DPUISD::RsubcNullJcc";
-  case DPUISD::CaoJcc:
-    return "DPUISD::CaoJcc";
-  case DPUISD::CaoNullJcc:
-    return "DPUISD::CaoNullJcc";
-  case DPUISD::ClzJcc:
-    return "DPUISD::ClzJcc";
-  case DPUISD::ClzNullJcc:
-    return "DPUISD::ClzNullJcc";
-  case DPUISD::CloJcc:
-    return "DPUISD::CloJcc";
-  case DPUISD::CloNullJcc:
-    return "DPUISD::CloNullJcc";
-  case DPUISD::ClsJcc:
-    return "DPUISD::ClsJcc";
-  case DPUISD::ClsNullJcc:
-    return "DPUISD::ClsNullJcc";
-  case DPUISD::MoveJcc:
-    return "DPUISD::MoveJcc";
-  case DPUISD::MoveNullJcc:
-    return "DPUISD::MoveNullJcc";
-  case DPUISD::RolAddJcc:
-    return "DPUISD::RolAddJcc";
-  case DPUISD::RolAddNullJcc:
-    return "DPUISD::RolAddNullJcc";
-  case DPUISD::LsrAddJcc:
-    return "DPUISD::LsrAddJcc";
-  case DPUISD::LsrAddNullJcc:
-    return "DPUISD::LsrAddNullJcc";
-  case DPUISD::LslAddJcc:
-    return "DPUISD::LslAddJcc";
-  case DPUISD::LslAddNullJcc:
-    return "DPUISD::LslAddNullJcc";
-  case DPUISD::LslSubJcc:
-    return "DPUISD::LslSubJcc";
-  case DPUISD::LslSubNullJcc:
-    return "DPUISD::LslSubNullJcc";
+  // case DPUISD::AddJcc:
+  //   return "DPUISD::AddJcc";
+  // case DPUISD::AddNullJcc:
+  //   return "DPUISD::AddNullJcc";
+  // case DPUISD::AddcJcc:
+  //   return "DPUISD::AddcJcc";
+  // case DPUISD::AddcNullJcc:
+  //   return "DPUISD::AddcNullJcc";
+  // case DPUISD::AndJcc:
+  //   return "DPUISD::AndJcc";
+  // case DPUISD::AndNullJcc:
+  //   return "DPUISD::AndNullJcc";
+  // case DPUISD::OrJcc:
+  //   return "DPUISD::OrJcc";
+  // case DPUISD::OrNullJcc:
+  //   return "DPUISD::OrNullJcc";
+  // case DPUISD::XorJcc:
+  //   return "DPUISD::XorJcc";
+  // case DPUISD::XorNullJcc:
+  //   return "DPUISD::XorNullJcc";
+  // case DPUISD::NandJcc:
+  //   return "DPUISD::NandJcc";
+  // case DPUISD::NandNullJcc:
+  //   return "DPUISD::NandNullJcc";
+  // case DPUISD::NorJcc:
+  //   return "DPUISD::NorJcc";
+  // case DPUISD::NorNullJcc:
+  //   return "DPUISD::NorNullJcc";
+  // case DPUISD::NxorJcc:
+  //   return "DPUISD::NxorJcc";
+  // case DPUISD::NxorNullJcc:
+  //   return "DPUISD::NxorNullJcc";
+  // case DPUISD::AndnJcc:
+  //   return "DPUISD::AndnJcc";
+  // case DPUISD::AndnNullJcc:
+  //   return "DPUISD::AndnNullJcc";
+  // case DPUISD::OrnJcc:
+  //   return "DPUISD::OrnJcc";
+  // case DPUISD::OrnNullJcc:
+  //   return "DPUISD::OrnNullJcc";
+  // case DPUISD::LslJcc:
+  //   return "DPUISD::LslJcc";
+  // case DPUISD::LslNullJcc:
+  //   return "DPUISD::LslNullJcc";
+  // case DPUISD::LslxJcc:
+  //   return "DPUISD::LslxJcc";
+  // case DPUISD::LslxNullJcc:
+  //   return "DPUISD::LslxNullJcc";
+  // case DPUISD::Lsl1Jcc:
+  //   return "DPUISD::Lsl1Jcc";
+  // case DPUISD::Lsl1NullJcc:
+  //   return "DPUISD::Lsl1NullJcc";
+  // case DPUISD::Lsl1xJcc:
+  //   return "DPUISD::Lsl1xJcc";
+  // case DPUISD::Lsl1xNullJcc:
+  //   return "DPUISD::Lsl1xNullJcc";
+  // case DPUISD::LsrJcc:
+  //   return "DPUISD::LsrJcc";
+  // case DPUISD::LsrNullJcc:
+  //   return "DPUISD::LsrNullJcc";
+  // case DPUISD::LsrxJcc:
+  //   return "DPUISD::LsrxJcc";
+  // case DPUISD::LsrxNullJcc:
+  //   return "DPUISD::LsrxNullJcc";
+  // case DPUISD::Lsr1Jcc:
+  //   return "DPUISD::Lsr1Jcc";
+  // case DPUISD::Lsr1NullJcc:
+  //   return "DPUISD::Lsr1NullJcc";
+  // case DPUISD::Lsr1xJcc:
+  //   return "DPUISD::Lsr1xJcc";
+  // case DPUISD::Lsr1xNullJcc:
+  //   return "DPUISD::Lsr1xNullJcc";
+  // case DPUISD::AsrJcc:
+  //   return "DPUISD::AsrJcc";
+  // case DPUISD::AsrNullJcc:
+  //   return "DPUISD::AsrNullJcc";
+  // case DPUISD::RolJcc:
+  //   return "DPUISD::RolJcc";
+  // case DPUISD::RolNullJcc:
+  //   return "DPUISD::RolNullJcc";
+  // case DPUISD::RorJcc:
+  //   return "DPUISD::RorJcc";
+  // case DPUISD::RorNullJcc:
+  //   return "DPUISD::RorNullJcc";
+  // case DPUISD::MUL8_UUJcc:
+  //   return "DPUISD::MUL8_UUJcc";
+  // case DPUISD::MUL8_UUNullJcc:
+  //   return "DPUISD::MUL8_UUNullJcc";
+  // case DPUISD::MUL8_SUJcc:
+  //   return "DPUISD::MUL8_SUJcc";
+  // case DPUISD::MUL8_SUNullJcc:
+  //   return "DPUISD::MUL8_SUNullJcc";
+  // case DPUISD::MUL8_SSJcc:
+  //   return "DPUISD::MUL8_SSJcc";
+  // case DPUISD::MUL8_SSNullJcc:
+  //   return "DPUISD::MUL8_SSNullJcc";
+  // case DPUISD::SubJcc:
+  //   return "DPUISD::SubJcc";
+  // case DPUISD::SubNullJcc:
+  //   return "DPUISD::SubNullJcc";
+  // case DPUISD::RsubJcc:
+  //   return "DPUISD::RsubJcc";
+  // case DPUISD::RsubNullJcc:
+  //   return "DPUISD::RsubNullJcc";
+  // case DPUISD::SubcJcc:
+  //   return "DPUISD::SubcJcc";
+  // case DPUISD::SubcNullJcc:
+  //   return "DPUISD::SubcNullJcc";
+  // case DPUISD::RsubcJcc:
+  //   return "DPUISD::RsubcJcc";
+  // case DPUISD::RsubcNullJcc:
+  //   return "DPUISD::RsubcNullJcc";
+  // case DPUISD::CaoJcc:
+  //   return "DPUISD::CaoJcc";
+  // case DPUISD::CaoNullJcc:
+  //   return "DPUISD::CaoNullJcc";
+  // case DPUISD::ClzJcc:
+  //   return "DPUISD::ClzJcc";
+  // case DPUISD::ClzNullJcc:
+  //   return "DPUISD::ClzNullJcc";
+  // case DPUISD::CloJcc:
+  //   return "DPUISD::CloJcc";
+  // case DPUISD::CloNullJcc:
+  //   return "DPUISD::CloNullJcc";
+  // case DPUISD::ClsJcc:
+  //   return "DPUISD::ClsJcc";
+  // case DPUISD::ClsNullJcc:
+  //   return "DPUISD::ClsNullJcc";
+  // case DPUISD::MoveJcc:
+  //   return "DPUISD::MoveJcc";
+  // case DPUISD::MoveNullJcc:
+  //   return "DPUISD::MoveNullJcc";
+  // case DPUISD::RolAddJcc:
+  //   return "DPUISD::RolAddJcc";
+  // case DPUISD::RolAddNullJcc:
+  //   return "DPUISD::RolAddNullJcc";
+  // case DPUISD::LsrAddJcc:
+  //   return "DPUISD::LsrAddJcc";
+  // case DPUISD::LsrAddNullJcc:
+  //   return "DPUISD::LsrAddNullJcc";
+  // case DPUISD::LslAddJcc:
+  //   return "DPUISD::LslAddJcc";
+  // case DPUISD::LslAddNullJcc:
+  //   return "DPUISD::LslAddNullJcc";
+  // case DPUISD::LslSubJcc:
+  //   return "DPUISD::LslSubJcc";
+  // case DPUISD::LslSubNullJcc:
+  //   return "DPUISD::LslSubNullJcc";
   case DPUISD::TEST_NODE:
     return "DPUISD::TEST_NODE";
   }
@@ -1737,11 +1738,24 @@ SDValue DPUTargetLowering::LowerBrCc(SDValue Op, SelectionDAG &DAG) const {
 
   // First, let's determine if there is a constant operand we can keep as
   // immediate.
+  ConstantSDNode *LC = dyn_cast<ConstantSDNode>(leftOp);
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(rightOp);
-
+  LLVM_DEBUG({
+      dbgs() << "leftOp "; leftOp->dump();
+      dbgs() << "rightOp "; rightOp->dump();
+      if (LC) {
+	dbgs() << "a const: "; LC->dump();
+      }
+      
+      if (C) {
+	dbgs() << "a const: "; C->dump();
+      }
+    });
+  
   // todo: handle 64bit compare with immediate
-  if (!(C && isLegalICmpImmediate(C->getSExtValue())) ||
-      (rightOp.getValueType().getSimpleVT().SimpleTy == MVT::i64)) {
+  if (!(C && isLegalICmpImmediate(C->getSExtValue()))
+      || (rightOp.getValueType().getSimpleVT().SimpleTy == MVT::i64)
+      ) {
     // No suitable constant found. We cannot do anything special.
     SDValue Chain = Op.getOperand(0);
     SDLoc dl(Op);
@@ -2029,6 +2043,12 @@ static MachineBasicBlock *
 EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
                             unsigned MulLL, unsigned MulHL, unsigned MulHL2,
                             unsigned MulHH) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -2061,12 +2081,26 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  BuildMI(BB, dl, TII.get(MulLL), LLDest)
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+  // BuildMI(BB, dl, TII.get(MulLL), LLDest)
+  //     .addReg(Op1)
+  //     .addReg(Op2)
+  //     .addImm(DPUAsmCondition::Small)
+  //     .addMBB(fastMBB)
+  //   // .addMetadata(N)
+  //   ;
+
+  BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest)
       .addReg(Op1)
       .addReg(Op2)
-      .addImm(DPUAsmCondition::Small)
-      .addMBB(fastMBB);
-
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JLTUrii))
+    .addReg(LLDest)
+    .addImm(0x100)
+    .addMBB(fastMBB)
+    .addMetadata(N);
+  
   BuildMI(slowMBB, dl, TII.get(MulHL), HLDest).addReg(Op1).addReg(Op2);
   BuildMI(slowMBB, dl, TII.get(DPU::LSL_ADDrrri), LSL1Dest)
       .addReg(LLDest)
@@ -2092,19 +2126,35 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
       .addMBB(slowMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** slowMBB: "; slowMBB->dump();
+      dbgs() << "** fastMBB: "; fastMBB->dump();
+      dbgs() << "****** \n";
+    });
+ 
   return fastMBB;
 }
 
 static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI,
                                                        MachineBasicBlock *BB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator I = ++BB->getIterator();
   MachineFunction *F = BB->getParent();
-  MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *falseMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, trueMBB);
+  F->insert(I, falseMBB);
   F->insert(I, endMBB);
   // Update machine-CFG edges by transferring all successors of the current
   // block to the new block which will contain the Phi node for the select.
@@ -2112,82 +2162,97 @@ static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI,
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
   endMBB->transferSuccessorsAndUpdatePHIs(BB);
   // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(trueMBB);
+  BB->addSuccessor(falseMBB);
   BB->addSuccessor(endMBB);
-  trueMBB->addSuccessor(endMBB);
+  falseMBB->addSuccessor(endMBB);
 
   unsigned int Dest = MI.getOperand(0).getReg();
   unsigned int CondReg = MI.getOperand(1).getReg();
   unsigned int TrueReg = MI.getOperand(2).getReg();
   unsigned int FalseReg = MI.getOperand(3).getReg();
 
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned FalseResultReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-
-  BuildMI(BB, dl, TII.get(DPU::ORrrr), FalseResultReg)
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
       .addReg(CondReg)
-      .addReg(FalseReg);
-
-  BuildMI(BB, dl, TII.get(DPU::TmpJcci))
-      .addImm(ISD::CondCode::SETEQ)
-      .addReg(CondReg)
-      .addImm(0)
-      .addReg(FalseResultReg)
+      .addImm(1)
       .addMBB(endMBB);
 
-  BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
-      .addReg(TrueReg)
-      .addMBB(trueMBB)
-      .addReg(FalseResultReg)
-      .addMBB(BB);
-
-  MI.eraseFromParent(); // The pseudo instruction is gone now.
-  return endMBB;
-}
-
-static MachineBasicBlock *
-EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, trueMBB);
-  F->insert(I, endMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  endMBB->splice(endMBB->begin(), BB,
-                 std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  endMBB->transferSuccessorsAndUpdatePHIs(BB);
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(trueMBB);
-  BB->addSuccessor(endMBB);
-
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int CondReg = MI.getOperand(1).getReg();
-  unsigned int TrueReg = MI.getOperand(2).getReg();
-  unsigned int FalseReg = MI.getOperand(3).getReg();
-
-  BuildMI(BB, dl, TII.get(DPU::Jcci))
-      .addImm(ISD::CondCode::SETEQ)
-      .addReg(CondReg)
-      .addImm(0)
+  BuildMI(falseMBB, dl, TII.get(DPU::JUMPi))
       .addMBB(endMBB);
-
-  trueMBB->addSuccessor(endMBB);
-
+  
   BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
-      .addReg(TrueReg)
-      .addMBB(trueMBB)
-      .addReg(FalseReg)
-      .addMBB(BB);
+    .addReg(TrueReg).addMBB(BB)
+    .addReg(FalseReg).addMBB(falseMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** falseMBB: "; falseMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
   return endMBB;
 }
 
+// static MachineBasicBlock *
+// EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction to replace: "; MI.dump();
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "****** \n";
+//     });
+//   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+//   DebugLoc dl = MI.getDebugLoc();
+//   const BasicBlock *LLVM_BB = BB->getBasicBlock();
+//   MachineFunction::iterator I = ++BB->getIterator();
+//   MachineFunction *F = BB->getParent();
+//   MachineBasicBlock *falseMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   F->insert(I, falseMBB);
+//   F->insert(I, endMBB);
+//   // Update machine-CFG edges by transferring all successors of the current
+//   // block to the new block which will contain the Phi node for the select.
+//   endMBB->splice(endMBB->begin(), BB,
+//                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+//   endMBB->transferSuccessorsAndUpdatePHIs(BB);
+//   // Next, add the true and fallthrough blocks as its successors.
+//   BB->addSuccessor(trueMBB);
+//   BB->addSuccessor(endMBB);
+//   falseMBB->addSuccessor(endMBB);
+
+//   unsigned int Dest = MI.getOperand(0).getReg();
+//   unsigned int CondReg = MI.getOperand(1).getReg();
+//   unsigned int TrueReg = MI.getOperand(2).getReg();
+//   unsigned int FalseReg = MI.getOperand(3).getReg();
+
+//   BuildMI(BB, dl, TII.get(DPU::Jcci))
+//       .addImm(ISD::CondCode::SETEQ)
+//       .addReg(CondReg)
+//       .addImm(1)
+//       .addMBB(endMBB);
+
+//   BuildMI(falseBB, dl, TII.get(DPU::Jumpi))
+//       .addMBB(endMBB);
+  
+//   BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
+//     .addReg(TrueReg).addMBB(BB)
+//     .addReg(FalseReg).addMBB(falseMBB);
+
+//   MI.eraseFromParent(); // The pseudo instruction is gone now.
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction replaced\n";
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "** falseMBB: "; falseMBB->dump();
+//       dbgs() << "** endMBB: "; endMBB->dump();
+//       dbgs() << "****** \n";
+//     });
+//   return endMBB;
+// }
+
 static MachineBasicBlock *
 EmitMramSubStoreWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
                                    unsigned int Mask, unsigned int Store) {
@@ -2373,6 +2438,13 @@ EmitMramLoadDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
 
 static MachineBasicBlock *
 EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
+
   /*
       What we want to generate (with dc.h != rb in that example):
       lslx       __R0, da.l, rb, ?sh32 @+4
@@ -2406,9 +2478,10 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   MachineRegisterInfo &RI = F->getRegInfo();
   unsigned LsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned MsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-
+  // unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  // unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned ShiftReg_check = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
   unsigned BigShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned BigShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
@@ -2426,20 +2499,53 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
-  BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit);
-
-  BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
-      .addReg(LsbOp1Reg)
-      .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB);
-
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg)
+  //     .addReg(Op1Reg, 0, DPU::sub_32bit);
+
+  // unsigned DummyReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  /// faulty
+  // BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg)
+  //     // .addReg(LsbOp1Reg)
+  //   .addReg(Op1Reg, 0, DPU::sub_32bit)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB)
+  //   // .addMetadata(N)
+  //   ;
+
+  /// good, but
+  // could increase quite a bit the code size
+  //   because MachineSinking will sink the lslxrrr to other places
+  //   and we will not be able to merge those three
+  //   though, with shouldSink false for this
+  //   on a few example, I can keep them adjacent
+  //  but I may kill other optimization stuff in other code
+  //   that use it genuinelly
+  BuildMI(BB, dl, TII.get(DPU::LSLXrrr), LsbToMsbPartReg)
+    // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
+    .addReg(ShiftReg)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
+    .addReg(ShiftReg_check)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(N);
+  
+  // BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg)
+      // .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), MsbToMsbPartReg)
-      .addReg(MsbOp1Reg)
+      // .addReg(MsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
       .addReg(ShiftReg);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftMsbReg)
@@ -2447,13 +2553,13 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
       .addReg(LsbToMsbPartReg);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), SmallShiftLsbReg)
-      .addReg(LsbOp1Reg)
+      // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
       .addReg(ShiftReg);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), Undef2Reg);
 
-  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG),
-          SmallShiftResultPart0Reg)
+  BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultPart0Reg)
       .addReg(Undef2Reg)
       .addReg(SmallShiftLsbReg)
       .addImm(DPU::sub_32bit);
@@ -2466,7 +2572,8 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
   BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB);
 
   BuildMI(bigShiftMBB, dl, TII.get(DPU::LSLrrr), BigShiftMsbReg)
-      .addReg(LsbOp1Reg)
+      // .addReg(LsbOp1Reg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit)
       .addReg(ShiftReg);
 
   BuildMI(bigShiftMBB, dl, TII.get(DPU::MOVEri), BigShiftLsbReg).addImm(0);
@@ -2495,6 +2602,16 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
       .addMBB(smallShiftMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** smallShiftMBB: "; smallShiftMBB->dump();
+      dbgs() << "** bigShiftMBB: "; bigShiftMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
   return endMBB;
 }
 
@@ -2618,6 +2735,13 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) {
 static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB, unsigned int shiftRight,
     unsigned int shiftRightExtended) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
+
   /*
       What we want to generate (with dc.l != rb in that example):
       lsrx    __R0, da.h, rb, ?sh32 @+4
@@ -2652,6 +2776,7 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
   unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned MsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned LsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned ShiftReg_check = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned SmallShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned SmallShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
   unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
@@ -2661,14 +2786,33 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
       RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
 
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
   BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit_hi);
 
-  BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
-      .addReg(MsbOp1Reg)
-      .addReg(ShiftReg)
-      .addImm(DPUAsmCondition::Condition::Shift32)
-      .addMBB(bigShiftMBB);
+  // BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg)
+  //     .addReg(MsbOp1Reg)
+  //     .addReg(ShiftReg)
+  //     .addImm(DPUAsmCondition::Condition::Shift32)
+  //     .addMBB(bigShiftMBB)
+  //   // .addMetadata(N)
+  //   ;
+
+  BuildMI(BB, dl, TII.get(DPU::LSRXrrr), MsbToLsbPartReg)
+    .addReg(MsbOp1Reg)
+    .addReg(ShiftReg)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check)
+    .addReg(ShiftReg)
+    .addImm(0x20)
+    .addMetadata(N);
+  BuildMI(BB, dl, TII.get(DPU::JEQrii))
+    .addReg(ShiftReg_check)
+    .addImm(0x20)
+    .addMBB(bigShiftMBB)
+    .addMetadata(N);
 
   BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), LsbOp1Reg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
@@ -2716,6 +2860,17 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter(
       .addMBB(smallShiftMBB);
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** smallShiftMBB: "; smallShiftMBB->dump();
+      dbgs() << "** bigShiftMBB: "; bigShiftMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
+ 
   return endMBB;
 }
 
@@ -2877,6 +3032,7 @@ EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
   BuildMI(*BB, MI, dl, TII.get(lsN), Op1MsbShift)
       .addReg(Op1Msb)
       .addReg(ShiftReg);
+  // should be checked
   BuildMI(*BB, MI, dl, TII.get(lsNJump), Op1LsbShift)
       .addReg(Op1Lsb)
       .addReg(ShiftReg)
@@ -3063,6 +3219,12 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB,
 
 static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
                                                       MachineBasicBlock *BB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "****** \n";
+    });
   /*
       What we want to generate (with dc != da in that example):
       clz.u dc, da.h ?nmax @+3
@@ -3094,120 +3256,607 @@ static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI,
   MachineRegisterInfo &RI = F->getRegInfo();
   unsigned FastResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
   unsigned SlowResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SlowResultPart1Reg =
-      RI.createVirtualRegister(&DPU::GP64_REGRegClass);
-  unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  // unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  // unsigned SlowResultPart1Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  // unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
 
-  BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
-      .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
-      .addImm(DPUAsmCondition::Condition::NotMaximum)
-      .addMBB(endMBB);
+  unsigned SlowResultReg_step = RI.createVirtualRegister(&DPU::GP64_REGRegClass);
+
+  unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LsbAddReg = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  LLVMContext &Context = F->getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg)
+  //     .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+  //     .addImm(DPUAsmCondition::Condition::NotMaximum)
+  //     .addMBB(endMBB)
+  //   // .addMetadata(N)
+  //   ;
+
+  BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg)
+    .addReg(Op1Reg, 0, DPU::sub_32bit_hi)
+    .addMetadata(N)
+    ;
+  BuildMI(BB, dl, TII.get(DPU::JNEQrii))
+    .addReg(FastResultReg, 0, DPU::sub_32bit)
+    .addImm(32)
+    .addMBB(endMBB)
+    .addMetadata(N)
+    ;
 
   BuildMI(msbAreZerosMBB, dl, TII.get(DPU::CLZrr), LsbClzReg)
       .addReg(Op1Reg, 0, DPU::sub_32bit);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg)
-      .addReg(LsbClzReg)
-      .addImm(32);
+  // This
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg)
+  //     .addReg(LsbClzReg)
+  //     .addImm(32);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg);
 
-  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg)
-      .addReg(UndefReg)
-      .addReg(SlowResultPartReg)
-      .addImm(DPU::sub_32bit);
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg)
+  //     .addReg(UndefReg)
+  //     .addReg(SlowResultPartReg)
+  //     .addImm(DPU::sub_32bit);
+
+  // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg)
+  //     .addReg(SlowResultPart1Reg)
+  //     .addReg(FastResultReg, 0, DPU::sub_32bit_hi)
+  //     .addImm(DPU::sub_32bit_hi);
 
+  // or
+  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), LsbAddReg)
+      .addReg(LsbClzReg)
+      .addImm(32);
+
+  BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg_step)
+    .addReg(SlowResultReg_step, RegState::Undef)
+    .addReg(LsbAddReg)
+    .addImm(DPU::sub_32bit);
+  
   BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg)
-      .addReg(SlowResultPart1Reg)
+      .addReg(SlowResultReg_step)
       .addReg(FastResultReg, 0, DPU::sub_32bit_hi)
       .addImm(DPU::sub_32bit_hi);
-
+  
   BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest)
       .addReg(FastResultReg)
       .addMBB(BB)
       .addReg(SlowResultReg)
       .addMBB(msbAreZerosMBB);
 
+  
   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** BB: "; BB->dump();
+      dbgs() << "** msbAreZerosMBB: "; msbAreZerosMBB->dump();
+      dbgs() << "** endMBB: "; endMBB->dump();
+      dbgs() << "****** \n";
+    });
+
   return endMBB;
 }
 
-static MachineBasicBlock *EmitSeqreadGet(MachineInstr &MI,
-                                         MachineBasicBlock *BB, bool IsIncCst) {
-  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
-  DebugLoc dl = MI.getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator I = ++BB->getIterator();
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *slowMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *fastMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(I, slowMBB);
-  F->insert(I, fastMBB);
-  // Update machine-CFG edges by transferring all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  fastMBB->splice(fastMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  fastMBB->transferSuccessorsAndUpdatePHIs(BB);
-  // Next, add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(slowMBB);
-  BB->addSuccessor(fastMBB);
-  slowMBB->addSuccessor(fastMBB);
+// static MachineBasicBlock *EmitSeqreadGet(MachineInstr &MI,
+//                                          MachineBasicBlock *BB, bool IsIncCst) {
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction to replace: "; MI.dump();
+//       dbgs() << "IsIncCst: " << IsIncCst << "\n";
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "****** \n";
+//     });
+    
+//   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+//   DebugLoc dl = MI.getDebugLoc();
+//   const BasicBlock *LLVM_BB = BB->getBasicBlock();
+//   MachineFunction::iterator I = ++BB->getIterator();
+//   MachineFunction *F = BB->getParent();
+//   MachineBasicBlock *slowMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   MachineBasicBlock *fastMBB = F->CreateMachineBasicBlock(LLVM_BB);
+//   F->insert(I, slowMBB);
+//   F->insert(I, fastMBB);
+//   // Update machine-CFG edges by transferring all successors of the current
+//   // block to the new block which will contain the Phi node for the select.
+//   fastMBB->splice(fastMBB->begin(), BB,
+//                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+//   fastMBB->transferSuccessorsAndUpdatePHIs(BB);
+//   // Next, add the true and fallthrough blocks as its successors.
+//   BB->addSuccessor(slowMBB);
+//   BB->addSuccessor(fastMBB);
+//   slowMBB->addSuccessor(fastMBB);
+
+//   unsigned int Dest = MI.getOperand(0).getReg();
+//   unsigned int PtrInit = MI.getOperand(1).getReg();
+//   unsigned int Reader = MI.getOperand(3).getReg();
+//   unsigned int Cond = MI.getOperand(4).getImm();
+//   unsigned int PageSize = MI.getOperand(5).getImm();
+
+//   MachineRegisterInfo &RI = F->getRegInfo();
+//   unsigned int PtrIncremented = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+//   if (IsIncCst) {
+//     BuildMI(BB, dl, TII.get(DPU::ADDrrici), PtrIncremented)
+//         .addReg(PtrInit)
+//         .addImm(MI.getOperand(2).getImm())
+//         .addImm(Cond)
+//         .addMBB(fastMBB);
+//   } else {
+//     BuildMI(BB, dl, TII.get(DPU::ADDrrrci), PtrIncremented)
+//         .addReg(PtrInit)
+//         .addReg(MI.getOperand(2).getReg())
+//         .addImm(Cond)
+//         .addMBB(fastMBB);
+//   }
+
+//   unsigned int WramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   unsigned int MramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   unsigned int MramCacheUpdated =
+//       RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   unsigned int PtrUpdated = RI.createVirtualRegister(&DPU::GP_REGRegClass);
+//   BuildMI(slowMBB, dl, TII.get(DPU::LWrri), MramCache).addReg(Reader).addImm(4);
+//   BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), MramCacheUpdated)
+//       .addReg(MramCache)
+//       .addImm(PageSize);
+//   BuildMI(slowMBB, dl, TII.get(DPU::SWrir))
+//       .addReg(Reader)
+//       .addImm(4)
+//       .addReg(MramCacheUpdated);
+//   BuildMI(slowMBB, dl, TII.get(DPU::LWrri), WramCache).addReg(Reader).addImm(0);
+//   BuildMI(slowMBB, dl, TII.get(DPU::LDMArri))
+//       .addReg(WramCache)
+//       .addReg(MramCacheUpdated)
+//       .addImm(FormatDMASize(PageSize * 2));
+//   BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), PtrUpdated)
+//       .addReg(PtrIncremented)
+//       .addImm(-PageSize);
+
+//   BuildMI(*fastMBB, fastMBB->begin(), dl, TII.get(TargetOpcode::PHI), Dest)
+//       .addReg(PtrIncremented)
+//       .addMBB(BB)
+//       .addReg(PtrUpdated)
+//       .addMBB(slowMBB);
+
+//   MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+//   LLVM_DEBUG({
+//       dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+//       dbgs() << "instruction replaced\n";
+//       dbgs() << "** BB: "; BB->dump();
+//       dbgs() << "** slowMBB: "; slowMBB->dump();
+//       dbgs() << "** fastMBB: "; fastMBB->dump();
+//       dbgs() << "****** \n";
+//     });
+  
+//   return fastMBB;
+// }
+
+static MachineBasicBlock *EmitAlu64BitRRWithCustomInserter(MachineInstr &MI,
+							   MachineBasicBlock *MBB,
+							   unsigned LsbOpcode,
+							   unsigned MsbOpcode) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
 
-  unsigned int Dest = MI.getOperand(0).getReg();
-  unsigned int PtrInit = MI.getOperand(1).getReg();
-  unsigned int Reader = MI.getOperand(3).getReg();
-  unsigned int Cond = MI.getOperand(4).getImm();
-  unsigned int PageSize = MI.getOperand(5).getImm();
+  
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  
+  LLVMContext &Context = MF.getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned LHSReg = MI.getOperand(1).getReg();
+  unsigned RHSReg = MI.getOperand(2).getReg();
+
+  // Create new virtual registers for the lower and upper halves
+  unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  // Split the 64-bit operands into 32-bit halves
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Lo).addReg(RHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Hi).addReg(RHSReg, 0, DPU::sub_32bit_hi);
+
+  // Perform the lower 32-bit subtraction
+  MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(LsbOpcode), Dst_Lo)
+    .addReg(LHS_Lo)
+    .addReg(RHS_Lo)
+    .addMetadata(N)
+    ;
+
+  // Perform the upper 32-bit subtraction with carry
+  MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(MsbOpcode), Dst_Hi)
+    .addReg(LHS_Hi)
+    .addReg(RHS_Hi)
+    .addMetadata(N)
+    ;
+
+  // Combine the result into the 64-bit destination register
+  unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0)
+      .addReg(UndefReg)
+      .addReg(Dst_Lo)
+      .addImm(DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1)
+      .addReg(Dstp0)
+      .addReg(Dst_Hi)
+      .addImm(DPU::sub_32bit_hi);
 
-  MachineRegisterInfo &RI = F->getRegInfo();
-  unsigned int PtrIncremented = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-
-  if (IsIncCst) {
-    BuildMI(BB, dl, TII.get(DPU::ADDrrici), PtrIncremented)
-        .addReg(PtrInit)
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(Cond)
-        .addMBB(fastMBB);
-  } else {
-    BuildMI(BB, dl, TII.get(DPU::ADDrrrci), PtrIncremented)
-        .addReg(PtrInit)
-        .addReg(MI.getOperand(2).getReg())
-        .addImm(Cond)
-        .addMBB(fastMBB);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1);
+  
+  for (unsigned i = 1; i < 3; i++) {
+    if (MI.getOperand(i).isKill()) {
+      MIBLsb->getOperand(i).setIsKill();
+      MIBMsb->getOperand(i).setIsKill();
+    }
   }
+  
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
 
-  unsigned int WramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int MramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int MramCacheUpdated =
-      RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  unsigned int PtrUpdated = RI.createVirtualRegister(&DPU::GP_REGRegClass);
-  BuildMI(slowMBB, dl, TII.get(DPU::LWrri), MramCache).addReg(Reader).addImm(4);
-  BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), MramCacheUpdated)
-      .addReg(MramCache)
-      .addImm(PageSize);
-  BuildMI(slowMBB, dl, TII.get(DPU::SWrir))
-      .addReg(Reader)
-      .addImm(4)
-      .addReg(MramCacheUpdated);
-  BuildMI(slowMBB, dl, TII.get(DPU::LWrri), WramCache).addReg(Reader).addImm(0);
-  BuildMI(slowMBB, dl, TII.get(DPU::LDMArri))
-      .addReg(WramCache)
-      .addReg(MramCacheUpdated)
-      .addImm(FormatDMASize(PageSize * 2));
-  BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), PtrUpdated)
-      .addReg(PtrIncremented)
-      .addImm(-PageSize);
+  return MBB;
+}
 
-  BuildMI(*fastMBB, fastMBB->begin(), dl, TII.get(TargetOpcode::PHI), Dest)
-      .addReg(PtrIncremented)
-      .addMBB(BB)
-      .addReg(PtrUpdated)
-      .addMBB(slowMBB);
+static MachineBasicBlock *EmitAlu64BitRIWithCustomInserter(MachineInstr &MI,
+							   MachineBasicBlock *MBB,
+							   unsigned LsbOpcode,
+							   unsigned MsbOpcode) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
 
-  MI.eraseFromParent(); // The pseudo instruction is gone now.
-  return fastMBB;
+  const DebugLoc &DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction &MF = *MBB->getParent();
+
+  LLVMContext &Context = MF.getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned LHSReg = MI.getOperand(1).getReg();
+  int64_t RHSImm = MI.getOperand(2).getImm();
+
+  // Create new virtual registers for the lower and upper halves
+  unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  
+  // Split the 64-bit operands into 32-bit halves
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi);
+
+  int64_t RHSImmLo = RHSImm & 0xFFFFFFFFl;
+  int64_t RHSImmHi = (RHSImm >> 32) & 0xFFFFFFFFl;
+
+  // // what if value is zero???
+  // // probably optimizable :)
+  // switch (RHSImmLo) {
+  // case 0:
+  // case 1:
+  // case 0xffffffff:
+  // case 0x80000000:
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "RHSImmLo = " << RHSImmLo << " could be optimized\n";
+  //     });
+  // }
+
+  // switch (RHSImmHi) {
+  // case 0:
+  // case 1:
+  // case 0xffffffff:
+  // case 0x80000000:
+  //   LLVM_DEBUG({
+  // 	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+  // 	dbgs() << "RHSImmHi = " << RHSImmHi << " could be optimized\n";
+  //     });
+  // }
+  
+  // Perform the lower 32-bit subtraction
+  MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(LsbOpcode), Dst_Lo)
+    .addReg(LHS_Lo)
+    .addImm(RHSImmLo)
+    .addMetadata(N);
+
+  // Perform the upper 32-bit subtraction with carry
+  MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(MsbOpcode), Dst_Hi)
+    .addReg(LHS_Hi)
+    .addImm(RHSImmHi)
+    .addMetadata(N);
+
+  // Combine the result into the 64-bit destination register
+  unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0)
+      .addReg(UndefReg)
+      .addReg(Dst_Lo)
+      .addImm(DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1)
+      .addReg(Dstp0)
+      .addReg(Dst_Hi)
+      .addImm(DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1);
+  
+  if (MI.getOperand(1).isKill()) {
+    MIBLsb->getOperand(1).setIsKill();
+    MIBMsb->getOperand(1).setIsKill();
+  }
+  
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  return MBB;
+}
+
+static MachineBasicBlock *EmitMove64RiWithCustomInserter(MachineInstr &MI,
+							 MachineBasicBlock *MBB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  int64_t RHSImm = MI.getOperand(1).getImm();
+
+  // Create new virtual registers for the lower and upper halves
+  unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  int64_t RHSImmLo = RHSImm & 0xFFFFFFFFl;
+  int64_t RHSImmHi = (RHSImm >> 32) & 0xFFFFFFFFl;
+
+  // // what if value is zero???
+  // // probably optimizable :)
+  switch (RHSImmLo) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "RHSImmLo = " << RHSImmLo << " could be optimized\n";
+      });
+  }
+
+  switch (RHSImmHi) {
+  case 0:
+  case 1:
+  case 0xffffffff:
+  case 0x80000000:
+    LLVM_DEBUG({
+	dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+	dbgs() << "RHSImmHi = " << RHSImmHi << " could be optimized\n";
+      });
+  }
+
+  // Perform the lower 32-bit subtraction
+  MachineInstrBuilder MIBLsb;
+  // switch (RHSImmLo) {
+  // default: {
+    MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::MOVEri), Dst_Lo).addImm(RHSImmLo);
+    // break;
+  // }
+  // case 0: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::ZERO);
+  //   break;
+  // }
+  // case 1: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::ONE);
+  //   break;
+  // }
+  // case 0xffffffff: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::LNEG);
+  //   break;
+  // }
+  // case 0x80000000: {
+  //   MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::MNEG);
+  //   break;
+  // }
+  // }
+
+  // Perform the upper 32-bit subtraction with carry
+  MachineInstrBuilder MIBMsb;
+  // switch (RHSImmHi) {
+  // default: {
+  MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::MOVEri), Dst_Hi).addImm(RHSImmHi);
+   //  break;
+  // }
+  // case 0: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::ZERO);
+  //   break;
+  // }
+  // case 1: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::ONE);
+  //   break;
+  // }
+  // case 0xffffffff: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::LNEG);
+  //   break;
+  // }
+  // case 0x80000000: {
+  //   MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::MNEG);
+  //   break;
+  // }
+  // }
+
+  // Combine the result into the 64-bit destination register
+  unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0)
+      .addReg(UndefReg)
+      .addReg(Dst_Lo)
+      .addImm(DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1)
+      .addReg(Dstp0)
+      .addReg(Dst_Hi)
+      .addImm(DPU::sub_32bit_hi);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1);
+
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  return MBB;
+}
+
+static DPUAsmCondition::Condition
+findSelect64SetConditionFor(DPUAsmCondition::Condition cond) {
+  switch (cond) {
+  default:
+    llvm_unreachable("invalid condition");
+  case DPUAsmCondition::Condition::Zero:
+  case DPUAsmCondition::Condition::Equal:
+    return DPUAsmCondition::Condition::ExtendedZero;
+  case DPUAsmCondition::Condition::NotZero:
+  case DPUAsmCondition::Condition::NotEqual:
+    return DPUAsmCondition::Condition::ExtendedNotZero;
+  case DPUAsmCondition::Condition::GreaterThanSigned:
+    return DPUAsmCondition::Condition::ExtendedGreaterThanSigned;
+  case DPUAsmCondition::Condition::GreaterOrEqualSigned:
+    return DPUAsmCondition::Condition::GreaterOrEqualSigned;
+  case DPUAsmCondition::Condition::LessThanSigned:
+    return DPUAsmCondition::Condition::LessThanSigned;
+  case DPUAsmCondition::Condition::LessOrEqualSigned:
+    return DPUAsmCondition::Condition::ExtendedLessOrEqualSigned;
+  case DPUAsmCondition::Condition::GreaterThanUnsigned:
+    return DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned;
+  case DPUAsmCondition::Condition::GreaterOrEqualUnsigned:
+    return DPUAsmCondition::Condition::GreaterOrEqualUnsigned;
+  case DPUAsmCondition::Condition::LessThanUnsigned:
+    return DPUAsmCondition::Condition::LessThanUnsigned;
+  case DPUAsmCondition::Condition::LessOrEqualUnsigned:
+    return DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned;
+  }
+}
+
+static MachineBasicBlock *EmitSetCC64WithCustomInserter(MachineInstr &MI,
+							MachineBasicBlock *MBB) {
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction to replace: "; MI.dump();
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction &MF = *MBB->getParent();
+
+  LLVMContext &Context = MF.getFunction().getContext();
+  MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata"));
+
+  // Get the virtual registers
+  unsigned DstReg = MI.getOperand(0).getReg();
+  auto ImmCond = static_cast<DPUAsmCondition::Condition>(MI.getOperand(1).getImm());
+  unsigned LHSReg = MI.getOperand(2).getReg();
+  unsigned RHSReg = MI.getOperand(3).getReg();
+
+  unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  unsigned RHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+  // unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass);
+
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Lo).addReg(RHSReg, 0, DPU::sub_32bit);
+  BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Hi).addReg(RHSReg, 0, DPU::sub_32bit_hi);
+  
+  DPUAsmCondition::Condition SetCondition =
+    findSelect64SetConditionFor(ImmCond);
+
+  MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::SUBzrr))
+    .addReg(DPU::ZERO)
+    .addReg(LHS_Lo)
+    .addReg(RHS_Lo)
+    .addMetadata(N);
+  MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::SUBCrrrc), DstReg)
+    .addReg(LHS_Hi)
+    .addReg(RHS_Hi)
+    .addImm(SetCondition)
+    .addMetadata(N);
+
+  for (unsigned i = 2; i < 4; i++) {
+    if (MI.getOperand(i).isKill()) {
+      MIBLsb->getOperand(i - 1).setIsKill();
+      MIBMsb->getOperand(i - 1).setIsKill();
+    }
+  }
+
+  // Remove the pseudo instruction
+  MI.eraseFromParent();
+  
+  LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n";
+      dbgs() << "instruction replaced\n";
+      dbgs() << "** MBB: "; MBB->dump();
+      dbgs() << "****** \n";
+    });
+
+  return MBB;
 }
 
 MachineBasicBlock *
@@ -3215,27 +3864,39 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   switch (MI.getOpcode()) {
   default:
+    MI.print(errs());
     llvm_unreachable("Unexpected instr type to insert");
-  case DPU::SEQREAD_GET:
-    return EmitSeqreadGet(MI, BB, false);
-  case DPU::SEQREAD_GET_CST:
-    return EmitSeqreadGet(MI, BB, true);
+  // case DPU::SEQREAD_GET:
+  //   return EmitSeqreadGet(MI, BB, false);
+  // case DPU::SEQREAD_GET_CST:
+  //   return EmitSeqreadGet(MI, BB, true);
   case DPU::Mul16UUrr:
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16UUrr\n";
+      });
     return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci,
                                        DPU::MUL_UH_ULrrr, DPU::MUL_UH_ULrrr,
                                        DPU::MUL_UH_UHrrr);
   case DPU::Mul16SUrr:
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16SUrr\n";
+      });
     return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci,
                                        DPU::MUL_SH_ULrrr, DPU::MUL_UH_ULrrr,
                                        DPU::MUL_SH_UHrrr);
   case DPU::Mul16SSrr:
+    LLVM_DEBUG({
+      dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16SSrr\n";
+      });
     return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci,
                                        DPU::MUL_SH_ULrrr, DPU::MUL_SH_ULrrr,
                                        DPU::MUL_SH_SHrrr);
   case DPU::SELECTrr:
     return EmitSelectWithCustomInserter(MI, BB);
   case DPU::SELECT64rr:
-    return EmitSelect64WithCustomInserter(MI, BB);
+    // return EmitSelect64WithCustomInserter(MI, BB);
+    return EmitSelectWithCustomInserter(MI, BB);
+    
   case DPU::MRAM_STORE_BYTErm:
     return EmitMramSubStoreWithCustomInserter(MI, BB, 7, DPU::SBrir);
   case DPU::MRAM_STORE_HALFrm:
@@ -3275,6 +3936,7 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return EmitMramSubLoadWithCustomInserter(MI, BB, 4, DPU::LW_Srri);
   case DPU::MRAM_LOAD_DOUBLEmr:
     return EmitMramLoadDoubleWithCustomInserter(MI, BB);
+    
   case DPU::LSL64rr:
     return EmitLsl64RegisterWithCustomInserter(MI, BB);
   case DPU::LSL64ri:
@@ -3305,5 +3967,42 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                 DPU::LSR_ADDrrri);
   case DPU::CLZ64r:
     return EmitClz64WithCustomInserter(MI, BB);
+
+  // RR
+  // case DPU::ADD64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ADDrrr, DPU::ADDCrrr);
+
+  // case DPU::AND64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ANDrrr, DPU::ANDrrr);
+
+  // case DPU::OR64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ORrrr, DPU::ORrrr);
+
+  // case DPU::SUB64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::SUBrrr, DPU::SUBCrrr);
+
+  // case DPU::XOR64rr:
+  //   return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::XORrrr, DPU::XORrrr);
+
+  // // RI
+  // case DPU::ADD64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ADDrri, DPU::ADDCrri);
+
+  // case DPU::AND64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ANDrri, DPU::ANDrri);
+
+  // case DPU::OR64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ORrri, DPU::ORrri);
+
+  // case DPU::XOR64ri:
+  //   return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::XORrri, DPU::XORrri);
+
+    
+  case DPU::MOVE64ri:
+    return EmitMove64RiWithCustomInserter(MI, BB);
+
+  case DPU::SET64cc:
+    return EmitSetCC64WithCustomInserter(MI, BB);
+
   }
 }
diff --git a/llvm/lib/Target/DPU/DPUTargetMachine.cpp b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
index 5815b161c6ce9..734bc0d541ff3 100644
--- a/llvm/lib/Target/DPU/DPUTargetMachine.cpp
+++ b/llvm/lib/Target/DPU/DPUTargetMachine.cpp
@@ -54,6 +54,7 @@ DPUTargetMachine::DPUTargetMachine(const Target &T, const Triple &TT,
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
+  // setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
@@ -84,6 +85,7 @@ class DPUPassConfig : public TargetPassConfig {
 
   bool addInstSelector() override;
 
+  void addPostRegAlloc() override;
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
 };
@@ -103,6 +105,11 @@ bool DPUPassConfig::addInstSelector() {
   return false;
 }
 
+void DPUPassConfig::addPostRegAlloc() {
+  DPUTargetMachine &TM = getDPUTargetMachine();
+  addPass(createDPUPostRAFusionPass(TM));
+}
+
 void DPUPassConfig::addPreEmitPass() {
   DPUTargetMachine &TM = getDPUTargetMachine();
   addPass(createDPUMergeComboInstrPass(TM));
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 8bd3036f1fc34..336a990a046ca 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -367,13 +367,19 @@ struct ScopedSaveAliaseesAndUsed {
   }
 
   ~ScopedSaveAliaseesAndUsed() {
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
     appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end()));
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
     appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(),
                                                        CompilerUsed.end()));
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 
-    for (auto P : FunctionAliases)
+    for (auto P : FunctionAliases) {
+      // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
       P.first->setIndirectSymbol(
           ConstantExpr::getBitCast(P.second, P.first->getType()));
+      // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
+    }
   }
 };
 
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index ef9f18a2289e9..26ced977d52fc 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -75,38 +75,52 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *>
   GlobalVariable *GV = M.getGlobalVariable(Name);
   SmallPtrSet<Constant *, 16> InitAsSet;
   SmallVector<Constant *, 16> Init;
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   if (GV) {
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
     auto *CA = cast<ConstantArray>(GV->getInitializer());
     for (auto &Op : CA->operands()) {
+      // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
       Constant *C = cast_or_null<Constant>(Op);
       if (InitAsSet.insert(C).second)
         Init.push_back(C);
     }
     GV->eraseFromParent();
   }
-
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   for (auto *V : Values) {
-    Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
+    // V->dump();
+    // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
+    // Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+    Constant *C = ConstantExpr::getPointerBitCastOrAddrSpaceCast(V, Int8PtrTy);
     if (InitAsSet.insert(C).second)
       Init.push_back(C);
   }
-
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   if (Init.empty())
     return;
-
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
   GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
                                 ConstantArray::get(ATy, Init), Name);
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   GV->setSection("llvm.metadata");
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 }
 
 void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   appendToUsedList(M, "llvm.used", Values);
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 }
 
 void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
   appendToUsedList(M, "llvm.compiler.used", Values);
+  // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; });
 }
 
 FunctionCallee