diff --git a/compiler-rt/dpu/CMakeLists.txt b/compiler-rt/dpu/CMakeLists.txt new file mode 100644 index 0000000000000..19e3c2790baf1 --- /dev/null +++ b/compiler-rt/dpu/CMakeLists.txt @@ -0,0 +1,306 @@ +cmake_minimum_required(VERSION 3.13) + +project(librt C ASM) + +set(CMAKE_AR llvm-ar) +set(CMAKE_LINKER llvm-ld) +set(CMAKE_NM llvm-nm) +set(CMAKE_OBJDUMP llvm-objdump) +set(CMAKE_RANLIB llvm-ranlib) +set(OBJCOPY llvm-objcopy) +set(CLANGFORMAT clang-format) + +set(COMPILER_RT_BUILTINS_DIR ../lib/builtins) + +set(GENERIC_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/dpu/mul32.S + ${COMPILER_RT_BUILTINS_DIR}/dpu/mulsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/muldi3.c + + ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.S + # ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.c optimized above + ${COMPILER_RT_BUILTINS_DIR}/dpu/div32.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/divsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/modsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/udivmodsi4.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/udivsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/umodsi3.c + + ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv64.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/divdi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/moddi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/udivdi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/umoddi3.c + + ${COMPILER_RT_BUILTINS_DIR}/absvdi2.c + ${COMPILER_RT_BUILTINS_DIR}/absvsi2.c + ${COMPILER_RT_BUILTINS_DIR}/adddf3.c + ${COMPILER_RT_BUILTINS_DIR}/addsf3.c + ${COMPILER_RT_BUILTINS_DIR}/addvdi3.c + ${COMPILER_RT_BUILTINS_DIR}/addvsi3.c + ${COMPILER_RT_BUILTINS_DIR}/ashldi3.c + ${COMPILER_RT_BUILTINS_DIR}/ashrdi3.c + ${COMPILER_RT_BUILTINS_DIR}/bswapdi2.c + ${COMPILER_RT_BUILTINS_DIR}/bswapsi2.c + ${COMPILER_RT_BUILTINS_DIR}/clzdi2.c + ${COMPILER_RT_BUILTINS_DIR}/clzsi2.c + ${COMPILER_RT_BUILTINS_DIR}/cmpdi2.c + ${COMPILER_RT_BUILTINS_DIR}/comparedf2.c + ${COMPILER_RT_BUILTINS_DIR}/comparesf2.c + ${COMPILER_RT_BUILTINS_DIR}/ctzdi2.c + ${COMPILER_RT_BUILTINS_DIR}/ctzsi2.c + ${COMPILER_RT_BUILTINS_DIR}/divdf3.c + ${COMPILER_RT_BUILTINS_DIR}/divdi3.c + ${COMPILER_RT_BUILTINS_DIR}/divmoddi4.c + ${COMPILER_RT_BUILTINS_DIR}/divmodsi4.c + ${COMPILER_RT_BUILTINS_DIR}/divsf3.c + ${COMPILER_RT_BUILTINS_DIR}/divsi3.c + ${COMPILER_RT_BUILTINS_DIR}/extendsfdf2.c + ${COMPILER_RT_BUILTINS_DIR}/extendhfsf2.c + ${COMPILER_RT_BUILTINS_DIR}/ffsdi2.c + ${COMPILER_RT_BUILTINS_DIR}/ffssi2.c + ${COMPILER_RT_BUILTINS_DIR}/fixdfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixdfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixsfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixsfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunsdfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunsdfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunssfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunssfsi.c + ${COMPILER_RT_BUILTINS_DIR}/floatdidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatdisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatsidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatsisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatundidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatundisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunsidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunsisf.c + ${COMPILER_RT_BUILTINS_DIR}/fp_mode.c + ${COMPILER_RT_BUILTINS_DIR}/int_util.c + ${COMPILER_RT_BUILTINS_DIR}/lshrdi3.c + ${COMPILER_RT_BUILTINS_DIR}/moddi3.c + ${COMPILER_RT_BUILTINS_DIR}/modsi3.c + ${COMPILER_RT_BUILTINS_DIR}/muldf3.c + ${COMPILER_RT_BUILTINS_DIR}/muldi3.c + ${COMPILER_RT_BUILTINS_DIR}/mulodi4.c + ${COMPILER_RT_BUILTINS_DIR}/mulosi4.c + ${COMPILER_RT_BUILTINS_DIR}/mulsf3.c + ${COMPILER_RT_BUILTINS_DIR}/mulvdi3.c + ${COMPILER_RT_BUILTINS_DIR}/mulvsi3.c + ${COMPILER_RT_BUILTINS_DIR}/negdf2.c + ${COMPILER_RT_BUILTINS_DIR}/negdi2.c + ${COMPILER_RT_BUILTINS_DIR}/negsf2.c + ${COMPILER_RT_BUILTINS_DIR}/negvdi2.c + ${COMPILER_RT_BUILTINS_DIR}/negvsi2.c + ${COMPILER_RT_BUILTINS_DIR}/paritydi2.c + ${COMPILER_RT_BUILTINS_DIR}/paritysi2.c + ${COMPILER_RT_BUILTINS_DIR}/popcountdi2.c + ${COMPILER_RT_BUILTINS_DIR}/popcountsi2.c + ${COMPILER_RT_BUILTINS_DIR}/powidf2.c + ${COMPILER_RT_BUILTINS_DIR}/powisf2.c + ${COMPILER_RT_BUILTINS_DIR}/subdf3.c + ${COMPILER_RT_BUILTINS_DIR}/subsf3.c + ${COMPILER_RT_BUILTINS_DIR}/subvdi3.c + ${COMPILER_RT_BUILTINS_DIR}/subvsi3.c + ${COMPILER_RT_BUILTINS_DIR}/truncdfhf2.c + ${COMPILER_RT_BUILTINS_DIR}/truncdfsf2.c + ${COMPILER_RT_BUILTINS_DIR}/truncsfhf2.c + ${COMPILER_RT_BUILTINS_DIR}/ucmpdi2.c + ${COMPILER_RT_BUILTINS_DIR}/udivdi3.c + ${COMPILER_RT_BUILTINS_DIR}/udivmoddi4.c + ${COMPILER_RT_BUILTINS_DIR}/udivmodsi4.c + ${COMPILER_RT_BUILTINS_DIR}/udivsi3.c + ${COMPILER_RT_BUILTINS_DIR}/umoddi3.c + ${COMPILER_RT_BUILTINS_DIR}/umodsi3.c + ) + +set(GENERIC_TF_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/addtf3.c + ${COMPILER_RT_BUILTINS_DIR}/addvti3.c + ${COMPILER_RT_BUILTINS_DIR}/absvti2.c + ${COMPILER_RT_BUILTINS_DIR}/ashrti3.c + ${COMPILER_RT_BUILTINS_DIR}/comparetf2.c + ${COMPILER_RT_BUILTINS_DIR}/clzti2.c + ${COMPILER_RT_BUILTINS_DIR}/cmpti2.c + ${COMPILER_RT_BUILTINS_DIR}/ctzti2.c + ${COMPILER_RT_BUILTINS_DIR}/divtf3.c + ${COMPILER_RT_BUILTINS_DIR}/divmodti4.c + ${COMPILER_RT_BUILTINS_DIR}/divti3.c + ${COMPILER_RT_BUILTINS_DIR}/extenddftf2.c + ${COMPILER_RT_BUILTINS_DIR}/extendhftf2.c + ${COMPILER_RT_BUILTINS_DIR}/extendsftf2.c + ${COMPILER_RT_BUILTINS_DIR}/ffsti2.c + ${COMPILER_RT_BUILTINS_DIR}/fixdfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixsfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixtfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixtfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixtfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixunsdfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixunssfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixunstfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunstfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunstfti.c + ${COMPILER_RT_BUILTINS_DIR}/floatditf.c + ${COMPILER_RT_BUILTINS_DIR}/floatsitf.c + ${COMPILER_RT_BUILTINS_DIR}/floattidf.c + ${COMPILER_RT_BUILTINS_DIR}/floattisf.c + ${COMPILER_RT_BUILTINS_DIR}/floattitf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunditf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunsitf.c + ${COMPILER_RT_BUILTINS_DIR}/floatuntidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatuntisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatuntitf.c + ${COMPILER_RT_BUILTINS_DIR}/lshrti3.c + ${COMPILER_RT_BUILTINS_DIR}/modti3.c + ${COMPILER_RT_BUILTINS_DIR}/muloti4.c + ${COMPILER_RT_BUILTINS_DIR}/multf3.c + ${COMPILER_RT_BUILTINS_DIR}/multi3.c + ${COMPILER_RT_BUILTINS_DIR}/mulvti3.c + ${COMPILER_RT_BUILTINS_DIR}/negti2.c + ${COMPILER_RT_BUILTINS_DIR}/negvti2.c + ${COMPILER_RT_BUILTINS_DIR}/popcountti2.c + ${COMPILER_RT_BUILTINS_DIR}/powitf2.c + ${COMPILER_RT_BUILTINS_DIR}/subtf3.c + ${COMPILER_RT_BUILTINS_DIR}/subvti3.c + ${COMPILER_RT_BUILTINS_DIR}/trunctfdf2.c + ${COMPILER_RT_BUILTINS_DIR}/trunctfhf2.c + ${COMPILER_RT_BUILTINS_DIR}/trunctfsf2.c + ${COMPILER_RT_BUILTINS_DIR}/ucmpti2.c + ${COMPILER_RT_BUILTINS_DIR}/udivmodti4.c + ${COMPILER_RT_BUILTINS_DIR}/udivti3.c + ${COMPILER_RT_BUILTINS_DIR}/umodti3.c + ) + +set(GENERIC_COMPLEX_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/divdc3.c + ${COMPILER_RT_BUILTINS_DIR}/divsc3.c + ${COMPILER_RT_BUILTINS_DIR}/muldc3.c + ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c + ) + +set(GENERIC_COMPLEX_TF_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/divdc3.c + ${COMPILER_RT_BUILTINS_DIR}/divsc3.c + ${COMPILER_RT_BUILTINS_DIR}/divtc3.c + ${COMPILER_RT_BUILTINS_DIR}/muldc3.c + ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c + ${COMPILER_RT_BUILTINS_DIR}/multc3.c + ) + +set(SOURCES ${GENERIC_SOURCES} + # ${GENERIC_TF_SOURCES} + # ${GENERIC_COMPLEX} + # ${GENERIC_COMPLEX_TF_SOURCES} + ) + +function(add_dpu_library) + set(options PROFILING) + set(oneValueArgs TARGET OPT_LEVEL LTO) + set(multiValueArgs SOURCES) + cmake_parse_arguments(arg "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message("ARGN: ${ARGN}") + + message(${options}) + message(${oneValueArgs}) + message(${multiValueArgs}) + + message("TARGET: ${arg_TARGET}") + message("OPT_LEVEL: ${arg_OPT_LEVEL}") + message("PROFILING: ${arg_PROFILING}") + message("LTO: ${arg_LTO}") + message("LTO_TYPE: ${arg_LTO_TYPE}") + + set(LOCAL_TARGET ${arg_TARGET}) + + set(OTHER_FLAGS) + list(APPEND OTHER_FLAGS -Wall) + list(APPEND OTHER_FLAGS -Wextra) + + if (arg_OPT_LEVEL) + list(APPEND OTHER_FLAGS ${arg_OPT_LEVEL}) + string(REPLACE "-" "" arg_OPT_LEVEL ${arg_OPT_LEVEL}) + string(APPEND LOCAL_TARGET "_${arg_OPT_LEVEL}") + endif() + if (arg_LTO) + list(APPEND OTHER_FLAGS ${arg_LTO}) + string(REPLACE "-f" "" arg_LTO ${arg_LTO}) + string(REPLACE "=" "" arg_LTO ${arg_LTO}) + string(APPEND LOCAL_TARGET "_${arg_LTO}") + else() + string(APPEND LOCAL_TARGET "_") + endif() + if (arg_PROFILING) + list(APPEND OTHER_FLAGS -pg) + string(APPEND LOCAL_TARGET "_pg") + endif() + + list(APPEND OTHER_FLAGS -g0) + list(APPEND OTHER_FLAGS -mllvm -verify-machineinstrs) + # list(APPEND OTHER_FLAGS -mllvm -debug) --> deduped + + message("LOCAL_TARGET: ${LOCAL_TARGET}") + message("OTHER_FLAGS: ${OTHER_FLAGS}") + + add_library(${LOCAL_TARGET} STATIC "${arg_SOURCES}") + + target_include_directories(${LOCAL_TARGET} PRIVATE + ${COMPILER_RT_BUILTINS_DIR} + ${COMPILER_RT_BUILTINS_DIR}/dpu) + + target_compile_options(${LOCAL_TARGET} + PRIVATE ${NOSTDLIB_FLAGS} ${STRICT_FLAGS} ${COMPILER_TIMESTAMP_DEF} ${OTHER_FLAGS}) + + # set_target_properties(${LOCAL_TARGET} PROPERTIES OUTPUT_NAME "rt") + + if (arg_LTO) + install( + TARGETS ${LOCAL_TARGET} + ARCHIVE + DESTINATION ${arg_OPT_LEVEL}/${arg_LTO} + ) + else() + install( + TARGETS ${LOCAL_TARGET} + ARCHIVE + DESTINATION ${arg_OPT_LEVEL}/no_lto + ) + endif() +endfunction() + +# add_dpu_library( +# TARGET rt +# OPT_LEVEL -O3 +# # LTO -flto +# # PROFILING +# SOURCES ${SOURCES} +# ) + +foreach(OPT_LEVEL -O0;-O1;-O2;-O3;-Os) + add_dpu_library( + TARGET rt + OPT_LEVEL ${OPT_LEVEL} + SOURCES ${SOURCES} + ) + # add_dpu_library( + # TARGET rt + # OPT_LEVEL ${OPT_LEVEL} + # PROFILING + # SOURCES ${SOURCES} + # ) + foreach(LTO -flto;-flto=thin) + add_dpu_library( + TARGET rt + OPT_LEVEL ${OPT_LEVEL} + LTO ${LTO} + SOURCES ${SOURCES} + ) + # add_dpu_library( + # TARGET rt + # OPT_LEVEL ${OPT_LEVEL} + # LTO ${LTO} + # PROFILING + # SOURCES ${SOURCES} + # ) + endforeach() +endforeach() diff --git a/compiler-rt/dpu/Toolchain.cmake b/compiler-rt/dpu/Toolchain.cmake new file mode 100644 index 0000000000000..ae09a95e9b705 --- /dev/null +++ b/compiler-rt/dpu/Toolchain.cmake @@ -0,0 +1,12 @@ +include(CMakeForceCompiler) + +# set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS s;S;asm) + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_CROSSCOMPILING 1) +set(CMAKE_ASM_COMPILER dpu-clang) +set(CMAKE_C_COMPILER dpu-clang) +set(CMAKE_CXX_COMPILER dpu-clang) +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(CMAKE_C_COMPILER_WORKS 1) +set(CMAKE_CXX_COMPILER_WORKS 1) diff --git a/compiler-rt/dpu/compiler_rt_tests.sh b/compiler-rt/dpu/compiler_rt_tests.sh new file mode 100644 index 0000000000000..350c9ce6fb7e1 --- /dev/null +++ b/compiler-rt/dpu/compiler_rt_tests.sh @@ -0,0 +1,265 @@ +#!/bin/bash + +set -eux + +COMPILER_RT=~/work/dpu_tools_fix_64bit_reg/llvm-project/compiler-rt/lib/builtins +COMPILER_RT_TESTS=~/work/dpu_tools_fix_64bit_reg/llvm-project/compiler-rt/test/builtins/Unit + +# not supported +# declare -a TESTS_=( + # absvti2_test.c + # adddf3vfp_test.c + # addsf3vfp_test.c + # addtf3_test.c + # addvti3_test.c + # ashlti3_test.c + # ashrti3_test.c + # clzti2_test.c + # cmpti2_test.c + # compiler_rt_logb_test.c + # compiler_rt_logbf_test.c + # compiler_rt_logbl_test.c + # ctzti2_test.c + # divdc3_test.c + # divdf3vfp_test.c + # divmodti4_test.c + # divsf3vfp_test.c + # divsc3_test.c + # divtc3_test.c + # divtf3_test.c + # divti3_test.c + # divxc3_test.c + # eqdf2vfp_test.c + # eqsf2vfp_test.c + # eqtf2_test.c + # extenddftf2_test.c + # extendhftf2_test.c + # extendsfdf2vfp_test.c + # extendsftf2_test.c + # ffsti2_test.c + # fixdfsivfp_test.c + # fixdfti_test.c + # fixsfsivfp_test.c + # fixsfti_test.c + # fixtfti_test.c + # fixunsdfsivfp_test.c + # fixunsdfti_test.c + # fixunssfsivfp_test.c + # fixunssfti_test.c + # floatditf_test.c + # floatsidfvfp_test.c + # floatsisfvfp_test.c + # floatunditf_test.c + # floatunssidfvfp_test.c + # floatunssisfvfp_test.c + # muldc3_test.c + # ltdf2vfp_test.c + # ltsf2vfp_test.c + # gedf2vfp_test.c + # gesf2vfp_test.c + # gtdf2vfp_test.c + # gtsf2vfp_test.c + # ledf2vfp_test.c + # lesf2vfp_test.c + # muldf3vfp_test.c + # mulsf3vfp_test.c + # nedf2vfp_test.c + # negdf2vfp_test.c + # negsf2vfp_test.c + # nesf2vfp_test.c + # subdf3vfp_test.c + # subsf3vfp_test.c + # truncdfsf2vfp_test.c + # unorddf2vfp_test.c + # unordsf2vfp_test.c + # mulsc3_test.c + # mulxc3_test.c + # powixf2_test.c + # subvti3_test.c + # ucmpti2_test.c + # udivmodti4_test.c + # udivti3_test.c + # umodti3_test.c + # subtf3_test.c + # powitf2_test.c + # negvti2_test.c + # modti3_test.c + # muloti4_test.c + # multc3_test.c + # multi3_test.c + # mulvti3_test.c + # negti2_test.c + # netf2_test.c + # parityti2_test.c + # popcountti2_test.c + # fixtfdi_test.c + # fixtfsi_test.c + # fixunstfdi_test.c + # fixunstfsi_test.c + # fixunstfti_test.c + # fixunsxfdi_test.c + # fixunsxfsi_test.c + # fixunsxfti_test.c + # fixxfti_test.c + # floatdixf_test.c + # floatsitf_test.c + # floattidf_test.c + # floattisf_test.c + # floattitf_test.c + # floattixf_test.c + # floatundixf_test.c + # floatunsitf_test.c + # floatuntidf_test.c + # floatuntisf_test.c + # floatuntitf_test.c + # floatuntixf_test.c + # getf2_test.c + # gttf2_test.c + # letf2_test.c + # lshrti3_test.c + # lttf2_test.c + # multf3_test.c + # unordtf2_test.c + # trunctfdf2_test.c + # trunctfhf2_test.c + # trunctfsf2_test.c + # fixxfdi_test.c + # udivmoddi4_test.c # too big :) +# ) + +declare -a TESTS=( + test.c + absvdi2_test.c + absvsi2_test.c + addvdi3_test.c + addvsi3_test.c + ashldi3_test.c + ashrdi3_test.c + bswapdi2_test.c + bswapsi2_test.c + clzdi2_test.c + clzsi2_test.c + cmpdi2_test.c + comparedf2_test.c + comparesf2_test.c + ctzdi2_test.c + ctzsi2_test.c + divdf3_test.c + divdi3_test.c + divmodsi4_test.c + divsf3_test.c + divsi3_test.c + extendhfsf2_test.c + ffsdi2_test.c + ffssi2_test.c + fixdfdi_test.c + fixsfdi_test.c + fixunsdfdi_test.c + fixunsdfsi_test.c + fixunssfdi_test.c + fixunssfsi_test.c + floatdidf_test.c + floatdisf_test.c + floatundidf_test.c + floatundisf_test.c + lshrdi3_test.c + moddi3_test.c + modsi3_test.c + muldi3_test.c + mulodi4_test.c + mulosi4_test.c + mulsi3_test.c + mulvdi3_test.c + mulvsi3_test.c + negdi2_test.c + negvdi2_test.c + negvsi2_test.c + paritydi2_test.c + paritysi2_test.c + popcountdi2_test.c + popcountsi2_test.c + powidf2_test.c + powisf2_test.c + subvdi3_test.c + subvsi3_test.c + truncdfhf2_test.c + truncdfsf2_test.c + truncsfhf2_test.c + ucmpdi2_test.c + udivdi3_test.c + udivmodsi4_test.c + udivsi3_test.c + umoddi3_test.c + umodsi3_test.c +) + +declare -a OPT_LEVELS=( + O0 + # O1 + # O2 + # O3 + # Os +) + +declare -a COMPILER_OPTIONS=( + no_lto + # lto + # ltothin +) + +MYPWD=`pwd` + +mkdir -p test +cd test + +for COMPILER_OPTION in "${COMPILER_OPTIONS[@]}" +do + mkdir -p ${COMPILER_OPTION} + cd ${COMPILER_OPTION} + + case "$COMPILER_OPTION" in + "no_lto") COMPILER_OPTION_="";; + "lto") COMPILER_OPTION_="-flto";; + "ltothin") COMPILER_OPTION_="-flto=thin";; + esac + + case "$COMPILER_OPTION" in + "no_lto") COMPILER_OPTION_LIB="";; + "lto") COMPILER_OPTION_LIB="lto";; + "ltothin") COMPILER_OPTION_LIB="ltothin";; + esac + + for OPT_LEVEL in "${OPT_LEVELS[@]}" + do + mkdir -p ${OPT_LEVEL} + cd ${OPT_LEVEL} + + for TEST in "${TESTS[@]}" + do + clang --target=dpu-upmem-dpurte -mcpu=v1A \ + -I${COMPILER_RT} \ + -g0 \ + -v \ + -save-temps \ + -I ${MYPWD} \ + ${COMPILER_OPTION_} \ + -${OPT_LEVEL} \ + ${COMPILER_RT_TESTS}/${TEST} \ + -o $(basename "${TEST}" .c) \ + -L ${MYPWD}/install/${OPT_LEVEL}/${COMPILER_OPTION}/ -lrt_${OPT_LEVEL}_${COMPILER_OPTION_LIB} \ + &> `basename ${TEST}`_compiler_log.txt + + # dpu-lldb --batch --one-line run -- $(basename "${TEST}" .c) + python3 ~/work/simple_examples/lldb_python.py $(basename "${TEST}" .c) + done + cd .. + done + + cd .. +done +cd .. + + # -L ~/scratch/dpu_tools/share/upmem/include/built-in/v1A -lrt_v1A \ + # -save-temps \ + # -mllvm -debug -mllvm -print-after-all -mllvm -verify-machineinstrs \ + # --thinlto-jobs=1 diff --git a/compiler-rt/dpu/lldb_python.py b/compiler-rt/dpu/lldb_python.py new file mode 100644 index 0000000000000..e333723af601e --- /dev/null +++ b/compiler-rt/dpu/lldb_python.py @@ -0,0 +1,42 @@ +import sys +import os +import subprocess +import dpu +import lldb +import tempfile + +binary = sys.argv[1] + +debugger = lldb.SBDebugger().Create() +debugger.SetAsync(False) + +target = debugger.CreateTarget(binary) +assert target.IsValid() + +launch_info = lldb.SBLaunchInfo(None) +launch_info.SetWorkingDirectory(os.getcwd()) + +with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + stdout_path = tmp_file.name + +launch_info.AddOpenFileAction(1, stdout_path, False, True) + +# process = target.Launch(debugger.GetListener(), None, None, ".", +# "stdout.txt", "stderr.txt", None, 0, False, error) +process = target.Launch(launch_info, lldb.SBError()) +# process = target.LaunchSimple(None, None, ".") + +# print(process) + +assert process.IsValid() + +with open(stdout_path, 'r') as file: + stdout_data = file.read() + +os.remove(stdout_path) + +print(stdout_data) + +# Cleanup LLDB +# lldb.SBDebugger.Terminate() +sys.exit(process.exit_state) diff --git a/compiler-rt/lib/builtins/dpu/div32.c b/compiler-rt/lib/builtins/dpu/div32.c new file mode 100644 index 0000000000000..df25bbbdaf9d4 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/div32.c @@ -0,0 +1,97 @@ +/* Copyright 2024 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include + +extern uint64_t __udiv32(uint32_t dividend, uint32_t divider); + +/* int64_t */ +void +__div32(int32_t dividend, int32_t divider + , int32_t *p_q, int32_t *p_rem + ) +{ + uint64_t res; + uint32_t q; + uint32_t rem; + + __asm__ goto("clo zero, %[dividend], z, %l[__div32_pos_dividend]\n\t" + "clo zero, %[divider], z, %l[__div32_neg_dividend_pos_divider]\n\t" + : + : [dividend] "r"(dividend), [divider] "r"(divider) + : + : __div32_pos_dividend, __div32_neg_dividend_pos_divider); + + /* The quotient's sign depends on the sign of the dividend and divider... After few tries it sounds */ + /* like the quickest way to select the operators is to branch according to the cases. */ + + /* __div32_neg_dividend_neg_divider: */ + /* As a result, the quotient is positive and the remainder negative */ + dividend = 0 - dividend; + divider = 0 - divider; + res = __udiv32(dividend, divider); + q = (uint32_t)(res >> 32); + rem = (uint32_t)res; + rem = 0 - rem; + goto recombine; + + /* *p_q = q; */ + /* *p_rem = rem; */ + /* return; */ + + +__div32_neg_dividend_pos_divider: + /* As a result, the quotient is negative and the remainder negative */ + dividend = 0 - dividend; + res = __udiv32(dividend, divider); + q = (uint32_t)(res >> 32); + q = 0 - q; + rem = (uint32_t)res; + rem = 0 - rem; + goto recombine; + /* *p_q = q; */ + /* *p_rem = rem; */ + /* return; */ + +__div32_pos_dividend: + __asm__ goto("clo zero, %[divider], z, %l[__div32_pos_dividend_pos_divider]" + : + : [divider] "r"(divider) + : + : __div32_pos_dividend_pos_divider); + /* As a result, the quotient is negative and the remainder positive */ + divider = 0 - divider; + res = __udiv32(dividend, divider); + q = (uint32_t)(res >> 32); + q = 0 - q; + rem = (uint32_t)res; + goto recombine; + /* *p_q = q; */ + /* *p_rem = rem; */ + /* return; */ + +__div32_pos_dividend_pos_divider: + /* The dividend and divider are both positive */ + res = __udiv32(dividend, divider); + /* goto last_exit; */ + q = (uint32_t) (res >> 32); + rem = (uint32_t) res; + /* goto recombine; */ + + /* *p_q = q; */ + /* *p_rem = rem; */ + +/* recombine: */ +/* res = q; */ +/* res <<= 32; */ +/* res |= rem; */ +/* last_exit: */ +/* return res; */ + + recombine: + *p_q = q; + *p_rem = rem; + return; +} diff --git a/compiler-rt/lib/builtins/dpu/divdi3.c b/compiler-rt/lib/builtins/dpu/divdi3.c new file mode 100644 index 0000000000000..178cbf35fd2ee --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/divdi3.c @@ -0,0 +1,31 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 signed division. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include + +extern uint64_t __udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +int64_t +__divdi3(int64_t dividend, int64_t divider) +{ + if (dividend >= 0) { + if (divider >= 0) { + return __udiv64(dividend, divider, 0); + } else { + return -__udiv64(dividend, -divider, 0); + } + } else if (divider >= 0) { + // Negative dividend, positive divider + return -__udiv64(-dividend, divider, 0); + } else { + // Negative dividend, negative divider + return __udiv64(-dividend, -divider, 0); + } +} diff --git a/compiler-rt/lib/builtins/dpu/divsi3.c b/compiler-rt/lib/builtins/dpu/divsi3.c new file mode 100644 index 0000000000000..8ec97468aaf83 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/divsi3.c @@ -0,0 +1,23 @@ +/* Copyright 2024 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include + +/* extern int64_t __div32(int32_t dividend, int32_t divider); */ +extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem); + +#include "int_lib.h" + +COMPILER_RT_ABI si_int +__divsi3(si_int a, si_int b) +{ + /* int64_t res = __div32(a, b); */ + /* return (si_int) (res >> 32); */ + + int32_t q; + int32_t rem; + __div32(a, b, &q, &rem); + return q; +} diff --git a/compiler-rt/lib/builtins/dpu/moddi3.c b/compiler-rt/lib/builtins/dpu/moddi3.c new file mode 100644 index 0000000000000..dad11e699f87c --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/moddi3.c @@ -0,0 +1,31 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 signed division. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include +extern uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +int64_t +__moddi3(int64_t dividend, int64_t divider) +{ + if (dividend >= 0) { + if (divider >= 0) { + return __udiv64(dividend, divider, 1); + } else { + return __udiv64(dividend, -divider, 1); + } + } else if (divider >= 0) { + // Negative dividend, positive divider + return -__udiv64(-dividend, divider, 1); + } else { + // Negative dividend, negative divider + return -__udiv64(-dividend, -divider, 1); + } +} diff --git a/compiler-rt/lib/builtins/dpu/modsi3.c b/compiler-rt/lib/builtins/dpu/modsi3.c new file mode 100644 index 0000000000000..c0cc59e8c92f9 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/modsi3.c @@ -0,0 +1,34 @@ +/* ===-- modsi3.c - Implement __modsi3 -------------------------------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is dual licensed under the MIT and the University of Illinois Open + * Source Licenses. See LICENSE_LLVM.TXT for details. + * + * ===----------------------------------------------------------------------=== + * + * This file implements __modsi3 for the compiler_rt library. + * + * ===----------------------------------------------------------------------=== + */ + +#include + +/* extern int64_t __div32(int32_t dividend, int32_t divider); */ +extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem); + +#include "int_lib.h" + +/* Returns: a % b */ + +COMPILER_RT_ABI si_int +__modsi3(si_int a, si_int b) +{ + /* int64_t res = __div32(a, b); */ + /* return (si_int) res; */ + + int32_t q; + int32_t rem; + __div32(a, b, &q, &rem); + return rem; +} diff --git a/compiler-rt/lib/builtins/dpu/mul32.S b/compiler-rt/lib/builtins/dpu/mul32.S new file mode 100644 index 0000000000000..fe735ab5b328f --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/mul32.S @@ -0,0 +1,48 @@ + .text + .globl __mul32 + .type __mul32,@function +__mul32: + jgtu r1, r0, .Ltmp0 + move r2, r0 + move r0, r1, true, .Ltmp1 +.Ltmp0: + move r2, r1 + // move r0, r0 +.Ltmp1: + move r1, zero + mul_step d0, r2, d0, 0, z, .Ltmp2 + mul_step d0, r2, d0, 1, z, .Ltmp2 + mul_step d0, r2, d0, 2, z, .Ltmp2 + mul_step d0, r2, d0, 3, z, .Ltmp2 + mul_step d0, r2, d0, 4, z, .Ltmp2 + mul_step d0, r2, d0, 5, z, .Ltmp2 + mul_step d0, r2, d0, 6, z, .Ltmp2 + mul_step d0, r2, d0, 7, z, .Ltmp2 + mul_step d0, r2, d0, 8, z, .Ltmp2 + mul_step d0, r2, d0, 9, z, .Ltmp2 + mul_step d0, r2, d0, 10, z, .Ltmp2 + mul_step d0, r2, d0, 11, z, .Ltmp2 + mul_step d0, r2, d0, 12, z, .Ltmp2 + mul_step d0, r2, d0, 13, z, .Ltmp2 + mul_step d0, r2, d0, 14, z, .Ltmp2 + mul_step d0, r2, d0, 15, z, .Ltmp2 + mul_step d0, r2, d0, 16, z, .Ltmp2 + mul_step d0, r2, d0, 17, z, .Ltmp2 + mul_step d0, r2, d0, 18, z, .Ltmp2 + mul_step d0, r2, d0, 19, z, .Ltmp2 + mul_step d0, r2, d0, 20, z, .Ltmp2 + mul_step d0, r2, d0, 21, z, .Ltmp2 + mul_step d0, r2, d0, 22, z, .Ltmp2 + mul_step d0, r2, d0, 23, z, .Ltmp2 + mul_step d0, r2, d0, 24, z, .Ltmp2 + mul_step d0, r2, d0, 25, z, .Ltmp2 + mul_step d0, r2, d0, 26, z, .Ltmp2 + mul_step d0, r2, d0, 27, z, .Ltmp2 + mul_step d0, r2, d0, 28, z, .Ltmp2 + mul_step d0, r2, d0, 29, z, .Ltmp2 + mul_step d0, r2, d0, 30, z, .Ltmp2 + mul_step d0, r2, d0, 31, z, .Ltmp2 +.Ltmp2: + move r0, r1 + + jump r23 diff --git a/compiler-rt/lib/builtins/dpu/mul32.c b/compiler-rt/lib/builtins/dpu/mul32.c new file mode 100644 index 0000000000000..cc6be09b64847 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/mul32.c @@ -0,0 +1,59 @@ +#include + +int32_t __mulsi3(int32_t a, int32_t b) +{ + int32_t dest; + + int32_t temp0; + uint64_t temp1; + + this is not working yet ... + temp1.hi/temp1.lo is not yet supported + + __asm__ volatile(" jgtu %[b], %[a], 1f\n" + " move %[temp0], %[a]\n" + " move %[temp1.hi], %[b], true, 2f\n" + "1:\n" + " move %[temp0], %[b]\n" + " move %[temp1.hi], %[a]\n" + "2:\n" + " move r1, zero\n" + " mul_step %[temp1], %[temp0], %[temp1], 0 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 1 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 2 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 3 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 4 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 5 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 6 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 7 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 8 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 9 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 10, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 11, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 12, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 13, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 14, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 15, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 16, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 17, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 18, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 19, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 20, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 21, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 22, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 23, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 24, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 25, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 26, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 27, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 28, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 29, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 30, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 31, z, 3f\n" + "3:\n" + " move %[dest], %[temp1.lo]\n" + : [dest] "=&r"(dest), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1) + : [a]"r"(a), [b]"r"(b) + : ); + return dest; +} diff --git a/compiler-rt/lib/builtins/dpu/muldi3.c b/compiler-rt/lib/builtins/dpu/muldi3.c new file mode 100644 index 0000000000000..2d5a28b1dc260 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/muldi3.c @@ -0,0 +1,171 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 multiplication emulation. + * + * A relatively fast emulation of 64x64 multiplication using byte multipliers. + * Basically, the two operands X and Y are seen as byte polynomials: + * - X = X0.2^0 + X1.2^8 + X2.2^16 + X3.2^24 + X4.2^32 + X5.2^40 + X6.2^48 + X7.2^56 + * - Y = Y0.2^0 + Y1.2^8 + Y2.2^16 + Y3.2^24 + Y4.2^32 + Y5.2^40 + Y6.2^48 + Y7.2^56 + * + * The product Z is expressed as a similar polynomial. Since the result is 64 bits, + * the function drops any coefficient for a power greater than 56, hence the following + * formula: + * Z = (X0.Y0).2^0 + * + (X0.Y1 + X1.Y0).2^8 + * + (X0.Y2 + X2.Y0 + X1.Y1).2^16 + * + (X0.Y3 + X1.Y2 + X2.Y1 + X3.Y0).2^24 + * + (X0.Y4 + X1.Y3 + X2.Y2 + X3.Y1 + X4.Y0).2^32 + * etc. + * + * Each individual produce is computed with the native built-in 8x8 instructions. + * Resulting processing time is in the magnitude of 150 instructions. + * + * The two operands are found in __D0 and the first kernel nano-stack entry. + * The result goes into __R0 (lsbits) and __R1 (msbits). + * Also, __R2 contains the return address register, instead of __RET__. + */ +#include + +static inline __attribute__((always_inline)) uint16_t +_mul00(uint32_t a, uint32_t b) +{ +#ifndef DPU + return (a & 0xff) * (b & 0xff); +#else + uint32_t r; + __asm__ volatile("mul_ul_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :); + return r; +#endif +} + +static inline __attribute__((always_inline)) uint16_t +_mul01(uint32_t a, uint32_t b) +{ +#ifndef DPU + return (a & 0xff) * ((b >> 8) & 0xff); +#else + uint32_t r; + __asm__ volatile("mul_ul_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :); + return r; +#endif +} + +#define _mul02(a, b) _mul00(a, (b >> 16)) +#define _mul03(a, b) _mul01(a, (b >> 16)) + +static inline __attribute__((always_inline)) uint16_t +_mul11(uint32_t a, uint32_t b) +{ +#ifndef DPU + return ((a >> 8) & 0xff) * ((b >> 8) & 0xff); +#else + uint32_t r; + __asm__ volatile("mul_uh_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :); + return r; +#endif +} + +static inline __attribute__((always_inline)) uint16_t +_mul12(uint32_t a, uint32_t b) +{ +#ifndef DPU + return ((a >> 8) & 0xff) * ((b >> 16) & 0xff); +#else + uint32_t r = (b >> 16); + __asm__ volatile("mul_uh_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(r) :); + return r; +#endif +} + +#define _mul13(a, b) _mul11(a, (b >> 16)) +#define _mul22(a, b) _mul00((a >> 16), (b >> 16)) +#define _mul23(a, b) _mul01((a >> 16), (b >> 16)) +#define _mul33(a, b) _mul11((a >> 16), (b >> 16)) + +#define mulx0y0(xl, yl) _mul00(xl, yl) +#define mulx0y1(xl, yl) _mul01(xl, yl) +#define mulx0y2(xl, yl) _mul02(xl, yl) +#define mulx0y3(xl, yl) _mul03(xl, yl) +#define mulx0y4(xl, yh) _mul00(xl, yh) +#define mulx0y5(xl, yh) _mul01(xl, yh) +#define mulx0y6(xl, yh) _mul02(xl, yh) +#define mulx0y7(xl, yh) _mul03(xl, yh) + +#define mulx1y1(xl, yl) _mul11(xl, yl) +#define mulx1y2(xl, yl) _mul12(xl, yl) +#define mulx1y3(xl, yl) _mul13(xl, yl) +#define mulx1y4(xl, yh) _mul01(yh, xl) +#define mulx1y5(xl, yh) _mul11(xl, yh) +#define mulx1y6(xl, yh) _mul12(xl, yh) + +#define mulx2y2(xl, yl) _mul22(xl, yl) +#define mulx2y3(xl, yl) _mul23(xl, yl) +#define mulx2y4(xl, yh) _mul02(yh, xl) +#define mulx2y5(xl, yh) _mul12(yh, xl) + +#define mulx3y3(xl, yl) _mul33(xl, yl) +#define mulx3y4(xl, yh) _mul03(yh, xl) + +// Symmetry... +#define mulx1y0(xl, yl) mulx0y1(yl, xl) +#define mulx2y0(xl, yl) mulx0y2(yl, xl) +#define mulx2y1(xl, yl) mulx1y2(yl, xl) +#define mulx3y0(xl, yl) mulx0y3(yl, xl) +#define mulx3y1(xl, yl) mulx1y3(yl, xl) +#define mulx3y2(xl, yl) mulx2y3(yl, xl) +#define mulx4y0(xh, yl) mulx0y4(yl, xh) +#define mulx4y1(xh, yl) mulx1y4(yl, xh) +#define mulx4y2(xh, yl) mulx2y4(yl, xh) +#define mulx4y3(xh, yl) mulx3y4(yl, xh) +#define mulx5y0(xh, yl) mulx0y5(yl, xh) +#define mulx5y1(xh, yl) mulx1y5(yl, xh) +#define mulx5y2(xh, yl) mulx2y5(yl, xh) +#define mulx6y0(xh, yl) mulx0y6(yl, xh) +#define mulx6y1(xh, yl) mulx1y6(yl, xh) +#define mulx7y0(xh, yl) mulx0y7(yl, xh) + +uint64_t +__muldi3(uint64_t x, uint64_t y) +{ + uint32_t xl = x; + uint32_t xh = ((uint64_t)x >> 32); + uint32_t yl = y; + uint32_t yh = ((uint64_t)y >> 32); + + // Each fragment of the product. + uint32_t p0, p1, p2, p3, p4, p5, p6, p7, rh; + uint64_t rl; + + p0 = mulx0y0(xl, yl); + rl = (uint64_t)p0; + + p1 = mulx0y1(xl, yl) + mulx1y0(xl, yl); + rl += ((uint64_t)p1 << 8); + + p2 = mulx0y2(xl, yl) + mulx2y0(xl, yl) + mulx1y1(xl, yl); + rl += ((uint64_t)p2 << 16); + + p3 = mulx0y3(xl, yl) + mulx3y0(xl, yl) + mulx1y2(xl, yl) + mulx2y1(xl, yl); + rl += ((uint64_t)p3 << 24); + + p4 = mulx0y4(xl, yh) + mulx4y0(xh, yl) + mulx1y3(xl, yl) + mulx3y1(xl, yl) + mulx2y2(xl, yl); + rh = p4; + + p5 = (mulx0y5(xl, yh) + mulx5y0(xh, yl) + mulx1y4(xl, yh) + mulx4y1(xh, yl) + + mulx2y3(xl, yl) + mulx3y2(xl, yl)); + rh += p5 << 8; + + p6 = (mulx0y6(xl, yh) + mulx6y0(xh, yl) + mulx1y5(xl, yh) + mulx5y1(xh, yl) + + mulx2y4(xl, yh) + mulx4y2(xh, yl) + mulx3y3(xl, yl)); + rh += p6 << 16; + + p7 = (mulx0y7(xl, yh) + mulx7y0(xh, yl) + mulx1y6(xl, yh) + mulx6y1(xh, yl) + + mulx2y5(xl, yh) + mulx5y2(xh, yl) + mulx3y4(xl, yh) + mulx4y3(xh, yl)); + rh += p7 << 24; + + return rl + (((uint64_t)rh) << 32); +} diff --git a/compiler-rt/lib/builtins/dpu/mulsi3.c b/compiler-rt/lib/builtins/dpu/mulsi3.c new file mode 100644 index 0000000000000..f41210acd79cd --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/mulsi3.c @@ -0,0 +1,8 @@ +#include + +extern int32_t __mul32(int32_t a, int32_t b); + +int32_t __mulsi3(int32_t a, int32_t b) +{ + return __mul32(a, b); +} diff --git a/compiler-rt/lib/builtins/dpu/udiv32.S b/compiler-rt/lib/builtins/dpu/udiv32.S new file mode 100644 index 0000000000000..8298d37dd8a0e --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udiv32.S @@ -0,0 +1,49 @@ + .text + .globl __udiv32 + .type __udiv32,@function +__udiv32: + clz r2, r1, max, 1f // r2 = by how many the divider can be shifted on 32-bit + clz r3, r0 // r3 = number of useless bits of the dividend + sub r2, r3, r2, gtu, 2f// r2 = the maximal shift to be done + move r3, r1 + move.u d0, r0 + jump r2, 3f // As we will jump backward relatively to label 3 forward + div_step d0, r3, d0, 31 + div_step d0, r3, d0, 30 + div_step d0, r3, d0, 29 + div_step d0, r3, d0, 28 + div_step d0, r3, d0, 27 + div_step d0, r3, d0, 26 + div_step d0, r3, d0, 25 + div_step d0, r3, d0, 24 + div_step d0, r3, d0, 23 + div_step d0, r3, d0, 22 + div_step d0, r3, d0, 21 + div_step d0, r3, d0, 20 + div_step d0, r3, d0, 19 + div_step d0, r3, d0, 18 + div_step d0, r3, d0, 17 + div_step d0, r3, d0, 16 + div_step d0, r3, d0, 15 + div_step d0, r3, d0, 14 + div_step d0, r3, d0, 13 + div_step d0, r3, d0, 12 + div_step d0, r3, d0, 11 + div_step d0, r3, d0, 10 + div_step d0, r3, d0, 9 + div_step d0, r3, d0, 8 + div_step d0, r3, d0, 7 + div_step d0, r3, d0, 6 + div_step d0, r3, d0, 5 + div_step d0, r3, d0, 4 + div_step d0, r3, d0, 3 + div_step d0, r3, d0, 2 + div_step d0, r3, d0, 1 +3: + div_step d0, r3, d0, 0 +4: + jump r23 +2: + move.u d0, r0, true, 4b +1: + fault 2 diff --git a/compiler-rt/lib/builtins/dpu/udiv32.c b/compiler-rt/lib/builtins/dpu/udiv32.c new file mode 100644 index 0000000000000..22f617e14fd71 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udiv32.c @@ -0,0 +1,63 @@ +#include + +uint64_t +__udiv32(uint32_t dividend, uint32_t divider) +{ + uint64_t dest; + + uint32_t temp0; + uint32_t temp1; + + /* clang-format off */ + __asm__ volatile(" clz %[temp0], %[divider], max, 1f\n" // %[temp0] = by how many the divider can be shifted on 32-bit + " clz %[temp1], %[dividend]\n" // %[temp1] = number of useless bits of the dividend + " sub %[temp0], %[temp1], %[temp0], gtu, 2f\n" // %[temp0] = the maximal shift to be done + " move %[temp1], %[divider]\n" + " move.u %[dest], %[dividend]\n" + " jump %[temp0], 3f\n" // As we will jump backward relatively to label 3 forward + " div_step %[dest], %[temp1], %[dest], 31\n" + " div_step %[dest], %[temp1], %[dest], 30\n" + " div_step %[dest], %[temp1], %[dest], 29\n" + " div_step %[dest], %[temp1], %[dest], 28\n" + " div_step %[dest], %[temp1], %[dest], 27\n" + " div_step %[dest], %[temp1], %[dest], 26\n" + " div_step %[dest], %[temp1], %[dest], 25\n" + " div_step %[dest], %[temp1], %[dest], 24\n" + " div_step %[dest], %[temp1], %[dest], 23\n" + " div_step %[dest], %[temp1], %[dest], 22\n" + " div_step %[dest], %[temp1], %[dest], 21\n" + " div_step %[dest], %[temp1], %[dest], 20\n" + " div_step %[dest], %[temp1], %[dest], 19\n" + " div_step %[dest], %[temp1], %[dest], 18\n" + " div_step %[dest], %[temp1], %[dest], 17\n" + " div_step %[dest], %[temp1], %[dest], 16\n" + " div_step %[dest], %[temp1], %[dest], 15\n" + " div_step %[dest], %[temp1], %[dest], 14\n" + " div_step %[dest], %[temp1], %[dest], 13\n" + " div_step %[dest], %[temp1], %[dest], 12\n" + " div_step %[dest], %[temp1], %[dest], 11\n" + " div_step %[dest], %[temp1], %[dest], 10\n" + " div_step %[dest], %[temp1], %[dest], 9\n" + " div_step %[dest], %[temp1], %[dest], 8\n" + " div_step %[dest], %[temp1], %[dest], 7\n" + " div_step %[dest], %[temp1], %[dest], 6\n" + " div_step %[dest], %[temp1], %[dest], 5\n" + " div_step %[dest], %[temp1], %[dest], 4\n" + " div_step %[dest], %[temp1], %[dest], 3\n" + " div_step %[dest], %[temp1], %[dest], 2\n" + " div_step %[dest], %[temp1], %[dest], 1\n" + "3:\n" + " div_step %[dest], %[temp1], %[dest], 0\n" + "4:\n" + " jump 5f\n" + "2:\n" + " move.u %[dest], %[dividend], true, 4b\n" + "1:\n" + " fault 2\n" + "5:\n" + : [dest] "=r"(dest), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1) + : [dividend] "r"(dividend), [divider] "r"(divider)); + /* clang-format on */ + + return dest; +} diff --git a/compiler-rt/lib/builtins/dpu/udiv64.c b/compiler-rt/lib/builtins/dpu/udiv64.c new file mode 100644 index 0000000000000..e55b3ffe9904c --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udiv64.c @@ -0,0 +1,59 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 multiplication unsigned division. + */ +#include + +static unsigned int +__clz__(uint64_t x) +{ + return __builtin_clzl(x); +} + +uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder) +{ + uint64_t dxo = dividend, dxe = 0; + + if (divider == 0) { + __asm__ volatile("fault 2"); + /* unreachable(); */ + __builtin_unreachable(); + } + if (divider > dividend) { + if (ask_remainder == 0) + return 0; + else + return dividend; + } + + // Mimic the div_step. + /// div_step functionality: + // if (Dxo >= (Ra<< #u5)) { + // Dxo = Dxo - (Ra<< #u5); + // Dxe = (Dxe << 1) | 1; + // } else { + // Dxe = Dxe << 1; + // } + int dividerl0 = __clz__(divider), dividendl0 = __clz__(dividend); + + int i = dividerl0 - dividendl0; + + for (; i >= 0; i--) { + uint64_t pivot = ((uint64_t)divider << i); + if (dxo >= pivot) { + dxo = dxo - pivot; + dxe = ((uint64_t)dxe << 1) | 1L; + } else { + dxe = (uint64_t)dxe << 1; + } + } + if (ask_remainder == 1) + return dxo; + else + return dxe; +} diff --git a/compiler-rt/lib/builtins/dpu/udivdi3.c b/compiler-rt/lib/builtins/dpu/udivdi3.c new file mode 100644 index 0000000000000..1b60b934b85f4 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udivdi3.c @@ -0,0 +1,19 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 unsigned division. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include +extern uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +uint64_t +__udivdi3(uint64_t dividend, uint64_t divider) +{ + return __udiv64(dividend, divider, 0); +} diff --git a/compiler-rt/lib/builtins/dpu/udivmodsi4.c b/compiler-rt/lib/builtins/dpu/udivmodsi4.c new file mode 100644 index 0000000000000..3a3f3902b6f61 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udivmodsi4.c @@ -0,0 +1,29 @@ +/*===-- udivmodsi4.c - Implement __udivmodsi4 ------------------------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is dual licensed under the MIT and the University of Illinois Open + * Source Licenses. See LICENSE_LLVM.TXT for details. + * + * ===----------------------------------------------------------------------=== + * + * This file implements __udivmodsi4 for the compiler_rt library. + * + * ===----------------------------------------------------------------------=== + */ + +#include + +extern uint64_t __udiv32(uint32_t dividend, uint32_t divider); + +#include "int_lib.h" + +/* Returns: a / b, *rem = a % b */ + +COMPILER_RT_ABI su_int +__udivmodsi4(su_int a, su_int b, su_int *rem) +{ + uint64_t res = __udiv32(a, b); + *rem = (su_int)res; + return (su_int) (res >> 32); +} diff --git a/compiler-rt/lib/builtins/dpu/udivsi3.c b/compiler-rt/lib/builtins/dpu/udivsi3.c new file mode 100644 index 0000000000000..dcc1d9fcf672f --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udivsi3.c @@ -0,0 +1,15 @@ +#include + +extern uint64_t __udiv32(uint32_t dividend, uint32_t divider); + +#include "../int_lib.h" + +typedef su_int fixuint_t; +typedef si_int fixint_t; + +// Returns: a / b + +COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) { + uint64_t res = __udiv32(a, b); + return (su_int) (res >> 32); +} diff --git a/compiler-rt/lib/builtins/dpu/umoddi3.c b/compiler-rt/lib/builtins/dpu/umoddi3.c new file mode 100644 index 0000000000000..4b3a82b01eb98 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/umoddi3.c @@ -0,0 +1,19 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 unsigned remainder. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include +extern uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +uint64_t +__umoddi3(uint64_t dividend, uint64_t divider) +{ + return __udiv64(dividend, divider, 1); +} diff --git a/compiler-rt/lib/builtins/dpu/umodsi3.c b/compiler-rt/lib/builtins/dpu/umodsi3.c new file mode 100644 index 0000000000000..c85cd8a4d9aed --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/umodsi3.c @@ -0,0 +1,27 @@ +/* ===-- umodsi3.c - Implement __umodsi3 -----------------------------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is dual licensed under the MIT and the University of Illinois Open + * Source Licenses. See LICENSE_LLVM.TXT for details. + * + * ===----------------------------------------------------------------------=== + * + * This file implements __umodsi3 for the compiler_rt library. + * + * ===----------------------------------------------------------------------=== + */ + +#include "int_lib.h" + +/* Returns: a % b */ + +extern unsigned long +__udiv32(unsigned int, unsigned int); + +COMPILER_RT_ABI su_int +__umodsi3(su_int a, su_int b) +{ + unsigned long res = __udiv32(a, b); + return (unsigned int)res; +} diff --git a/compiler-rt/test/builtins/Unit/comparedf2_test.c b/compiler-rt/test/builtins/Unit/comparedf2_test.c index 27666e2ad689b..d606ae7eff6ca 100644 --- a/compiler-rt/test/builtins/Unit/comparedf2_test.c +++ b/compiler-rt/test/builtins/Unit/comparedf2_test.c @@ -458,7 +458,7 @@ static const struct TestVector vectors[] = { {__builtin_inf(),__builtin_inf(),0,0,0,0,0,0,0}, }; -int main(int argc, char *argv[]) { +int main() { const int numVectors = sizeof vectors / sizeof vectors[0]; int i; for (i = 0; i +#include + +int main() +{ + fprintf(stderr, "hello err\n"); + fprintf(stdout, "hello out\n"); + srand(42); + for (int i = 0; i < 10; i++) { + printf("%d %d\n", i, rand()); + } + return 0; +} diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 378df1b75e25d..15ed3e94bff5b 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -1103,23 +1103,51 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From, bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, AllSuccsCache &AllSuccessors) { // Don't sink instructions that the target prefers not to sink. - if (!TII->shouldSink(MI)) + if (!TII->shouldSink(MI)) { + // LLVM_DEBUG({ + // dbgs() << "shouldSink false "; MI.dump(); + // }); return false; - + } + // Check if it's safe to move the instruction. - if (!MI.isSafeToMove(AA, SawStore)) + if (!MI.isSafeToMove(AA, SawStore)) { + // LLVM_DEBUG({ + // dbgs() << "not safe "; MI.dump(); + // dbgs() << "mayStore(): " << MI.mayStore() << "\n"; + // dbgs() << "mayLoad(): " << MI.mayLoad() << "\n"; + // dbgs() << "isCall(): " << MI.isCall() << "\n"; + // dbgs() << "isPHI(): " << MI.isPHI() << "\n"; + // dbgs() << "hasOrderedMemoryRef(): " << MI.hasOrderedMemoryRef() << "\n"; + // dbgs() << "isPosition(): " << MI.isPosition() << "\n"; + // dbgs() << "isDebugInstr(): " << MI.isDebugInstr() << "\n"; + // dbgs() << "isTerminator(): " << MI.isTerminator() << "\n"; + // dbgs() << "mayRaiseFPException(): " << MI.mayRaiseFPException() << "\n"; + // dbgs() << "hasUnmodeledSideEffects(): " << MI.hasUnmodeledSideEffects() << "\n"; + // dbgs() << "isDereferenceableInvariantLoad(AA): " << MI.isDereferenceableInvariantLoad(AA) << "\n"; + // dbgs() << "SawStore: " << SawStore << "\n"; + // }); return false; - + } + // Convergent operations may not be made control-dependent on additional // values. - if (MI.isConvergent()) + if (MI.isConvergent()) { + // LLVM_DEBUG({ + // dbgs() << "isconvergent "; MI.dump(); + // }); return false; - + } + // Don't break implicit null checks. This is a performance heuristic, and not // required for correctness. - if (SinkingPreventsImplicitNullCheck(MI, TII, TRI)) + if (SinkingPreventsImplicitNullCheck(MI, TII, TRI)) { + LLVM_DEBUG({ + dbgs() << "nullcheck "; MI.dump(); + }); return false; - + } + // FIXME: This should include support for sinking instructions within the // block they are currently in to shorten the live ranges. We often get // instructions sunk into the top of a large block, but it would be better to @@ -1134,9 +1162,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors); // If there are no outputs, it must have side-effects. - if (!SuccToSinkTo) + if (!SuccToSinkTo) { + // LLVM_DEBUG({ + // dbgs() << "no succ "; MI.dump(); + // }); return false; - + } // If the instruction to move defines a dead physical register which is live // when leaving the basic block, don't move it because it could turn into a // "zombie" define of that preg. E.g., EFLAGS. () @@ -1146,8 +1177,12 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, Register Reg = MO.getReg(); if (Reg == 0 || !Register::isPhysicalRegister(Reg)) continue; - if (SuccToSinkTo->isLiveIn(Reg)) + if (SuccToSinkTo->isLiveIn(Reg)) { + // LLVM_DEBUG({ + // dbgs() << "zombie "; MI.dump(); + // }); return false; + } } LLVM_DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo); diff --git a/llvm/lib/Target/DPU/CMakeLists.txt b/llvm/lib/Target/DPU/CMakeLists.txt index 7a887b71ee3aa..9e216ef08cb39 100644 --- a/llvm/lib/Target/DPU/CMakeLists.txt +++ b/llvm/lib/Target/DPU/CMakeLists.txt @@ -28,7 +28,7 @@ add_llvm_target(DPUCodeGen DPUResolveMacroInstrPass.cpp DPUMacroFusion.cpp DPUSelectionDAGInfo.cpp - + DPUPostRAFusion.cpp DEPENDS intrinsics_gen diff --git a/llvm/lib/Target/DPU/DPU.h b/llvm/lib/Target/DPU/DPU.h index 2ef567d9bc868..7f84823cb9ae0 100644 --- a/llvm/lib/Target/DPU/DPU.h +++ b/llvm/lib/Target/DPU/DPU.h @@ -19,6 +19,7 @@ namespace llvm { class FunctionPass; class DPUTargetMachine; +FunctionPass *createDPUPostRAFusionPass(DPUTargetMachine &tm); FunctionPass *createDPUMergeComboInstrPass(DPUTargetMachine &tm); FunctionPass *createDPUResolveMacroInstrPass(DPUTargetMachine &tm); diff --git a/llvm/lib/Target/DPU/DPU.td b/llvm/lib/Target/DPU/DPU.td index 65f22ee7312f9..e262860b24780 100644 --- a/llvm/lib/Target/DPU/DPU.td +++ b/llvm/lib/Target/DPU/DPU.td @@ -71,4 +71,5 @@ def DPU : Target { let AssemblyParsers = [DPUAsmParser]; let AssemblyParserVariants = [DPUAsmParserVariant]; let AssemblyWriters = [DPUInstPrinter]; + let AllowRegisterRenaming = 1; } diff --git a/llvm/lib/Target/DPU/DPUFrameLowering.cpp b/llvm/lib/Target/DPU/DPUFrameLowering.cpp index 8bf3c6c06650b..026354d10e304 100644 --- a/llvm/lib/Target/DPU/DPUFrameLowering.cpp +++ b/llvm/lib/Target/DPU/DPUFrameLowering.cpp @@ -85,7 +85,8 @@ void DPUFrameLowering::emitPrologue(MachineFunction &MF, .addCFIIndex(CFIIndex) .setMIFlag(MachineInstr::FrameSetup); - BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir), DPU::R22) + BuildMI(MBB, MBBI, DL, DPUII.get(DPU::SDrir)) + .addReg(DPU::R22) .addImm(StackSize - STACK_SIZE_FOR_D22) .addReg(DPU::D22); BuildMI(MBB, MBBI, DL, DPUII.get(DPU::ADDrri), DPU::R22) diff --git a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp index c501d43ed7a89..539056aeb055b 100644 --- a/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/DPU/DPUISelDAGToDAG.cpp @@ -100,25 +100,52 @@ class DPUDAGToDAGISel : public SelectionDAGISel { StringRef DPUDAGToDAGISel::getPassName() const { return "DPUDAGToDAGISel"; } bool DPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + }); + bool Ret = SelectionDAGISel::runOnMachineFunction(MF); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + }); + processFunctionAfterISel(MF); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + }); + return Ret; } void DPUDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + }); + MachineRegisterInfo *MRI = &MF.getRegInfo(); auto &SubTarget = static_cast(MF.getSubtarget()); auto InstrInfo = SubTarget.getInstrInfo(); auto RegInfo = SubTarget.getRegisterInfo(); - for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE; - ++MFI) + for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE; ++MFI) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + }); + for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + }); + + bool res = replaceUsesWithConstantReg(MRI, InstrInfo, RegInfo, *I); + if (res) { + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " YES did something.\n"; + } } + } } static inline bool canCommuteOperation(MachineInstr *MI, unsigned opNo, @@ -149,6 +176,10 @@ bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI, const DPUInstrInfo *DII, const TargetRegisterInfo *TRI, const MachineInstr &MI) { + // This function seems to do manual coalescing + // probably we should use the proper one that probably knows better + // maybe prob with MI operand constraint ... ? + // probably better to educate the coalescer, or better define register class unsigned DstReg = 0, CstReg = 0; if (MI.getOpcode() == DPU::COPY) { @@ -220,6 +251,8 @@ bool DPUDAGToDAGISel::replaceUsesWithConstantReg(MachineRegisterInfo *MRI, UMI->getRegClassConstraint(OpNo, DII, TRI)->contains(OtherReg))) { UMI->getOperand(newOpNo).setReg(CstReg); UMI->getOperand(OpNo).setReg(OtherReg); + + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " YES did something.\n"; } } @@ -387,10 +420,70 @@ void DPUDAGToDAGISel::Select(SDNode *Node) { return; } + EVT VT = Node->getValueType(0); + SDLoc DL(Node); + + MachineFunction &MF = CurDAG->getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + switch (Opcode) { + case ISD::Constant: { + LLVM_DEBUG({dbgs() << "a constant: "; Node->dump();}); + if (VT == MVT::i32) { + // Materialize some constants as copies from constant register. + // This allows the coalescer to propagate these into other instructions. + ConstantSDNode *ConstNode = cast(Node); + if (ConstNode->isNullValue()) { + SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::ZERO, MVT::i32); + ReplaceNode(Node, New.getNode()); + return; + } else if (ConstNode->isOne()) { + SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::ONE, MVT::i32); + ReplaceNode(Node, New.getNode()); + return; + } else if (ConstNode->isAllOnesValue()) { + SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::LNEG, MVT::i32); + ReplaceNode(Node, New.getNode()); + return; + } else { + const ConstantInt *Cst = ConstNode->getConstantIntValue(); + if (Cst->isMinValue(/* signed = */ true)) { + SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, DPU::MNEG, MVT::i32); + ReplaceNode(Node, New.getNode()); + return; + } + } + } else if (VT == MVT::i64) { + ConstantSDNode *ConstNode = cast(Node); + if (ConstNode->isNullValue()) { + // // Create a new virtual register of type i64 + // SDValue ImpDef = SDValue(CurDAG->getMachineNode(DPU::IMPLICIT_DEF, DL, MVT::i64), 0); + // // Insert the low part into the virtual register + // SDValue InsertLo = CurDAG->getTargetInsertSubreg(DPU::sub_32bit, DL, MVT::i64, + // ImpDef, + // CurDAG->getRegister(DPU::ZERO, MVT::i32)); + // // Insert the high part into the virtual register + // SDValue InsertHi = CurDAG->getTargetInsertSubreg(DPU::sub_32bit_hi, DL, MVT::i64, + // InsertLo, + // CurDAG->getRegister(DPU::ZERO, MVT::i32)); + // // Replace the old node with the new virtual register value + // ReplaceNode(Node, InsertHi.getNode()); + + SDValue truc = SDValue(CurDAG->getMachineNode(DPU::MOVE_Srr, DL, MVT::i64, + CurDAG->getRegister(DPU::ZERO, MVT::i32)), 0); + ReplaceNode(Node, truc.getNode()); + return; + } else if (ConstNode->isOne()) { + SDValue truc = SDValue(CurDAG->getMachineNode(DPU::MOVE_Srr, DL, MVT::i64, + CurDAG->getRegister(DPU::ONE, MVT::i32)), 0); + ReplaceNode(Node, truc.getNode()); + return; + } + } + break; + } case ISD::FrameIndex: { int FI = cast(Node)->getIndex(); - EVT VT = Node->getValueType(0); SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT); unsigned Opc = DPU::ADDrri; if (Node->hasOneUse()) { diff --git a/llvm/lib/Target/DPU/DPUISelLowering.h b/llvm/lib/Target/DPU/DPUISelLowering.h index 87d963121a70b..91eadd89e9489 100644 --- a/llvm/lib/Target/DPU/DPUISelLowering.h +++ b/llvm/lib/Target/DPU/DPUISelLowering.h @@ -32,12 +32,12 @@ enum { SetCC, // SET to a condition BrCC, // Jump and branch with condition BrCCi, // Jump and branch with condition - BrCCZero, // Jump and branch with condition and one operand equal to zero - OrJCCZero, - AndJCCZero, - XorJCCZero, - AddJCCZero, - SubJCCZero, + // BrCCZero, // Jump and branch with condition and one operand equal to zero + // OrJCCZero, + // AndJCCZero, + // XorJCCZero, + // AddJCCZero, + // SubJCCZero, Wrapper, // Global addresses, externals... TRUNC64, // Keep the LSBits register, LSL64_32, // Shift 32 positions to the left @@ -62,9 +62,9 @@ enum { MUL16_SU, MUL16_SS, - Addc, - Subc, - Rsubc, + // Addc, + // Subc, + // Rsubc, Clo, Cls, @@ -77,80 +77,80 @@ enum { LslAdd, - AddJcc, - AddNullJcc, - AddcJcc, - AddcNullJcc, - AndJcc, - AndNullJcc, - OrJcc, - OrNullJcc, - XorJcc, - XorNullJcc, - NandJcc, - NandNullJcc, - NorJcc, - NorNullJcc, - NxorJcc, - NxorNullJcc, - AndnJcc, - AndnNullJcc, - OrnJcc, - OrnNullJcc, - LslJcc, - LslNullJcc, - LslxJcc, - LslxNullJcc, - Lsl1Jcc, - Lsl1NullJcc, - Lsl1xJcc, - Lsl1xNullJcc, - LsrJcc, - LsrNullJcc, - LsrxJcc, - LsrxNullJcc, - Lsr1Jcc, - Lsr1NullJcc, - Lsr1xJcc, - Lsr1xNullJcc, - AsrJcc, - AsrNullJcc, - RolJcc, - RolNullJcc, - RorJcc, - RorNullJcc, - MUL8_UUJcc, - MUL8_UUNullJcc, - MUL8_SUJcc, - MUL8_SUNullJcc, - MUL8_SSJcc, - MUL8_SSNullJcc, - SubJcc, - SubNullJcc, - RsubJcc, - RsubNullJcc, - SubcJcc, - SubcNullJcc, - RsubcJcc, - RsubcNullJcc, - CaoJcc, - CaoNullJcc, - ClzJcc, - ClzNullJcc, - CloJcc, - CloNullJcc, - ClsJcc, - ClsNullJcc, - MoveJcc, - MoveNullJcc, - RolAddJcc, - RolAddNullJcc, - LsrAddJcc, - LsrAddNullJcc, - LslAddJcc, - LslAddNullJcc, - LslSubJcc, - LslSubNullJcc, + // AddJcc, + // AddNullJcc, + // AddcJcc, + // AddcNullJcc, + // AndJcc, + // AndNullJcc, + // OrJcc, + // OrNullJcc, + // XorJcc, + // XorNullJcc, + // NandJcc, + // NandNullJcc, + // NorJcc, + // NorNullJcc, + // NxorJcc, + // NxorNullJcc, + // AndnJcc, + // AndnNullJcc, + // OrnJcc, + // OrnNullJcc, + // LslJcc, + // LslNullJcc, + // LslxJcc, + // LslxNullJcc, + // Lsl1Jcc, + // Lsl1NullJcc, + // Lsl1xJcc, + // Lsl1xNullJcc, + // LsrJcc, + // LsrNullJcc, + // LsrxJcc, + // LsrxNullJcc, + // Lsr1Jcc, + // Lsr1NullJcc, + // Lsr1xJcc, + // Lsr1xNullJcc, + // AsrJcc, + // AsrNullJcc, + // RolJcc, + // RolNullJcc, + // RorJcc, + // RorNullJcc, + // MUL8_UUJcc, + // MUL8_UUNullJcc, + // MUL8_SUJcc, + // MUL8_SUNullJcc, + // MUL8_SSJcc, + // MUL8_SSNullJcc, + // SubJcc, + // SubNullJcc, + // RsubJcc, + // RsubNullJcc, + // SubcJcc, + // SubcNullJcc, + // RsubcJcc, + // RsubcNullJcc, + // CaoJcc, + // CaoNullJcc, + // ClzJcc, + // ClzNullJcc, + // CloJcc, + // CloNullJcc, + // ClsJcc, + // ClsNullJcc, + // MoveJcc, + // MoveNullJcc, + // RolAddJcc, + // RolAddNullJcc, + // LsrAddJcc, + // LsrAddNullJcc, + // LslAddJcc, + // LslAddNullJcc, + // LslSubJcc, + // LslSubNullJcc, ADD_VASTART, diff --git a/llvm/lib/Target/DPU/DPUInstrFormats.td b/llvm/lib/Target/DPU/DPUInstrFormats.td index 66116ab29b153..a4e80392af3b6 100644 --- a/llvm/lib/Target/DPU/DPUInstrFormats.td +++ b/llvm/lib/Target/DPU/DPUInstrFormats.td @@ -97,6 +97,7 @@ def u5_imm : UImmOperand< 5, i32>; def u8_imm : UImmOperand< 8, i32>; def s8_i64_imm : SImmOperand<8, i64>; +def s11_i64_imm : SImmOperand<11, i64>; def s16_i64_imm : SImmOperand<16, i64>; def s32_i64_imm : SImmOperand<32, i64>; def u32_i64_imm : UImmOperand<32, i64>; diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp index db957f97bcaa9..eb10d5bdbcf0e 100644 --- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp +++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp @@ -53,7 +53,9 @@ void DPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL = (I != MBB.end()) ? I->getDebugLoc() : DebugLoc(); - unsigned Opcode = (RC == &DPU::GP_REGRegClass) ? DPU::SWrir : DPU::SDrir; + unsigned Opcode = (RC == &DPU::GP_REGRegClass + || RC == &DPU::GPZ_REGRegClass + ) ? DPU::SWrir : DPU::SDrir; LLVM_DEBUG({ dbgs() << "DPU/Instr - storeRegToStackSlot DestReg=" @@ -82,7 +84,9 @@ void DPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); - unsigned Opcode = (RC == &DPU::GP_REGRegClass) ? DPU::LWrri : DPU::LDrri; + unsigned Opcode = (RC == &DPU::GP_REGRegClass + || RC == &DPU::GPZ_REGRegClass + ) ? DPU::LWrri : DPU::LDrri; LLVM_DEBUG({ dbgs() << "DPU/Instr - loadRegFromStackSlot DestReg=" << std::to_string(DestReg) << " Opcode= " << std::to_string(Opcode) @@ -94,26 +98,148 @@ void DPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(Opcode), DestReg).addFrameIndex(FI).addImm(0); } +void DPUInstrInfo::expand64BitRegisterAluInstruction(MachineInstr &MI, + MachineBasicBlock &MBB, + unsigned int LsbOpcode, + unsigned int MsbOpcode) const { + MachineFunction *MF = MBB.getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + unsigned int DestReg = MI.getOperand(0).getReg(); + unsigned int Op1Reg = MI.getOperand(1).getReg(); + unsigned int Op2Reg = MI.getOperand(2).getReg(); + + unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit); + unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi); + + unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit); + unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi); + + unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit); + unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi); + + MachineInstrBuilder MIBDestLsb; + MIBDestLsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(LsbOpcode), + LSBDestReg) + .addReg(LSBDOp1Reg) + .addReg(LSBOp2Reg); + + MachineInstrBuilder MIBDestMsb; + MIBDestMsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(MsbOpcode), + MSBDestReg) + .addReg(MSBDOp1Reg) + .addReg(MSBOp2Reg); + + for (unsigned i = 0; i < 3; i++) { + if (MI.getOperand(i).isRenamable()) { + MIBDestLsb->getOperand(i).setIsRenamable(); + MIBDestMsb->getOperand(i).setIsRenamable(); + } + if (MI.getOperand(i).isKill()) { + MIBDestLsb->getOperand(i).setIsKill(); + MIBDestMsb->getOperand(i).setIsKill(); + } + } +} + +void DPUInstrInfo::expand64BitImmediateAluInstruction(MachineInstr &MI, + MachineBasicBlock &MBB, + unsigned int LsbOpcode, + unsigned int MsbOpcode) const { + MachineFunction *MF = MBB.getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + unsigned int DestReg = MI.getOperand(0).getReg(); + unsigned int Op1Reg = MI.getOperand(1).getReg(); + int64_t Op2Imm = MI.getOperand(2).getImm(); + + unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit); + unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi); + + unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit); + unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi); + + int64_t LSBOp2Imm = Op2Imm & 0xFFFFFFFFl; + int64_t MSBOp2Imm = (Op2Imm >> 32) & 0xFFFFFFFFl; + + // // what if value is zero??? + // // probably optimizable :) + switch (LSBOp2Imm) { + case 0: + case 1: + case 0xffffffff: + case 0x80000000: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "LSBOp2Imm = " << LSBOp2Imm << " could be optimized\n"; + }); + } + + switch (MSBOp2Imm) { + case 0: + case 1: + case 0xffffffff: + case 0x80000000: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "MSBOp2Imm = " << MSBOp2Imm << " could be optimized\n"; + }); + } + + MachineInstrBuilder MIBDestLsb; + MIBDestLsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(LsbOpcode), + LSBDestReg) + .addReg(LSBDOp1Reg) + .addImm(LSBOp2Imm); + + MachineInstrBuilder MIBDestMsb; + MIBDestMsb = BuildMI(MBB, MI, MI.getDebugLoc(), get(MsbOpcode), + MSBDestReg) + .addReg(MSBDOp1Reg) + .addImm(MSBOp2Imm); + + for (unsigned i = 0; i < 2; i++) { + if (MI.getOperand(i).isRenamable()) { + MIBDestLsb->getOperand(i).setIsRenamable(); + MIBDestMsb->getOperand(i).setIsRenamable(); + } + if (MI.getOperand(i).isKill()) { + MIBDestLsb->getOperand(i).setIsKill(); + MIBDestMsb->getOperand(i).setIsKill(); + } + } +} + bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to expand: "; MI.dump(); + dbgs() << "** MBB: "; MBB.dump(); + dbgs() << "****** \n"; + }); switch (MI.getDesc().getOpcode()) { default: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "Don't know how to expand.\n"; + }); return false; case DPU::RETi: BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::JUMPr)).addReg(DPU::R23); break; case DPU::CALLi: - BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri)) - .addReg(DPU::R23) - .add(MI.getOperand(0)); + BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri), DPU::R23) + .add(MI.getOperand(0)) + .copyImplicitOps(MI); break; case DPU::CALLr: - BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr)) - .addReg(DPU::R23) - .add(MI.getOperand(0)); + BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr), DPU::R23) + .add(MI.getOperand(0)) + .copyImplicitOps(MI); break; case DPU::ADD_VAStart: { // Get the first index in stack where the first // vaargs is stored @@ -122,16 +248,57 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { StackSize = MF->getFrameInfo().getStackSize(); } unsigned int ResultReg = MI.getOperand(0).getReg(); - BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif)) - .addReg(ResultReg) + BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif), ResultReg) .addReg(DPU::R22) .addImm(StackSize + STACK_SIZE_FOR_D22) .addImm(DPUAsmCondition::Condition::False); break; } + + case DPU::ADD64rr: + expand64BitRegisterAluInstruction(MI, MBB, DPU::ADDrrr, DPU::ADDCrrr); + break; + case DPU::AND64rr: + expand64BitRegisterAluInstruction(MI, MBB, DPU::ANDrrr, DPU::ANDrrr); + break; + case DPU::OR64rr: + expand64BitRegisterAluInstruction(MI, MBB, DPU::ORrrr, DPU::ORrrr); + break; + case DPU::SUB64rr: + expand64BitRegisterAluInstruction(MI, MBB, DPU::SUBrrr, DPU::SUBCrrr); + break; + case DPU::XOR64rr: + expand64BitRegisterAluInstruction(MI, MBB, DPU::XORrrr, DPU::XORrrr); + break; + + case DPU::ADD64ri: + expand64BitImmediateAluInstruction(MI, MBB, DPU::ADDrri, DPU::ADDCrri); + break; + case DPU::AND64ri: + expand64BitImmediateAluInstruction(MI, MBB, DPU::ANDrri, DPU::ANDrri); + break; + case DPU::OR64ri: + expand64BitImmediateAluInstruction(MI, MBB, DPU::ORrri, DPU::ORrri); + break; + case DPU::XOR64ri: + expand64BitImmediateAluInstruction(MI, MBB, DPU::XORrri, DPU::XORrri); + break; + + // case DPU::Jcci: + // case DPU::TmpJcci: + // case DPU::Jcc: { + // // don't expand yet as they are used for late optimization + // // these late optimization should be reworked and placed earlier in the pipeline + // // so we could treat more cases of optim + // break; + // } } MBB.erase(MI); + + LLVM_DEBUG({ + dbgs() << "** MBB: "; MBB.dump(); + }); return true; } @@ -139,18 +306,24 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const { + LLVM_DEBUG({ dbgs() << "DPU/Instr - copyPhysReg "; I->dump(); }); + + bool is_dest_renamable = I->getOperand(0).isRenamable(); + bool is_src_renamable = I->getOperand(1).isRenamable(); + MachineInstrBuilder MIB; + if (DPU::GP_REGRegClass.contains(DestReg) && DPU::OP_REGRegClass.contains(SrcReg)) { LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg << " kill= " << KillSrc << " to dest=" << DestReg << "\n"); - BuildMI(MBB, I, DL, get(DPU::MOVErr), DestReg) + MIB = BuildMI(MBB, I, DL, get(DPU::MOVErr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else if (DPU::GP64_REGRegClass.contains(DestReg, SrcReg)) { LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg << " kill= " << KillSrc << " to dest=" << DestReg << "\n"); - BuildMI(MBB, I, DL, get(DPU::MOVDrr), DestReg) + MIB = BuildMI(MBB, I, DL, get(DPU::MOVDrr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else if (DPU::GP64_REGRegClass.contains(SrcReg) && DPU::GP_REGRegClass.contains(DestReg)) { @@ -158,7 +331,7 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB, LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg << " kill= " << KillSrc << " to dest=" << DestReg << "\n"); - BuildMI(MBB, I, DL, get(DPU::EXTRACT_SUBREG), DestReg) + MIB = BuildMI(MBB, I, DL, get(DPU::EXTRACT_SUBREG), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(DPU::sub_32bit); } else if (DPU::GP_REGRegClass.contains(SrcReg) && @@ -167,11 +340,16 @@ void DPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB, LLVM_DEBUG(dbgs() << "DPU/Instr - copyPhysReg from src=" << SrcReg << " kill= " << KillSrc << " to dest=" << DestReg << "\n"); - BuildMI(MBB, I, DL, get(DPU::MOVE_Srr), DestReg) + MIB = BuildMI(MBB, I, DL, get(DPU::MOVE_Srr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else { llvm_unreachable("Impossible reg-to-reg copy"); } + + if (is_dest_renamable) + MIB->getOperand(0).setIsRenamable(); + if (is_src_renamable) + MIB->getOperand(1).setIsRenamable(); } static bool reverseBranchOpc(unsigned Opc, unsigned &ReversedOpc) { @@ -257,6 +435,7 @@ bool DPUInstrInfo::reverseBranchCondition( case DPU::Jcc: case DPU::Jcci: case DPU::Jcc64: + // case DPU::Jcci64: Cond[1].setImm(ISD::getSetCCInverse(ISD::CondCode(Cond[1].getImm()), MVT::i32)); break; default: { @@ -275,6 +454,11 @@ bool DPUInstrInfo::reverseBranchCondition( static void fetchUnconditionalBranchInfo(MachineInstr *Inst, unsigned &targetBasicBlockOperandIndex) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "Inst "; Inst->dump(); + }); + switch (Inst->getOpcode()) { case DPU::JUMPi: targetBasicBlockOperandIndex = 0; @@ -286,21 +470,73 @@ fetchUnconditionalBranchInfo(MachineInstr *Inst, static void fetchConditionalBranchInfo(MachineInstr *Inst, unsigned &targetBasicBlockOperandIndex, - SmallVectorImpl &Cond) { + SmallVectorImpl &Cond, + bool &do_have_metadata) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "Inst "; Inst->dump(); + dbgs() << "Cond.size() " << Cond.size() << "\n"; + for (unsigned i = 0; i < Cond.size(); ++i) { + dbgs() << "Cond[" << i << "] "; Cond[i].dump(); + } + }); + unsigned Opc = Inst->getOpcode(); Cond.push_back(MachineOperand::CreateImm(Opc)); + // for (unsigned int eachOperandIndex = 0; eachOperandIndex < Inst->getNumOperands(); + // eachOperandIndex++) { + // MachineOperand &operand = Inst->getOperand(eachOperandIndex); + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "operand " << eachOperandIndex << ": "; operand.dump(); + // }); + // if (operand.isMBB()) { + // targetBasicBlockOperandIndex = eachOperandIndex; + // } else { + // Cond.push_back(operand); + // } + // } unsigned int NumOp = Inst->getNumExplicitOperands(); + // unsigned int NumOp = Inst->getNumOperands(); + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "NumOp " << NumOp << "\n"; + // }); for (unsigned int eachOperandIndex = 0; eachOperandIndex < NumOp; eachOperandIndex++) { + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "operand " << eachOperandIndex << ": "; + // }); MachineOperand &operand = Inst->getOperand(eachOperandIndex); + // LLVM_DEBUG({ + // operand.dump(); + // }); if (operand.isMBB()) { targetBasicBlockOperandIndex = eachOperandIndex; } else { Cond.push_back(operand); } } + + do_have_metadata = false; + for (const MachineOperand &Op : Inst->operands()) { + if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(Inst->getMF()->getFunction().getContext(), "MySpecialMetadata")) { + Cond.push_back(Op); + do_have_metadata = true; + } + } + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "Inst "; Inst->dump(); + dbgs() << "Cond.size() " << Cond.size() << "\n"; + for (unsigned i = 0; i < Cond.size(); ++i) { + dbgs() << "Cond[" << i << "] "; Cond[i].dump(); + } + }); } static inline bool isAnalyzableBranch(MachineInstr *Inst) { @@ -312,6 +548,15 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "MBB "; MBB.dump(); + for (unsigned i = 0; i < Cond.size(); ++i) { + dbgs() << "Cond[" << i << "] "; Cond[i].dump(); + } + }); + MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend(); // Skip all the debug instructions. @@ -331,6 +576,9 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // If not an analyzable branch (e.g., indirect jump), just leave. if (!isAnalyzableBranch(LastInst)) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n"; + }); return true; } @@ -366,18 +614,40 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // Conditional branch if (LastInst->isConditionalBranch()) { unsigned int TBBOpIdx; - fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond); + bool do_have_metadata = false; + fetchConditionalBranchInfo(LastInst, TBBOpIdx, Cond, do_have_metadata); + if (do_have_metadata) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable metadata\n"; + }); + return true; + } + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "MBB "; MBB.dump(); + // dbgs() << "LastInst "; LastInst->dump(); + // dbgs() << "TBBOpIdx " << TBBOpIdx << "\n"; + // for (unsigned i = 0; i < Cond.size(); ++i) { + // dbgs() << "Cond[" << i << "] "; Cond[i].dump(); + // } + // }); TBB = LastInst->getOperand(TBBOpIdx).getMBB(); return false; } // Unknown branch type + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n"; + }); return true; } // If we reached here, there are two branches. // If there are three terminators, we don't know what sort of block this is. if (++I != REnd && isUnpredicatedTerminator(*I)) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n"; + }); return true; } @@ -386,11 +656,13 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (SecondLastInst->isUnconditionalBranch()) { // Return if the last instruction cannot be removed. if (!AllowModify) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n"; + }); return true; } unsigned int TBBOpIdx; fetchUnconditionalBranchInfo(SecondLastInst, TBBOpIdx); - TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB(); LastInst->eraseFromParent(); return false; @@ -400,25 +672,52 @@ bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // Conditional branch followed by an unconditional branch. // The last one must be unconditional. if (!LastInst->isUnconditionalBranch()) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n"; + }); return true; } unsigned int TBBOpIdx; unsigned int FTBBOpIdx; + bool do_have_metadata = false; fetchUnconditionalBranchInfo(LastInst, FTBBOpIdx); - fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond); + fetchConditionalBranchInfo(SecondLastInst, TBBOpIdx, Cond, do_have_metadata); + if (do_have_metadata) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable metadata\n"; + }); + return true; + } TBB = SecondLastInst->getOperand(TBBOpIdx).getMBB(); FBB = LastInst->getOperand(FTBBOpIdx).getMBB(); - + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "MBB "; MBB.dump(); + // dbgs() << "LastInst "; LastInst->dump(); + // dbgs() << "SecondLastInst "; SecondLastInst->dump(); + // dbgs() << "TBBOpIdx " << TBBOpIdx << "\n"; + + // for (unsigned i = 0; i < Cond.size(); ++i) { + // dbgs() << "Cond[" << i << "] "; Cond[i].dump(); + // } + // }); return false; } // Unknown branch type + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " not analyzable\n"; + }); return true; } unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "MBB "; MBB.dump(); + }); MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; @@ -444,22 +743,128 @@ unsigned DPUInstrInfo::removeBranch(MachineBasicBlock &MBB, void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL, ArrayRef Cond) const { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "MBB "; MBB.dump(); + for (unsigned i = 0; i < Cond.size(); ++i) { + dbgs() << "Cond[" << i << "] "; Cond[i].dump(); + if (Cond[i].isReg()) { + dbgs() << "Cond[" << i << "] isUse " << Cond[i].isUse() << "\n"; + dbgs() << "Cond[" << i << "] isDef " << Cond[i].isDef() << "\n"; + } + } + }); + + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "DPU::sub_32bit " << DPU::sub_32bit << "\n"; + // dbgs() << "DPU::sub_32bit_hi " << DPU::sub_32bit_hi << "\n"; + // for (unsigned i = 0; i < Cond.size(); ++i) { + // dbgs() << "Cond[" << i << "] = "; Cond[i].dump(); + // if (Cond[i].isReg()) { + // dbgs() << "is Reg\n"; + // dbgs() << Cond[i].getReg() << "\n"; + // dbgs() << Cond[i].getSubReg() << "\n"; + + // dbgs() << "contains " << DPU::GP64_REGRegClass.contains(Cond[i].getReg()) << "\n"; + // } + // } + // }); + MachineInstrBuilder MIB; unsigned Opc = Cond[0].getImm(); - MIB = BuildMI(&MBB, DL, get(Opc)); + // treat special cases + // those where not well handled with LLVM SSA stuff + // bool have_metadata = false; + // TODO: find a better way to discover if it's an arithmetic+comp+jump + // or simply rely solely on metadata? + // switch (Opc) { + // default: + // break; + // case DPU::CLZ_Urrci: + // case DPU::MUL_UL_ULrrrci: + // case DPU::LSLXrrrci: + // case DPU::LSRXrrrci: + // { + // for (unsigned i = 0; i < Cond.size(); ++i) { + // if (Cond[i].isMetadata() + // && Cond[i].getMetadata()->getOperand(0).get() == MDString::get(MBB.getParent()->getFunction().getContext(), "MySpecialMetadata")) { + // have_metadata = true; + // } + // } + // break; + // } + // } - for (unsigned i = 1; i < Cond.size(); ++i) { - if (Cond[i].isReg()) - MIB.addReg(Cond[i].getReg()); - else if (Cond[i].isImm()) + MIB = BuildMI(&MBB, DL, get(Opc)); + // for (unsigned i = 1; i < Cond.size(); ++i) { + // MIB->addOperand(Cond[i]); + // } + + + unsigned start = 1; + // if (have_metadata) { + // MIB = BuildMI(&MBB, DL, get(Opc), Cond[start].getReg()); + // start++; + // } else { + // MIB = BuildMI(&MBB, DL, get(Opc)); + // } + + for (unsigned i = start; i < Cond.size(); ++i) { + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << " working on " << i << "\n"; + // }); + if (Cond[i].isReg()) { + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // }); + // MIB.addReg(Cond[i].getReg()); + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // }); + MIB->addOperand(Cond[i]); + // if (Cond[i].isDef()) { + // // The register in question could potentially be a + // // subreg hi/lo of a 64-bit vreg + // if (unsigned SubReg = Cond[i].getSubReg()) { + // MIB.addDef(Cond[i].getReg(), 0, SubReg); + // } else { + // MIB.addDef(Cond[i].getReg()); + // } + // } else { + // // The register in question could potentially be a + // // subreg hi/lo of a 64-bit vreg + // if (unsigned SubReg = Cond[i].getSubReg()) { + // MIB.addReg(Cond[i].getReg(), 0, SubReg); + // } else { + // MIB.addReg(Cond[i].getReg()); + // } + // } + } else if (Cond[i].isImm()) { MIB.addImm(Cond[i].getImm()); - else + } else if (Cond[i].isMetadata()) { + // MIB.addMetadata(Cond[i].getMetadata()); + } else { assert(false && "Cannot copy operand"); + } } MIB.addMBB(TBB); + + // add back remaining metadata + for (unsigned i = 0; i < Cond.size(); ++i) { + if (Cond[i].isMetadata()) { + MIB.addMetadata(Cond[i].getMetadata()); + } + } + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "MIB "; MIB->dump(); + }); } unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB, @@ -467,6 +872,13 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded) const { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "MBB "; MBB.dump(); + for (unsigned i = 0; i < Cond.size(); ++i) { + dbgs() << "Cond[" << i << "] "; Cond[i].dump(); + } + }); unsigned nrOfInsertedMachineInstr = 0; // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -492,5 +904,47 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB, // to instructions added. if (BytesAdded) *BytesAdded = nrOfInsertedMachineInstr; + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "MBB "; MBB.dump(); + }); return nrOfInsertedMachineInstr; } + +bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const { + switch (MI.getDesc().getOpcode()) { + default: + break; + case DPU::CLZ_Urr: + case DPU::LSLXrrr: + case DPU::LSRXrrr: + case DPU::ANDrri: + case DPU::JEQrii: + case DPU::JNEQrii: + // case DPU::ADDrrr: + // case DPU::ADDCrrr: + case DPU::SUBrrr: + case DPU::SUBCrrr: + { + // return false; + for (const MachineOperand &Op : MI.operands()) { + if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI.getMF()->getFunction().getContext(), "MySpecialMetadata")) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Don't sink because I have MySpecialMetadata.\n"; + MI.dump(); + }); + return false; // Do not sink this instruction + } + } + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " I'm potentially something used in arith+cond+jump from EmitInstrWithCustomInserter but I allow sink because I don't have MySpecialMetadata.\n"; + MI.dump(); + }); + break; + } + } + + // return true; + return TargetInstrInfo::shouldSink(MI); +} diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.h b/llvm/lib/Target/DPU/DPUInstrInfo.h index e9c2a3b920a05..98fc84304958f 100644 --- a/llvm/lib/Target/DPU/DPUInstrInfo.h +++ b/llvm/lib/Target/DPU/DPUInstrInfo.h @@ -43,14 +43,22 @@ class DPUInstrInfo : public DPUGenInstrInfo { const TargetRegisterInfo *TRI) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; - + void expand64BitRegisterAluInstruction(MachineInstr &MI, + MachineBasicBlock &MBB, + unsigned int LsbOpcode, + unsigned int MsbOpcode) const; + void expand64BitImmediateAluInstruction(MachineInstr &MI, + MachineBasicBlock &MBB, + unsigned int LsbOpcode, + unsigned int MsbOpcode) const; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; bool reverseBranchCondition(SmallVectorImpl &Cond) const override; - + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, @@ -65,6 +73,8 @@ class DPUInstrInfo : public DPUGenInstrInfo { void buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL, ArrayRef Cond) const; + + bool shouldSink(const MachineInstr &MI) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.td b/llvm/lib/Target/DPU/DPUInstrInfo.td index 6b89c0e906556..b923d56beddea 100644 --- a/llvm/lib/Target/DPU/DPUInstrInfo.td +++ b/llvm/lib/Target/DPU/DPUInstrInfo.td @@ -217,58 +217,66 @@ defm : WramStoreImmPat; def : Pat<(i32 (trunc DoubleReg:$src)), (EXTRACT_SUBREG DoubleReg:$src, sub_32bit)>; -let isMoveImm = 1, isAsCheapAsAMove = 0 in { +let isMoveImm = 1, isAsCheapAsAMove = 0 +, usesCustomInserter = 1 +in { def MOVE64ri: PseudoDPUInstruction< (outs GP64_REG:$dc), (ins i64imm:$imm), "", [(set i64:$dc, (i64 imm:$imm))]>; } -let isAsCheapAsAMove = 0 in { -def ADD64rr: PseudoDPUInstruction< - (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), - "", - [(set i64:$dc, (add i64:$da, i64:$db))]>; - +let isAsCheapAsAMove = 0 +// , usesCustomInserter = 1 +in { def ADD64ri: PseudoDPUInstruction< (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm), "", [(set i64:$dc, (add i64:$da, (i64 imm:$imm)))]>; -def SUB64rr: PseudoDPUInstruction< - (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), - "", - [(set i64:$dc, (sub i64:$da, i64:$db))]>; - -def OR64rr: PseudoDPUInstruction< - (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), +def AND64ri: PseudoDPUInstruction< + (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm), "", - [(set i64:$dc, (or i64:$da, i64:$db))]>; + [(set i64:$dc, (and i64:$da, (i64 imm:$imm)))]>; def OR64ri: PseudoDPUInstruction< (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm), "", [(set i64:$dc, (or i64:$da, (i64 imm:$imm)))]>; +def XOR64ri: PseudoDPUInstruction< + (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm), + "", + [(set i64:$dc, (xor i64:$da, (i64 imm:$imm)))]>; +} + +let isAsCheapAsAMove = 0 +// , usesCustomInserter = 1 +in { +def ADD64rr: PseudoDPUInstruction< + (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), + "", + [(set i64:$dc, (add i64:$da, i64:$db))]>; + def AND64rr: PseudoDPUInstruction< (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), "", [(set i64:$dc, (and i64:$da, i64:$db))]>; -def AND64ri: PseudoDPUInstruction< - (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm), +def OR64rr: PseudoDPUInstruction< + (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), "", - [(set i64:$dc, (and i64:$da, (i64 imm:$imm)))]>; + [(set i64:$dc, (or i64:$da, i64:$db))]>; -def XOR64rr: PseudoDPUInstruction< +def SUB64rr: PseudoDPUInstruction< (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), "", - [(set i64:$dc, (xor i64:$da, i64:$db))]>; + [(set i64:$dc, (sub i64:$da, i64:$db))]>; -def XOR64ri: PseudoDPUInstruction< - (outs GP64_REG:$dc), (ins GP64_REG:$da, i64imm:$imm), +def XOR64rr: PseudoDPUInstruction< + (outs GP64_REG:$dc), (ins GP64_REG:$da, GP64_REG:$db), "", - [(set i64:$dc, (xor i64:$da, (i64 imm:$imm)))]>; + [(set i64:$dc, (xor i64:$da, i64:$db))]>; } // Bit operations: 64 bits emulation. @@ -454,26 +462,36 @@ def Jcci: PseudoDPUInstruction< [(DPUBrCCi (i32 imm:$cc), i32:$ra, (s11_imm:$immediate), bb:$dst)] >; -def TmpJcci: PseudoDPUInstruction< - (outs), (ins ccopcode:$cc, OP_REG:$ra, s11_imm:$immediate, GP_REG:$dependency, pcoffset:$dst), - "", - [] - >; +// def TmpJcci: PseudoDPUInstruction< +// (outs), (ins ccopcode:$cc, OP_REG:$ra, s11_imm:$immediate, GP_REG:$dependency, pcoffset:$dst), +// "", +// [] +// >; } -let isBranch = 1, isTerminator = 1, isCompare = 1, isBarrier = 0, isIndirectBranch = 0, isAsCheapAsAMove = 0 in { +let isBranch = 1, isTerminator = 1, isCompare = 1, isBarrier = 0, isIndirectBranch = 0, isAsCheapAsAMove = 0 +// , usesCustomInserter = 1 +in { def Jcc64: PseudoDPUInstruction< (outs), (ins ccopcode:$cc, GP64_REG:$da, GP64_REG:$db, pcoffset:$dst), "", [(DPUBrCC (i32 imm:$cc), i64:$da, i64:$db, bb:$dst)] >; + +// def Jcci64: PseudoDPUInstruction< +// (outs), (ins ccopcode:$cc, GP64_REG:$da, s11_i64_imm:$immediate, pcoffset:$dst), +// "", +// [(DPUBrCCi (i32 imm:$cc), i64:$da, (s11_i64_imm:$immediate), bb:$dst)] +// >; } // ----------------------------------------------------------------------------- // SETCC // ----------------------------------------------------------------------------- -let isAsCheapAsAMove = 0 in { +let isAsCheapAsAMove = 0 +, usesCustomInserter = 1 +in { def SET64cc: PseudoDPUInstruction< (outs GP_REG:$rc), (ins ccopcode:$cc, GP64_REG:$lhs, GP64_REG:$rhs), "", @@ -634,3 +652,15 @@ let usesCustomInserter = 1 in { def MRAM_LOAD64_X32mr : MRAM_LOAD64_X_mr; def MRAM_LOAD_DOUBLEmr: MRAM_LOAD64_X_mr; } + +//===----------------------------------------------------------------------===// +// Bit manipulation instructions +//===----------------------------------------------------------------------===// + +// ((1 << n) - 1) +def : Pat<(sub (shl (i32 1), GP_REG:$n), (i32 1)), + (LSLXrrr LNEG, GP_REG:$n)>; + +def : Pat<(xor (shl (i32 -1), GP_REG:$n), (i32 -1)), + (LSLXrrr LNEG, GP_REG:$n)>; +// ==== diff --git a/llvm/lib/Target/DPU/DPUMCInstLower.cpp b/llvm/lib/Target/DPU/DPUMCInstLower.cpp index 311c64f86b142..954f3834cc138 100644 --- a/llvm/lib/Target/DPU/DPUMCInstLower.cpp +++ b/llvm/lib/Target/DPU/DPUMCInstLower.cpp @@ -102,6 +102,7 @@ void DPUMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; case MachineOperand::MO_RegisterMask: + case MachineOperand::MO_Metadata: continue; case MachineOperand::MO_GlobalAddress: diff --git a/llvm/lib/Target/DPU/DPUMacroFusion.cpp b/llvm/lib/Target/DPU/DPUMacroFusion.cpp index a606c017d7cfb..6a14246c852c0 100644 --- a/llvm/lib/Target/DPU/DPUMacroFusion.cpp +++ b/llvm/lib/Target/DPU/DPUMacroFusion.cpp @@ -28,14 +28,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, // We are mainly interested in merging a simple operation with a simple // conditional/unconditional branch LLVM_DEBUG({ - dbgs() << "DPU/Merge: checking macro fusion:\n\t"; - if (!FirstMI) - dbgs() << ""; - else - FirstMI->dump(); - dbgs() << "\n\t"; - SecondMI.dump(); - dbgs() << "\n"; + dbgs() << "DPU/Merge: checking macro fusion:\n"; + if (!FirstMI) { + dbgs() << "\t\n"; + } else { + dbgs() << "\t"; FirstMI->dump(); + } + dbgs() << "\t"; SecondMI.dump(); }); if (!FirstMI) { @@ -51,14 +50,38 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, switch (secondOpc) { default: // todo probably more opportunities (Conditional branches...) + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "DPU/Merge: the two instructions cannot be fused\n"; + }); return false; case DPU::JUMPi: - case DPU::TmpJcci: + // case DPU::TmpJcci: + break; + case DPU::JNEQrii: + case DPU::JEQrii: + if (!(FirstMI->getOperand(0).isReg() && SecondMI.getOperand(0).isReg() && + (FirstMI->getOperand(0).getReg() == + SecondMI.getOperand(0).getReg()))) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "DPU/Merge: the two instructions cannot be fused\n"; + }); + LLVM_DEBUG({ + dbgs() << "first reg " << FirstMI->getOperand(0).getReg() << "\n"; + dbgs() << "second reg " << SecondMI.getOperand(0).getReg() << "\n"; + }); + return false; + } break; case DPU::Jcci: if (!(FirstMI->getOperand(0).isReg() && SecondMI.getOperand(1).isReg() && - (FirstMI->getOperand(0).getReg() == - SecondMI.getOperand(1).getReg()))) { + (FirstMI->getOperand(0).getReg() == + SecondMI.getOperand(1).getReg()))) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "DPU/Merge: the two instructions cannot be fused\n"; + }); return false; } break; @@ -68,7 +91,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, default: // todo probably more opportunities (Operations with specific immediate // operands, call...) - LLVM_DEBUG(dbgs() << "DPU/Merge: the two instructions cannot be fused\n"); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "DPU/Merge: the two instructions cannot be fused\n"; + }); return false; case DPU::ADDrri: case DPU::ADDrrr: @@ -92,6 +118,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case DPU::RORrrr: case DPU::RORrri: case DPU::CLZrr: + case DPU::CLZ_Urr: case DPU::CAOrr: case DPU::MUL_UL_ULrrr: case DPU::MUL_SL_ULrrr: diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp index 998d4f0d4bcc5..c96a23c933e17 100644 --- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp +++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp @@ -6,6 +6,9 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// + +// TODO: expand to more situation of arith+comp+branch + #include "DPUTargetMachine.h" #include #include @@ -653,12 +656,25 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB, LLVM_DEBUG(dbgs() << "KO: Unknown LastOpc\n"); return false; case DPU::JUMPi: { + // this is currently wrong + // we morph the branch from unconditional to conditional + // by this, we modify the CFG by creating artificially a fall through which is not declared + // so, it's bugged + return false; + // + if (!ImmCanBeEncodedOn8Bits) { LLVM_DEBUG( dbgs() << "KO: LastOpc == DPU::JUMPi && !ImmCanBeEncodedOn8Bits\n"); return false; } + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "before change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + int64_t actualCondition = ISD::SETTRUE2; MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(OpJumpOpc)) @@ -687,14 +703,33 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB, auto actualConditionOperand = MachineOperand::CreateImm(actualCondition); ComboInst.add(actualConditionOperand).add(LastInst->getOperand(0)); - LLVM_DEBUG(dbgs() << "OK\n"; LastInst->dump(); SecondLastInst->dump();); + LLVM_DEBUG({ + dbgs() << "OK\n"; + dbgs() << "del "; SecondLastInst->dump(); + dbgs() << "del "; LastInst->dump(); + dbgs() << "fused to\n"; + dbgs() << "add "; ComboInst->dump(); + }); + LastInst->eraseFromParent(); SecondLastInst->eraseFromParent(); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "after change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + return true; } - case DPU::TmpJcci: + // case DPU::TmpJcci: case DPU::Jcci: { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "before change: (if any)\n"; + dbgs() << "** MBB "; MBB->dump(); + }); + bool isSourceCondition = false; if (SecondLastInst->getOperand(0).getReg() != @@ -757,11 +792,17 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB, "&& !isSourceCondition) && (!ImmCanBeEncodedOn11Bits)\n"); return false; } + if (SecondLastOpc == DPU::MOVEri || SecondLastOpc == DPU::MOVErr) { + LLVM_DEBUG( + dbgs() + << "KO: move to zero is invalid\n"); + return false; + } // todo: this is not optimal. One register has been allocated but not used // now. This can become an issue (unnecessary spilling) ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), InstrInfo.get(OpNullJumpOpc)) - .addReg(DPU::ZERO); + .addReg(DPU::ZERO); } else { if (!ImmCanBeEncodedOn8Bits) { LLVM_DEBUG( @@ -794,14 +835,30 @@ static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB, break; } - LastInst->getOperand(0).setImm(actualCondition); - ComboInst.add(LastInst->getOperand(0)) + // why modify the original instruction ??? + // LastInst->getOperand(0).setImm(actualCondition); + // ComboInst.add(LastInst->getOperand(0)) + // .add(LastInst->getOperand(LastInst->getNumOperands() - 1)); + ComboInst.addImm(actualCondition) .add(LastInst->getOperand(LastInst->getNumOperands() - 1)); - LLVM_DEBUG(dbgs() << "OK\n"; LastInst->dump(); SecondLastInst->dump();); + LLVM_DEBUG({ + dbgs() << "OK\n"; + dbgs() << "del "; SecondLastInst->dump(); + dbgs() << "del "; LastInst->dump(); + dbgs() << "fused to\n"; + dbgs() << "add "; ComboInst->dump(); + }); + LastInst->eraseFromParent(); SecondLastInst->eraseFromParent(); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "after change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + return true; } case DPU::Jcc: @@ -821,9 +878,19 @@ bool DPUMergeComboInstrPass::runOnMachineFunction(MachineFunction &MF) { for (auto &MFI : MF) { MachineBasicBlock *MBB = &MFI; - LLVM_DEBUG(MBB->dump()); - changeMade |= mergeComboInstructionsInMBB(MBB, InstrInfo); + // LLVM_DEBUG(MBB->dump()); + + bool local_change = mergeComboInstructionsInMBB(MBB, InstrInfo); + if (local_change) { + // LLVM_DEBUG({ + // dbgs() << "\nchanged to:\n"; + // MBB->dump(); + // }); + changeMade = true; + } } + LLVM_DEBUG(dbgs() << "********** DPU/MergeComboInstrPass: " << MF.getName() + << " done: changeMade = " << changeMade << " **********\n\n"); return changeMade; } diff --git a/llvm/lib/Target/DPU/DPUPostRAFusion.cpp b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp new file mode 100644 index 0000000000000..a3cc5ab25e5d5 --- /dev/null +++ b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp @@ -0,0 +1,256 @@ +#include "DPUTargetMachine.h" +#include "DPU.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include + +#define GET_INSTRINFO_ENUM + +#include "DPUCondCodes.h" +#include "DPUGenInstrInfo.inc" +#include "DPUISelLowering.h" +#include "MCTargetDesc/DPUAsmCondition.h" + +#define GET_REGINFO_ENUM +#include "DPUGenRegisterInfo.inc" + +#define DEBUG_TYPE "dpu-postra-fusion" + +using namespace llvm; + +namespace { +class DPUPostRAFusionPass : public MachineFunctionPass { +public: + static char ID; + + explicit DPUPostRAFusionPass(DPUTargetMachine &tm) + : MachineFunctionPass(ID), TM(tm) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + llvm::StringRef getPassName() const override { + return "DPU PostRA Fussion"; + } + +private: + const DPUTargetMachine &TM; +}; + +char DPUPostRAFusionPass::ID = 0; +} // namespace + +FunctionPass *llvm::createDPUPostRAFusionPass(DPUTargetMachine &tm) { + return new DPUPostRAFusionPass(tm); +} + +static MachineInstr * +getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I, + MachineBasicBlock::reverse_iterator REnd) { + // Skip all the debug instructions. + while (I != REnd && + (I->isDebugValue() || I->getOpcode() == TargetOpcode::DBG_VALUE)) { + ++I; + } + if (I == REnd) { + return NULL; + } + return &*I; +} + +static bool do_have_special_metadata(MachineInstr *MI) { + for (const MachineOperand &Op : MI->operands()) { + if (Op.isMetadata() && Op.getMetadata()->getOperand(0).get() == MDString::get(MI->getMF()->getFunction().getContext(), "MySpecialMetadata")) { + return true; + } + } + + return false; +} + +static bool runOnMachineBB(MachineBasicBlock *MBB, + const DPUInstrInfo &InstrInfo) { + MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend(); + MachineInstr *LastInst, *SecondLastInst; + unsigned int LastOpc, SecondLastOpc; + + // LLVMContext &Context = MBB->getParent()->getFunction().getContext(); + // MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + + LastInst = getLastNonDebugInstrFrom(I, REnd); + if (LastInst == NULL) { + // LLVM_DEBUG(dbgs() << "KO: I == REnd\n"); + return false; + } + I++; + SecondLastInst = getLastNonDebugInstrFrom(I, REnd); + if (SecondLastInst == NULL) { + // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n"); + return false; + } + + if (!do_have_special_metadata(LastInst) + || !do_have_special_metadata(SecondLastInst)) + return false; + + LastOpc = LastInst->getOpcode(); + SecondLastOpc = SecondLastInst->getOpcode(); + + // attempt to merge lslx/lsrx and XX 32 jeq XX 32 instructions + // TODO: check if it's shift32 as well? + // or maybe use other metadata? + // but this is to be extra careful, or for the next player in the game ... :) + // though, here I apply only when with my metadata + // but if I actually not test my metadata, maybe + // and add JNEQrii, I could pop both + // and why not tackle other possible optim that may have introduce this code + // event from user maybe + // original code is JEQrii, but JNEQrii could be introduce by analyzeBranch + if ((LastOpc == DPU::JEQrii || LastOpc == DPU::JNEQrii) + && SecondLastOpc == DPU::ANDrri) { + I++; + MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd); + if (ThirdLastInst == NULL) { + // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n"); + return false; + } + if (!do_have_special_metadata(ThirdLastInst)) + return false; + + unsigned int ThirdLastOpc = ThirdLastInst->getOpcode(); + if ((ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr)) { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "before change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + + unsigned int new_opcode = (ThirdLastOpc == DPU::LSLXrrr ? + DPU::LSLXrrrci : DPU::LSRXrrrci); + MachineInstrBuilder ComboInst = BuildMI(MBB, ThirdLastInst->getDebugLoc(), + InstrInfo.get(new_opcode)); + ComboInst.add(ThirdLastInst->getOperand(0)); + ComboInst.add(ThirdLastInst->getOperand(1)); + ComboInst.add(ThirdLastInst->getOperand(2)); + ComboInst.addImm(DPUAsmCondition::Condition::Shift32); + ComboInst.addMBB(LastInst->getOperand(2).getMBB()); + // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink + + LLVM_DEBUG({ + dbgs() << "OK\n"; + dbgs() << "del "; ThirdLastInst->dump(); + dbgs() << "del "; SecondLastInst->dump(); + dbgs() << "del "; LastInst->dump(); + dbgs() << "fused to\n"; + dbgs() << "add "; ComboInst->dump(); + }); + + LastInst->eraseFromParent(); + SecondLastInst->eraseFromParent(); + ThirdLastInst->eraseFromParent(); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "after change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + return true; + } + } + + // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch + // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch + if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii) + && SecondLastOpc == DPU::MUL_UL_ULrrr) { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "before change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + + MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), + InstrInfo.get(DPU::MUL_UL_ULrrrci)); + ComboInst.add(SecondLastInst->getOperand(0)); + ComboInst.add(SecondLastInst->getOperand(1)); + ComboInst.add(SecondLastInst->getOperand(2)); + ComboInst.addImm(DPUAsmCondition::Small); + ComboInst.add(LastInst->getOperand(2)); + // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink + + LLVM_DEBUG({ + dbgs() << "OK\n"; + dbgs() << "del "; SecondLastInst->dump(); + dbgs() << "del "; LastInst->dump(); + dbgs() << "fused to\n"; + dbgs() << "add "; ComboInst->dump(); + }); + LastInst->eraseFromParent(); + SecondLastInst->eraseFromParent(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "after change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + + return true; + } + + // original code is JNEQrii, but JEQrii could be introduce by analyzeBranch + if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii) + && SecondLastOpc == DPU::CLZ_Urr) { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "before change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + + MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), + InstrInfo.get(DPU::CLZ_Urrci)); + ComboInst.add(SecondLastInst->getOperand(0)); + ComboInst.add(SecondLastInst->getOperand(1)); + ComboInst.addImm((LastOpc == DPU::JNEQrii) ? + DPUAsmCondition::Condition::NotMaximum : DPUAsmCondition::Condition::Maximum); + ComboInst.add(LastInst->getOperand(2)); + // ComboInst.addMetadata(N); now that we merge, we don't need to prohibe sink + + LLVM_DEBUG({ + dbgs() << "OK\n"; + dbgs() << "del "; SecondLastInst->dump(); + dbgs() << "del "; LastInst->dump(); + dbgs() << "fused to\n"; + dbgs() << "add "; ComboInst->dump(); + }); + + LastInst->eraseFromParent(); + SecondLastInst->eraseFromParent(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "after change: \n"; + dbgs() << "** MBB "; MBB->dump(); + }); + + return true; + } + + return false; +} + +bool DPUPostRAFusionPass::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName() + << " **********\n\n"); + + auto &SubTarget = static_cast(MF.getSubtarget()); + auto &InstrInfo = *SubTarget.getInstrInfo(); + bool changeMade = false; + + for (auto &MFI : MF) { + MachineBasicBlock *MBB = &MFI; + changeMade |= runOnMachineBB(MBB, InstrInfo); + } + + LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName() + << " done: changeMade = " << changeMade << " **********\n\n"); + return changeMade; +} diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.cpp b/llvm/lib/Target/DPU/DPURegisterInfo.cpp index 778ac2343a5c4..705b05ca0e746 100644 --- a/llvm/lib/Target/DPU/DPURegisterInfo.cpp +++ b/llvm/lib/Target/DPU/DPURegisterInfo.cpp @@ -50,28 +50,41 @@ DPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { BitVector DPURegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector reserved = BitVector(getNumRegs()); - reserved.set(DPU::D22); - reserved.set(DPU::R22); - reserved.set(DPU::R23); - reserved.set(DPU::ZERO); - reserved.set(DPU::ONE); - reserved.set(DPU::LNEG); - reserved.set(DPU::MNEG); - reserved.set(DPU::ID); - reserved.set(DPU::ID2); - reserved.set(DPU::ID4); - reserved.set(DPU::ID8); - reserved.set(DPU::MAJ_D22); - reserved.set(DPU::MAJ_R22); - reserved.set(DPU::MAJ_R23); - reserved.set(DPU::MAJ_ZERO); - reserved.set(DPU::MAJ_ONE); - reserved.set(DPU::MAJ_LNEG); - reserved.set(DPU::MAJ_MNEG); - reserved.set(DPU::MAJ_ID); - reserved.set(DPU::MAJ_ID2); - reserved.set(DPU::MAJ_ID4); - reserved.set(DPU::MAJ_ID8); + + markSuperRegs(reserved, DPU::D22); + markSuperRegs(reserved, DPU::R22); + markSuperRegs(reserved, DPU::R23); + markSuperRegs(reserved, DPU::ZERO); + markSuperRegs(reserved, DPU::ONE); + markSuperRegs(reserved, DPU::LNEG); + markSuperRegs(reserved, DPU::MNEG); + markSuperRegs(reserved, DPU::ID); + markSuperRegs(reserved, DPU::ID2); + markSuperRegs(reserved, DPU::ID4); + markSuperRegs(reserved, DPU::ID8); + assert(checkAllSuperRegsMarked(reserved)); + // reserved.set(DPU::D22); + // reserved.set(DPU::R22); + // reserved.set(DPU::R23); + // reserved.set(DPU::ZERO); + // reserved.set(DPU::ONE); + // reserved.set(DPU::LNEG); + // reserved.set(DPU::MNEG); + // reserved.set(DPU::ID); + // reserved.set(DPU::ID2); + // reserved.set(DPU::ID4); + // reserved.set(DPU::ID8); + // reserved.set(DPU::MAJ_D22); + // reserved.set(DPU::MAJ_R22); + // reserved.set(DPU::MAJ_R23); + // reserved.set(DPU::MAJ_ZERO); + // reserved.set(DPU::MAJ_ONE); + // reserved.set(DPU::MAJ_LNEG); + // reserved.set(DPU::MAJ_MNEG); + // reserved.set(DPU::MAJ_ID); + // reserved.set(DPU::MAJ_ID2); + // reserved.set(DPU::MAJ_ID4); + // reserved.set(DPU::MAJ_ID8); return reserved; } @@ -167,3 +180,19 @@ DPURegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/, CallingConv::ID /*CC*/) const { return CSR_RegMask; } + +bool DPURegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { + switch(PhysReg) { + default: + return false; + case DPU::ZERO: + case DPU::ONE: + case DPU::LNEG: + case DPU::MNEG: + case DPU::ID: + case DPU::ID2: + case DPU::ID4: + case DPU::ID8: + return true; + } +} diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.h b/llvm/lib/Target/DPU/DPURegisterInfo.h index 5d769d6a0d9d7..25d9c575a3967 100644 --- a/llvm/lib/Target/DPU/DPURegisterInfo.h +++ b/llvm/lib/Target/DPU/DPURegisterInfo.h @@ -37,6 +37,8 @@ struct DPURegisterInfo : public DPUGenRegisterInfo { const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; + bool isConstantPhysReg(MCRegister PhysReg) const override; + bool requiresRegisterScavenging(const MachineFunction &MF) const override { return true; } diff --git a/llvm/lib/Target/DPU/DPURegisterInfo.td b/llvm/lib/Target/DPU/DPURegisterInfo.td index caa0d84670555..06c44a9aaeac2 100644 --- a/llvm/lib/Target/DPU/DPURegisterInfo.td +++ b/llvm/lib/Target/DPU/DPURegisterInfo.td @@ -56,31 +56,31 @@ def R22 : DPUReg<22, "r22">, DwarfRegNum<[22]>; // R23: reserved as the return address for functions def R23 : DPUReg<23, "r23">, DwarfRegNum<[23]>; -// Thread data registers -def MAJ_R0 : DPUReg<0, "R0", [], [R0]>, DwarfRegNum<[0]>; -def MAJ_R1 : DPUReg<1, "R1", [], [R1]>, DwarfRegNum<[1]>; -def MAJ_R2 : DPUReg<2, "R2", [], [R2]>, DwarfRegNum<[2]>; -def MAJ_R3 : DPUReg<3, "R3", [], [R3]>, DwarfRegNum<[3]>; -def MAJ_R4 : DPUReg<4, "R4", [], [R4]>, DwarfRegNum<[4]>; -def MAJ_R5 : DPUReg<5, "R5", [], [R5]>, DwarfRegNum<[5]>; -def MAJ_R6 : DPUReg<6, "R6", [], [R6]>, DwarfRegNum<[6]>; -def MAJ_R7 : DPUReg<7, "R7", [], [R7]>, DwarfRegNum<[7]>; -def MAJ_R8 : DPUReg<8, "R8", [], [R8]>, DwarfRegNum<[8]>; -def MAJ_R9 : DPUReg<9, "R9", [], [R9]>, DwarfRegNum<[9]>; -def MAJ_R10 : DPUReg<10, "R10", [], [R10]>, DwarfRegNum<[10]>; -def MAJ_R11 : DPUReg<11, "R11", [], [R11]>, DwarfRegNum<[11]>; -def MAJ_R12 : DPUReg<12, "R12", [], [R12]>, DwarfRegNum<[12]>; -def MAJ_R13 : DPUReg<13, "R13", [], [R13]>, DwarfRegNum<[13]>; -def MAJ_R14 : DPUReg<14, "R14", [], [R14]>, DwarfRegNum<[14]>; -def MAJ_R15 : DPUReg<15, "R15", [], [R15]>, DwarfRegNum<[15]>; -def MAJ_R16 : DPUReg<16, "R16", [], [R16]>, DwarfRegNum<[16]>; -def MAJ_R17 : DPUReg<17, "R17", [], [R17]>, DwarfRegNum<[17]>; -def MAJ_R18 : DPUReg<18, "R18", [], [R18]>, DwarfRegNum<[18]>; -def MAJ_R19 : DPUReg<19, "R19", [], [R19]>, DwarfRegNum<[19]>; -def MAJ_R20 : DPUReg<20, "R20", [], [R20]>, DwarfRegNum<[20]>; -def MAJ_R21 : DPUReg<21, "R21", [], [R21]>, DwarfRegNum<[21]>; -def MAJ_R22 : DPUReg<22, "R22", [], [R22]>, DwarfRegNum<[22]>; -def MAJ_R23 : DPUReg<23, "R23", [], [R23]>, DwarfRegNum<[23]>; +// // Thread data registers +// def MAJ_R0 : DPUReg<0, "R0", [], [R0]>, DwarfRegNum<[0]>; +// def MAJ_R1 : DPUReg<1, "R1", [], [R1]>, DwarfRegNum<[1]>; +// def MAJ_R2 : DPUReg<2, "R2", [], [R2]>, DwarfRegNum<[2]>; +// def MAJ_R3 : DPUReg<3, "R3", [], [R3]>, DwarfRegNum<[3]>; +// def MAJ_R4 : DPUReg<4, "R4", [], [R4]>, DwarfRegNum<[4]>; +// def MAJ_R5 : DPUReg<5, "R5", [], [R5]>, DwarfRegNum<[5]>; +// def MAJ_R6 : DPUReg<6, "R6", [], [R6]>, DwarfRegNum<[6]>; +// def MAJ_R7 : DPUReg<7, "R7", [], [R7]>, DwarfRegNum<[7]>; +// def MAJ_R8 : DPUReg<8, "R8", [], [R8]>, DwarfRegNum<[8]>; +// def MAJ_R9 : DPUReg<9, "R9", [], [R9]>, DwarfRegNum<[9]>; +// def MAJ_R10 : DPUReg<10, "R10", [], [R10]>, DwarfRegNum<[10]>; +// def MAJ_R11 : DPUReg<11, "R11", [], [R11]>, DwarfRegNum<[11]>; +// def MAJ_R12 : DPUReg<12, "R12", [], [R12]>, DwarfRegNum<[12]>; +// def MAJ_R13 : DPUReg<13, "R13", [], [R13]>, DwarfRegNum<[13]>; +// def MAJ_R14 : DPUReg<14, "R14", [], [R14]>, DwarfRegNum<[14]>; +// def MAJ_R15 : DPUReg<15, "R15", [], [R15]>, DwarfRegNum<[15]>; +// def MAJ_R16 : DPUReg<16, "R16", [], [R16]>, DwarfRegNum<[16]>; +// def MAJ_R17 : DPUReg<17, "R17", [], [R17]>, DwarfRegNum<[17]>; +// def MAJ_R18 : DPUReg<18, "R18", [], [R18]>, DwarfRegNum<[18]>; +// def MAJ_R19 : DPUReg<19, "R19", [], [R19]>, DwarfRegNum<[19]>; +// def MAJ_R20 : DPUReg<20, "R20", [], [R20]>, DwarfRegNum<[20]>; +// def MAJ_R21 : DPUReg<21, "R21", [], [R21]>, DwarfRegNum<[21]>; +// def MAJ_R22 : DPUReg<22, "R22", [], [R22]>, DwarfRegNum<[22]>; +// def MAJ_R23 : DPUReg<23, "R23", [], [R23]>, DwarfRegNum<[23]>; // Thread data registers, extended to 64 bits. let SubRegIndices = [sub_32bit, sub_32bit_hi], CoveredBySubRegs = 1 in { @@ -97,39 +97,39 @@ let SubRegIndices = [sub_32bit, sub_32bit_hi], CoveredBySubRegs = 1 in { def D20 : DPUReg<20, "d20", [R21, R20]>; def D22 : DPUReg<22, "d22", [R23, R22]>; - def MAJ_D0 : DPUReg<0, "D0", [MAJ_R1, MAJ_R0], [D0]>; - def MAJ_D2 : DPUReg<2, "D2", [MAJ_R3, MAJ_R2], [D2]>; - def MAJ_D4 : DPUReg<4, "D4", [MAJ_R5, MAJ_R4], [D4]>; - def MAJ_D6 : DPUReg<6, "D6", [MAJ_R7, MAJ_R6], [D6]>; - def MAJ_D8 : DPUReg<8, "D8", [MAJ_R9, MAJ_R8], [D8]>; - def MAJ_D10 : DPUReg<10, "D10", [MAJ_R11, MAJ_R10], [D10]>; - def MAJ_D12 : DPUReg<12, "D12", [MAJ_R13, MAJ_R12], [D12]>; - def MAJ_D14 : DPUReg<14, "D14", [MAJ_R15, MAJ_R14], [D14]>; - def MAJ_D16 : DPUReg<16, "D16", [MAJ_R17, MAJ_R16], [D16]>; - def MAJ_D18 : DPUReg<18, "D18", [MAJ_R19, MAJ_R18], [D18]>; - def MAJ_D20 : DPUReg<20, "D20", [MAJ_R21, MAJ_R20], [D20]>; - def MAJ_D22 : DPUReg<22, "D22", [MAJ_R23, MAJ_R22], [D22]>; + // def MAJ_D0 : DPUReg<0, "D0", [MAJ_R1, MAJ_R0], [D0]>; + // def MAJ_D2 : DPUReg<2, "D2", [MAJ_R3, MAJ_R2], [D2]>; + // def MAJ_D4 : DPUReg<4, "D4", [MAJ_R5, MAJ_R4], [D4]>; + // def MAJ_D6 : DPUReg<6, "D6", [MAJ_R7, MAJ_R6], [D6]>; + // def MAJ_D8 : DPUReg<8, "D8", [MAJ_R9, MAJ_R8], [D8]>; + // def MAJ_D10 : DPUReg<10, "D10", [MAJ_R11, MAJ_R10], [D10]>; + // def MAJ_D12 : DPUReg<12, "D12", [MAJ_R13, MAJ_R12], [D12]>; + // def MAJ_D14 : DPUReg<14, "D14", [MAJ_R15, MAJ_R14], [D14]>; + // def MAJ_D16 : DPUReg<16, "D16", [MAJ_R17, MAJ_R16], [D16]>; + // def MAJ_D18 : DPUReg<18, "D18", [MAJ_R19, MAJ_R18], [D18]>; + // def MAJ_D20 : DPUReg<20, "D20", [MAJ_R21, MAJ_R20], [D20]>; + // def MAJ_D22 : DPUReg<22, "D22", [MAJ_R23, MAJ_R22], [D22]>; } // Constant registers. def ZERO: DPUReg<24, "zero">; -def MAJ_ZERO: DPUReg<24, "ZERO", [], [ZERO]>; +// def MAJ_ZERO: DPUReg<24, "ZERO", [], [ZERO]>; def ONE: DPUReg<25, "one">; -def MAJ_ONE: DPUReg<25, "ONE", [], [ONE]>; +// def MAJ_ONE: DPUReg<25, "ONE", [], [ONE]>; def LNEG: DPUReg<26, "lneg">; -def MAJ_LNEG: DPUReg<26, "LNEG", [], [LNEG]>; +// def MAJ_LNEG: DPUReg<26, "LNEG", [], [LNEG]>; def MNEG: DPUReg<27, "mneg">; -def MAJ_MNEG: DPUReg<27, "MNEG", [], [MNEG]>; +// def MAJ_MNEG: DPUReg<27, "MNEG", [], [MNEG]>; // Thread id registers. Return the thread identification for the // current thread, times 1, 2, 4, 8. def ID: DPUReg<28, "id">; def ID2: DPUReg<29, "id2">; def ID4: DPUReg<30, "id4">; def ID8: DPUReg<31, "id8">; -def MAJ_ID: DPUReg<28, "ID", [], [ID]>; -def MAJ_ID2: DPUReg<29, "ID2", [], [ID2]>; -def MAJ_ID4: DPUReg<30, "ID4", [], [ID4]>; -def MAJ_ID8: DPUReg<31, "ID8", [], [ID8]>; +// def MAJ_ID: DPUReg<28, "ID", [], [ID]>; +// def MAJ_ID2: DPUReg<29, "ID2", [], [ID2]>; +// def MAJ_ID4: DPUReg<30, "ID4", [], [ID4]>; +// def MAJ_ID8: DPUReg<31, "ID8", [], [ID8]>; // Define the register class representing this bank of general // purpose registers used by ONE thread. @@ -139,16 +139,41 @@ def MAJ_ID8: DPUReg<31, "ID8", [], [ID8]>; // that can be used as an instruction operand. // Hide the reserved registers, so that we are very sure that the compiler will // not do anything with them. -def GP_REG : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23), (sequence "MAJ_R%u", 0, 23))>; -def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG, MAJ_ZERO, MAJ_ONE, MAJ_LNEG, MAJ_MNEG)>; -def ID_REG : RegisterClass<"DPU", [i32], 32, (add ID, ID2, ID4, ID8, MAJ_ID, MAJ_ID2, MAJ_ID4, MAJ_ID8)>; -def ZERO_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, MAJ_ZERO)>; +def GP_REG : RegisterClass<"DPU", [i32], 32, (add (sequence "R%u", 0, 23) +// , (sequence "MAJ_R%u", 0, 23) +// , ZERO +// , ONE +// ,LNEG //<-- there is an issue with this one: lsr_add r2, lneg, r2, 3 seems to be understood as sats r2, r2 ... encoding problem??? +// ,MNEG //<-- this one as well +// in fact they cause more trouble now. +// probably they are not well specified elsewhere +// or encoding/decoding are not well tested properly with register constraints ... +// need to check that +// because register coalescing could be really interesting ... +// move $d/r 0/1/-1 could be potentially removed +// will check that later, first: correctness +)>; + +def CONST_REG : RegisterClass<"DPU", [i32], 32, (add ZERO, ONE, LNEG, MNEG +// , MAJ_ZERO, MAJ_ONE, MAJ_LNEG, MAJ_MNEG +)>; + +def ID_REG : RegisterClass<"DPU", [i32], 32, (add ID, ID2, ID4, ID8 +//, MAJ_ID, MAJ_ID2, MAJ_ID4, MAJ_ID8 +)>; + +def ZERO_REG : RegisterClass<"DPU", [i32], 32, (add ZERO +// , MAJ_ZERO +)>; + def OP_REG : RegisterClass<"DPU", [i32], 32, (add GP_REG, CONST_REG, ID_REG)>; def GPZ_REG : RegisterClass<"DPU", [i32], 32, (add GP_REG, ZERO_REG)>; // 64 bits registers are the combinations of 2 consecutive registers. def GP64_REG : RegisterClass<"DPU", [i64], 64, - (add D0, D2, D4, D6, D8, D10, D12, D14, D16, D18, D20, D22, MAJ_D0, MAJ_D2, MAJ_D4, MAJ_D6, MAJ_D8, MAJ_D10, MAJ_D12, MAJ_D14, MAJ_D16, MAJ_D18, MAJ_D20, MAJ_D22)>; + (add D0, D2, D4, D6, D8, D10, D12, D14, D16, D18, D20, D22 + // , MAJ_D0, MAJ_D2, MAJ_D4, MAJ_D6, MAJ_D8, MAJ_D10, MAJ_D12, MAJ_D14, MAJ_D16, MAJ_D18, MAJ_D20, MAJ_D22 + )>; def S0: DPUReg<0, "s0">; def S1: DPUReg<1, "s1">; diff --git a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp index bbfb4fec0d67e..cdbe91cbc44d3 100644 --- a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp +++ b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp @@ -7,6 +7,16 @@ // //===----------------------------------------------------------------------===// +// possibly move that earlier in the pipeline +// all simple arithmetic could be moved to in EmitInstrWithCustomInserter pre regalloc and other optim +// here I needed to add some option again, because we tweak it postRA +// if we do that express them directly during ISEL, we would benefit more natural optimization earlier +// also, possibility of FastIsel and GlobalSel instead of InstructionSel ... + +// TODO: expand test cases for splicing stuff +// need_splice = 0/1 x canFallThrough = 0/1 +// and/or doing Jcc and Setcc earlier as well + #include "DPU.h" #include "DPUInstrInfo.h" #include "DPUSubtarget.h" @@ -119,6 +129,13 @@ static void resolve64BitImmediateAluInstruction( MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter, const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode, unsigned int MsbOpcode) { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + }); + MachineFunction *MF = MBB->getParent(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); @@ -143,12 +160,23 @@ static void resolve64BitImmediateAluInstruction( MSBDestReg) .addReg(MSBDOp1Reg) .addImm(MSBOp2Imm); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); } static void resolve64BitRegisterAluInstruction( MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter, const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode, unsigned int MsbOpcode) { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + }); MachineFunction *MF = MBB->getParent(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); @@ -165,14 +193,33 @@ static void resolve64BitRegisterAluInstruction( unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit); unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi); - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(LsbOpcode), + MachineInstrBuilder MIBDestLsb; + MIBDestLsb = BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(LsbOpcode), LSBDestReg) .addReg(LSBDOp1Reg) .addReg(LSBOp2Reg); - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(MsbOpcode), + + MachineInstrBuilder MIBDestMsb; + MIBDestMsb = BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(MsbOpcode), MSBDestReg) .addReg(MSBDOp1Reg) .addReg(MSBOp2Reg); + + for (unsigned i = 0; i < 3; i++) { + if (MBBIter->getOperand(i).isRenamable()) { + MIBDestLsb->getOperand(i).setIsRenamable(); + MIBDestMsb->getOperand(i).setIsRenamable(); + } + if (MBBIter->getOperand(i).isKill()) { + MIBDestLsb->getOperand(i).setIsKill(); + MIBDestMsb->getOperand(i).setIsKill(); + } + } + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); } static void resolveJeq64(MachineBasicBlock *MBB, @@ -181,21 +228,48 @@ static void resolveJeq64(MachineBasicBlock *MBB, const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); MachineFunction *F = MBB->getParent(); + + bool need_splice = std::next(MBBIter) != MBB->end(); + + MachineBasicBlock *FTMBB = MBB->getFallThrough(); + MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** canFallThrough: " << MBB->canFallThrough() << "\n"; + if (MBB->canFallThrough()) { + dbgs() << "** FTMBB: "; FTMBB->dump(); + } + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); + MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *endMBB; + F->insert(I, trueMBB); - F->insert(I, endMBB); - // Update machine-CFG edges by transferring all successors of the current - // block to the new block which will contain the Phi node for the select. - endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end()); - endMBB->transferSuccessorsAndUpdatePHIs(MBB); + if (need_splice) { + endMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(I, endMBB); + // Update machine-CFG edges by transferring all successors of the current + // block to the new block which will contain the Phi node for the select. + endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end()); + endMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(endMBB); + endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true); + } else { + endMBB = FTMBB; + MBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true); + } + // Next, add the true and fallthrough blocks as its successors. - auto JumpMBB = MBBIter->getOperand(3).getMBB(); MBB->addSuccessor(trueMBB); - MBB->addSuccessor(endMBB); trueMBB->addSuccessor(JumpMBB); trueMBB->addSuccessor(endMBB); - + unsigned int Op1Reg = MBBIter->getOperand(1).getReg(); unsigned int Op2Reg = MBBIter->getOperand(2).getReg(); @@ -215,6 +289,20 @@ static void resolveJeq64(MachineBasicBlock *MBB, .addReg(MsbOp1Reg) .addReg(MsbOp2Reg) .addMBB(JumpMBB); + + trueMBB->addLiveIn(MsbOp1Reg); + trueMBB->addLiveIn(MsbOp2Reg); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** trueMBB: "; trueMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + dbgs() << "** FTMBB: "; FTMBB->dump(); + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); } static void resolveJneq64(MachineBasicBlock *MBB, @@ -223,18 +311,44 @@ static void resolveJneq64(MachineBasicBlock *MBB, const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); MachineFunction *F = MBB->getParent(); + + bool need_splice = std::next(MBBIter) != MBB->end(); + bool canFallThrough = MBB->canFallThrough(); + MachineBasicBlock *FTMBB = MBB->getFallThrough(); + MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** canFallThrough: " << canFallThrough << "\n"; + if (canFallThrough) { + dbgs() << "** FTMBB: "; FTMBB->dump(); + } + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); + MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *endMBB; F->insert(I, trueMBB); - F->insert(I, endMBB); - // Update machine-CFG edges by transferring all successors of the current - // block to the new block which will contain the Phi node for the select. - endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end()); - endMBB->transferSuccessorsAndUpdatePHIs(MBB); - // Next, add the true and fallthrough blocks as its successors. - auto JumpMBB = MBBIter->getOperand(3).getMBB(); + + if (need_splice) { + endMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(I, endMBB); + // Update machine-CFG edges by transferring all successors of the current + // block to the new block which will contain the Phi node for the select. + endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end()); + endMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(JumpMBB); + endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true); + } else { + endMBB = FTMBB; + MBB->removeSuccessor(endMBB, /* NormalizeSuccProbs = */ true); + } + MBB->addSuccessor(trueMBB); - MBB->addSuccessor(JumpMBB); trueMBB->addSuccessor(JumpMBB); trueMBB->addSuccessor(endMBB); @@ -257,12 +371,35 @@ static void resolveJneq64(MachineBasicBlock *MBB, .addReg(MsbOp1Reg) .addReg(MsbOp2Reg) .addMBB(JumpMBB); + + trueMBB->addLiveIn(MsbOp1Reg); + trueMBB->addLiveIn(MsbOp2Reg); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** trueMBB: "; trueMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + if (canFallThrough) { + dbgs() << "** FTMBB: "; FTMBB->dump(); + } + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); } static void resolveJcc64AsSub64(MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter, const DPUInstrInfo &InstrInfo, DPUAsmCondition::Condition Cond) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + unsigned int Op1Reg = MBBIter->getOperand(1).getReg(); unsigned int Op2Reg = MBBIter->getOperand(2).getReg(); auto JumpMBB = MBBIter->getOperand(3).getMBB(); @@ -284,11 +421,20 @@ static void resolveJcc64AsSub64(MachineBasicBlock *MBB, .addReg(MsbOp2Reg) .addImm(Cond) .addMBB(JumpMBB); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); } static void resolveJcc64(MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter, const DPUInstrInfo &InstrInfo) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + }); + switch (MBBIter->getOperand(0).getImm()) { default: llvm_unreachable("invalid condition"); @@ -304,45 +450,214 @@ static void resolveJcc64(MachineBasicBlock *MBB, break; case ISD::SETOGT: case ISD::SETGT: + LLVM_DEBUG({ dbgs() << "GT " << ISD::SETOGT << " " << ISD::SETGT << "\n"; }); resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, DPUAsmCondition::Condition::ExtendedGreaterThanSigned); break; case ISD::SETOGE: case ISD::SETGE: + LLVM_DEBUG({ dbgs() << "GE " << ISD::SETOGE << " " << ISD::SETGE << "\n"; }); resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, DPUAsmCondition::Condition::GreaterOrEqualSigned); break; case ISD::SETOLT: case ISD::SETLT: + LLVM_DEBUG({ dbgs() << "LT " << ISD::SETOLT << " " << ISD::SETLT << "\n"; }); resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, DPUAsmCondition::Condition::LessThanSigned); break; case ISD::SETOLE: case ISD::SETLE: + LLVM_DEBUG({ dbgs() << "GE " << ISD::SETOLE << " " << ISD::SETLE << "\n"; }); resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, DPUAsmCondition::Condition::ExtendedLessOrEqualSigned); break; case ISD::SETUGT: - resolveJcc64AsSub64( - MBB, MBBIter, InstrInfo, - DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned); + LLVM_DEBUG({ dbgs() << "UGT " << ISD::SETUGT << "\n"; }); + resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, + DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned); + break; case ISD::SETUGE: + LLVM_DEBUG({ dbgs() << "UGE " << ISD::SETUGE << "\n"; }); resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, DPUAsmCondition::Condition::GreaterOrEqualUnsigned); break; case ISD::SETULT: + LLVM_DEBUG({ dbgs() << "ULT " << ISD::SETULT << "\n"; }); resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, DPUAsmCondition::Condition::LessThanUnsigned); break; case ISD::SETULE: - resolveJcc64AsSub64( - MBB, MBBIter, InstrInfo, - DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned); + LLVM_DEBUG({ dbgs() << "ULE " << ISD::SETULE << "\n"; }); + resolveJcc64AsSub64(MBB, MBBIter, InstrInfo, + DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned); break; } } +static void resolveJcci64(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBIter, + const DPUInstrInfo &InstrInfo) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + unsigned int OpCode = + findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true); + const MachineInstrBuilder &MIB = + BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode)); + MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2)); + + for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) { + MachineOperand &Operand = MBBIter->getOperand(i); + + if (Operand.isMBB()) { + MIB.add(Operand); + break; + } + } + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); +} + +static void resolveMOVE64ri(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBIter, + const DPUInstrInfo &InstrInfo) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + unsigned int DestReg = MBBIter->getOperand(0).getReg(); + int64_t Op1Imm = MBBIter->getOperand(1).getImm(); + + int64_t LSBOp1Imm = Op1Imm & 0xFFFFFFFFl; + int64_t MSBOp1Imm = (Op1Imm >> 32) & 0xFFFFFFFFl; + unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit); + unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi); + + BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri), + LSBDestReg) + .addImm(LSBOp1Imm); + BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri), + MSBDestReg) + .addImm(MSBOp1Imm); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); +} + +static void resolveSET64cc(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBIter, + const DPUInstrInfo &InstrInfo) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + unsigned int DestReg = MBBIter->getOperand(0).getReg(); + auto ImmCond = static_cast(MBBIter->getOperand(1).getImm()); + unsigned int Op1Reg = MBBIter->getOperand(2).getReg(); + unsigned int Op2Reg = MBBIter->getOperand(3).getReg(); + + DPUAsmCondition::Condition SetCondition = + findSelect64SetConditionFor(ImmCond); + + unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit); + unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi); + + unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit); + unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi); + + BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::SUBzrr)) + .addReg(DPU::ZERO) + .addReg(LSBDOp1Reg) + .addReg(LSBOp2Reg); + BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), + InstrInfo.get(DPU::SUBCrrrc), DestReg) + .addReg(MSBDOp1Reg) + .addReg(MSBOp2Reg) + .addImm(SetCondition); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); +} + +static void resolveJcc(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBIter, + const DPUInstrInfo &InstrInfo) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + unsigned int OpCode = + findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), false); + BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode)) + .add(MBBIter->getOperand(1)) + .add(MBBIter->getOperand(2)) + .add(MBBIter->getOperand(3)); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); +} + +static void resolveJcci(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBIter, + const DPUInstrInfo &InstrInfo) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + unsigned int OpCode = + findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true); + const MachineInstrBuilder &MIB = + BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode)); + MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2)); + + for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) { + MachineOperand &Operand = MBBIter->getOperand(i); + + if (Operand.isMBB()) { + MIB.add(Operand); + break; + } + } + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); +} + static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB, const DPUInstrInfo &InstrInfo) { bool Modified = false; @@ -355,130 +670,77 @@ static bool resolveMacroInstructionsInMBB(MachineBasicBlock *MBB, default: InstrModified = false; break; - case DPU::Jcc: { - unsigned int OpCode = - findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), false); - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode)) - .add(MBBIter->getOperand(1)) - .add(MBBIter->getOperand(2)) - .add(MBBIter->getOperand(3)); + + case DPU::Jcc: + resolveJcc(MBB, MBBIter, InstrInfo); break; - } - case DPU::TmpJcci: - case DPU::Jcci: { - unsigned int OpCode = - findJumpOpcodeForCondition(MBBIter->getOperand(0).getImm(), true); - const MachineInstrBuilder &MIB = - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(OpCode)); - MIB.add(MBBIter->getOperand(1)).add(MBBIter->getOperand(2)); - - for (unsigned int i = MBBIter->getNumOperands() - 1; i >= 3; --i) { - MachineOperand &Operand = MBBIter->getOperand(i); - - if (Operand.isMBB()) { - MIB.add(Operand); - break; - } - } + // case DPU::TmpJcci: + case DPU::Jcci: + resolveJcci(MBB, MBBIter, InstrInfo); break; - } + case DPU::Jcc64: resolveJcc64(MBB, MBBIter, InstrInfo); break; - case DPU::SET64cc: { - MachineFunction *MF = MBB->getParent(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - - unsigned int DestReg = MBBIter->getOperand(0).getReg(); - auto ImmCond = static_cast( - MBBIter->getOperand(1).getImm()); - unsigned int Op1Reg = MBBIter->getOperand(2).getReg(); - unsigned int Op2Reg = MBBIter->getOperand(3).getReg(); - - DPUAsmCondition::Condition SetCondition = - findSelect64SetConditionFor(ImmCond); - - unsigned int LSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit); - unsigned int MSBDOp1Reg = TRI->getSubReg(Op1Reg, DPU::sub_32bit_hi); - - unsigned int LSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit); - unsigned int MSBOp2Reg = TRI->getSubReg(Op2Reg, DPU::sub_32bit_hi); - - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::SUBzrr)) - .addReg(DPU::ZERO) - .addReg(LSBDOp1Reg) - .addReg(LSBOp2Reg); - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), - InstrInfo.get(DPU::SUBCrrrc), DestReg) - .addReg(MSBDOp1Reg) - .addReg(MSBOp2Reg) - .addImm(SetCondition); - break; - } - case DPU::MOVE64ri: { - MachineFunction *MF = MBB->getParent(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - - unsigned int DestReg = MBBIter->getOperand(0).getReg(); - int64_t Op1Imm = MBBIter->getOperand(1).getImm(); - - int64_t LSBOp1Imm = Op1Imm & 0xFFFFFFFFl; - int64_t MSBOp1Imm = (Op1Imm >> 32) & 0xFFFFFFFFl; - unsigned int LSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit); - unsigned int MSBDestReg = TRI->getSubReg(DestReg, DPU::sub_32bit_hi); - - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri), - LSBDestReg) - .addImm(LSBOp1Imm); - BuildMI(*MBB, MBBIter, MBBIter->getDebugLoc(), InstrInfo.get(DPU::MOVEri), - MSBDestReg) - .addImm(MSBOp1Imm); - break; - } - case DPU::ADD64rr: - resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrrr, - DPU::ADDCrrr); - break; - case DPU::ADD64ri: - resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrri, - DPU::ADDCrri); - break; - case DPU::SUB64rr: - resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::SUBrrr, - DPU::SUBCrrr); - break; - case DPU::OR64rr: - resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrrr, - DPU::ORrrr); - break; - case DPU::OR64ri: - resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrri, - DPU::ORrri); - break; - case DPU::AND64rr: - resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrrr, - DPU::ANDrrr); - break; - case DPU::AND64ri: - resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrri, - DPU::ANDrri); - break; - case DPU::XOR64rr: - resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrrr, - DPU::XORrrr); - break; - case DPU::XOR64ri: - resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrri, - DPU::XORrri); - break; + // case DPU::Jcci64: + // resolveJcci64(MBB, MBBIter, InstrInfo); + // break; + + // case DPU::SET64cc: + // resolveSET64cc(MBB, MBBIter, InstrInfo); + // break; + + // case DPU::MOVE64ri: + // resolveMOVE64ri(MBB, MBBIter, InstrInfo); + // break; + + // case DPU::ADD64rr: + // resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrrr, + // DPU::ADDCrrr); + // break; + // case DPU::AND64rr: + // resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrrr, + // DPU::ANDrrr); + // break; + // case DPU::OR64rr: + // resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrrr, + // DPU::ORrrr); + // break; + // case DPU::SUB64rr: + // resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::SUBrrr, + // DPU::SUBCrrr); + // break; + // case DPU::XOR64rr: + // resolve64BitRegisterAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrrr, + // DPU::XORrrr); + // break; + + // case DPU::AND64ri: + // resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ANDrri, + // DPU::ANDrri); + // break; + // case DPU::ADD64ri: + // resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ADDrri, + // DPU::ADDCrri); + // break; + // case DPU::OR64ri: + // resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::ORrri, + // DPU::ORrri); + // break; + // case DPU::XOR64ri: + // resolve64BitImmediateAluInstruction(MBB, MBBIter, InstrInfo, DPU::XORrri, + // DPU::XORrri); + // break; + } if (InstrModified) { MBB->erase(MBBIter++); - Modified = true; - } else { + Modified |= true; + } + else { ++MBBIter; } } @@ -499,5 +761,7 @@ bool DPUResolveMacroInstrPass::runOnMachineFunction(MachineFunction &MF) { changeMade |= resolveMacroInstructionsInMBB(MBB, InstrInfo); } + LLVM_DEBUG(dbgs() << "********** DPU/ResolveMacroInstrPass: " << MF.getName() + << " done: changeMade = " << changeMade << " **********\n\n"); return changeMade; } diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp index 95ed30c7086ec..0b220529df968 100644 --- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp +++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp @@ -89,13 +89,14 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI) PredictableSelectIsExpensive = true; setJumpIsExpensive(false); - setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); - setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); - setLibcallName(RTLIB::SDIV_I32, "__div32"); - setLibcallName(RTLIB::UDIV_I32, "__udiv32"); + // setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); + // setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); + // setLibcallName(RTLIB::SDIV_I32, "__div32"); + // setLibcallName(RTLIB::UDIV_I32, "__udiv32"); // Set up the register classes. addRegisterClass(MVT::i32, &DPU::GP_REGRegClass); + // addRegisterClass(MVT::i32, &DPU::CONST_REGRegClass); addRegisterClass(MVT::i64, &DPU::GP64_REGRegClass); // Compute derived properties from the register classes @@ -226,7 +227,7 @@ DPUTargetLowering::DPUTargetLowering(const TargetMachine &TM, DPUSubtarget &STI) setOperationAction(ISD::BR_CC, MVT::i16, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); - + setOperationAction(ISD::ADDC, MVT::i8, Expand); setOperationAction(ISD::ADDC, MVT::i16, Expand); setOperationAction(ISD::ADDC, MVT::i32, Expand); @@ -382,23 +383,23 @@ SDValue DPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVAARG(Op, DAG); default: { - const char *NodeName = getTargetNodeName(Op.getOpcode()); LLVM_DEBUG({ - dbgs() << "FAIL: "; - Op.dump(&DAG); - }); - if (NodeName != nullptr) { - LLVM_DEBUG(dbgs() << "\tnode name = " << NodeName << "\n"); - } - for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) { - LLVM_DEBUG({ - dbgs() << "\toperand #" << std::to_string(eachOp) << " = "; - Op.getOperand(eachOp).dump(&DAG); + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "FAIL: "; + Op.dump(&DAG); + dbgs() << "\n"; + const char *NodeName = getTargetNodeName(Op.getOpcode()); + if (NodeName != nullptr) { + dbgs() << "\tnode name = " << NodeName << "\n"; + } + for (unsigned eachOp = 0; eachOp < Op.getNumOperands(); eachOp++) { + dbgs() << "\toperand #" << std::to_string(eachOp) << " = "; + Op.getOperand(eachOp).dump(&DAG); + } }); - } - } report_fatal_error("NOT implemented: lowering of such a type of SDValue"); } + } } const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -433,18 +434,18 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const { return "DPUISD::BrCC"; case DPUISD::BrCCi: return "DPUISD::BrCCi"; - case DPUISD::BrCCZero: - return "DPUISD::BrCCZero"; - case DPUISD::OrJCCZero: - return "DPUISD::OrJCCZero"; - case DPUISD::AndJCCZero: - return "DPUISD::AndJCCZero"; - case DPUISD::XorJCCZero: - return "DPUISD::XorJCCZero"; - case DPUISD::AddJCCZero: - return "DPUISD::AddJCCZero"; - case DPUISD::SubJCCZero: - return "DPUISD::SubJCCZero"; + // case DPUISD::BrCCZero: + // return "DPUISD::BrCCZero"; + // case DPUISD::OrJCCZero: + // return "DPUISD::OrJCCZero"; + // case DPUISD::AndJCCZero: + // return "DPUISD::AndJCCZero"; + // case DPUISD::XorJCCZero: + // return "DPUISD::XorJCCZero"; + // case DPUISD::AddJCCZero: + // return "DPUISD::AddJCCZero"; + // case DPUISD::SubJCCZero: + // return "DPUISD::SubJCCZero"; case DPUISD::Wrapper: return "DPUISD::Wrapper"; case DPUISD::TRUNC64: @@ -491,12 +492,12 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const { return "DPUISD::MUL16_SU"; case DPUISD::MUL16_SS: return "DPUISD::MUL16_SS"; - case DPUISD::Addc: - return "DPUISD::Addc"; - case DPUISD::Subc: - return "DPUISD::Subc"; - case DPUISD::Rsubc: - return "DPUISD::Rsubc"; + // case DPUISD::Addc: + // return "DPUISD::Addc"; + // case DPUISD::Subc: + // return "DPUISD::Subc"; + // case DPUISD::Rsubc: + // return "DPUISD::Rsubc"; case DPUISD::Clo: return "DPUISD::Clo"; case DPUISD::Cls: @@ -515,154 +516,154 @@ const char *DPUTargetLowering::getTargetNodeName(unsigned Opcode) const { return "DPUISD::Lsr1x"; case DPUISD::LslAdd: return "DPUISD::LslAdd"; - case DPUISD::AddJcc: - return "DPUISD::AddJcc"; - case DPUISD::AddNullJcc: - return "DPUISD::AddNullJcc"; - case DPUISD::AddcJcc: - return "DPUISD::AddcJcc"; - case DPUISD::AddcNullJcc: - return "DPUISD::AddcNullJcc"; - case DPUISD::AndJcc: - return "DPUISD::AndJcc"; - case DPUISD::AndNullJcc: - return "DPUISD::AndNullJcc"; - case DPUISD::OrJcc: - return "DPUISD::OrJcc"; - case DPUISD::OrNullJcc: - return "DPUISD::OrNullJcc"; - case DPUISD::XorJcc: - return "DPUISD::XorJcc"; - case DPUISD::XorNullJcc: - return "DPUISD::XorNullJcc"; - case DPUISD::NandJcc: - return "DPUISD::NandJcc"; - case DPUISD::NandNullJcc: - return "DPUISD::NandNullJcc"; - case DPUISD::NorJcc: - return "DPUISD::NorJcc"; - case DPUISD::NorNullJcc: - return "DPUISD::NorNullJcc"; - case DPUISD::NxorJcc: - return "DPUISD::NxorJcc"; - case DPUISD::NxorNullJcc: - return "DPUISD::NxorNullJcc"; - case DPUISD::AndnJcc: - return "DPUISD::AndnJcc"; - case DPUISD::AndnNullJcc: - return "DPUISD::AndnNullJcc"; - case DPUISD::OrnJcc: - return "DPUISD::OrnJcc"; - case DPUISD::OrnNullJcc: - return "DPUISD::OrnNullJcc"; - case DPUISD::LslJcc: - return "DPUISD::LslJcc"; - case DPUISD::LslNullJcc: - return "DPUISD::LslNullJcc"; - case DPUISD::LslxJcc: - return "DPUISD::LslxJcc"; - case DPUISD::LslxNullJcc: - return "DPUISD::LslxNullJcc"; - case DPUISD::Lsl1Jcc: - return "DPUISD::Lsl1Jcc"; - case DPUISD::Lsl1NullJcc: - return "DPUISD::Lsl1NullJcc"; - case DPUISD::Lsl1xJcc: - return "DPUISD::Lsl1xJcc"; - case DPUISD::Lsl1xNullJcc: - return "DPUISD::Lsl1xNullJcc"; - case DPUISD::LsrJcc: - return "DPUISD::LsrJcc"; - case DPUISD::LsrNullJcc: - return "DPUISD::LsrNullJcc"; - case DPUISD::LsrxJcc: - return "DPUISD::LsrxJcc"; - case DPUISD::LsrxNullJcc: - return "DPUISD::LsrxNullJcc"; - case DPUISD::Lsr1Jcc: - return "DPUISD::Lsr1Jcc"; - case DPUISD::Lsr1NullJcc: - return "DPUISD::Lsr1NullJcc"; - case DPUISD::Lsr1xJcc: - return "DPUISD::Lsr1xJcc"; - case DPUISD::Lsr1xNullJcc: - return "DPUISD::Lsr1xNullJcc"; - case DPUISD::AsrJcc: - return "DPUISD::AsrJcc"; - case DPUISD::AsrNullJcc: - return "DPUISD::AsrNullJcc"; - case DPUISD::RolJcc: - return "DPUISD::RolJcc"; - case DPUISD::RolNullJcc: - return "DPUISD::RolNullJcc"; - case DPUISD::RorJcc: - return "DPUISD::RorJcc"; - case DPUISD::RorNullJcc: - return "DPUISD::RorNullJcc"; - case DPUISD::MUL8_UUJcc: - return "DPUISD::MUL8_UUJcc"; - case DPUISD::MUL8_UUNullJcc: - return "DPUISD::MUL8_UUNullJcc"; - case DPUISD::MUL8_SUJcc: - return "DPUISD::MUL8_SUJcc"; - case DPUISD::MUL8_SUNullJcc: - return "DPUISD::MUL8_SUNullJcc"; - case DPUISD::MUL8_SSJcc: - return "DPUISD::MUL8_SSJcc"; - case DPUISD::MUL8_SSNullJcc: - return "DPUISD::MUL8_SSNullJcc"; - case DPUISD::SubJcc: - return "DPUISD::SubJcc"; - case DPUISD::SubNullJcc: - return "DPUISD::SubNullJcc"; - case DPUISD::RsubJcc: - return "DPUISD::RsubJcc"; - case DPUISD::RsubNullJcc: - return "DPUISD::RsubNullJcc"; - case DPUISD::SubcJcc: - return "DPUISD::SubcJcc"; - case DPUISD::SubcNullJcc: - return "DPUISD::SubcNullJcc"; - case DPUISD::RsubcJcc: - return "DPUISD::RsubcJcc"; - case DPUISD::RsubcNullJcc: - return "DPUISD::RsubcNullJcc"; - case DPUISD::CaoJcc: - return "DPUISD::CaoJcc"; - case DPUISD::CaoNullJcc: - return "DPUISD::CaoNullJcc"; - case DPUISD::ClzJcc: - return "DPUISD::ClzJcc"; - case DPUISD::ClzNullJcc: - return "DPUISD::ClzNullJcc"; - case DPUISD::CloJcc: - return "DPUISD::CloJcc"; - case DPUISD::CloNullJcc: - return "DPUISD::CloNullJcc"; - case DPUISD::ClsJcc: - return "DPUISD::ClsJcc"; - case DPUISD::ClsNullJcc: - return "DPUISD::ClsNullJcc"; - case DPUISD::MoveJcc: - return "DPUISD::MoveJcc"; - case DPUISD::MoveNullJcc: - return "DPUISD::MoveNullJcc"; - case DPUISD::RolAddJcc: - return "DPUISD::RolAddJcc"; - case DPUISD::RolAddNullJcc: - return "DPUISD::RolAddNullJcc"; - case DPUISD::LsrAddJcc: - return "DPUISD::LsrAddJcc"; - case DPUISD::LsrAddNullJcc: - return "DPUISD::LsrAddNullJcc"; - case DPUISD::LslAddJcc: - return "DPUISD::LslAddJcc"; - case DPUISD::LslAddNullJcc: - return "DPUISD::LslAddNullJcc"; - case DPUISD::LslSubJcc: - return "DPUISD::LslSubJcc"; - case DPUISD::LslSubNullJcc: - return "DPUISD::LslSubNullJcc"; + // case DPUISD::AddJcc: + // return "DPUISD::AddJcc"; + // case DPUISD::AddNullJcc: + // return "DPUISD::AddNullJcc"; + // case DPUISD::AddcJcc: + // return "DPUISD::AddcJcc"; + // case DPUISD::AddcNullJcc: + // return "DPUISD::AddcNullJcc"; + // case DPUISD::AndJcc: + // return "DPUISD::AndJcc"; + // case DPUISD::AndNullJcc: + // return "DPUISD::AndNullJcc"; + // case DPUISD::OrJcc: + // return "DPUISD::OrJcc"; + // case DPUISD::OrNullJcc: + // return "DPUISD::OrNullJcc"; + // case DPUISD::XorJcc: + // return "DPUISD::XorJcc"; + // case DPUISD::XorNullJcc: + // return "DPUISD::XorNullJcc"; + // case DPUISD::NandJcc: + // return "DPUISD::NandJcc"; + // case DPUISD::NandNullJcc: + // return "DPUISD::NandNullJcc"; + // case DPUISD::NorJcc: + // return "DPUISD::NorJcc"; + // case DPUISD::NorNullJcc: + // return "DPUISD::NorNullJcc"; + // case DPUISD::NxorJcc: + // return "DPUISD::NxorJcc"; + // case DPUISD::NxorNullJcc: + // return "DPUISD::NxorNullJcc"; + // case DPUISD::AndnJcc: + // return "DPUISD::AndnJcc"; + // case DPUISD::AndnNullJcc: + // return "DPUISD::AndnNullJcc"; + // case DPUISD::OrnJcc: + // return "DPUISD::OrnJcc"; + // case DPUISD::OrnNullJcc: + // return "DPUISD::OrnNullJcc"; + // case DPUISD::LslJcc: + // return "DPUISD::LslJcc"; + // case DPUISD::LslNullJcc: + // return "DPUISD::LslNullJcc"; + // case DPUISD::LslxJcc: + // return "DPUISD::LslxJcc"; + // case DPUISD::LslxNullJcc: + // return "DPUISD::LslxNullJcc"; + // case DPUISD::Lsl1Jcc: + // return "DPUISD::Lsl1Jcc"; + // case DPUISD::Lsl1NullJcc: + // return "DPUISD::Lsl1NullJcc"; + // case DPUISD::Lsl1xJcc: + // return "DPUISD::Lsl1xJcc"; + // case DPUISD::Lsl1xNullJcc: + // return "DPUISD::Lsl1xNullJcc"; + // case DPUISD::LsrJcc: + // return "DPUISD::LsrJcc"; + // case DPUISD::LsrNullJcc: + // return "DPUISD::LsrNullJcc"; + // case DPUISD::LsrxJcc: + // return "DPUISD::LsrxJcc"; + // case DPUISD::LsrxNullJcc: + // return "DPUISD::LsrxNullJcc"; + // case DPUISD::Lsr1Jcc: + // return "DPUISD::Lsr1Jcc"; + // case DPUISD::Lsr1NullJcc: + // return "DPUISD::Lsr1NullJcc"; + // case DPUISD::Lsr1xJcc: + // return "DPUISD::Lsr1xJcc"; + // case DPUISD::Lsr1xNullJcc: + // return "DPUISD::Lsr1xNullJcc"; + // case DPUISD::AsrJcc: + // return "DPUISD::AsrJcc"; + // case DPUISD::AsrNullJcc: + // return "DPUISD::AsrNullJcc"; + // case DPUISD::RolJcc: + // return "DPUISD::RolJcc"; + // case DPUISD::RolNullJcc: + // return "DPUISD::RolNullJcc"; + // case DPUISD::RorJcc: + // return "DPUISD::RorJcc"; + // case DPUISD::RorNullJcc: + // return "DPUISD::RorNullJcc"; + // case DPUISD::MUL8_UUJcc: + // return "DPUISD::MUL8_UUJcc"; + // case DPUISD::MUL8_UUNullJcc: + // return "DPUISD::MUL8_UUNullJcc"; + // case DPUISD::MUL8_SUJcc: + // return "DPUISD::MUL8_SUJcc"; + // case DPUISD::MUL8_SUNullJcc: + // return "DPUISD::MUL8_SUNullJcc"; + // case DPUISD::MUL8_SSJcc: + // return "DPUISD::MUL8_SSJcc"; + // case DPUISD::MUL8_SSNullJcc: + // return "DPUISD::MUL8_SSNullJcc"; + // case DPUISD::SubJcc: + // return "DPUISD::SubJcc"; + // case DPUISD::SubNullJcc: + // return "DPUISD::SubNullJcc"; + // case DPUISD::RsubJcc: + // return "DPUISD::RsubJcc"; + // case DPUISD::RsubNullJcc: + // return "DPUISD::RsubNullJcc"; + // case DPUISD::SubcJcc: + // return "DPUISD::SubcJcc"; + // case DPUISD::SubcNullJcc: + // return "DPUISD::SubcNullJcc"; + // case DPUISD::RsubcJcc: + // return "DPUISD::RsubcJcc"; + // case DPUISD::RsubcNullJcc: + // return "DPUISD::RsubcNullJcc"; + // case DPUISD::CaoJcc: + // return "DPUISD::CaoJcc"; + // case DPUISD::CaoNullJcc: + // return "DPUISD::CaoNullJcc"; + // case DPUISD::ClzJcc: + // return "DPUISD::ClzJcc"; + // case DPUISD::ClzNullJcc: + // return "DPUISD::ClzNullJcc"; + // case DPUISD::CloJcc: + // return "DPUISD::CloJcc"; + // case DPUISD::CloNullJcc: + // return "DPUISD::CloNullJcc"; + // case DPUISD::ClsJcc: + // return "DPUISD::ClsJcc"; + // case DPUISD::ClsNullJcc: + // return "DPUISD::ClsNullJcc"; + // case DPUISD::MoveJcc: + // return "DPUISD::MoveJcc"; + // case DPUISD::MoveNullJcc: + // return "DPUISD::MoveNullJcc"; + // case DPUISD::RolAddJcc: + // return "DPUISD::RolAddJcc"; + // case DPUISD::RolAddNullJcc: + // return "DPUISD::RolAddNullJcc"; + // case DPUISD::LsrAddJcc: + // return "DPUISD::LsrAddJcc"; + // case DPUISD::LsrAddNullJcc: + // return "DPUISD::LsrAddNullJcc"; + // case DPUISD::LslAddJcc: + // return "DPUISD::LslAddJcc"; + // case DPUISD::LslAddNullJcc: + // return "DPUISD::LslAddNullJcc"; + // case DPUISD::LslSubJcc: + // return "DPUISD::LslSubJcc"; + // case DPUISD::LslSubNullJcc: + // return "DPUISD::LslSubNullJcc"; case DPUISD::TEST_NODE: return "DPUISD::TEST_NODE"; } @@ -1737,11 +1738,24 @@ SDValue DPUTargetLowering::LowerBrCc(SDValue Op, SelectionDAG &DAG) const { // First, let's determine if there is a constant operand we can keep as // immediate. + ConstantSDNode *LC = dyn_cast(leftOp); ConstantSDNode *C = dyn_cast(rightOp); - + LLVM_DEBUG({ + dbgs() << "leftOp "; leftOp->dump(); + dbgs() << "rightOp "; rightOp->dump(); + if (LC) { + dbgs() << "a const: "; LC->dump(); + } + + if (C) { + dbgs() << "a const: "; C->dump(); + } + }); + // todo: handle 64bit compare with immediate - if (!(C && isLegalICmpImmediate(C->getSExtValue())) || - (rightOp.getValueType().getSimpleVT().SimpleTy == MVT::i64)) { + if (!(C && isLegalICmpImmediate(C->getSExtValue())) + || (rightOp.getValueType().getSimpleVT().SimpleTy == MVT::i64) + ) { // No suitable constant found. We cannot do anything special. SDValue Chain = Op.getOperand(0); SDLoc dl(Op); @@ -2029,6 +2043,12 @@ static MachineBasicBlock * EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, unsigned MulLL, unsigned MulHL, unsigned MulHL2, unsigned MulHH) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** BB: "; BB->dump(); + dbgs() << "****** \n"; + }); const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -2061,12 +2081,26 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); - BuildMI(BB, dl, TII.get(MulLL), LLDest) + LLVMContext &Context = F->getFunction().getContext(); + MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + // BuildMI(BB, dl, TII.get(MulLL), LLDest) + // .addReg(Op1) + // .addReg(Op2) + // .addImm(DPUAsmCondition::Small) + // .addMBB(fastMBB) + // // .addMetadata(N) + // ; + + BuildMI(BB, dl, TII.get(DPU::MUL_UL_ULrrr), LLDest) .addReg(Op1) .addReg(Op2) - .addImm(DPUAsmCondition::Small) - .addMBB(fastMBB); - + .addMetadata(N); + BuildMI(BB, dl, TII.get(DPU::JLTUrii)) + .addReg(LLDest) + .addImm(0x100) + .addMBB(fastMBB) + .addMetadata(N); + BuildMI(slowMBB, dl, TII.get(MulHL), HLDest).addReg(Op1).addReg(Op2); BuildMI(slowMBB, dl, TII.get(DPU::LSL_ADDrrri), LSL1Dest) .addReg(LLDest) @@ -2092,19 +2126,35 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, .addMBB(slowMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** BB: "; BB->dump(); + dbgs() << "** slowMBB: "; slowMBB->dump(); + dbgs() << "** fastMBB: "; fastMBB->dump(); + dbgs() << "****** \n"; + }); + return fastMBB; } static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** BB: "; BB->dump(); + dbgs() << "****** \n"; + }); const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator I = ++BB->getIterator(); MachineFunction *F = BB->getParent(); - MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *falseMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(I, trueMBB); + F->insert(I, falseMBB); F->insert(I, endMBB); // Update machine-CFG edges by transferring all successors of the current // block to the new block which will contain the Phi node for the select. @@ -2112,82 +2162,97 @@ static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI, std::next(MachineBasicBlock::iterator(MI)), BB->end()); endMBB->transferSuccessorsAndUpdatePHIs(BB); // Next, add the true and fallthrough blocks as its successors. - BB->addSuccessor(trueMBB); + BB->addSuccessor(falseMBB); BB->addSuccessor(endMBB); - trueMBB->addSuccessor(endMBB); + falseMBB->addSuccessor(endMBB); unsigned int Dest = MI.getOperand(0).getReg(); unsigned int CondReg = MI.getOperand(1).getReg(); unsigned int TrueReg = MI.getOperand(2).getReg(); unsigned int FalseReg = MI.getOperand(3).getReg(); - MachineRegisterInfo &RI = F->getRegInfo(); - unsigned FalseResultReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - - BuildMI(BB, dl, TII.get(DPU::ORrrr), FalseResultReg) + BuildMI(BB, dl, TII.get(DPU::JEQrii)) .addReg(CondReg) - .addReg(FalseReg); - - BuildMI(BB, dl, TII.get(DPU::TmpJcci)) - .addImm(ISD::CondCode::SETEQ) - .addReg(CondReg) - .addImm(0) - .addReg(FalseResultReg) + .addImm(1) .addMBB(endMBB); - BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) - .addReg(TrueReg) - .addMBB(trueMBB) - .addReg(FalseResultReg) - .addMBB(BB); - - MI.eraseFromParent(); // The pseudo instruction is gone now. - return endMBB; -} - -static MachineBasicBlock * -EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { - const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = ++BB->getIterator(); - MachineFunction *F = BB->getParent(); - MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(I, trueMBB); - F->insert(I, endMBB); - // Update machine-CFG edges by transferring all successors of the current - // block to the new block which will contain the Phi node for the select. - endMBB->splice(endMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - endMBB->transferSuccessorsAndUpdatePHIs(BB); - // Next, add the true and fallthrough blocks as its successors. - BB->addSuccessor(trueMBB); - BB->addSuccessor(endMBB); - - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int CondReg = MI.getOperand(1).getReg(); - unsigned int TrueReg = MI.getOperand(2).getReg(); - unsigned int FalseReg = MI.getOperand(3).getReg(); - - BuildMI(BB, dl, TII.get(DPU::Jcci)) - .addImm(ISD::CondCode::SETEQ) - .addReg(CondReg) - .addImm(0) + BuildMI(falseMBB, dl, TII.get(DPU::JUMPi)) .addMBB(endMBB); - - trueMBB->addSuccessor(endMBB); - + BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) - .addReg(TrueReg) - .addMBB(trueMBB) - .addReg(FalseReg) - .addMBB(BB); + .addReg(TrueReg).addMBB(BB) + .addReg(FalseReg).addMBB(falseMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** BB: "; BB->dump(); + dbgs() << "** falseMBB: "; falseMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + dbgs() << "****** \n"; + }); return endMBB; } +// static MachineBasicBlock * +// EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { +// LLVM_DEBUG({ +// dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; +// dbgs() << "instruction to replace: "; MI.dump(); +// dbgs() << "** BB: "; BB->dump(); +// dbgs() << "****** \n"; +// }); +// const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); +// DebugLoc dl = MI.getDebugLoc(); +// const BasicBlock *LLVM_BB = BB->getBasicBlock(); +// MachineFunction::iterator I = ++BB->getIterator(); +// MachineFunction *F = BB->getParent(); +// MachineBasicBlock *falseMBB = F->CreateMachineBasicBlock(LLVM_BB); +// MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); +// F->insert(I, falseMBB); +// F->insert(I, endMBB); +// // Update machine-CFG edges by transferring all successors of the current +// // block to the new block which will contain the Phi node for the select. +// endMBB->splice(endMBB->begin(), BB, +// std::next(MachineBasicBlock::iterator(MI)), BB->end()); +// endMBB->transferSuccessorsAndUpdatePHIs(BB); +// // Next, add the true and fallthrough blocks as its successors. +// BB->addSuccessor(trueMBB); +// BB->addSuccessor(endMBB); +// falseMBB->addSuccessor(endMBB); + +// unsigned int Dest = MI.getOperand(0).getReg(); +// unsigned int CondReg = MI.getOperand(1).getReg(); +// unsigned int TrueReg = MI.getOperand(2).getReg(); +// unsigned int FalseReg = MI.getOperand(3).getReg(); + +// BuildMI(BB, dl, TII.get(DPU::Jcci)) +// .addImm(ISD::CondCode::SETEQ) +// .addReg(CondReg) +// .addImm(1) +// .addMBB(endMBB); + +// BuildMI(falseBB, dl, TII.get(DPU::Jumpi)) +// .addMBB(endMBB); + +// BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) +// .addReg(TrueReg).addMBB(BB) +// .addReg(FalseReg).addMBB(falseMBB); + +// MI.eraseFromParent(); // The pseudo instruction is gone now. +// LLVM_DEBUG({ +// dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; +// dbgs() << "instruction replaced\n"; +// dbgs() << "** BB: "; BB->dump(); +// dbgs() << "** falseMBB: "; falseMBB->dump(); +// dbgs() << "** endMBB: "; endMBB->dump(); +// dbgs() << "****** \n"; +// }); +// return endMBB; +// } + static MachineBasicBlock * EmitMramSubStoreWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, unsigned int Mask, unsigned int Store) { @@ -2373,6 +2438,13 @@ EmitMramLoadDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { static MachineBasicBlock * EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** BB: "; BB->dump(); + dbgs() << "****** \n"; + }); + /* What we want to generate (with dc.h != rb in that example): lslx __R0, da.l, rb, ?sh32 @+4 @@ -2406,9 +2478,10 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { MachineRegisterInfo &RI = F->getRegInfo(); unsigned LsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned MsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - + // unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + // unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned ShiftReg_check = RI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned BigShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned BigShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); @@ -2426,20 +2499,53 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg) - .addReg(Op1Reg, 0, DPU::sub_32bit); - - BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg) - .addReg(LsbOp1Reg) - .addReg(ShiftReg) - .addImm(DPUAsmCondition::Condition::Shift32) - .addMBB(bigShiftMBB); - - BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg) - .addReg(Op1Reg, 0, DPU::sub_32bit_hi); + LLVMContext &Context = F->getFunction().getContext(); + MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + + // BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg) + // .addReg(Op1Reg, 0, DPU::sub_32bit); + + // unsigned DummyReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + + /// faulty + // BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg) + // // .addReg(LsbOp1Reg) + // .addReg(Op1Reg, 0, DPU::sub_32bit) + // .addReg(ShiftReg) + // .addImm(DPUAsmCondition::Condition::Shift32) + // .addMBB(bigShiftMBB) + // // .addMetadata(N) + // ; + + /// good, but + // could increase quite a bit the code size + // because MachineSinking will sink the lslxrrr to other places + // and we will not be able to merge those three + // though, with shouldSink false for this + // on a few example, I can keep them adjacent + // but I may kill other optimization stuff in other code + // that use it genuinelly + BuildMI(BB, dl, TII.get(DPU::LSLXrrr), LsbToMsbPartReg) + // .addReg(LsbOp1Reg) + .addReg(Op1Reg, 0, DPU::sub_32bit) + .addReg(ShiftReg) + .addMetadata(N); + BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check) + .addReg(ShiftReg) + .addImm(0x20) + .addMetadata(N); + BuildMI(BB, dl, TII.get(DPU::JEQrii)) + .addReg(ShiftReg_check) + .addImm(0x20) + .addMBB(bigShiftMBB) + .addMetadata(N); + + // BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg) + // .addReg(Op1Reg, 0, DPU::sub_32bit_hi); BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), MsbToMsbPartReg) - .addReg(MsbOp1Reg) + // .addReg(MsbOp1Reg) + .addReg(Op1Reg, 0, DPU::sub_32bit_hi) .addReg(ShiftReg); BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftMsbReg) @@ -2447,13 +2553,13 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { .addReg(LsbToMsbPartReg); BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), SmallShiftLsbReg) - .addReg(LsbOp1Reg) + // .addReg(LsbOp1Reg) + .addReg(Op1Reg, 0, DPU::sub_32bit) .addReg(ShiftReg); BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), Undef2Reg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), - SmallShiftResultPart0Reg) + BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultPart0Reg) .addReg(Undef2Reg) .addReg(SmallShiftLsbReg) .addImm(DPU::sub_32bit); @@ -2466,7 +2572,8 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB); BuildMI(bigShiftMBB, dl, TII.get(DPU::LSLrrr), BigShiftMsbReg) - .addReg(LsbOp1Reg) + // .addReg(LsbOp1Reg) + .addReg(Op1Reg, 0, DPU::sub_32bit) .addReg(ShiftReg); BuildMI(bigShiftMBB, dl, TII.get(DPU::MOVEri), BigShiftLsbReg).addImm(0); @@ -2495,6 +2602,16 @@ EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { .addMBB(smallShiftMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** BB: "; BB->dump(); + dbgs() << "** smallShiftMBB: "; smallShiftMBB->dump(); + dbgs() << "** bigShiftMBB: "; bigShiftMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + dbgs() << "****** \n"; + }); return endMBB; } @@ -2618,6 +2735,13 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB, unsigned int shiftRight, unsigned int shiftRightExtended) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** BB: "; BB->dump(); + dbgs() << "****** \n"; + }); + /* What we want to generate (with dc.l != rb in that example): lsrx __R0, da.h, rb, ?sh32 @+4 @@ -2652,6 +2776,7 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter( unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned MsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned LsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned ShiftReg_check = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned SmallShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned SmallShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); @@ -2661,14 +2786,33 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter( RI.createVirtualRegister(&DPU::GP64_REGRegClass); unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + LLVMContext &Context = F->getFunction().getContext(); + MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg) .addReg(Op1Reg, 0, DPU::sub_32bit_hi); - BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg) - .addReg(MsbOp1Reg) - .addReg(ShiftReg) - .addImm(DPUAsmCondition::Condition::Shift32) - .addMBB(bigShiftMBB); + // BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg) + // .addReg(MsbOp1Reg) + // .addReg(ShiftReg) + // .addImm(DPUAsmCondition::Condition::Shift32) + // .addMBB(bigShiftMBB) + // // .addMetadata(N) + // ; + + BuildMI(BB, dl, TII.get(DPU::LSRXrrr), MsbToLsbPartReg) + .addReg(MsbOp1Reg) + .addReg(ShiftReg) + .addMetadata(N); + BuildMI(BB, dl, TII.get(DPU::ANDrri), ShiftReg_check) + .addReg(ShiftReg) + .addImm(0x20) + .addMetadata(N); + BuildMI(BB, dl, TII.get(DPU::JEQrii)) + .addReg(ShiftReg_check) + .addImm(0x20) + .addMBB(bigShiftMBB) + .addMetadata(N); BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), LsbOp1Reg) .addReg(Op1Reg, 0, DPU::sub_32bit); @@ -2716,6 +2860,17 @@ static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter( .addMBB(smallShiftMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** BB: "; BB->dump(); + dbgs() << "** smallShiftMBB: "; smallShiftMBB->dump(); + dbgs() << "** bigShiftMBB: "; bigShiftMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + dbgs() << "****** \n"; + }); + return endMBB; } @@ -2877,6 +3032,7 @@ EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, BuildMI(*BB, MI, dl, TII.get(lsN), Op1MsbShift) .addReg(Op1Msb) .addReg(ShiftReg); + // should be checked BuildMI(*BB, MI, dl, TII.get(lsNJump), Op1LsbShift) .addReg(Op1Lsb) .addReg(ShiftReg) @@ -3063,6 +3219,12 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** BB: "; BB->dump(); + dbgs() << "****** \n"; + }); /* What we want to generate (with dc != da in that example): clz.u dc, da.h ?nmax @+3 @@ -3094,120 +3256,607 @@ static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI, MachineRegisterInfo &RI = F->getRegInfo(); unsigned FastResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); unsigned SlowResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SlowResultPart1Reg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + + // unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + // unsigned SlowResultPart1Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + // unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg) - .addReg(Op1Reg, 0, DPU::sub_32bit_hi) - .addImm(DPUAsmCondition::Condition::NotMaximum) - .addMBB(endMBB); + unsigned SlowResultReg_step = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + + unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned LsbAddReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + + LLVMContext &Context = F->getFunction().getContext(); + MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + + // BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg) + // .addReg(Op1Reg, 0, DPU::sub_32bit_hi) + // .addImm(DPUAsmCondition::Condition::NotMaximum) + // .addMBB(endMBB) + // // .addMetadata(N) + // ; + + BuildMI(BB, dl, TII.get(DPU::CLZ_Urr), FastResultReg) + .addReg(Op1Reg, 0, DPU::sub_32bit_hi) + .addMetadata(N) + ; + BuildMI(BB, dl, TII.get(DPU::JNEQrii)) + .addReg(FastResultReg, 0, DPU::sub_32bit) + .addImm(32) + .addMBB(endMBB) + .addMetadata(N) + ; BuildMI(msbAreZerosMBB, dl, TII.get(DPU::CLZrr), LsbClzReg) .addReg(Op1Reg, 0, DPU::sub_32bit); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg) - .addReg(LsbClzReg) - .addImm(32); + // This + // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg) + // .addReg(LsbClzReg) + // .addImm(32); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); + // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg) - .addReg(UndefReg) - .addReg(SlowResultPartReg) - .addImm(DPU::sub_32bit); + // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg) + // .addReg(UndefReg) + // .addReg(SlowResultPartReg) + // .addImm(DPU::sub_32bit); + + // BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg) + // .addReg(SlowResultPart1Reg) + // .addReg(FastResultReg, 0, DPU::sub_32bit_hi) + // .addImm(DPU::sub_32bit_hi); + // or + BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), LsbAddReg) + .addReg(LsbClzReg) + .addImm(32); + + BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg_step) + .addReg(SlowResultReg_step, RegState::Undef) + .addReg(LsbAddReg) + .addImm(DPU::sub_32bit); + BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg) - .addReg(SlowResultPart1Reg) + .addReg(SlowResultReg_step) .addReg(FastResultReg, 0, DPU::sub_32bit_hi) .addImm(DPU::sub_32bit_hi); - + BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) .addReg(FastResultReg) .addMBB(BB) .addReg(SlowResultReg) .addMBB(msbAreZerosMBB); + MI.eraseFromParent(); // The pseudo instruction is gone now. + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** BB: "; BB->dump(); + dbgs() << "** msbAreZerosMBB: "; msbAreZerosMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + dbgs() << "****** \n"; + }); + return endMBB; } -static MachineBasicBlock *EmitSeqreadGet(MachineInstr &MI, - MachineBasicBlock *BB, bool IsIncCst) { - const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = ++BB->getIterator(); - MachineFunction *F = BB->getParent(); - MachineBasicBlock *slowMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *fastMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(I, slowMBB); - F->insert(I, fastMBB); - // Update machine-CFG edges by transferring all successors of the current - // block to the new block which will contain the Phi node for the select. - fastMBB->splice(fastMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - fastMBB->transferSuccessorsAndUpdatePHIs(BB); - // Next, add the true and fallthrough blocks as its successors. - BB->addSuccessor(slowMBB); - BB->addSuccessor(fastMBB); - slowMBB->addSuccessor(fastMBB); +// static MachineBasicBlock *EmitSeqreadGet(MachineInstr &MI, +// MachineBasicBlock *BB, bool IsIncCst) { +// LLVM_DEBUG({ +// dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; +// dbgs() << "instruction to replace: "; MI.dump(); +// dbgs() << "IsIncCst: " << IsIncCst << "\n"; +// dbgs() << "** BB: "; BB->dump(); +// dbgs() << "****** \n"; +// }); + +// const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); +// DebugLoc dl = MI.getDebugLoc(); +// const BasicBlock *LLVM_BB = BB->getBasicBlock(); +// MachineFunction::iterator I = ++BB->getIterator(); +// MachineFunction *F = BB->getParent(); +// MachineBasicBlock *slowMBB = F->CreateMachineBasicBlock(LLVM_BB); +// MachineBasicBlock *fastMBB = F->CreateMachineBasicBlock(LLVM_BB); +// F->insert(I, slowMBB); +// F->insert(I, fastMBB); +// // Update machine-CFG edges by transferring all successors of the current +// // block to the new block which will contain the Phi node for the select. +// fastMBB->splice(fastMBB->begin(), BB, +// std::next(MachineBasicBlock::iterator(MI)), BB->end()); +// fastMBB->transferSuccessorsAndUpdatePHIs(BB); +// // Next, add the true and fallthrough blocks as its successors. +// BB->addSuccessor(slowMBB); +// BB->addSuccessor(fastMBB); +// slowMBB->addSuccessor(fastMBB); + +// unsigned int Dest = MI.getOperand(0).getReg(); +// unsigned int PtrInit = MI.getOperand(1).getReg(); +// unsigned int Reader = MI.getOperand(3).getReg(); +// unsigned int Cond = MI.getOperand(4).getImm(); +// unsigned int PageSize = MI.getOperand(5).getImm(); + +// MachineRegisterInfo &RI = F->getRegInfo(); +// unsigned int PtrIncremented = RI.createVirtualRegister(&DPU::GP_REGRegClass); + +// if (IsIncCst) { +// BuildMI(BB, dl, TII.get(DPU::ADDrrici), PtrIncremented) +// .addReg(PtrInit) +// .addImm(MI.getOperand(2).getImm()) +// .addImm(Cond) +// .addMBB(fastMBB); +// } else { +// BuildMI(BB, dl, TII.get(DPU::ADDrrrci), PtrIncremented) +// .addReg(PtrInit) +// .addReg(MI.getOperand(2).getReg()) +// .addImm(Cond) +// .addMBB(fastMBB); +// } + +// unsigned int WramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass); +// unsigned int MramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass); +// unsigned int MramCacheUpdated = +// RI.createVirtualRegister(&DPU::GP_REGRegClass); +// unsigned int PtrUpdated = RI.createVirtualRegister(&DPU::GP_REGRegClass); +// BuildMI(slowMBB, dl, TII.get(DPU::LWrri), MramCache).addReg(Reader).addImm(4); +// BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), MramCacheUpdated) +// .addReg(MramCache) +// .addImm(PageSize); +// BuildMI(slowMBB, dl, TII.get(DPU::SWrir)) +// .addReg(Reader) +// .addImm(4) +// .addReg(MramCacheUpdated); +// BuildMI(slowMBB, dl, TII.get(DPU::LWrri), WramCache).addReg(Reader).addImm(0); +// BuildMI(slowMBB, dl, TII.get(DPU::LDMArri)) +// .addReg(WramCache) +// .addReg(MramCacheUpdated) +// .addImm(FormatDMASize(PageSize * 2)); +// BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), PtrUpdated) +// .addReg(PtrIncremented) +// .addImm(-PageSize); + +// BuildMI(*fastMBB, fastMBB->begin(), dl, TII.get(TargetOpcode::PHI), Dest) +// .addReg(PtrIncremented) +// .addMBB(BB) +// .addReg(PtrUpdated) +// .addMBB(slowMBB); + +// MI.eraseFromParent(); // The pseudo instruction is gone now. + +// LLVM_DEBUG({ +// dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; +// dbgs() << "instruction replaced\n"; +// dbgs() << "** BB: "; BB->dump(); +// dbgs() << "** slowMBB: "; slowMBB->dump(); +// dbgs() << "** fastMBB: "; fastMBB->dump(); +// dbgs() << "****** \n"; +// }); + +// return fastMBB; +// } + +static MachineBasicBlock *EmitAlu64BitRRWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB, + unsigned LsbOpcode, + unsigned MsbOpcode) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int PtrInit = MI.getOperand(1).getReg(); - unsigned int Reader = MI.getOperand(3).getReg(); - unsigned int Cond = MI.getOperand(4).getImm(); - unsigned int PageSize = MI.getOperand(5).getImm(); + + const DebugLoc &DL = MI.getDebugLoc(); + MachineFunction &MF = *MBB->getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + LLVMContext &Context = MF.getFunction().getContext(); + MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + + // Get the virtual registers + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned LHSReg = MI.getOperand(1).getReg(); + unsigned RHSReg = MI.getOperand(2).getReg(); + + // Create new virtual registers for the lower and upper halves + unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned RHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned RHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + + // Split the 64-bit operands into 32-bit halves + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Lo).addReg(RHSReg, 0, DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Hi).addReg(RHSReg, 0, DPU::sub_32bit_hi); + + // Perform the lower 32-bit subtraction + MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(LsbOpcode), Dst_Lo) + .addReg(LHS_Lo) + .addReg(RHS_Lo) + .addMetadata(N) + ; + + // Perform the upper 32-bit subtraction with carry + MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(MsbOpcode), Dst_Hi) + .addReg(LHS_Hi) + .addReg(RHS_Hi) + .addMetadata(N) + ; + + // Combine the result into the 64-bit destination register + unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); + + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0) + .addReg(UndefReg) + .addReg(Dst_Lo) + .addImm(DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1) + .addReg(Dstp0) + .addReg(Dst_Hi) + .addImm(DPU::sub_32bit_hi); - MachineRegisterInfo &RI = F->getRegInfo(); - unsigned int PtrIncremented = RI.createVirtualRegister(&DPU::GP_REGRegClass); - - if (IsIncCst) { - BuildMI(BB, dl, TII.get(DPU::ADDrrici), PtrIncremented) - .addReg(PtrInit) - .addImm(MI.getOperand(2).getImm()) - .addImm(Cond) - .addMBB(fastMBB); - } else { - BuildMI(BB, dl, TII.get(DPU::ADDrrrci), PtrIncremented) - .addReg(PtrInit) - .addReg(MI.getOperand(2).getReg()) - .addImm(Cond) - .addMBB(fastMBB); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1); + + for (unsigned i = 1; i < 3; i++) { + if (MI.getOperand(i).isKill()) { + MIBLsb->getOperand(i).setIsKill(); + MIBMsb->getOperand(i).setIsKill(); + } } + + // Remove the pseudo instruction + MI.eraseFromParent(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); - unsigned int WramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int MramCache = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int MramCacheUpdated = - RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int PtrUpdated = RI.createVirtualRegister(&DPU::GP_REGRegClass); - BuildMI(slowMBB, dl, TII.get(DPU::LWrri), MramCache).addReg(Reader).addImm(4); - BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), MramCacheUpdated) - .addReg(MramCache) - .addImm(PageSize); - BuildMI(slowMBB, dl, TII.get(DPU::SWrir)) - .addReg(Reader) - .addImm(4) - .addReg(MramCacheUpdated); - BuildMI(slowMBB, dl, TII.get(DPU::LWrri), WramCache).addReg(Reader).addImm(0); - BuildMI(slowMBB, dl, TII.get(DPU::LDMArri)) - .addReg(WramCache) - .addReg(MramCacheUpdated) - .addImm(FormatDMASize(PageSize * 2)); - BuildMI(slowMBB, dl, TII.get(DPU::ADDrri), PtrUpdated) - .addReg(PtrIncremented) - .addImm(-PageSize); + return MBB; +} - BuildMI(*fastMBB, fastMBB->begin(), dl, TII.get(TargetOpcode::PHI), Dest) - .addReg(PtrIncremented) - .addMBB(BB) - .addReg(PtrUpdated) - .addMBB(slowMBB); +static MachineBasicBlock *EmitAlu64BitRIWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB, + unsigned LsbOpcode, + unsigned MsbOpcode) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); - MI.eraseFromParent(); // The pseudo instruction is gone now. - return fastMBB; + const DebugLoc &DL = MI.getDebugLoc(); + const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MachineFunction &MF = *MBB->getParent(); + + LLVMContext &Context = MF.getFunction().getContext(); + MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + + // Get the virtual registers + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned LHSReg = MI.getOperand(1).getReg(); + int64_t RHSImm = MI.getOperand(2).getImm(); + + // Create new virtual registers for the lower and upper halves + unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + + // Split the 64-bit operands into 32-bit halves + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi); + + int64_t RHSImmLo = RHSImm & 0xFFFFFFFFl; + int64_t RHSImmHi = (RHSImm >> 32) & 0xFFFFFFFFl; + + // // what if value is zero??? + // // probably optimizable :) + // switch (RHSImmLo) { + // case 0: + // case 1: + // case 0xffffffff: + // case 0x80000000: + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "RHSImmLo = " << RHSImmLo << " could be optimized\n"; + // }); + // } + + // switch (RHSImmHi) { + // case 0: + // case 1: + // case 0xffffffff: + // case 0x80000000: + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "RHSImmHi = " << RHSImmHi << " could be optimized\n"; + // }); + // } + + // Perform the lower 32-bit subtraction + MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(LsbOpcode), Dst_Lo) + .addReg(LHS_Lo) + .addImm(RHSImmLo) + .addMetadata(N); + + // Perform the upper 32-bit subtraction with carry + MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(MsbOpcode), Dst_Hi) + .addReg(LHS_Hi) + .addImm(RHSImmHi) + .addMetadata(N); + + // Combine the result into the 64-bit destination register + unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0) + .addReg(UndefReg) + .addReg(Dst_Lo) + .addImm(DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1) + .addReg(Dstp0) + .addReg(Dst_Hi) + .addImm(DPU::sub_32bit_hi); + + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1); + + if (MI.getOperand(1).isKill()) { + MIBLsb->getOperand(1).setIsKill(); + MIBMsb->getOperand(1).setIsKill(); + } + + // Remove the pseudo instruction + MI.eraseFromParent(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + return MBB; +} + +static MachineBasicBlock *EmitMove64RiWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + const DebugLoc &DL = MI.getDebugLoc(); + const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + // Get the virtual registers + unsigned DstReg = MI.getOperand(0).getReg(); + int64_t RHSImm = MI.getOperand(1).getImm(); + + // Create new virtual registers for the lower and upper halves + unsigned Dst_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + + int64_t RHSImmLo = RHSImm & 0xFFFFFFFFl; + int64_t RHSImmHi = (RHSImm >> 32) & 0xFFFFFFFFl; + + // // what if value is zero??? + // // probably optimizable :) + switch (RHSImmLo) { + case 0: + case 1: + case 0xffffffff: + case 0x80000000: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "RHSImmLo = " << RHSImmLo << " could be optimized\n"; + }); + } + + switch (RHSImmHi) { + case 0: + case 1: + case 0xffffffff: + case 0x80000000: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "RHSImmHi = " << RHSImmHi << " could be optimized\n"; + }); + } + + // Perform the lower 32-bit subtraction + MachineInstrBuilder MIBLsb; + // switch (RHSImmLo) { + // default: { + MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::MOVEri), Dst_Lo).addImm(RHSImmLo); + // break; + // } + // case 0: { + // MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::ZERO); + // break; + // } + // case 1: { + // MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::ONE); + // break; + // } + // case 0xffffffff: { + // MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::LNEG); + // break; + // } + // case 0x80000000: { + // MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Lo).addReg(DPU::MNEG); + // break; + // } + // } + + // Perform the upper 32-bit subtraction with carry + MachineInstrBuilder MIBMsb; + // switch (RHSImmHi) { + // default: { + MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::MOVEri), Dst_Hi).addImm(RHSImmHi); + // break; + // } + // case 0: { + // MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::ZERO); + // break; + // } + // case 1: { + // MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::ONE); + // break; + // } + // case 0xffffffff: { + // MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::LNEG); + // break; + // } + // case 0x80000000: { + // MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Dst_Hi).addReg(DPU::MNEG); + // break; + // } + // } + + // Combine the result into the 64-bit destination register + unsigned Dstp0 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + unsigned Dstp1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + unsigned UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp0) + .addReg(UndefReg) + .addReg(Dst_Lo) + .addImm(DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dstp1) + .addReg(Dstp0) + .addReg(Dst_Hi) + .addImm(DPU::sub_32bit_hi); + + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), DstReg).addReg(Dstp1); + + // Remove the pseudo instruction + MI.eraseFromParent(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + return MBB; +} + +static DPUAsmCondition::Condition +findSelect64SetConditionFor(DPUAsmCondition::Condition cond) { + switch (cond) { + default: + llvm_unreachable("invalid condition"); + case DPUAsmCondition::Condition::Zero: + case DPUAsmCondition::Condition::Equal: + return DPUAsmCondition::Condition::ExtendedZero; + case DPUAsmCondition::Condition::NotZero: + case DPUAsmCondition::Condition::NotEqual: + return DPUAsmCondition::Condition::ExtendedNotZero; + case DPUAsmCondition::Condition::GreaterThanSigned: + return DPUAsmCondition::Condition::ExtendedGreaterThanSigned; + case DPUAsmCondition::Condition::GreaterOrEqualSigned: + return DPUAsmCondition::Condition::GreaterOrEqualSigned; + case DPUAsmCondition::Condition::LessThanSigned: + return DPUAsmCondition::Condition::LessThanSigned; + case DPUAsmCondition::Condition::LessOrEqualSigned: + return DPUAsmCondition::Condition::ExtendedLessOrEqualSigned; + case DPUAsmCondition::Condition::GreaterThanUnsigned: + return DPUAsmCondition::Condition::ExtendedGreaterThanUnsigned; + case DPUAsmCondition::Condition::GreaterOrEqualUnsigned: + return DPUAsmCondition::Condition::GreaterOrEqualUnsigned; + case DPUAsmCondition::Condition::LessThanUnsigned: + return DPUAsmCondition::Condition::LessThanUnsigned; + case DPUAsmCondition::Condition::LessOrEqualUnsigned: + return DPUAsmCondition::Condition::ExtendedLessOrEqualUnsigned; + } +} + +static MachineBasicBlock *EmitSetCC64WithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MI.dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + const DebugLoc &DL = MI.getDebugLoc(); + const TargetInstrInfo &TII = *MBB->getParent()->getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MachineFunction &MF = *MBB->getParent(); + + LLVMContext &Context = MF.getFunction().getContext(); + MDNode *N = MDNode::get(Context, MDString::get(Context, "MySpecialMetadata")); + + // Get the virtual registers + unsigned DstReg = MI.getOperand(0).getReg(); + auto ImmCond = static_cast(MI.getOperand(1).getImm()); + unsigned LHSReg = MI.getOperand(2).getReg(); + unsigned RHSReg = MI.getOperand(3).getReg(); + + unsigned LHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned LHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned RHS_Lo = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + unsigned RHS_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + // unsigned Dst_Hi = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Lo).addReg(LHSReg, 0, DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), LHS_Hi).addReg(LHSReg, 0, DPU::sub_32bit_hi); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Lo).addReg(RHSReg, 0, DPU::sub_32bit); + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), RHS_Hi).addReg(RHSReg, 0, DPU::sub_32bit_hi); + + DPUAsmCondition::Condition SetCondition = + findSelect64SetConditionFor(ImmCond); + + MachineInstrBuilder MIBLsb = BuildMI(*MBB, MI, DL, TII.get(DPU::SUBzrr)) + .addReg(DPU::ZERO) + .addReg(LHS_Lo) + .addReg(RHS_Lo) + .addMetadata(N); + MachineInstrBuilder MIBMsb = BuildMI(*MBB, MI, DL, TII.get(DPU::SUBCrrrc), DstReg) + .addReg(LHS_Hi) + .addReg(RHS_Hi) + .addImm(SetCondition) + .addMetadata(N); + + for (unsigned i = 2; i < 4; i++) { + if (MI.getOperand(i).isKill()) { + MIBLsb->getOperand(i - 1).setIsKill(); + MIBMsb->getOperand(i - 1).setIsKill(); + } + } + + // Remove the pseudo instruction + MI.eraseFromParent(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction replaced\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "****** \n"; + }); + + return MBB; } MachineBasicBlock * @@ -3215,27 +3864,39 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { default: + MI.print(errs()); llvm_unreachable("Unexpected instr type to insert"); - case DPU::SEQREAD_GET: - return EmitSeqreadGet(MI, BB, false); - case DPU::SEQREAD_GET_CST: - return EmitSeqreadGet(MI, BB, true); + // case DPU::SEQREAD_GET: + // return EmitSeqreadGet(MI, BB, false); + // case DPU::SEQREAD_GET_CST: + // return EmitSeqreadGet(MI, BB, true); case DPU::Mul16UUrr: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16UUrr\n"; + }); return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci, DPU::MUL_UH_ULrrr, DPU::MUL_UH_ULrrr, DPU::MUL_UH_UHrrr); case DPU::Mul16SUrr: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16SUrr\n"; + }); return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci, DPU::MUL_SH_ULrrr, DPU::MUL_UH_ULrrr, DPU::MUL_SH_UHrrr); case DPU::Mul16SSrr: + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << " Mul16SSrr\n"; + }); return EmitMul16WithCustomInserter(MI, BB, DPU::MUL_UL_ULrrrci, DPU::MUL_SH_ULrrr, DPU::MUL_SH_ULrrr, DPU::MUL_SH_SHrrr); case DPU::SELECTrr: return EmitSelectWithCustomInserter(MI, BB); case DPU::SELECT64rr: - return EmitSelect64WithCustomInserter(MI, BB); + // return EmitSelect64WithCustomInserter(MI, BB); + return EmitSelectWithCustomInserter(MI, BB); + case DPU::MRAM_STORE_BYTErm: return EmitMramSubStoreWithCustomInserter(MI, BB, 7, DPU::SBrir); case DPU::MRAM_STORE_HALFrm: @@ -3275,6 +3936,7 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return EmitMramSubLoadWithCustomInserter(MI, BB, 4, DPU::LW_Srri); case DPU::MRAM_LOAD_DOUBLEmr: return EmitMramLoadDoubleWithCustomInserter(MI, BB); + case DPU::LSL64rr: return EmitLsl64RegisterWithCustomInserter(MI, BB); case DPU::LSL64ri: @@ -3305,5 +3967,42 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, DPU::LSR_ADDrrri); case DPU::CLZ64r: return EmitClz64WithCustomInserter(MI, BB); + + // RR + // case DPU::ADD64rr: + // return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ADDrrr, DPU::ADDCrrr); + + // case DPU::AND64rr: + // return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ANDrrr, DPU::ANDrrr); + + // case DPU::OR64rr: + // return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::ORrrr, DPU::ORrrr); + + // case DPU::SUB64rr: + // return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::SUBrrr, DPU::SUBCrrr); + + // case DPU::XOR64rr: + // return EmitAlu64BitRRWithCustomInserter(MI, BB, DPU::XORrrr, DPU::XORrrr); + + // // RI + // case DPU::ADD64ri: + // return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ADDrri, DPU::ADDCrri); + + // case DPU::AND64ri: + // return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ANDrri, DPU::ANDrri); + + // case DPU::OR64ri: + // return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::ORrri, DPU::ORrri); + + // case DPU::XOR64ri: + // return EmitAlu64BitRIWithCustomInserter(MI, BB, DPU::XORrri, DPU::XORrri); + + + case DPU::MOVE64ri: + return EmitMove64RiWithCustomInserter(MI, BB); + + case DPU::SET64cc: + return EmitSetCC64WithCustomInserter(MI, BB); + } } diff --git a/llvm/lib/Target/DPU/DPUTargetMachine.cpp b/llvm/lib/Target/DPU/DPUTargetMachine.cpp index 5815b161c6ce9..734bc0d541ff3 100644 --- a/llvm/lib/Target/DPU/DPUTargetMachine.cpp +++ b/llvm/lib/Target/DPU/DPUTargetMachine.cpp @@ -54,6 +54,7 @@ DPUTargetMachine::DPUTargetMachine(const Target &T, const Triple &TT, getEffectiveCodeModel(CM, CodeModel::Small), OL), TLOF(std::make_unique()), Subtarget(TT, CPU, FS, *this) { + // setRequiresStructuredCFG(true); initAsmInfo(); } @@ -84,6 +85,7 @@ class DPUPassConfig : public TargetPassConfig { bool addInstSelector() override; + void addPostRegAlloc() override; void addPreEmitPass() override; void addPreEmitPass2() override; }; @@ -103,6 +105,11 @@ bool DPUPassConfig::addInstSelector() { return false; } +void DPUPassConfig::addPostRegAlloc() { + DPUTargetMachine &TM = getDPUTargetMachine(); + addPass(createDPUPostRAFusionPass(TM)); +} + void DPUPassConfig::addPreEmitPass() { DPUTargetMachine &TM = getDPUTargetMachine(); addPass(createDPUMergeComboInstrPass(TM)); diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 8bd3036f1fc34..336a990a046ca 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -367,13 +367,19 @@ struct ScopedSaveAliaseesAndUsed { } ~ScopedSaveAliaseesAndUsed() { + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); appendToUsed(M, std::vector(Used.begin(), Used.end())); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); appendToCompilerUsed(M, std::vector(CompilerUsed.begin(), CompilerUsed.end())); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); - for (auto P : FunctionAliases) + for (auto P : FunctionAliases) { + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); P.first->setIndirectSymbol( ConstantExpr::getBitCast(P.second, P.first->getType())); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); + } } }; diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index ef9f18a2289e9..26ced977d52fc 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -75,38 +75,52 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef GlobalVariable *GV = M.getGlobalVariable(Name); SmallPtrSet InitAsSet; SmallVector Init; + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); if (GV) { + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); auto *CA = cast(GV->getInitializer()); for (auto &Op : CA->operands()) { + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); Constant *C = cast_or_null(Op); if (InitAsSet.insert(C).second) Init.push_back(C); } GV->eraseFromParent(); } - + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext()); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); for (auto *V : Values) { - Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); + // V->dump(); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); + // Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy); + Constant *C = ConstantExpr::getPointerBitCastOrAddrSpaceCast(V, Int8PtrTy); if (InitAsSet.insert(C).second) Init.push_back(C); } - + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); if (Init.empty()) return; - + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size()); GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, ConstantArray::get(ATy, Init), Name); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); GV->setSection("llvm.metadata"); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); } void llvm::appendToUsed(Module &M, ArrayRef Values) { + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); appendToUsedList(M, "llvm.used", Values); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); } void llvm::appendToCompilerUsed(Module &M, ArrayRef Values) { + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); appendToUsedList(M, "llvm.compiler.used", Values); + // LLVM_DEBUG({ dbgs() << __FILE__ << __LINE__ << __func__ << "\n"; }); } FunctionCallee