diff --git a/compiler-rt/dpu/CMakeLists.txt b/compiler-rt/dpu/CMakeLists.txt new file mode 100644 index 0000000000000..3a81885225652 --- /dev/null +++ b/compiler-rt/dpu/CMakeLists.txt @@ -0,0 +1,306 @@ +cmake_minimum_required(VERSION 3.13) + +project(librt C ASM) + +set(CMAKE_AR llvm-ar) +set(CMAKE_LINKER llvm-ld) +set(CMAKE_NM llvm-nm) +set(CMAKE_OBJDUMP llvm-objdump) +set(CMAKE_RANLIB llvm-ranlib) +set(OBJCOPY llvm-objcopy) +set(CLANGFORMAT clang-format) + +set(COMPILER_RT_BUILTINS_DIR ../lib/builtins) + +set(GENERIC_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/dpu/mul32.S + ${COMPILER_RT_BUILTINS_DIR}/dpu/mulsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/muldi3.c + + ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.S + # ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv32.c optimized above + ${COMPILER_RT_BUILTINS_DIR}/dpu/div32.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/divsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/modsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/udivmodsi4.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/udivsi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/umodsi3.c + + ${COMPILER_RT_BUILTINS_DIR}/dpu/udiv64.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/divdi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/moddi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/udivdi3.c + ${COMPILER_RT_BUILTINS_DIR}/dpu/umoddi3.c + + ${COMPILER_RT_BUILTINS_DIR}/absvdi2.c + ${COMPILER_RT_BUILTINS_DIR}/absvsi2.c + ${COMPILER_RT_BUILTINS_DIR}/adddf3.c + ${COMPILER_RT_BUILTINS_DIR}/addsf3.c + ${COMPILER_RT_BUILTINS_DIR}/addvdi3.c + ${COMPILER_RT_BUILTINS_DIR}/addvsi3.c + ${COMPILER_RT_BUILTINS_DIR}/ashldi3.c + ${COMPILER_RT_BUILTINS_DIR}/ashrdi3.c + ${COMPILER_RT_BUILTINS_DIR}/bswapdi2.c + ${COMPILER_RT_BUILTINS_DIR}/bswapsi2.c + ${COMPILER_RT_BUILTINS_DIR}/clzdi2.c + ${COMPILER_RT_BUILTINS_DIR}/clzsi2.c + ${COMPILER_RT_BUILTINS_DIR}/cmpdi2.c + ${COMPILER_RT_BUILTINS_DIR}/comparedf2.c + ${COMPILER_RT_BUILTINS_DIR}/comparesf2.c + ${COMPILER_RT_BUILTINS_DIR}/ctzdi2.c + ${COMPILER_RT_BUILTINS_DIR}/ctzsi2.c + ${COMPILER_RT_BUILTINS_DIR}/divdf3.c + ${COMPILER_RT_BUILTINS_DIR}/divdi3.c + ${COMPILER_RT_BUILTINS_DIR}/divmoddi4.c + ${COMPILER_RT_BUILTINS_DIR}/divmodsi4.c + ${COMPILER_RT_BUILTINS_DIR}/divsf3.c + ${COMPILER_RT_BUILTINS_DIR}/divsi3.c + ${COMPILER_RT_BUILTINS_DIR}/extendsfdf2.c + ${COMPILER_RT_BUILTINS_DIR}/extendhfsf2.c + ${COMPILER_RT_BUILTINS_DIR}/ffsdi2.c + ${COMPILER_RT_BUILTINS_DIR}/ffssi2.c + ${COMPILER_RT_BUILTINS_DIR}/fixdfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixdfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixsfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixsfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunsdfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunsdfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunssfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunssfsi.c + ${COMPILER_RT_BUILTINS_DIR}/floatdidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatdisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatsidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatsisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatundidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatundisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunsidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunsisf.c + ${COMPILER_RT_BUILTINS_DIR}/fp_mode.c + ${COMPILER_RT_BUILTINS_DIR}/int_util.c + ${COMPILER_RT_BUILTINS_DIR}/lshrdi3.c + ${COMPILER_RT_BUILTINS_DIR}/moddi3.c + ${COMPILER_RT_BUILTINS_DIR}/modsi3.c + ${COMPILER_RT_BUILTINS_DIR}/muldf3.c + ${COMPILER_RT_BUILTINS_DIR}/muldi3.c + ${COMPILER_RT_BUILTINS_DIR}/mulodi4.c + ${COMPILER_RT_BUILTINS_DIR}/mulosi4.c + ${COMPILER_RT_BUILTINS_DIR}/mulsf3.c + ${COMPILER_RT_BUILTINS_DIR}/mulvdi3.c + ${COMPILER_RT_BUILTINS_DIR}/mulvsi3.c + ${COMPILER_RT_BUILTINS_DIR}/negdf2.c + ${COMPILER_RT_BUILTINS_DIR}/negdi2.c + ${COMPILER_RT_BUILTINS_DIR}/negsf2.c + ${COMPILER_RT_BUILTINS_DIR}/negvdi2.c + ${COMPILER_RT_BUILTINS_DIR}/negvsi2.c + ${COMPILER_RT_BUILTINS_DIR}/paritydi2.c + ${COMPILER_RT_BUILTINS_DIR}/paritysi2.c + ${COMPILER_RT_BUILTINS_DIR}/popcountdi2.c + ${COMPILER_RT_BUILTINS_DIR}/popcountsi2.c + ${COMPILER_RT_BUILTINS_DIR}/powidf2.c + ${COMPILER_RT_BUILTINS_DIR}/powisf2.c + ${COMPILER_RT_BUILTINS_DIR}/subdf3.c + ${COMPILER_RT_BUILTINS_DIR}/subsf3.c + ${COMPILER_RT_BUILTINS_DIR}/subvdi3.c + ${COMPILER_RT_BUILTINS_DIR}/subvsi3.c + ${COMPILER_RT_BUILTINS_DIR}/truncdfhf2.c + ${COMPILER_RT_BUILTINS_DIR}/truncdfsf2.c + ${COMPILER_RT_BUILTINS_DIR}/truncsfhf2.c + ${COMPILER_RT_BUILTINS_DIR}/ucmpdi2.c + ${COMPILER_RT_BUILTINS_DIR}/udivdi3.c + ${COMPILER_RT_BUILTINS_DIR}/udivmoddi4.c + ${COMPILER_RT_BUILTINS_DIR}/udivmodsi4.c + ${COMPILER_RT_BUILTINS_DIR}/udivsi3.c + ${COMPILER_RT_BUILTINS_DIR}/umoddi3.c + ${COMPILER_RT_BUILTINS_DIR}/umodsi3.c + ) + +set(GENERIC_TF_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/addtf3.c + ${COMPILER_RT_BUILTINS_DIR}/addvti3.c + ${COMPILER_RT_BUILTINS_DIR}/absvti2.c + ${COMPILER_RT_BUILTINS_DIR}/ashrti3.c + ${COMPILER_RT_BUILTINS_DIR}/comparetf2.c + ${COMPILER_RT_BUILTINS_DIR}/clzti2.c + ${COMPILER_RT_BUILTINS_DIR}/cmpti2.c + ${COMPILER_RT_BUILTINS_DIR}/ctzti2.c + ${COMPILER_RT_BUILTINS_DIR}/divtf3.c + ${COMPILER_RT_BUILTINS_DIR}/divmodti4.c + ${COMPILER_RT_BUILTINS_DIR}/divti3.c + ${COMPILER_RT_BUILTINS_DIR}/extenddftf2.c + ${COMPILER_RT_BUILTINS_DIR}/extendhftf2.c + ${COMPILER_RT_BUILTINS_DIR}/extendsftf2.c + ${COMPILER_RT_BUILTINS_DIR}/ffsti2.c + ${COMPILER_RT_BUILTINS_DIR}/fixdfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixsfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixtfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixtfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixtfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixunsdfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixunssfti.c + ${COMPILER_RT_BUILTINS_DIR}/fixunstfdi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunstfsi.c + ${COMPILER_RT_BUILTINS_DIR}/fixunstfti.c + ${COMPILER_RT_BUILTINS_DIR}/floatditf.c + ${COMPILER_RT_BUILTINS_DIR}/floatsitf.c + ${COMPILER_RT_BUILTINS_DIR}/floattidf.c + ${COMPILER_RT_BUILTINS_DIR}/floattisf.c + ${COMPILER_RT_BUILTINS_DIR}/floattitf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunditf.c + ${COMPILER_RT_BUILTINS_DIR}/floatunsitf.c + ${COMPILER_RT_BUILTINS_DIR}/floatuntidf.c + ${COMPILER_RT_BUILTINS_DIR}/floatuntisf.c + ${COMPILER_RT_BUILTINS_DIR}/floatuntitf.c + ${COMPILER_RT_BUILTINS_DIR}/lshrti3.c + ${COMPILER_RT_BUILTINS_DIR}/modti3.c + ${COMPILER_RT_BUILTINS_DIR}/muloti4.c + ${COMPILER_RT_BUILTINS_DIR}/multf3.c + ${COMPILER_RT_BUILTINS_DIR}/multi3.c + ${COMPILER_RT_BUILTINS_DIR}/mulvti3.c + ${COMPILER_RT_BUILTINS_DIR}/negti2.c + ${COMPILER_RT_BUILTINS_DIR}/negvti2.c + ${COMPILER_RT_BUILTINS_DIR}/popcountti2.c + ${COMPILER_RT_BUILTINS_DIR}/powitf2.c + ${COMPILER_RT_BUILTINS_DIR}/subtf3.c + ${COMPILER_RT_BUILTINS_DIR}/subvti3.c + ${COMPILER_RT_BUILTINS_DIR}/trunctfdf2.c + ${COMPILER_RT_BUILTINS_DIR}/trunctfhf2.c + ${COMPILER_RT_BUILTINS_DIR}/trunctfsf2.c + ${COMPILER_RT_BUILTINS_DIR}/ucmpti2.c + ${COMPILER_RT_BUILTINS_DIR}/udivmodti4.c + ${COMPILER_RT_BUILTINS_DIR}/udivti3.c + ${COMPILER_RT_BUILTINS_DIR}/umodti3.c + ) + +set(GENERIC_COMPLEX_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/divdc3.c + ${COMPILER_RT_BUILTINS_DIR}/divsc3.c + ${COMPILER_RT_BUILTINS_DIR}/muldc3.c + ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c + ) + +set(GENERIC_COMPLEX_TF_SOURCES + ${COMPILER_RT_BUILTINS_DIR}/divdc3.c + ${COMPILER_RT_BUILTINS_DIR}/divsc3.c + ${COMPILER_RT_BUILTINS_DIR}/divtc3.c + ${COMPILER_RT_BUILTINS_DIR}/muldc3.c + ${COMPILER_RT_BUILTINS_DIR}/mulsc3.c + ${COMPILER_RT_BUILTINS_DIR}/multc3.c + ) + +set(SOURCES ${GENERIC_SOURCES} + # ${GENERIC_TF_SOURCES} + # ${GENERIC_COMPLEX} + # ${GENERIC_COMPLEX_TF_SOURCES} + ) + +function(add_dpu_library) + set(options PROFILING) + set(oneValueArgs TARGET OPT_LEVEL LTO) + set(multiValueArgs SOURCES) + cmake_parse_arguments(arg "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message("ARGN: ${ARGN}") + + message(${options}) + message(${oneValueArgs}) + message(${multiValueArgs}) + + message("TARGET: ${arg_TARGET}") + message("OPT_LEVEL: ${arg_OPT_LEVEL}") + message("PROFILING: ${arg_PROFILING}") + message("LTO: ${arg_LTO}") + message("LTO_TYPE: ${arg_LTO_TYPE}") + + set(LOCAL_TARGET ${arg_TARGET}) + + set(OTHER_FLAGS) + list(APPEND OTHER_FLAGS -Wall) + list(APPEND OTHER_FLAGS -Wextra) + + if (arg_OPT_LEVEL) + list(APPEND OTHER_FLAGS ${arg_OPT_LEVEL}) + string(REPLACE "-" "" arg_OPT_LEVEL ${arg_OPT_LEVEL}) + string(APPEND LOCAL_TARGET "_${arg_OPT_LEVEL}") + endif() + if (arg_LTO) + list(APPEND OTHER_FLAGS ${arg_LTO}) + string(REPLACE "-f" "" arg_LTO ${arg_LTO}) + string(REPLACE "=" "" arg_LTO ${arg_LTO}) + string(APPEND LOCAL_TARGET "_${arg_LTO}") + else() + string(APPEND LOCAL_TARGET "_") + endif() + if (arg_PROFILING) + list(APPEND OTHER_FLAGS -pg) + string(APPEND LOCAL_TARGET "_pg") + endif() + + list(APPEND OTHER_FLAGS -g0) + list(APPEND OTHER_FLAGS -mllvm -verify-machineinstrs) + # list(APPEND OTHER_FLAGS -mllvm -debug) --> deduped + + message("LOCAL_TARGET: ${LOCAL_TARGET}") + message("OTHER_FLAGS: ${OTHER_FLAGS}") + + add_library(${LOCAL_TARGET} STATIC "${arg_SOURCES}") + + target_include_directories(${LOCAL_TARGET} PRIVATE + ${COMPILER_RT_BUILTINS_DIR} + ${COMPILER_RT_BUILTINS_DIR}/dpu) + + target_compile_options(${LOCAL_TARGET} + PRIVATE ${NOSTDLIB_FLAGS} ${STRICT_FLAGS} ${COMPILER_TIMESTAMP_DEF} ${OTHER_FLAGS}) + + # set_target_properties(${LOCAL_TARGET} PROPERTIES OUTPUT_NAME "rt") + + if (arg_LTO) + install( + TARGETS ${LOCAL_TARGET} + ARCHIVE + DESTINATION ${arg_OPT_LEVEL}/${arg_LTO} + ) + else() + install( + TARGETS ${LOCAL_TARGET} + ARCHIVE + DESTINATION ${arg_OPT_LEVEL}/no_lto + ) + endif() +endfunction() + +# add_dpu_library( +# TARGET rt +# OPT_LEVEL -O3 +# # LTO -flto +# # PROFILING +# SOURCES ${SOURCES} +# ) + +foreach(OPT_LEVEL -O0;-O1;-O2;-O3;-Os) + add_dpu_library( + TARGET rt + OPT_LEVEL ${OPT_LEVEL} + SOURCES ${SOURCES} + ) + # add_dpu_library( + # TARGET rt + # OPT_LEVEL ${OPT_LEVEL} + # PROFILING + # SOURCES ${SOURCES} + # ) + foreach(LTO -flto;-flto=thin) + add_dpu_library( + TARGET rt + OPT_LEVEL ${OPT_LEVEL} + LTO ${LTO} + SOURCES ${SOURCES} + ) + # add_dpu_library( + # TARGET rt + # OPT_LEVEL ${OPT_LEVEL} + # LTO ${LTO} + # PROFILING + # SOURCES ${SOURCES} + # ) + endforeach() +endforeach() diff --git a/compiler-rt/dpu/Toolchain.cmake b/compiler-rt/dpu/Toolchain.cmake new file mode 100644 index 0000000000000..ae09a95e9b705 --- /dev/null +++ b/compiler-rt/dpu/Toolchain.cmake @@ -0,0 +1,12 @@ +include(CMakeForceCompiler) + +# set(CMAKE_ASM_SOURCE_FILE_EXTENSIONS s;S;asm) + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_CROSSCOMPILING 1) +set(CMAKE_ASM_COMPILER dpu-clang) +set(CMAKE_C_COMPILER dpu-clang) +set(CMAKE_CXX_COMPILER dpu-clang) +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(CMAKE_C_COMPILER_WORKS 1) +set(CMAKE_CXX_COMPILER_WORKS 1) diff --git a/compiler-rt/dpu/compiler_rt_tests.sh b/compiler-rt/dpu/compiler_rt_tests.sh new file mode 100644 index 0000000000000..83912da93aec8 --- /dev/null +++ b/compiler-rt/dpu/compiler_rt_tests.sh @@ -0,0 +1,269 @@ +#!/bin/bash + +set -eux + +LLVM_SOURCE=~/work/dpu_tools_llvm_cleanup_20240710_2/llvm-project +COMPILER_RT=${LLVM_SOURCE}/compiler-rt/lib/builtins +COMPILER_RT_TESTS=${LLVM_SOURCE}/compiler-rt/test/builtins/Unit + +COMPILER_RT_BUILD=`pwd` + +# not supported +# declare -a TESTS_=( + # absvti2_test.c + # adddf3vfp_test.c + # addsf3vfp_test.c + # addtf3_test.c + # addvti3_test.c + # ashlti3_test.c + # ashrti3_test.c + # clzti2_test.c + # cmpti2_test.c + # compiler_rt_logb_test.c + # compiler_rt_logbf_test.c + # compiler_rt_logbl_test.c + # ctzti2_test.c + # divdc3_test.c + # divdf3vfp_test.c + # divmodti4_test.c + # divsf3vfp_test.c + # divsc3_test.c + # divtc3_test.c + # divtf3_test.c + # divti3_test.c + # divxc3_test.c + # eqdf2vfp_test.c + # eqsf2vfp_test.c + # eqtf2_test.c + # extenddftf2_test.c + # extendhftf2_test.c + # extendsfdf2vfp_test.c + # extendsftf2_test.c + # ffsti2_test.c + # fixdfsivfp_test.c + # fixdfti_test.c + # fixsfsivfp_test.c + # fixsfti_test.c + # fixtfti_test.c + # fixunsdfsivfp_test.c + # fixunsdfti_test.c + # fixunssfsivfp_test.c + # fixunssfti_test.c + # floatditf_test.c + # floatsidfvfp_test.c + # floatsisfvfp_test.c + # floatunditf_test.c + # floatunssidfvfp_test.c + # floatunssisfvfp_test.c + # muldc3_test.c + # ltdf2vfp_test.c + # ltsf2vfp_test.c + # gedf2vfp_test.c + # gesf2vfp_test.c + # gtdf2vfp_test.c + # gtsf2vfp_test.c + # ledf2vfp_test.c + # lesf2vfp_test.c + # muldf3vfp_test.c + # mulsf3vfp_test.c + # nedf2vfp_test.c + # negdf2vfp_test.c + # negsf2vfp_test.c + # nesf2vfp_test.c + # subdf3vfp_test.c + # subsf3vfp_test.c + # truncdfsf2vfp_test.c + # unorddf2vfp_test.c + # unordsf2vfp_test.c + # mulsc3_test.c + # mulxc3_test.c + # powixf2_test.c + # subvti3_test.c + # ucmpti2_test.c + # udivmodti4_test.c + # udivti3_test.c + # umodti3_test.c + # subtf3_test.c + # powitf2_test.c + # negvti2_test.c + # modti3_test.c + # muloti4_test.c + # multc3_test.c + # multi3_test.c + # mulvti3_test.c + # negti2_test.c + # netf2_test.c + # parityti2_test.c + # popcountti2_test.c + # fixtfdi_test.c + # fixtfsi_test.c + # fixunstfdi_test.c + # fixunstfsi_test.c + # fixunstfti_test.c + # fixunsxfdi_test.c + # fixunsxfsi_test.c + # fixunsxfti_test.c + # fixxfti_test.c + # floatdixf_test.c + # floatsitf_test.c + # floattidf_test.c + # floattisf_test.c + # floattitf_test.c + # floattixf_test.c + # floatundixf_test.c + # floatunsitf_test.c + # floatuntidf_test.c + # floatuntisf_test.c + # floatuntitf_test.c + # floatuntixf_test.c + # getf2_test.c + # gttf2_test.c + # letf2_test.c + # lshrti3_test.c + # lttf2_test.c + # multf3_test.c + # unordtf2_test.c + # trunctfdf2_test.c + # trunctfhf2_test.c + # trunctfsf2_test.c + # fixxfdi_test.c + # udivmoddi4_test.c # too big :) +# ) + +declare -a TESTS=( + # test.c + absvdi2_test.c + absvsi2_test.c + addvdi3_test.c + addvsi3_test.c + ashldi3_test.c + ashrdi3_test.c + bswapdi2_test.c + bswapsi2_test.c + clzdi2_test.c + clzsi2_test.c + cmpdi2_test.c + comparedf2_test.c + comparesf2_test.c + ctzdi2_test.c + ctzsi2_test.c + divdf3_test.c + divdi3_test.c + divmodsi4_test.c + divsf3_test.c + divsi3_test.c + extendhfsf2_test.c + ffsdi2_test.c + ffssi2_test.c + fixdfdi_test.c + fixsfdi_test.c + fixunsdfdi_test.c + fixunsdfsi_test.c + fixunssfdi_test.c + fixunssfsi_test.c + floatdidf_test.c + floatdisf_test.c + floatundidf_test.c + floatundisf_test.c + lshrdi3_test.c + moddi3_test.c + modsi3_test.c + muldi3_test.c + mulodi4_test.c + mulosi4_test.c + mulsi3_test.c + mulvdi3_test.c + mulvsi3_test.c + negdi2_test.c + negvdi2_test.c + negvsi2_test.c + paritydi2_test.c + paritysi2_test.c + popcountdi2_test.c + popcountsi2_test.c + powidf2_test.c + powisf2_test.c + subvdi3_test.c + subvsi3_test.c + truncdfhf2_test.c + truncdfsf2_test.c + truncsfhf2_test.c + ucmpdi2_test.c + udivdi3_test.c + udivmodsi4_test.c + udivsi3_test.c + umoddi3_test.c + umodsi3_test.c +) + +declare -a OPT_LEVELS=( + O0 + O1 + O2 + O3 + Os +) + +declare -a COMPILER_OPTIONS=( + no_lto + lto + ltothin +) + +MYPWD=`pwd` + +mkdir -p test +cd test + +for COMPILER_OPTION in "${COMPILER_OPTIONS[@]}" +do + mkdir -p ${COMPILER_OPTION} + cd ${COMPILER_OPTION} + + case "$COMPILER_OPTION" in + "no_lto") COMPILER_OPTION_="";; + "lto") COMPILER_OPTION_="-flto";; + "ltothin") COMPILER_OPTION_="-flto=thin";; + esac + + case "$COMPILER_OPTION" in + "no_lto") COMPILER_OPTION_LIB="";; + "lto") COMPILER_OPTION_LIB="lto";; + "ltothin") COMPILER_OPTION_LIB="ltothin";; + esac + + for OPT_LEVEL in "${OPT_LEVELS[@]}" + do + mkdir -p ${OPT_LEVEL} + cd ${OPT_LEVEL} + + for TEST in "${TESTS[@]}" + do + clang --target=dpu-upmem-dpurte -mcpu=v1A \ + -I${COMPILER_RT} \ + -g0 \ + -v \ + -save-temps \ + -I ${MYPWD} \ + ${COMPILER_OPTION_} \ + -${OPT_LEVEL} \ + ${COMPILER_RT_TESTS}/${TEST} \ + -o $(basename "${TEST}" .c) \ + -L ${COMPILER_RT_BUILD} -lrt_${OPT_LEVEL}_${COMPILER_OPTION_LIB} \ + -mllvm -debug -mllvm -print-after-all -mllvm -verify-machineinstrs \ + &> `basename ${TEST}`_compiler_log.txt + + # dpu-lldb --batch --one-line run -- $(basename "${TEST}" .c) + python3 ${LLVM_SOURCE}/compiler-rt/dpu/lldb_python.py $(basename "${TEST}" .c) + done + cd .. + done + + cd .. +done +cd .. + + # -L ~/scratch/dpu_tools/share/upmem/include/built-in/v1A -lrt_v1A \ + # -save-temps \ + # -mllvm -debug -mllvm -print-after-all -mllvm -verify-machineinstrs \ + # --thinlto-jobs=1 diff --git a/compiler-rt/dpu/lldb_python.py b/compiler-rt/dpu/lldb_python.py new file mode 100644 index 0000000000000..e333723af601e --- /dev/null +++ b/compiler-rt/dpu/lldb_python.py @@ -0,0 +1,42 @@ +import sys +import os +import subprocess +import dpu +import lldb +import tempfile + +binary = sys.argv[1] + +debugger = lldb.SBDebugger().Create() +debugger.SetAsync(False) + +target = debugger.CreateTarget(binary) +assert target.IsValid() + +launch_info = lldb.SBLaunchInfo(None) +launch_info.SetWorkingDirectory(os.getcwd()) + +with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + stdout_path = tmp_file.name + +launch_info.AddOpenFileAction(1, stdout_path, False, True) + +# process = target.Launch(debugger.GetListener(), None, None, ".", +# "stdout.txt", "stderr.txt", None, 0, False, error) +process = target.Launch(launch_info, lldb.SBError()) +# process = target.LaunchSimple(None, None, ".") + +# print(process) + +assert process.IsValid() + +with open(stdout_path, 'r') as file: + stdout_data = file.read() + +os.remove(stdout_path) + +print(stdout_data) + +# Cleanup LLDB +# lldb.SBDebugger.Terminate() +sys.exit(process.exit_state) diff --git a/compiler-rt/lib/builtins/dpu/div32.c b/compiler-rt/lib/builtins/dpu/div32.c new file mode 100644 index 0000000000000..df25bbbdaf9d4 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/div32.c @@ -0,0 +1,97 @@ +/* Copyright 2024 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include + +extern uint64_t __udiv32(uint32_t dividend, uint32_t divider); + +/* int64_t */ +void +__div32(int32_t dividend, int32_t divider + , int32_t *p_q, int32_t *p_rem + ) +{ + uint64_t res; + uint32_t q; + uint32_t rem; + + __asm__ goto("clo zero, %[dividend], z, %l[__div32_pos_dividend]\n\t" + "clo zero, %[divider], z, %l[__div32_neg_dividend_pos_divider]\n\t" + : + : [dividend] "r"(dividend), [divider] "r"(divider) + : + : __div32_pos_dividend, __div32_neg_dividend_pos_divider); + + /* The quotient's sign depends on the sign of the dividend and divider... After few tries it sounds */ + /* like the quickest way to select the operators is to branch according to the cases. */ + + /* __div32_neg_dividend_neg_divider: */ + /* As a result, the quotient is positive and the remainder negative */ + dividend = 0 - dividend; + divider = 0 - divider; + res = __udiv32(dividend, divider); + q = (uint32_t)(res >> 32); + rem = (uint32_t)res; + rem = 0 - rem; + goto recombine; + + /* *p_q = q; */ + /* *p_rem = rem; */ + /* return; */ + + +__div32_neg_dividend_pos_divider: + /* As a result, the quotient is negative and the remainder negative */ + dividend = 0 - dividend; + res = __udiv32(dividend, divider); + q = (uint32_t)(res >> 32); + q = 0 - q; + rem = (uint32_t)res; + rem = 0 - rem; + goto recombine; + /* *p_q = q; */ + /* *p_rem = rem; */ + /* return; */ + +__div32_pos_dividend: + __asm__ goto("clo zero, %[divider], z, %l[__div32_pos_dividend_pos_divider]" + : + : [divider] "r"(divider) + : + : __div32_pos_dividend_pos_divider); + /* As a result, the quotient is negative and the remainder positive */ + divider = 0 - divider; + res = __udiv32(dividend, divider); + q = (uint32_t)(res >> 32); + q = 0 - q; + rem = (uint32_t)res; + goto recombine; + /* *p_q = q; */ + /* *p_rem = rem; */ + /* return; */ + +__div32_pos_dividend_pos_divider: + /* The dividend and divider are both positive */ + res = __udiv32(dividend, divider); + /* goto last_exit; */ + q = (uint32_t) (res >> 32); + rem = (uint32_t) res; + /* goto recombine; */ + + /* *p_q = q; */ + /* *p_rem = rem; */ + +/* recombine: */ +/* res = q; */ +/* res <<= 32; */ +/* res |= rem; */ +/* last_exit: */ +/* return res; */ + + recombine: + *p_q = q; + *p_rem = rem; + return; +} diff --git a/compiler-rt/lib/builtins/dpu/divdi3.c b/compiler-rt/lib/builtins/dpu/divdi3.c new file mode 100644 index 0000000000000..178cbf35fd2ee --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/divdi3.c @@ -0,0 +1,31 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 signed division. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include + +extern uint64_t __udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +int64_t +__divdi3(int64_t dividend, int64_t divider) +{ + if (dividend >= 0) { + if (divider >= 0) { + return __udiv64(dividend, divider, 0); + } else { + return -__udiv64(dividend, -divider, 0); + } + } else if (divider >= 0) { + // Negative dividend, positive divider + return -__udiv64(-dividend, divider, 0); + } else { + // Negative dividend, negative divider + return __udiv64(-dividend, -divider, 0); + } +} diff --git a/compiler-rt/lib/builtins/dpu/divsi3.c b/compiler-rt/lib/builtins/dpu/divsi3.c new file mode 100644 index 0000000000000..8ec97468aaf83 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/divsi3.c @@ -0,0 +1,23 @@ +/* Copyright 2024 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include + +/* extern int64_t __div32(int32_t dividend, int32_t divider); */ +extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem); + +#include "int_lib.h" + +COMPILER_RT_ABI si_int +__divsi3(si_int a, si_int b) +{ + /* int64_t res = __div32(a, b); */ + /* return (si_int) (res >> 32); */ + + int32_t q; + int32_t rem; + __div32(a, b, &q, &rem); + return q; +} diff --git a/compiler-rt/lib/builtins/dpu/moddi3.c b/compiler-rt/lib/builtins/dpu/moddi3.c new file mode 100644 index 0000000000000..dad11e699f87c --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/moddi3.c @@ -0,0 +1,31 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 signed division. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include +extern uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +int64_t +__moddi3(int64_t dividend, int64_t divider) +{ + if (dividend >= 0) { + if (divider >= 0) { + return __udiv64(dividend, divider, 1); + } else { + return __udiv64(dividend, -divider, 1); + } + } else if (divider >= 0) { + // Negative dividend, positive divider + return -__udiv64(-dividend, divider, 1); + } else { + // Negative dividend, negative divider + return -__udiv64(-dividend, -divider, 1); + } +} diff --git a/compiler-rt/lib/builtins/dpu/modsi3.c b/compiler-rt/lib/builtins/dpu/modsi3.c new file mode 100644 index 0000000000000..c0cc59e8c92f9 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/modsi3.c @@ -0,0 +1,34 @@ +/* ===-- modsi3.c - Implement __modsi3 -------------------------------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is dual licensed under the MIT and the University of Illinois Open + * Source Licenses. See LICENSE_LLVM.TXT for details. + * + * ===----------------------------------------------------------------------=== + * + * This file implements __modsi3 for the compiler_rt library. + * + * ===----------------------------------------------------------------------=== + */ + +#include + +/* extern int64_t __div32(int32_t dividend, int32_t divider); */ +extern void __div32(int32_t dividend, int32_t divider, int32_t *q, int32_t *rem); + +#include "int_lib.h" + +/* Returns: a % b */ + +COMPILER_RT_ABI si_int +__modsi3(si_int a, si_int b) +{ + /* int64_t res = __div32(a, b); */ + /* return (si_int) res; */ + + int32_t q; + int32_t rem; + __div32(a, b, &q, &rem); + return rem; +} diff --git a/compiler-rt/lib/builtins/dpu/mul32.S b/compiler-rt/lib/builtins/dpu/mul32.S new file mode 100644 index 0000000000000..fe735ab5b328f --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/mul32.S @@ -0,0 +1,48 @@ + .text + .globl __mul32 + .type __mul32,@function +__mul32: + jgtu r1, r0, .Ltmp0 + move r2, r0 + move r0, r1, true, .Ltmp1 +.Ltmp0: + move r2, r1 + // move r0, r0 +.Ltmp1: + move r1, zero + mul_step d0, r2, d0, 0, z, .Ltmp2 + mul_step d0, r2, d0, 1, z, .Ltmp2 + mul_step d0, r2, d0, 2, z, .Ltmp2 + mul_step d0, r2, d0, 3, z, .Ltmp2 + mul_step d0, r2, d0, 4, z, .Ltmp2 + mul_step d0, r2, d0, 5, z, .Ltmp2 + mul_step d0, r2, d0, 6, z, .Ltmp2 + mul_step d0, r2, d0, 7, z, .Ltmp2 + mul_step d0, r2, d0, 8, z, .Ltmp2 + mul_step d0, r2, d0, 9, z, .Ltmp2 + mul_step d0, r2, d0, 10, z, .Ltmp2 + mul_step d0, r2, d0, 11, z, .Ltmp2 + mul_step d0, r2, d0, 12, z, .Ltmp2 + mul_step d0, r2, d0, 13, z, .Ltmp2 + mul_step d0, r2, d0, 14, z, .Ltmp2 + mul_step d0, r2, d0, 15, z, .Ltmp2 + mul_step d0, r2, d0, 16, z, .Ltmp2 + mul_step d0, r2, d0, 17, z, .Ltmp2 + mul_step d0, r2, d0, 18, z, .Ltmp2 + mul_step d0, r2, d0, 19, z, .Ltmp2 + mul_step d0, r2, d0, 20, z, .Ltmp2 + mul_step d0, r2, d0, 21, z, .Ltmp2 + mul_step d0, r2, d0, 22, z, .Ltmp2 + mul_step d0, r2, d0, 23, z, .Ltmp2 + mul_step d0, r2, d0, 24, z, .Ltmp2 + mul_step d0, r2, d0, 25, z, .Ltmp2 + mul_step d0, r2, d0, 26, z, .Ltmp2 + mul_step d0, r2, d0, 27, z, .Ltmp2 + mul_step d0, r2, d0, 28, z, .Ltmp2 + mul_step d0, r2, d0, 29, z, .Ltmp2 + mul_step d0, r2, d0, 30, z, .Ltmp2 + mul_step d0, r2, d0, 31, z, .Ltmp2 +.Ltmp2: + move r0, r1 + + jump r23 diff --git a/compiler-rt/lib/builtins/dpu/mul32.c b/compiler-rt/lib/builtins/dpu/mul32.c new file mode 100644 index 0000000000000..cc6be09b64847 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/mul32.c @@ -0,0 +1,59 @@ +#include + +int32_t __mulsi3(int32_t a, int32_t b) +{ + int32_t dest; + + int32_t temp0; + uint64_t temp1; + + this is not working yet ... + temp1.hi/temp1.lo is not yet supported + + __asm__ volatile(" jgtu %[b], %[a], 1f\n" + " move %[temp0], %[a]\n" + " move %[temp1.hi], %[b], true, 2f\n" + "1:\n" + " move %[temp0], %[b]\n" + " move %[temp1.hi], %[a]\n" + "2:\n" + " move r1, zero\n" + " mul_step %[temp1], %[temp0], %[temp1], 0 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 1 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 2 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 3 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 4 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 5 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 6 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 7 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 8 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 9 , z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 10, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 11, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 12, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 13, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 14, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 15, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 16, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 17, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 18, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 19, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 20, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 21, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 22, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 23, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 24, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 25, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 26, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 27, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 28, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 29, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 30, z, 3f\n" + " mul_step %[temp1], %[temp0], %[temp1], 31, z, 3f\n" + "3:\n" + " move %[dest], %[temp1.lo]\n" + : [dest] "=&r"(dest), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1) + : [a]"r"(a), [b]"r"(b) + : ); + return dest; +} diff --git a/compiler-rt/lib/builtins/dpu/muldi3.c b/compiler-rt/lib/builtins/dpu/muldi3.c new file mode 100644 index 0000000000000..2d5a28b1dc260 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/muldi3.c @@ -0,0 +1,171 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 multiplication emulation. + * + * A relatively fast emulation of 64x64 multiplication using byte multipliers. + * Basically, the two operands X and Y are seen as byte polynomials: + * - X = X0.2^0 + X1.2^8 + X2.2^16 + X3.2^24 + X4.2^32 + X5.2^40 + X6.2^48 + X7.2^56 + * - Y = Y0.2^0 + Y1.2^8 + Y2.2^16 + Y3.2^24 + Y4.2^32 + Y5.2^40 + Y6.2^48 + Y7.2^56 + * + * The product Z is expressed as a similar polynomial. Since the result is 64 bits, + * the function drops any coefficient for a power greater than 56, hence the following + * formula: + * Z = (X0.Y0).2^0 + * + (X0.Y1 + X1.Y0).2^8 + * + (X0.Y2 + X2.Y0 + X1.Y1).2^16 + * + (X0.Y3 + X1.Y2 + X2.Y1 + X3.Y0).2^24 + * + (X0.Y4 + X1.Y3 + X2.Y2 + X3.Y1 + X4.Y0).2^32 + * etc. + * + * Each individual produce is computed with the native built-in 8x8 instructions. + * Resulting processing time is in the magnitude of 150 instructions. + * + * The two operands are found in __D0 and the first kernel nano-stack entry. + * The result goes into __R0 (lsbits) and __R1 (msbits). + * Also, __R2 contains the return address register, instead of __RET__. + */ +#include + +static inline __attribute__((always_inline)) uint16_t +_mul00(uint32_t a, uint32_t b) +{ +#ifndef DPU + return (a & 0xff) * (b & 0xff); +#else + uint32_t r; + __asm__ volatile("mul_ul_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :); + return r; +#endif +} + +static inline __attribute__((always_inline)) uint16_t +_mul01(uint32_t a, uint32_t b) +{ +#ifndef DPU + return (a & 0xff) * ((b >> 8) & 0xff); +#else + uint32_t r; + __asm__ volatile("mul_ul_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :); + return r; +#endif +} + +#define _mul02(a, b) _mul00(a, (b >> 16)) +#define _mul03(a, b) _mul01(a, (b >> 16)) + +static inline __attribute__((always_inline)) uint16_t +_mul11(uint32_t a, uint32_t b) +{ +#ifndef DPU + return ((a >> 8) & 0xff) * ((b >> 8) & 0xff); +#else + uint32_t r; + __asm__ volatile("mul_uh_uh %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(b) :); + return r; +#endif +} + +static inline __attribute__((always_inline)) uint16_t +_mul12(uint32_t a, uint32_t b) +{ +#ifndef DPU + return ((a >> 8) & 0xff) * ((b >> 16) & 0xff); +#else + uint32_t r = (b >> 16); + __asm__ volatile("mul_uh_ul %[rc_wr32], %[ra_r32], %[rb_wr32]" : [rc_wr32] "=r"(r) : [ra_r32] "r"(a), [rb_wr32] "r"(r) :); + return r; +#endif +} + +#define _mul13(a, b) _mul11(a, (b >> 16)) +#define _mul22(a, b) _mul00((a >> 16), (b >> 16)) +#define _mul23(a, b) _mul01((a >> 16), (b >> 16)) +#define _mul33(a, b) _mul11((a >> 16), (b >> 16)) + +#define mulx0y0(xl, yl) _mul00(xl, yl) +#define mulx0y1(xl, yl) _mul01(xl, yl) +#define mulx0y2(xl, yl) _mul02(xl, yl) +#define mulx0y3(xl, yl) _mul03(xl, yl) +#define mulx0y4(xl, yh) _mul00(xl, yh) +#define mulx0y5(xl, yh) _mul01(xl, yh) +#define mulx0y6(xl, yh) _mul02(xl, yh) +#define mulx0y7(xl, yh) _mul03(xl, yh) + +#define mulx1y1(xl, yl) _mul11(xl, yl) +#define mulx1y2(xl, yl) _mul12(xl, yl) +#define mulx1y3(xl, yl) _mul13(xl, yl) +#define mulx1y4(xl, yh) _mul01(yh, xl) +#define mulx1y5(xl, yh) _mul11(xl, yh) +#define mulx1y6(xl, yh) _mul12(xl, yh) + +#define mulx2y2(xl, yl) _mul22(xl, yl) +#define mulx2y3(xl, yl) _mul23(xl, yl) +#define mulx2y4(xl, yh) _mul02(yh, xl) +#define mulx2y5(xl, yh) _mul12(yh, xl) + +#define mulx3y3(xl, yl) _mul33(xl, yl) +#define mulx3y4(xl, yh) _mul03(yh, xl) + +// Symmetry... +#define mulx1y0(xl, yl) mulx0y1(yl, xl) +#define mulx2y0(xl, yl) mulx0y2(yl, xl) +#define mulx2y1(xl, yl) mulx1y2(yl, xl) +#define mulx3y0(xl, yl) mulx0y3(yl, xl) +#define mulx3y1(xl, yl) mulx1y3(yl, xl) +#define mulx3y2(xl, yl) mulx2y3(yl, xl) +#define mulx4y0(xh, yl) mulx0y4(yl, xh) +#define mulx4y1(xh, yl) mulx1y4(yl, xh) +#define mulx4y2(xh, yl) mulx2y4(yl, xh) +#define mulx4y3(xh, yl) mulx3y4(yl, xh) +#define mulx5y0(xh, yl) mulx0y5(yl, xh) +#define mulx5y1(xh, yl) mulx1y5(yl, xh) +#define mulx5y2(xh, yl) mulx2y5(yl, xh) +#define mulx6y0(xh, yl) mulx0y6(yl, xh) +#define mulx6y1(xh, yl) mulx1y6(yl, xh) +#define mulx7y0(xh, yl) mulx0y7(yl, xh) + +uint64_t +__muldi3(uint64_t x, uint64_t y) +{ + uint32_t xl = x; + uint32_t xh = ((uint64_t)x >> 32); + uint32_t yl = y; + uint32_t yh = ((uint64_t)y >> 32); + + // Each fragment of the product. + uint32_t p0, p1, p2, p3, p4, p5, p6, p7, rh; + uint64_t rl; + + p0 = mulx0y0(xl, yl); + rl = (uint64_t)p0; + + p1 = mulx0y1(xl, yl) + mulx1y0(xl, yl); + rl += ((uint64_t)p1 << 8); + + p2 = mulx0y2(xl, yl) + mulx2y0(xl, yl) + mulx1y1(xl, yl); + rl += ((uint64_t)p2 << 16); + + p3 = mulx0y3(xl, yl) + mulx3y0(xl, yl) + mulx1y2(xl, yl) + mulx2y1(xl, yl); + rl += ((uint64_t)p3 << 24); + + p4 = mulx0y4(xl, yh) + mulx4y0(xh, yl) + mulx1y3(xl, yl) + mulx3y1(xl, yl) + mulx2y2(xl, yl); + rh = p4; + + p5 = (mulx0y5(xl, yh) + mulx5y0(xh, yl) + mulx1y4(xl, yh) + mulx4y1(xh, yl) + + mulx2y3(xl, yl) + mulx3y2(xl, yl)); + rh += p5 << 8; + + p6 = (mulx0y6(xl, yh) + mulx6y0(xh, yl) + mulx1y5(xl, yh) + mulx5y1(xh, yl) + + mulx2y4(xl, yh) + mulx4y2(xh, yl) + mulx3y3(xl, yl)); + rh += p6 << 16; + + p7 = (mulx0y7(xl, yh) + mulx7y0(xh, yl) + mulx1y6(xl, yh) + mulx6y1(xh, yl) + + mulx2y5(xl, yh) + mulx5y2(xh, yl) + mulx3y4(xl, yh) + mulx4y3(xh, yl)); + rh += p7 << 24; + + return rl + (((uint64_t)rh) << 32); +} diff --git a/compiler-rt/lib/builtins/dpu/mulsi3.c b/compiler-rt/lib/builtins/dpu/mulsi3.c new file mode 100644 index 0000000000000..f41210acd79cd --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/mulsi3.c @@ -0,0 +1,8 @@ +#include + +extern int32_t __mul32(int32_t a, int32_t b); + +int32_t __mulsi3(int32_t a, int32_t b) +{ + return __mul32(a, b); +} diff --git a/compiler-rt/lib/builtins/dpu/udiv32.S b/compiler-rt/lib/builtins/dpu/udiv32.S new file mode 100644 index 0000000000000..8298d37dd8a0e --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udiv32.S @@ -0,0 +1,49 @@ + .text + .globl __udiv32 + .type __udiv32,@function +__udiv32: + clz r2, r1, max, 1f // r2 = by how many the divider can be shifted on 32-bit + clz r3, r0 // r3 = number of useless bits of the dividend + sub r2, r3, r2, gtu, 2f// r2 = the maximal shift to be done + move r3, r1 + move.u d0, r0 + jump r2, 3f // As we will jump backward relatively to label 3 forward + div_step d0, r3, d0, 31 + div_step d0, r3, d0, 30 + div_step d0, r3, d0, 29 + div_step d0, r3, d0, 28 + div_step d0, r3, d0, 27 + div_step d0, r3, d0, 26 + div_step d0, r3, d0, 25 + div_step d0, r3, d0, 24 + div_step d0, r3, d0, 23 + div_step d0, r3, d0, 22 + div_step d0, r3, d0, 21 + div_step d0, r3, d0, 20 + div_step d0, r3, d0, 19 + div_step d0, r3, d0, 18 + div_step d0, r3, d0, 17 + div_step d0, r3, d0, 16 + div_step d0, r3, d0, 15 + div_step d0, r3, d0, 14 + div_step d0, r3, d0, 13 + div_step d0, r3, d0, 12 + div_step d0, r3, d0, 11 + div_step d0, r3, d0, 10 + div_step d0, r3, d0, 9 + div_step d0, r3, d0, 8 + div_step d0, r3, d0, 7 + div_step d0, r3, d0, 6 + div_step d0, r3, d0, 5 + div_step d0, r3, d0, 4 + div_step d0, r3, d0, 3 + div_step d0, r3, d0, 2 + div_step d0, r3, d0, 1 +3: + div_step d0, r3, d0, 0 +4: + jump r23 +2: + move.u d0, r0, true, 4b +1: + fault 2 diff --git a/compiler-rt/lib/builtins/dpu/udiv32.c b/compiler-rt/lib/builtins/dpu/udiv32.c new file mode 100644 index 0000000000000..22f617e14fd71 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udiv32.c @@ -0,0 +1,63 @@ +#include + +uint64_t +__udiv32(uint32_t dividend, uint32_t divider) +{ + uint64_t dest; + + uint32_t temp0; + uint32_t temp1; + + /* clang-format off */ + __asm__ volatile(" clz %[temp0], %[divider], max, 1f\n" // %[temp0] = by how many the divider can be shifted on 32-bit + " clz %[temp1], %[dividend]\n" // %[temp1] = number of useless bits of the dividend + " sub %[temp0], %[temp1], %[temp0], gtu, 2f\n" // %[temp0] = the maximal shift to be done + " move %[temp1], %[divider]\n" + " move.u %[dest], %[dividend]\n" + " jump %[temp0], 3f\n" // As we will jump backward relatively to label 3 forward + " div_step %[dest], %[temp1], %[dest], 31\n" + " div_step %[dest], %[temp1], %[dest], 30\n" + " div_step %[dest], %[temp1], %[dest], 29\n" + " div_step %[dest], %[temp1], %[dest], 28\n" + " div_step %[dest], %[temp1], %[dest], 27\n" + " div_step %[dest], %[temp1], %[dest], 26\n" + " div_step %[dest], %[temp1], %[dest], 25\n" + " div_step %[dest], %[temp1], %[dest], 24\n" + " div_step %[dest], %[temp1], %[dest], 23\n" + " div_step %[dest], %[temp1], %[dest], 22\n" + " div_step %[dest], %[temp1], %[dest], 21\n" + " div_step %[dest], %[temp1], %[dest], 20\n" + " div_step %[dest], %[temp1], %[dest], 19\n" + " div_step %[dest], %[temp1], %[dest], 18\n" + " div_step %[dest], %[temp1], %[dest], 17\n" + " div_step %[dest], %[temp1], %[dest], 16\n" + " div_step %[dest], %[temp1], %[dest], 15\n" + " div_step %[dest], %[temp1], %[dest], 14\n" + " div_step %[dest], %[temp1], %[dest], 13\n" + " div_step %[dest], %[temp1], %[dest], 12\n" + " div_step %[dest], %[temp1], %[dest], 11\n" + " div_step %[dest], %[temp1], %[dest], 10\n" + " div_step %[dest], %[temp1], %[dest], 9\n" + " div_step %[dest], %[temp1], %[dest], 8\n" + " div_step %[dest], %[temp1], %[dest], 7\n" + " div_step %[dest], %[temp1], %[dest], 6\n" + " div_step %[dest], %[temp1], %[dest], 5\n" + " div_step %[dest], %[temp1], %[dest], 4\n" + " div_step %[dest], %[temp1], %[dest], 3\n" + " div_step %[dest], %[temp1], %[dest], 2\n" + " div_step %[dest], %[temp1], %[dest], 1\n" + "3:\n" + " div_step %[dest], %[temp1], %[dest], 0\n" + "4:\n" + " jump 5f\n" + "2:\n" + " move.u %[dest], %[dividend], true, 4b\n" + "1:\n" + " fault 2\n" + "5:\n" + : [dest] "=r"(dest), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1) + : [dividend] "r"(dividend), [divider] "r"(divider)); + /* clang-format on */ + + return dest; +} diff --git a/compiler-rt/lib/builtins/dpu/udiv64.c b/compiler-rt/lib/builtins/dpu/udiv64.c new file mode 100644 index 0000000000000..e55b3ffe9904c --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udiv64.c @@ -0,0 +1,59 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 multiplication unsigned division. + */ +#include + +static unsigned int +__clz__(uint64_t x) +{ + return __builtin_clzl(x); +} + +uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder) +{ + uint64_t dxo = dividend, dxe = 0; + + if (divider == 0) { + __asm__ volatile("fault 2"); + /* unreachable(); */ + __builtin_unreachable(); + } + if (divider > dividend) { + if (ask_remainder == 0) + return 0; + else + return dividend; + } + + // Mimic the div_step. + /// div_step functionality: + // if (Dxo >= (Ra<< #u5)) { + // Dxo = Dxo - (Ra<< #u5); + // Dxe = (Dxe << 1) | 1; + // } else { + // Dxe = Dxe << 1; + // } + int dividerl0 = __clz__(divider), dividendl0 = __clz__(dividend); + + int i = dividerl0 - dividendl0; + + for (; i >= 0; i--) { + uint64_t pivot = ((uint64_t)divider << i); + if (dxo >= pivot) { + dxo = dxo - pivot; + dxe = ((uint64_t)dxe << 1) | 1L; + } else { + dxe = (uint64_t)dxe << 1; + } + } + if (ask_remainder == 1) + return dxo; + else + return dxe; +} diff --git a/compiler-rt/lib/builtins/dpu/udivdi3.c b/compiler-rt/lib/builtins/dpu/udivdi3.c new file mode 100644 index 0000000000000..1b60b934b85f4 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udivdi3.c @@ -0,0 +1,19 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 unsigned division. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include +extern uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +uint64_t +__udivdi3(uint64_t dividend, uint64_t divider) +{ + return __udiv64(dividend, divider, 0); +} diff --git a/compiler-rt/lib/builtins/dpu/udivmodsi4.c b/compiler-rt/lib/builtins/dpu/udivmodsi4.c new file mode 100644 index 0000000000000..3a3f3902b6f61 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udivmodsi4.c @@ -0,0 +1,29 @@ +/*===-- udivmodsi4.c - Implement __udivmodsi4 ------------------------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is dual licensed under the MIT and the University of Illinois Open + * Source Licenses. See LICENSE_LLVM.TXT for details. + * + * ===----------------------------------------------------------------------=== + * + * This file implements __udivmodsi4 for the compiler_rt library. + * + * ===----------------------------------------------------------------------=== + */ + +#include + +extern uint64_t __udiv32(uint32_t dividend, uint32_t divider); + +#include "int_lib.h" + +/* Returns: a / b, *rem = a % b */ + +COMPILER_RT_ABI su_int +__udivmodsi4(su_int a, su_int b, su_int *rem) +{ + uint64_t res = __udiv32(a, b); + *rem = (su_int)res; + return (su_int) (res >> 32); +} diff --git a/compiler-rt/lib/builtins/dpu/udivsi3.c b/compiler-rt/lib/builtins/dpu/udivsi3.c new file mode 100644 index 0000000000000..dcc1d9fcf672f --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/udivsi3.c @@ -0,0 +1,15 @@ +#include + +extern uint64_t __udiv32(uint32_t dividend, uint32_t divider); + +#include "../int_lib.h" + +typedef su_int fixuint_t; +typedef si_int fixint_t; + +// Returns: a / b + +COMPILER_RT_ABI su_int __udivsi3(su_int a, su_int b) { + uint64_t res = __udiv32(a, b); + return (su_int) (res >> 32); +} diff --git a/compiler-rt/lib/builtins/dpu/umoddi3.c b/compiler-rt/lib/builtins/dpu/umoddi3.c new file mode 100644 index 0000000000000..4b3a82b01eb98 --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/umoddi3.c @@ -0,0 +1,19 @@ +/* Copyright 2020 UPMEM. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +/* + * 64x64 unsigned remainder. + * + * This is the actual libcall implementation, as requested by the compiler. + */ +#include +extern uint64_t +__udiv64(uint64_t dividend, uint64_t divider, int ask_remainder); + +uint64_t +__umoddi3(uint64_t dividend, uint64_t divider) +{ + return __udiv64(dividend, divider, 1); +} diff --git a/compiler-rt/lib/builtins/dpu/umodsi3.c b/compiler-rt/lib/builtins/dpu/umodsi3.c new file mode 100644 index 0000000000000..c85cd8a4d9aed --- /dev/null +++ b/compiler-rt/lib/builtins/dpu/umodsi3.c @@ -0,0 +1,27 @@ +/* ===-- umodsi3.c - Implement __umodsi3 -----------------------------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is dual licensed under the MIT and the University of Illinois Open + * Source Licenses. See LICENSE_LLVM.TXT for details. + * + * ===----------------------------------------------------------------------=== + * + * This file implements __umodsi3 for the compiler_rt library. + * + * ===----------------------------------------------------------------------=== + */ + +#include "int_lib.h" + +/* Returns: a % b */ + +extern unsigned long +__udiv32(unsigned int, unsigned int); + +COMPILER_RT_ABI su_int +__umodsi3(su_int a, su_int b) +{ + unsigned long res = __udiv32(a, b); + return (unsigned int)res; +} diff --git a/compiler-rt/test/builtins/Unit/comparedf2_test.c b/compiler-rt/test/builtins/Unit/comparedf2_test.c index 27666e2ad689b..d606ae7eff6ca 100644 --- a/compiler-rt/test/builtins/Unit/comparedf2_test.c +++ b/compiler-rt/test/builtins/Unit/comparedf2_test.c @@ -458,7 +458,7 @@ static const struct TestVector vectors[] = { {__builtin_inf(),__builtin_inf(),0,0,0,0,0,0,0}, }; -int main(int argc, char *argv[]) { +int main() { const int numVectors = sizeof vectors / sizeof vectors[0]; int i; for (i = 0; i +#include + +namespace llvm { + +#define POSTRA_FUSION_METADATA_STRING "MySpecialMetadata" + +MDNode *getPostRAFusionMetadata(const MachineFunction *MF) { + LLVMContext &Context = MF->getFunction().getContext(); + return MDNode::get(Context, + MDString::get(Context, POSTRA_FUSION_METADATA_STRING)); +} + +bool hasPostRAFusionMetadata(const MachineInstr *MI) { + for (const MachineOperand &Op : MI->operands()) { + if (!Op.isMetadata()) + continue; + + LLVMContext &Context = MI->getMF()->getFunction().getContext(); + if (Op.getMetadata()->getOperand(0).get() == + MDString::get(Context, POSTRA_FUSION_METADATA_STRING)) { + return true; + } + } + + return false; +} + +MachineInstr * +getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I, + MachineBasicBlock::reverse_iterator REnd) { + // Skip all the debug instructions. + while (I != REnd && + (I->isDebugValue() || I->getOpcode() == TargetOpcode::DBG_VALUE)) { + ++I; + } + if (I == REnd) { + return NULL; + } + return &*I; +} + +} // namespace llvm diff --git a/llvm/lib/Target/DPU/DPUHelper.h b/llvm/lib/Target/DPU/DPUHelper.h new file mode 100644 index 0000000000000..9b3436bcad68c --- /dev/null +++ b/llvm/lib/Target/DPU/DPUHelper.h @@ -0,0 +1,20 @@ +#ifndef LLVM_LIB_TARGET_DPU_DPUHELPER_H +#define LLVM_LIB_TARGET_DPU_DPUHELPER_H + +#include +#include +#include +#include + +namespace llvm { + +MDNode *getPostRAFusionMetadata(const MachineFunction *MF); +bool hasPostRAFusionMetadata(const MachineInstr *MI); + +MachineInstr * +getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I, + MachineBasicBlock::reverse_iterator REnd); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_DPU_DPUHELPER_H diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.cpp b/llvm/lib/Target/DPU/DPUInstrInfo.cpp index db957f97bcaa9..f55e72108145c 100644 --- a/llvm/lib/Target/DPU/DPUInstrInfo.cpp +++ b/llvm/lib/Target/DPU/DPUInstrInfo.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "DPUHelper.h" #include "DPUInstrInfo.h" #include "DPUTargetMachine.h" @@ -106,14 +107,14 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::JUMPr)).addReg(DPU::R23); break; case DPU::CALLi: - BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri)) - .addReg(DPU::R23) - .add(MI.getOperand(0)); + BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLri), DPU::R23) + .add(MI.getOperand(0)) + .copyImplicitOps(MI); break; case DPU::CALLr: - BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr)) - .addReg(DPU::R23) - .add(MI.getOperand(0)); + BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::CALLrr), DPU::R23) + .add(MI.getOperand(0)) + .copyImplicitOps(MI); break; case DPU::ADD_VAStart: { // Get the first index in stack where the first // vaargs is stored @@ -122,8 +123,7 @@ bool DPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { StackSize = MF->getFrameInfo().getStackSize(); } unsigned int ResultReg = MI.getOperand(0).getReg(); - BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif)) - .addReg(ResultReg) + BuildMI(MBB, MI, MI.getDebugLoc(), get(DPU::SUBrrif), ResultReg) .addReg(DPU::R22) .addImm(StackSize + STACK_SIZE_FOR_D22) .addImm(DPUAsmCondition::Condition::False); @@ -301,10 +301,20 @@ static void fetchConditionalBranchInfo(MachineInstr *Inst, Cond.push_back(operand); } } + + for (const MachineOperand &Op : Inst->operands()) { + if (Op.isMetadata()) { + Cond.push_back(Op); + } + } } static inline bool isAnalyzableBranch(MachineInstr *Inst) { - return Inst->isBranch() && !Inst->isIndirectBranch(); + return (Inst->isBranch() && !Inst->isIndirectBranch() + // We intentionally know that those will be optimized by us + // during DPUPostRAFusion, don't let split the critical edge + // && !hasPostRAFusionMetadata(Inst) + ); } bool DPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB, @@ -451,15 +461,22 @@ void DPUInstrInfo::buildConditionalBranch(MachineBasicBlock &MBB, MIB = BuildMI(&MBB, DL, get(Opc)); for (unsigned i = 1; i < Cond.size(); ++i) { - if (Cond[i].isReg()) - MIB.addReg(Cond[i].getReg()); - else if (Cond[i].isImm()) - MIB.addImm(Cond[i].getImm()); - else + if (Cond[i].isReg() || Cond[i].isImm()) { + MIB->addOperand(Cond[i]); + } else if (Cond[i].isMetadata()) { + // skip + } else { assert(false && "Cannot copy operand"); + } } MIB.addMBB(TBB); + + for (unsigned i = 0; i < Cond.size(); ++i) { + if (Cond[i].isMetadata()) { + MIB.addMetadata(Cond[i].getMetadata()); + } + } } unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB, @@ -494,3 +511,10 @@ unsigned DPUInstrInfo::insertBranch(MachineBasicBlock &MBB, *BytesAdded = nrOfInsertedMachineInstr; return nrOfInsertedMachineInstr; } + +bool DPUInstrInfo::shouldSink(const MachineInstr &MI) const { + if (hasPostRAFusionMetadata(&MI)) + return false; + + return TargetInstrInfo::shouldSink(MI); +} diff --git a/llvm/lib/Target/DPU/DPUInstrInfo.h b/llvm/lib/Target/DPU/DPUInstrInfo.h index e9c2a3b920a05..14c199c9160e8 100644 --- a/llvm/lib/Target/DPU/DPUInstrInfo.h +++ b/llvm/lib/Target/DPU/DPUInstrInfo.h @@ -65,6 +65,8 @@ class DPUInstrInfo : public DPUGenInstrInfo { void buildConditionalBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL, ArrayRef Cond) const; + + bool shouldSink(const MachineInstr &MI) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/DPU/DPUMCInstLower.cpp b/llvm/lib/Target/DPU/DPUMCInstLower.cpp index 311c64f86b142..954f3834cc138 100644 --- a/llvm/lib/Target/DPU/DPUMCInstLower.cpp +++ b/llvm/lib/Target/DPU/DPUMCInstLower.cpp @@ -102,6 +102,7 @@ void DPUMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; case MachineOperand::MO_RegisterMask: + case MachineOperand::MO_Metadata: continue; case MachineOperand::MO_GlobalAddress: diff --git a/llvm/lib/Target/DPU/DPUMacroFusion.cpp b/llvm/lib/Target/DPU/DPUMacroFusion.cpp index a606c017d7cfb..2cec6c8ea4ccd 100644 --- a/llvm/lib/Target/DPU/DPUMacroFusion.cpp +++ b/llvm/lib/Target/DPU/DPUMacroFusion.cpp @@ -7,8 +7,10 @@ // //===----------------------------------------------------------------------===// +#include "DPUHelper.h" #include "DPUMacroFusion.h" #include "DPUSubtarget.h" + #include "llvm/CodeGen/MacroFusion.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" @@ -28,14 +30,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, // We are mainly interested in merging a simple operation with a simple // conditional/unconditional branch LLVM_DEBUG({ - dbgs() << "DPU/Merge: checking macro fusion:\n\t"; - if (!FirstMI) - dbgs() << ""; - else - FirstMI->dump(); - dbgs() << "\n\t"; - SecondMI.dump(); - dbgs() << "\n"; + dbgs() << "DPU/Merge: checking macro fusion:\n"; + if (!FirstMI) { + dbgs() << "\t\n"; + } else { + dbgs() << "\t"; FirstMI->dump(); + } + dbgs() << "\t"; SecondMI.dump(); }); if (!FirstMI) { @@ -45,6 +46,26 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, return true; } + // check if they are candidate for PostRAFusion + if (hasPostRAFusionMetadata(FirstMI) + && hasPostRAFusionMetadata(&SecondMI)) { + // and if they share operands + for (auto &FirstMIOperands : FirstMI->operands()) { + if (!FirstMIOperands.isReg()) + continue; + + for (auto &SecondMIOperands : SecondMI.operands()) { + if (!SecondMIOperands.isReg()) + continue; + + if (FirstMIOperands.getReg() == SecondMIOperands.getReg()) { + LLVM_DEBUG({ dbgs() << "DPU/Merge: the two instructions can be fused in PostRA\n"; }); + return true; + } + } + } + } + unsigned firstOpc = FirstMI->getOpcode(); unsigned secondOpc = SecondMI.getOpcode(); diff --git a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp index 998d4f0d4bcc5..c774c236490ed 100644 --- a/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp +++ b/llvm/lib/Target/DPU/DPUMergeComboInstrPass.cpp @@ -6,12 +6,14 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +#include "DPU.h" +#include "DPUHelper.h" #include "DPUTargetMachine.h" -#include -#include -#include "DPU.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include + +#include #define GET_INSTRINFO_ENUM @@ -188,20 +190,6 @@ static const ISD::CondCode sourceConditions[] = { ISD::SETOEQ, ISD::SETOGE, ISD::SETOLT, ISD::SETONE, ISD::SETUEQ, ISD::SETEQ, ISD::SETGE, ISD::SETLT, ISD::SETNE}; -static MachineInstr * -getLastNonDebugInstrFrom(MachineBasicBlock::reverse_iterator &I, - MachineBasicBlock::reverse_iterator REnd) { - // Skip all the debug instructions. - while (I != REnd && - (I->isDebugValue() || I->getOpcode() == TargetOpcode::DBG_VALUE)) { - ++I; - } - if (I == REnd) { - return NULL; - } - return &*I; -} - static bool mergeComboInstructionsInMBB(MachineBasicBlock *MBB, const DPUInstrInfo &InstrInfo) { MachineBasicBlock::reverse_iterator I = MBB->rbegin(), REnd = MBB->rend(); diff --git a/llvm/lib/Target/DPU/DPUPostRAFusion.cpp b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp new file mode 100644 index 0000000000000..135cb730a443c --- /dev/null +++ b/llvm/lib/Target/DPU/DPUPostRAFusion.cpp @@ -0,0 +1,252 @@ +#include "DPU.h" +#include "DPUHelper.h" +#include "DPUTargetMachine.h" + +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +#define GET_INSTRINFO_ENUM + +#include "DPUCondCodes.h" +#include "DPUGenInstrInfo.inc" +#include "DPUISelLowering.h" +#include "MCTargetDesc/DPUAsmCondition.h" + +#define GET_REGINFO_ENUM +#include "DPUGenRegisterInfo.inc" + +#define DEBUG_TYPE "dpu-postra-fusion" + +using namespace llvm; + +namespace { +class DPUPostRAFusionPass : public MachineFunctionPass { +public: + const DPUInstrInfo *TII; + static char ID; + + explicit DPUPostRAFusionPass(DPUTargetMachine &TM) + : MachineFunctionPass(ID), TM(TM) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + llvm::StringRef getPassName() const override { + return "DPU PostRA Fussion"; + } + +private: + const DPUTargetMachine &TM; + bool runOnMachineBB(MachineBasicBlock &MBB); +}; + +char DPUPostRAFusionPass::ID = 0; +} // namespace + +FunctionPass *llvm::createDPUPostRAFusionPass(DPUTargetMachine &TM) { + return new DPUPostRAFusionPass(TM); +} + +bool DPUPostRAFusionPass::runOnMachineBB(MachineBasicBlock &MBB) { + MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend(); + MachineInstr *LastInst, *SecondLastInst; + unsigned int LastOpc, SecondLastOpc; + + LastInst = getLastNonDebugInstrFrom(I, REnd); + if (LastInst == NULL) { + return false; + } + I++; + SecondLastInst = getLastNonDebugInstrFrom(I, REnd); + if (SecondLastInst == NULL) { + return false; + } + + LastOpc = LastInst->getOpcode(); + SecondLastOpc = SecondLastInst->getOpcode(); + + if (!hasPostRAFusionMetadata(LastInst) + || !hasPostRAFusionMetadata(SecondLastInst)) { + return false; + } + + DebugLoc DL = SecondLastInst->getDebugLoc(); + + // attempt to merge lsl/r variant; and XX 32; jeq XX 32; instructions + // that has a special metadata + // TODO: implement more generic situation without the metadata + // TODO: split-critical-edge could break BB and reverse cond+branch + if ((LastOpc == DPU::JEQrii // || LastOpc == DPU::JNEQrii + ) + && SecondLastOpc == DPU::ANDrri) { + I++; + MachineInstr *ThirdLastInst = getLastNonDebugInstrFrom(I, REnd); + if (ThirdLastInst == NULL) { + // LLVM_DEBUG(dbgs() << "KO: I++ == REnd\n"); + return false; + } + + if (!hasPostRAFusionMetadata(ThirdLastInst)) { + // This should not happen AFAIK, but I don't know everything yet ... + return false; + } + + unsigned int ThirdLastOpc = ThirdLastInst->getOpcode(); + if (ThirdLastOpc == DPU::LSLXrrr || ThirdLastOpc == DPU::LSRXrrr + || ThirdLastOpc == DPU::LSLrrr || ThirdLastOpc == DPU::LSRrrr) { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "before change: \n"; + dbgs() << "** MBB "; MBB.dump(); + }); + + unsigned int NewOpcode; + + switch (ThirdLastOpc) { + default: + report_fatal_error("This should not happen. Please report to UPMEM."); + break; + + case DPU::LSLXrrr: + NewOpcode = DPU::LSLXrrrci; + break; + + case DPU::LSRXrrr: + NewOpcode = DPU::LSRXrrrci; + break; + + case DPU::LSLrrr: + NewOpcode = DPU::LSLrrrci; + break; + + case DPU::LSRrrr: + NewOpcode = DPU::LSRrrrci; + break; + } + + MachineInstrBuilder ComboInst = BuildMI(&MBB, ThirdLastInst->getDebugLoc(), + TII->get(NewOpcode), + ThirdLastInst->getOperand(0).getReg()); + ComboInst.add(ThirdLastInst->getOperand(1)); + ComboInst.add(ThirdLastInst->getOperand(2)); + ComboInst.addImm(DPUAsmCondition::Condition::Shift32); + ComboInst.addMBB(LastInst->getOperand(2).getMBB()); + // ComboInst.addMetadata(N); + + LLVM_DEBUG({ + dbgs() << "OK\n"; + dbgs() << "del "; ThirdLastInst->dump(); + dbgs() << "del "; SecondLastInst->dump(); + dbgs() << "del "; LastInst->dump(); + dbgs() << "fused to\n"; + dbgs() << "add "; ComboInst->dump(); + }); + + LastInst->eraseFromParent(); + SecondLastInst->eraseFromParent(); + ThirdLastInst->eraseFromParent(); + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "after change: \n"; + dbgs() << "** MBB "; MBB.dump(); + }); + return true; + } + } + + // attempt to optimize MUL_UL_ULrrr + comp res 256 + branch + // original code is JLTUrii, but JGEUrii could be introduce by analyzeBranch + // if ((LastOpc == DPU::JLTUrii || LastOpc == DPU::JGEUrii) + // && SecondLastOpc == DPU::MUL_UL_ULrrr) { + + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "before change: \n"; + // dbgs() << "** MBB "; MBB->dump(); + // }); + + // MachineInstrBuilder ComboInst = BuildMI(MBB, SecondLastInst->getDebugLoc(), + // InstrInfo.get(DPU::MUL_UL_ULrrrci), + // SecondLastInst->getOperand(0).getReg()); + // ComboInst.add(SecondLastInst->getOperand(1)); + // ComboInst.add(SecondLastInst->getOperand(1)); + // ComboInst.addImm(DPUAsmCondition::Small); + // ComboInst.addMBB(LastInst->getOperand(2).getMBB()); + // // ComboInst.addMetadata(N); + + // LLVM_DEBUG({ + // dbgs() << "OK\n"; + // dbgs() << "del "; SecondLastInst->dump(); + // dbgs() << "del "; LastInst->dump(); + // dbgs() << "fused to\n"; + // dbgs() << "add "; ComboInst->dump(); + // }); + // LastInst->eraseFromParent(); + // SecondLastInst->eraseFromParent(); + + // LLVM_DEBUG({ + // dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + // dbgs() << "after change: \n"; + // dbgs() << "** MBB "; MBB->dump(); + // }); + + // return true; + // } + + // the original code is JNEQrii, but it is possible that split-critical-edge breaks + // the BB and reverse cond+branch + if ((LastOpc == DPU::JNEQrii || LastOpc == DPU::JEQrii) + && SecondLastOpc == DPU::CLZ_Urr) { + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "before change: \n"; + dbgs() << "** MBB "; MBB.dump(); + }); + + MachineInstrBuilder ComboInst = BuildMI(&MBB, DL, TII->get(DPU::CLZ_Urrci), + SecondLastInst->getOperand(0).getReg()); + ComboInst.add(SecondLastInst->getOperand(1)); + ComboInst.addImm((LastOpc == DPU::JNEQrii) ? + DPUAsmCondition::Condition::NotMaximum : DPUAsmCondition::Condition::Maximum); + ComboInst.addMBB(LastInst->getOperand(2).getMBB()); + // ComboInst.addMetadata(N); + + LLVM_DEBUG({ + dbgs() << "OK\n"; + dbgs() << "del "; SecondLastInst->dump(); + dbgs() << "del "; LastInst->dump(); + dbgs() << "fused to\n"; + dbgs() << "add "; ComboInst->dump(); + }); + + LastInst->eraseFromParent(); + SecondLastInst->eraseFromParent(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "after change: \n"; + dbgs() << "** MBB "; MBB.dump(); + }); + + return true; + } + + return false; +} + +bool DPUPostRAFusionPass::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName() + << " **********\n\n"); + + TII = static_cast(MF.getSubtarget().getInstrInfo()); + bool Modified = false; + + for (auto &MBB : MF) { + Modified |= runOnMachineBB(MBB); + } + + LLVM_DEBUG(dbgs() << "********** DPU/DPUPostRAFusionPass: " << MF.getName() + << " done: Modified = " << Modified << " **********\n\n"); + return Modified; +} diff --git a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp index bbfb4fec0d67e..9b417cd8f12e2 100644 --- a/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp +++ b/llvm/lib/Target/DPU/DPUResolveMacroInstrPass.cpp @@ -149,6 +149,11 @@ static void resolve64BitRegisterAluInstruction( MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBIter, const DPUInstrInfo &InstrInfo, unsigned int LsbOpcode, unsigned int MsbOpcode) { + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + }); MachineFunction *MF = MBB->getParent(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); @@ -173,6 +178,11 @@ static void resolve64BitRegisterAluInstruction( MSBDestReg) .addReg(MSBDOp1Reg) .addReg(MSBOp2Reg); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** MBB: "; MBB->dump(); + }); } static void resolveJeq64(MachineBasicBlock *MBB, @@ -181,6 +191,25 @@ static void resolveJeq64(MachineBasicBlock *MBB, const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); MachineFunction *F = MBB->getParent(); + + bool need_splice = std::next(MBBIter) != MBB->end(); + + MachineBasicBlock *FTMBB = MBB->getFallThrough(); + MachineBasicBlock *JumpMBB = MBBIter->getOperand(3).getMBB(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** canFallThrough: " << MBB->canFallThrough() << "\n"; + if (MBB->canFallThrough()) { + dbgs() << "** FTMBB: "; FTMBB->dump(); + } + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); + MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(I, trueMBB); @@ -190,12 +219,13 @@ static void resolveJeq64(MachineBasicBlock *MBB, endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(MBB); // Next, add the true and fallthrough blocks as its successors. - auto JumpMBB = MBBIter->getOperand(3).getMBB(); MBB->addSuccessor(trueMBB); MBB->addSuccessor(endMBB); trueMBB->addSuccessor(JumpMBB); trueMBB->addSuccessor(endMBB); + endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true); + unsigned int Op1Reg = MBBIter->getOperand(1).getReg(); unsigned int Op2Reg = MBBIter->getOperand(2).getReg(); @@ -215,6 +245,19 @@ static void resolveJeq64(MachineBasicBlock *MBB, .addReg(MsbOp1Reg) .addReg(MsbOp2Reg) .addMBB(JumpMBB); + trueMBB->addLiveIn(MsbOp1Reg); + trueMBB->addLiveIn(MsbOp2Reg); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** trueMBB: "; trueMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + dbgs() << "** FTMBB: "; FTMBB->dump(); + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); } static void resolveJneq64(MachineBasicBlock *MBB, @@ -223,6 +266,25 @@ static void resolveJneq64(MachineBasicBlock *MBB, const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); MachineFunction *F = MBB->getParent(); + + bool need_splice = std::next(MBBIter) != MBB->end(); + bool canFallThrough = MBB->canFallThrough(); + MachineBasicBlock *FTMBB = MBB->getFallThrough(); + MachineBasicBlock * JumpMBB = MBBIter->getOperand(3).getMBB(); + + LLVM_DEBUG({ + dbgs() << __FILE__ << " " << __LINE__ << " " << __func__ << "\n"; + dbgs() << "instruction to replace: "; MBBIter->dump(); + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** canFallThrough: " << canFallThrough << "\n"; + if (canFallThrough) { + dbgs() << "** FTMBB: "; FTMBB->dump(); + } + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); + MachineBasicBlock *trueMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(I, trueMBB); @@ -232,12 +294,13 @@ static void resolveJneq64(MachineBasicBlock *MBB, endMBB->splice(endMBB->begin(), MBB, std::next(MBBIter), MBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(MBB); // Next, add the true and fallthrough blocks as its successors. - auto JumpMBB = MBBIter->getOperand(3).getMBB(); MBB->addSuccessor(trueMBB); MBB->addSuccessor(JumpMBB); trueMBB->addSuccessor(JumpMBB); trueMBB->addSuccessor(endMBB); + endMBB->removeSuccessor(JumpMBB, /* NormalizeSuccProbs = */ true); + unsigned int Op1Reg = MBBIter->getOperand(1).getReg(); unsigned int Op2Reg = MBBIter->getOperand(2).getReg(); @@ -257,6 +320,21 @@ static void resolveJneq64(MachineBasicBlock *MBB, .addReg(MsbOp1Reg) .addReg(MsbOp2Reg) .addMBB(JumpMBB); + trueMBB->addLiveIn(MsbOp1Reg); + trueMBB->addLiveIn(MsbOp2Reg); + + LLVM_DEBUG({ + dbgs() << "** instruction replaced, but still need removal\n"; + dbgs() << "** need_splice: " << need_splice << "\n"; + dbgs() << "** MBB: "; MBB->dump(); + dbgs() << "** trueMBB: "; trueMBB->dump(); + dbgs() << "** endMBB: "; endMBB->dump(); + if (canFallThrough) { + dbgs() << "** FTMBB: "; FTMBB->dump(); + } + dbgs() << "** JumpMBB: "; JumpMBB->dump(); + dbgs() << "****** \n"; + }); } static void resolveJcc64AsSub64(MachineBasicBlock *MBB, @@ -499,5 +577,8 @@ bool DPUResolveMacroInstrPass::runOnMachineFunction(MachineFunction &MF) { changeMade |= resolveMacroInstructionsInMBB(MBB, InstrInfo); } + LLVM_DEBUG(dbgs() << "********** DPU/ResolveMacroInstrPass: " << MF.getName() + << " done: changeMade = " << changeMade << " **********\n\n"); + return changeMade; } diff --git a/llvm/lib/Target/DPU/DPUTargetLowering.cpp b/llvm/lib/Target/DPU/DPUTargetLowering.cpp index beb7d532e2d00..4dd9f7b0fdfe1 100644 --- a/llvm/lib/Target/DPU/DPUTargetLowering.cpp +++ b/llvm/lib/Target/DPU/DPUTargetLowering.cpp @@ -12,10 +12,12 @@ // //===----------------------------------------------------------------------===// -#include "DPUTargetLowering.h" +#include "DPUHelper.h" #include "DPUISelLowering.h" #include "DPUMachineFunctionInfo.h" +#include "DPUTargetLowering.h" #include "DPUTargetMachine.h" + #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -24,8 +26,9 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/IntrinsicsDPU.h" -#include -#include +#include "llvm/MC/MCSymbol.h" + +// #include #define GET_REGINFO_ENUM @@ -1971,18 +1974,18 @@ EmitMul16WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, BB->addSuccessor(fastMBB); slowMBB->addSuccessor(fastMBB); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1 = MI.getOperand(1).getReg(); - unsigned int Op2 = MI.getOperand(2).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Op1 = MI.getOperand(1).getReg(); + Register Op2 = MI.getOperand(2).getReg(); MachineRegisterInfo &RI = F->getRegInfo(); - unsigned int LLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int HLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int HL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int HHDest = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int LSL1Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register LLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register HLDest = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register HL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register HHDest = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register LSL1Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register LSL2Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register LSL3Dest = RI.createVirtualRegister(&DPU::GP_REGRegClass); BuildMI(BB, dl, TII.get(MulLL), LLDest) .addReg(Op1) @@ -2039,10 +2042,10 @@ static MachineBasicBlock *EmitSelectWithCustomInserter(MachineInstr &MI, BB->addSuccessor(endMBB); trueMBB->addSuccessor(endMBB); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int CondReg = MI.getOperand(1).getReg(); - unsigned int TrueReg = MI.getOperand(2).getReg(); - unsigned int FalseReg = MI.getOperand(3).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register CondReg = MI.getOperand(1).getReg(); + Register TrueReg = MI.getOperand(2).getReg(); + Register FalseReg = MI.getOperand(3).getReg(); MachineRegisterInfo &RI = F->getRegInfo(); unsigned FalseResultReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); @@ -2088,10 +2091,10 @@ EmitSelect64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { BB->addSuccessor(trueMBB); BB->addSuccessor(endMBB); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int CondReg = MI.getOperand(1).getReg(); - unsigned int TrueReg = MI.getOperand(2).getReg(); - unsigned int FalseReg = MI.getOperand(3).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register CondReg = MI.getOperand(1).getReg(); + Register TrueReg = MI.getOperand(2).getReg(); + Register FalseReg = MI.getOperand(3).getReg(); BuildMI(BB, dl, TII.get(DPU::Jcci)) .addImm(ISD::CondCode::SETEQ) @@ -2119,12 +2122,12 @@ EmitMramSubStoreWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, MachineFunction *F = BB->getParent(); MachineRegisterInfo &RI = F->getRegInfo(); - unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ExactWramCacheAddrReg = + Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ExactWramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned int storeRegister = MI.getOperand(0).getReg(); + Register storeRegister = MI.getOperand(0).getReg(); // todo __sw_cache_buffer should have abstract representation @@ -2175,8 +2178,8 @@ EmitMramStoreDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { MachineFunction *F = BB->getParent(); MachineRegisterInfo &RI = F->getRegInfo(); - unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); // todo __sw_cache_buffer should have abstract representation @@ -2214,10 +2217,10 @@ EmitMramSubLoadWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, MachineFunction *F = BB->getParent(); MachineRegisterInfo &RI = F->getRegInfo(); - unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ExactWramCacheAddrReg = + Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MramAddrMSBReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ExactWramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); // todo __sw_cache_buffer should have abstract representation @@ -2263,8 +2266,8 @@ EmitMramLoadDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { MachineFunction *F = BB->getParent(); MachineRegisterInfo &RI = F->getRegInfo(); - unsigned WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register WramCacheAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MramAddrReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); // todo __sw_cache_buffer should have abstract representation @@ -2294,124 +2297,149 @@ EmitMramLoadDoubleWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { return BB; } -static MachineBasicBlock * -EmitLsl64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { +static MachineBasicBlock *emitLsl64RegisterWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) { /* What we want to generate (with dc.h != rb in that example): - lslx __R0, da.l, rb, ?sh32 @+4 + lslx __R0, da.l, rb, ?sh32 @bigShift + smallShift: lsl dc.h, da.h, rb or dc.h, dc.h, __R0 - lsl dc.l, da.l, rb, ?true @+3 + lsl dc.l, da.l, rb, ?true @end + bigShift: lsl dc.h, da.l, rb move dc.l, 0 + end: */ - const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = ++BB->getIterator(); - MachineFunction *F = BB->getParent(); - MachineBasicBlock *smallShiftMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *bigShiftMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(I, smallShiftMBB); - F->insert(I, bigShiftMBB); - F->insert(I, endMBB); + MachineFunction *MF = MBB->getParent(); + + const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineBasicBlock *smallShiftMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *bigShiftMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB); + + MachineFunction::iterator I = ++MBB->getIterator(); + MF->insert(I, smallShiftMBB); + MF->insert(I, bigShiftMBB); + MF->insert(I, endMBB); + + // Move all instructions after the instruction to endMBB. + endMBB->splice(endMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + // Update machine-CFG edges by transferring all successors of the current // block to the new block which will contain the Phi node for the select. - endMBB->splice(endMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - endMBB->transferSuccessorsAndUpdatePHIs(BB); + endMBB->transferSuccessorsAndUpdatePHIs(MBB); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1Reg = MI.getOperand(1).getReg(); - unsigned int ShiftReg = MI.getOperand(2).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Op1Reg = MI.getOperand(1).getReg(); + Register ShiftReg = MI.getOperand(2).getReg(); - MachineRegisterInfo &RI = F->getRegInfo(); - unsigned LsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MsbToMsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register LsbToMsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MsbToMsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register LsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ShiftCheckReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + + Register BigShiftMsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register BigShiftLsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned BigShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned BigShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register SmallShiftMsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register SmallShiftLsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned SmallShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned SmallShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register BigShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SmallShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SmallShiftResultReg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register BigShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SmallShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register Undef2Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned BigShiftResultPart0Reg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SmallShiftResultPart0Reg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned Undef2Reg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + MDNode *MDN = getPostRAFusionMetadata(MF); - BuildMI(BB, dl, TII.get(DPU::COPY), LsbOp1Reg) + BuildMI(MBB, DL, TII.get(DPU::COPY), LsbOp1Reg) .addReg(Op1Reg, 0, DPU::sub_32bit); - BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg) + // BuildMI(BB, dl, TII.get(DPU::LSLXrrrci), LsbToMsbPartReg) + // .addReg(LsbOp1Reg) + // .addReg(ShiftReg) + // .addImm(DPUAsmCondition::Condition::Shift32) + // .addMBB(bigShiftMBB); + + BuildMI(MBB, DL, TII.get(DPU::LSLXrrr), LsbToMsbPartReg) .addReg(LsbOp1Reg) .addReg(ShiftReg) - .addImm(DPUAsmCondition::Condition::Shift32) - .addMBB(bigShiftMBB); + .addMetadata(MDN); - BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), MsbOp1Reg) + BuildMI(MBB, DL, TII.get(DPU::ANDrri), ShiftCheckReg) + .addReg(ShiftReg) + .addImm(0x20) + .addMetadata(MDN); + + BuildMI(MBB, DL, TII.get(DPU::JEQrii)) + .addReg(ShiftCheckReg) + .addImm(0x20) + .addMBB(bigShiftMBB) + .addMetadata(MDN); + + BuildMI(smallShiftMBB, DL, TII.get(DPU::COPY), MsbOp1Reg) .addReg(Op1Reg, 0, DPU::sub_32bit_hi); - BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), MsbToMsbPartReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::LSLrrr), MsbToMsbPartReg) .addReg(MsbOp1Reg) .addReg(ShiftReg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftMsbReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftMsbReg) .addReg(MsbToMsbPartReg) .addReg(LsbToMsbPartReg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::LSLrrr), SmallShiftLsbReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::LSLrrr), SmallShiftLsbReg) .addReg(LsbOp1Reg) .addReg(ShiftReg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), Undef2Reg); + BuildMI(smallShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), Undef2Reg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), + BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultPart0Reg) .addReg(Undef2Reg) .addReg(SmallShiftLsbReg) .addImm(DPU::sub_32bit); - BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg) .addReg(SmallShiftResultPart0Reg) .addReg(SmallShiftMsbReg) .addImm(DPU::sub_32bit_hi); - BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB); + BuildMI(smallShiftMBB, DL, TII.get(DPU::JUMPi)).addMBB(endMBB); - BuildMI(bigShiftMBB, dl, TII.get(DPU::LSLrrr), BigShiftMsbReg) + BuildMI(bigShiftMBB, DL, TII.get(DPU::LSLrrr), BigShiftMsbReg) .addReg(LsbOp1Reg) .addReg(ShiftReg); - BuildMI(bigShiftMBB, dl, TII.get(DPU::MOVEri), BigShiftLsbReg).addImm(0); + BuildMI(bigShiftMBB, DL, TII.get(DPU::MOVEri), BigShiftLsbReg).addImm(0); - BuildMI(bigShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); + BuildMI(bigShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); - BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg) + BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg) .addReg(UndefReg) .addReg(BigShiftLsbReg) .addImm(DPU::sub_32bit); - BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg) + BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg) .addReg(BigShiftResultPart0Reg) .addReg(BigShiftMsbReg) .addImm(DPU::sub_32bit_hi); - BB->addSuccessor(smallShiftMBB); - BB->addSuccessor(bigShiftMBB); + MBB->addSuccessor(smallShiftMBB); + MBB->addSuccessor(bigShiftMBB); smallShiftMBB->addSuccessor(endMBB); bigShiftMBB->addSuccessor(endMBB); - BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) + BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest) .addReg(BigShiftResultReg) .addMBB(bigShiftMBB) .addReg(SmallShiftResultReg) @@ -2428,8 +2456,8 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { MachineFunction *F = BB->getParent(); MachineRegisterInfo &RI = F->getRegInfo(); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1Reg = MI.getOperand(1).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Op1Reg = MI.getOperand(1).getReg(); int64_t ShiftImm = MI.getOperand(2).getImm(); if (ShiftImm < 32) { @@ -2439,13 +2467,13 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { lsl_add dc.h __R0 da.h ShiftImm lsl dc.l da.l ShiftImm */ - unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb) .addReg(Op1Reg, 0, DPU::sub_32bit); @@ -2484,10 +2512,10 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { lsl dc.h da.l ${ShiftImm - 32} move dc.l 0 */ - unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); BuildMI(*BB, MI, dl, TII.get(DPU::MOVEri), ResultLsb).addImm(0); BuildMI(*BB, MI, dl, TII.get(DPU::LSLrri), ResultMsb) @@ -2512,10 +2540,10 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { move dc.h da.l move dc.l 0 */ - unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); BuildMI(*BB, MI, dl, TII.get(DPU::MOVEri), ResultLsb).addImm(0); BuildMI(*BB, MI, dl, TII.get(DPU::COPY), ResultMsb) @@ -2538,101 +2566,128 @@ EmitLsl64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) { return BB; } -static MachineBasicBlock *EmitShiftRight64RegisterWithCustomInserter( - MachineInstr &MI, MachineBasicBlock *BB, unsigned int shiftRight, +static MachineBasicBlock *emitShiftRight64RegisterWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *MBB, unsigned int shiftRight, unsigned int shiftRightExtended) { /* What we want to generate (with dc.l != rb in that example): - lsrx __R0, da.h, rb, ?sh32 @+4 + lsrx __R0, da.h, rb, ?sh32 @bigShift + smallShift: lsr dc.l, da.l, rb or dc.l, dc.l, __R0 - lsr dc.h, da.h, rb, ?true @+2 // asr dc.h, da.h, rb, ?true - @+2 lsr.u dc, da.h, rb // asr.s dc, da.h, rb + lsr dc.h, da.h, rb, ?true @end // asr dc.h, da.h, rb, ?true @end + bigShift: + lsr.u dc, da.h, rb // asr.s dc, da.h, rb + end: */ - const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = ++BB->getIterator(); - MachineFunction *F = BB->getParent(); - MachineBasicBlock *smallShiftMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *bigShiftMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(I, smallShiftMBB); - F->insert(I, bigShiftMBB); - F->insert(I, endMBB); + MachineFunction *MF = MBB->getParent(); + + const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineBasicBlock *smallShiftMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *bigShiftMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB); + + MachineFunction::iterator I = ++MBB->getIterator(); + MF->insert(I, smallShiftMBB); + MF->insert(I, bigShiftMBB); + MF->insert(I, endMBB); + + // Move all instructions after the instruction to EndMBB. + endMBB->splice(endMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + // Update machine-CFG edges by transferring all successors of the current // block to the new block which will contain the Phi node for the select. - endMBB->splice(endMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - endMBB->transferSuccessorsAndUpdatePHIs(BB); + endMBB->transferSuccessorsAndUpdatePHIs(MBB); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1Reg = MI.getOperand(1).getReg(); - unsigned int ShiftReg = MI.getOperand(2).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Op1Reg = MI.getOperand(1).getReg(); + Register ShiftReg = MI.getOperand(2).getReg(); - MachineRegisterInfo &RI = F->getRegInfo(); - unsigned LsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MsbOp1Reg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned MsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned LsbToLsbPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned SmallShiftLsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned SmallShiftMsbReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SmallShiftResultPart0Reg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SmallShiftResultReg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - - BuildMI(BB, dl, TII.get(DPU::COPY), MsbOp1Reg) - .addReg(Op1Reg, 0, DPU::sub_32bit_hi); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register LsbToLsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MsbToLsbPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register LsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register MsbOp1Reg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ShiftCheckReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); - BuildMI(BB, dl, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg) - .addReg(MsbOp1Reg) - .addReg(ShiftReg) - .addImm(DPUAsmCondition::Condition::Shift32) - .addMBB(bigShiftMBB); + Register SmallShiftLsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register SmallShiftMsbReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + + Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SmallShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SmallShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register BigShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + + MDNode *MDN = getPostRAFusionMetadata(MF); - BuildMI(smallShiftMBB, dl, TII.get(DPU::COPY), LsbOp1Reg) + BuildMI(MBB, DL, TII.get(DPU::COPY), MsbOp1Reg) + .addReg(Op1Reg, 0, DPU::sub_32bit_hi); + + // BuildMI(MBB, DL, TII.get(DPU::LSRXrrrci), MsbToLsbPartReg) + // .addReg(MsbOp1Reg) + // .addReg(ShiftReg) + // .addImm(DPUAsmCondition::Condition::Shift32) + // .addMBB(bigShiftMBB); + + BuildMI(MBB, DL, TII.get(DPU::LSRXrrr), MsbToLsbPartReg) + .addReg(MsbOp1Reg) + .addReg(ShiftReg) + .addMetadata(MDN); + + BuildMI(MBB, DL, TII.get(DPU::ANDrri), ShiftCheckReg) + .addReg(ShiftReg) + .addImm(0x20) + .addMetadata(MDN); + + BuildMI(MBB, DL, TII.get(DPU::JEQrii)) + .addReg(ShiftCheckReg) + .addImm(0x20) + .addMBB(bigShiftMBB) + .addMetadata(MDN); + + BuildMI(smallShiftMBB, DL, TII.get(DPU::COPY), LsbOp1Reg) .addReg(Op1Reg, 0, DPU::sub_32bit); - BuildMI(smallShiftMBB, dl, TII.get(DPU::LSRrrr), LsbToLsbPartReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::LSRrrr), LsbToLsbPartReg) .addReg(LsbOp1Reg) .addReg(ShiftReg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftLsbReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftLsbReg) .addReg(MsbToLsbPartReg) .addReg(LsbToLsbPartReg); - BuildMI(smallShiftMBB, dl, TII.get(shiftRight), SmallShiftMsbReg) + BuildMI(smallShiftMBB, DL, TII.get(shiftRight), SmallShiftMsbReg) .addReg(MsbOp1Reg) .addReg(ShiftReg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); + BuildMI(smallShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), - SmallShiftResultPart0Reg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultPart0Reg) .addReg(UndefReg) .addReg(SmallShiftLsbReg) .addImm(DPU::sub_32bit); - BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg) .addReg(SmallShiftResultPart0Reg) .addReg(SmallShiftMsbReg) .addImm(DPU::sub_32bit_hi); - BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB); + BuildMI(smallShiftMBB, DL, TII.get(DPU::JUMPi)).addMBB(endMBB); - BuildMI(bigShiftMBB, dl, TII.get(shiftRightExtended), BigShiftResultReg) + BuildMI(bigShiftMBB, DL, TII.get(shiftRightExtended), BigShiftResultReg) .addReg(MsbOp1Reg) .addReg(ShiftReg); - BB->addSuccessor(smallShiftMBB); - BB->addSuccessor(bigShiftMBB); + MBB->addSuccessor(smallShiftMBB); + MBB->addSuccessor(bigShiftMBB); smallShiftMBB->addSuccessor(endMBB); bigShiftMBB->addSuccessor(endMBB); - BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) + BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest) .addReg(BigShiftResultReg) .addMBB(bigShiftMBB) .addReg(SmallShiftResultReg) @@ -2650,8 +2705,8 @@ static MachineBasicBlock *EmitShiftRight64ImmediateWithCustomInserter( MachineFunction *F = BB->getParent(); MachineRegisterInfo &RI = F->getRegInfo(); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1Reg = MI.getOperand(1).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Op1Reg = MI.getOperand(1).getReg(); int64_t ShiftImm = MI.getOperand(2).getImm(); if (ShiftImm < 32) { @@ -2661,13 +2716,13 @@ static MachineBasicBlock *EmitShiftRight64ImmediateWithCustomInserter( lsr_add dc.l __R0 da.l ShiftImm lsr dc.h da.h ShiftImm // asr dc.h da.h ShiftImm */ - unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); + Register UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb) .addReg(Op1Reg, 0, DPU::sub_32bit); @@ -2723,7 +2778,7 @@ static MachineBasicBlock *EmitShiftRight64ImmediateWithCustomInserter( } static MachineBasicBlock * -EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, +emitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB, unsigned int lsN, unsigned int lsNJump, unsigned int lsNx) { /* @@ -2732,127 +2787,144 @@ EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, lsNx __R0, da.l, rb lsNx __R1, da.h, rb lsN dc.h, da.h, rb - lsN __R2, da.l, rb , ?sh32 @+3 + lsN __R2, da.l, rb , ?sh32 @bigShift or dc.h, dc.h, __R0 - or dc.l, __R2, __R1, ?true @+3 + or dc.l, __R2, __R1, ?true @end + bigShift: or dc.l, dc.h, __R0 or dc.h, __R2, __R1 + end: */ - const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = ++BB->getIterator(); - MachineFunction *F = BB->getParent(); - MachineBasicBlock *smallShiftMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *bigShiftMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(I, smallShiftMBB); - F->insert(I, bigShiftMBB); - F->insert(I, endMBB); - // Update machine-CFG edges by transferring all successors of the current - // block to the new block which will contain the Phi node for the select. - endMBB->splice(endMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - endMBB->transferSuccessorsAndUpdatePHIs(BB); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1Reg = MI.getOperand(1).getReg(); - unsigned int ShiftReg = MI.getOperand(2).getReg(); + const BasicBlock *BB = MBB->getBasicBlock(); + MachineBasicBlock *smallShiftMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *bigShiftMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB); - MachineRegisterInfo &RI = F->getRegInfo(); - unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1LsbShiftX = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1MsbShiftX = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1LsbShift = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1MsbShift = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned SmallShiftLsbResultReg = - RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned SmallShiftMsbResultReg = - RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned BigShiftLsbResultReg = - RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned BigShiftMsbResultReg = - RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned BigShiftResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SmallShiftResultReg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned UndefReg1 = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SmallShiftResultPart0Reg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned BigShiftResultPart0Reg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - - BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb) + MachineFunction::iterator I = ++MBB->getIterator(); + MF->insert(I, smallShiftMBB); + MF->insert(I, bigShiftMBB); + MF->insert(I, endMBB); + + // Update machine-CFG edges by transferring all successors of the current + // block to the new block which will contain the Phi node for the select. + endMBB->splice(endMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + endMBB->transferSuccessorsAndUpdatePHIs(MBB); + + Register Dest = MI.getOperand(0).getReg(); + Register Op1Reg = MI.getOperand(1).getReg(); + Register ShiftReg = MI.getOperand(2).getReg(); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register Op1Lsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1Msb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1LsbShiftX = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1MsbShiftX = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1LsbShift = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1MsbShift = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register SmallShiftLsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register SmallShiftMsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register BigShiftLsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register BigShiftMsbResultReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register BigShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SmallShiftResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + + Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register UndefReg1 = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SmallShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register BigShiftResultPart0Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + + Register ShiftCheckReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + MDNode *MDN = getPostRAFusionMetadata(MF); + + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Lsb) .addReg(Op1Reg, 0, DPU::sub_32bit); - BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Msb) + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Msb) .addReg(Op1Reg, 0, DPU::sub_32bit_hi); - BuildMI(*BB, MI, dl, TII.get(lsNx), Op1MsbShiftX) + BuildMI(*MBB, MI, DL, TII.get(lsNx), Op1MsbShiftX) .addReg(Op1Msb) .addReg(ShiftReg); - BuildMI(*BB, MI, dl, TII.get(lsNx), Op1LsbShiftX) + BuildMI(*MBB, MI, DL, TII.get(lsNx), Op1LsbShiftX) .addReg(Op1Lsb) .addReg(ShiftReg); - BuildMI(*BB, MI, dl, TII.get(lsN), Op1MsbShift) + BuildMI(*MBB, MI, DL, TII.get(lsN), Op1MsbShift) .addReg(Op1Msb) .addReg(ShiftReg); - BuildMI(*BB, MI, dl, TII.get(lsNJump), Op1LsbShift) + + // BuildMI(*MBB, MI, DL, TII.get(lsNJump), Op1LsbShift) + // .addReg(Op1Lsb) + // .addReg(ShiftReg) + // .addImm(DPUAsmCondition::Condition::Shift32) + // .addMBB(bigShiftMBB); + BuildMI(*MBB, MI, DL, TII.get(lsN), Op1LsbShift) .addReg(Op1Lsb) .addReg(ShiftReg) - .addImm(DPUAsmCondition::Condition::Shift32) - .addMBB(bigShiftMBB); - - BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftMsbResultReg) + .addMetadata(MDN); + BuildMI(MBB, DL, TII.get(DPU::ANDrri), ShiftCheckReg) + .addReg(ShiftReg) + .addImm(0x20) + .addMetadata(MDN) + ; + BuildMI(MBB, DL, TII.get(DPU::JEQrii)) + .addReg(ShiftCheckReg) + .addImm(0x20) + .addMBB(bigShiftMBB) + .addMetadata(MDN) + ; + + BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftMsbResultReg) .addReg(Op1MsbShift) .addReg(Op1LsbShiftX); - BuildMI(smallShiftMBB, dl, TII.get(DPU::ORrrr), SmallShiftLsbResultReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::ORrrr), SmallShiftLsbResultReg) .addReg(Op1LsbShift) .addReg(Op1MsbShiftX); - BuildMI(smallShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); + BuildMI(smallShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); - BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), + BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultPart0Reg) .addReg(UndefReg) .addReg(SmallShiftLsbResultReg) .addImm(DPU::sub_32bit); - BuildMI(smallShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg) + BuildMI(smallShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), SmallShiftResultReg) .addReg(SmallShiftResultPart0Reg) .addReg(SmallShiftMsbResultReg) .addImm(DPU::sub_32bit_hi); - BuildMI(smallShiftMBB, dl, TII.get(DPU::JUMPi)).addMBB(endMBB); + BuildMI(smallShiftMBB, DL, TII.get(DPU::JUMPi)).addMBB(endMBB); - BuildMI(bigShiftMBB, dl, TII.get(DPU::ORrrr), BigShiftLsbResultReg) + BuildMI(bigShiftMBB, DL, TII.get(DPU::ORrrr), BigShiftLsbResultReg) .addReg(Op1MsbShift) .addReg(Op1LsbShiftX); - BuildMI(bigShiftMBB, dl, TII.get(DPU::ORrrr), BigShiftMsbResultReg) + BuildMI(bigShiftMBB, DL, TII.get(DPU::ORrrr), BigShiftMsbResultReg) .addReg(Op1LsbShift) .addReg(Op1MsbShiftX); - BuildMI(bigShiftMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg1); + BuildMI(bigShiftMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg1); - BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg) + BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultPart0Reg) .addReg(UndefReg1) .addReg(BigShiftLsbResultReg) .addImm(DPU::sub_32bit); - BuildMI(bigShiftMBB, dl, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg) + BuildMI(bigShiftMBB, DL, TII.get(DPU::INSERT_SUBREG), BigShiftResultReg) .addReg(BigShiftResultPart0Reg) .addReg(BigShiftMsbResultReg) .addImm(DPU::sub_32bit_hi); - BB->addSuccessor(smallShiftMBB); - BB->addSuccessor(bigShiftMBB); + MBB->addSuccessor(smallShiftMBB); + MBB->addSuccessor(bigShiftMBB); smallShiftMBB->addSuccessor(endMBB); bigShiftMBB->addSuccessor(endMBB); - BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) + BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest) .addReg(BigShiftResultReg) .addMBB(bigShiftMBB) .addReg(SmallShiftResultReg) @@ -2863,15 +2935,15 @@ EmitRot64RegisterWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, } static MachineBasicBlock * -EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, +emitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB, unsigned int lsNx, unsigned int lsN_add) { - const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - MachineFunction *F = BB->getParent(); - MachineRegisterInfo &RI = F->getRegInfo(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1Reg = MI.getOperand(1).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Op1Reg = MI.getOperand(1).getReg(); int64_t ShiftImm = MI.getOperand(2).getImm(); ShiftImm = ShiftImm % 64; @@ -2884,43 +2956,43 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, lsN_add dc.l, __R1, da.l, imm lsN_add dc.h, __R0, da.h, imm */ - unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - - BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb) + Register Op1Lsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1Msb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultLsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultLsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultPart = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Lsb) .addReg(Op1Reg, 0, DPU::sub_32bit); - BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Msb) + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Msb) .addReg(Op1Reg, 0, DPU::sub_32bit_hi); - BuildMI(*BB, MI, dl, TII.get(lsNx), ResultLsbPart) + BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultLsbPart) .addReg(Op1Msb) .addImm(ShiftImm); - BuildMI(*BB, MI, dl, TII.get(lsNx), ResultMsbPart) + BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultMsbPart) .addReg(Op1Lsb) .addImm(ShiftImm); - BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultLsb) + BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultLsb) .addReg(ResultLsbPart) .addReg(Op1Lsb) .addImm(ShiftImm); - BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultMsb) + BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultMsb) .addReg(ResultMsbPart) .addReg(Op1Msb) .addImm(ShiftImm); - BuildMI(*BB, MI, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); + BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); - BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), ResultPart) + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), ResultPart) .addReg(UndefReg) .addReg(ResultLsb) .addImm(DPU::sub_32bit); - BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), Dest) + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dest) .addReg(ResultPart) .addReg(ResultMsb) .addImm(DPU::sub_32bit_hi); @@ -2932,43 +3004,43 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, lsN_add dc.h, __R1, da.l, ${ShiftImm - 32} lsN_add dc.l, __R0, da.h, ${ShiftImm - 32} */ - unsigned Op1Lsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned Op1Msb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultLsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultLsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsbPart = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned ResultMsb = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned ResultPart = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - - BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Lsb) + Register Op1Lsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register Op1Msb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultLsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultLsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsbPart = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register ResultMsb = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register ResultPart = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Lsb) .addReg(Op1Reg, 0, DPU::sub_32bit); - BuildMI(*BB, MI, dl, TII.get(DPU::COPY), Op1Msb) + BuildMI(*MBB, MI, DL, TII.get(DPU::COPY), Op1Msb) .addReg(Op1Reg, 0, DPU::sub_32bit_hi); - BuildMI(*BB, MI, dl, TII.get(lsNx), ResultLsbPart) + BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultLsbPart) .addReg(Op1Lsb) .addImm(ShiftImm - 32); - BuildMI(*BB, MI, dl, TII.get(lsNx), ResultMsbPart) + BuildMI(*MBB, MI, DL, TII.get(lsNx), ResultMsbPart) .addReg(Op1Msb) .addImm(ShiftImm - 32); - BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultLsb) + BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultLsb) .addReg(ResultLsbPart) .addReg(Op1Msb) .addImm(ShiftImm - 32); - BuildMI(*BB, MI, dl, TII.get(lsN_add), ResultMsb) + BuildMI(*MBB, MI, DL, TII.get(lsN_add), ResultMsb) .addReg(ResultMsbPart) .addReg(Op1Lsb) .addImm(ShiftImm - 32); - BuildMI(*BB, MI, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); + BuildMI(*MBB, MI, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); - BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), ResultPart) + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), ResultPart) .addReg(UndefReg) .addReg(ResultLsb) .addImm(DPU::sub_32bit); - BuildMI(*BB, MI, dl, TII.get(DPU::INSERT_SUBREG), Dest) + BuildMI(*MBB, MI, DL, TII.get(DPU::INSERT_SUBREG), Dest) .addReg(ResultPart) .addReg(ResultMsb) .addImm(DPU::sub_32bit_hi); @@ -2977,82 +3049,108 @@ EmitRot64ImmediateWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB, /* swapd dc da */ - BuildMI(*BB, MI, dl, TII.get(DPU::SWAPDrr), Dest).addReg(Op1Reg); + BuildMI(*MBB, MI, DL, TII.get(DPU::SWAPDrr), Dest).addReg(Op1Reg); } MI.eraseFromParent(); // The pseudo instruction is gone now. - return BB; + return MBB; } -static MachineBasicBlock *EmitClz64WithCustomInserter(MachineInstr &MI, - MachineBasicBlock *BB) { +static MachineBasicBlock *emitClz64WithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) { /* What we want to generate (with dc != da in that example): - clz.u dc, da.h ?nmax @+3 + clz.u dc, da.h ?nmax @end clz dc.l da.l add dc.l dc.l 32 + end: */ - const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = ++BB->getIterator(); - MachineFunction *F = BB->getParent(); - MachineBasicBlock *msbAreZerosMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *endMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(I, msbAreZerosMBB); - F->insert(I, endMBB); + /* + Though, arithmetic+comparison+branch is difficult to manage, + we break `clz.u dc, da.h ?nmax @end` here, and fuse it back later. + */ + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + const BasicBlock *BB = MBB->getBasicBlock(); + + MachineBasicBlock *msbAreZerosMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *endMBB = MF->CreateMachineBasicBlock(BB); + + MachineFunction::iterator I = ++MBB->getIterator(); + MF->insert(I, msbAreZerosMBB); + MF->insert(I, endMBB); + + // Move all instructions after the instruction to endMBB. + endMBB->splice(endMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + // Update machine-CFG edges by transferring all successors of the current // block to the new block which will contain the Phi node for the select. - endMBB->splice(endMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - endMBB->transferSuccessorsAndUpdatePHIs(BB); + endMBB->transferSuccessorsAndUpdatePHIs(MBB); - BB->addSuccessor(msbAreZerosMBB); - BB->addSuccessor(endMBB); + MBB->addSuccessor(msbAreZerosMBB); + MBB->addSuccessor(endMBB); msbAreZerosMBB->addSuccessor(endMBB); - unsigned int Dest = MI.getOperand(0).getReg(); - unsigned int Op1Reg = MI.getOperand(1).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Op1Reg = MI.getOperand(1).getReg(); - MachineRegisterInfo &RI = F->getRegInfo(); - unsigned FastResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SlowResultReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned UndefReg = RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SlowResultPart1Reg = - RI.createVirtualRegister(&DPU::GP64_REGRegClass); - unsigned SlowResultPartReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - unsigned LsbClzReg = RI.createVirtualRegister(&DPU::GP_REGRegClass); - - BuildMI(BB, dl, TII.get(DPU::CLZ_Urrci), FastResultReg) + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register FastResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SlowResultReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register UndefReg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SlowResultPart1Reg = MRI.createVirtualRegister(&DPU::GP64_REGRegClass); + Register SlowResultPartReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + Register LsbClzReg = MRI.createVirtualRegister(&DPU::GP_REGRegClass); + + MDNode *MDN = getPostRAFusionMetadata(MF); + + MachineInstrBuilder cpt1 = BuildMI(MBB, DL, TII.get(DPU::CLZ_Urr), FastResultReg) .addReg(Op1Reg, 0, DPU::sub_32bit_hi) - .addImm(DPUAsmCondition::Condition::NotMaximum) - .addMBB(endMBB); + .addMetadata(MDN); + + MachineInstrBuilder cpt2 = BuildMI(MBB, DL, TII.get(DPU::JNEQrii)) + .addReg(FastResultReg, 0, DPU::sub_32bit) + .addImm(32) + .addMBB(endMBB) + .addMetadata(MDN); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::CLZrr), LsbClzReg) + MachineInstrBuilder cpt3 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::CLZrr), LsbClzReg) .addReg(Op1Reg, 0, DPU::sub_32bit); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::ADDrri), SlowResultPartReg) + MachineInstrBuilder cpt4 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::ADDrri), SlowResultPartReg) .addReg(LsbClzReg) .addImm(32); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::IMPLICIT_DEF), UndefReg); + MachineInstrBuilder cpt5 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::IMPLICIT_DEF), UndefReg); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg) + MachineInstrBuilder cpt6 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::INSERT_SUBREG), SlowResultPart1Reg) .addReg(UndefReg) .addReg(SlowResultPartReg) .addImm(DPU::sub_32bit); - BuildMI(msbAreZerosMBB, dl, TII.get(DPU::INSERT_SUBREG), SlowResultReg) + MachineInstrBuilder cpt7 = BuildMI(msbAreZerosMBB, DL, TII.get(DPU::INSERT_SUBREG), SlowResultReg) .addReg(SlowResultPart1Reg) .addReg(FastResultReg, 0, DPU::sub_32bit_hi) .addImm(DPU::sub_32bit_hi); - BuildMI(*endMBB, endMBB->begin(), dl, TII.get(DPU::PHI), Dest) + MachineInstrBuilder cpt8 = BuildMI(*endMBB, endMBB->begin(), DL, TII.get(DPU::PHI), Dest) .addReg(FastResultReg) - .addMBB(BB) + .addMBB(MBB) .addReg(SlowResultReg) .addMBB(msbAreZerosMBB); + if (MI.getOperand(1).isKill()) { + // cpt1->getOperand(1).setIsKill(); + cpt3->getOperand(1).setIsKill(); + cpt4->getOperand(1).setIsKill(); + + cpt6->getOperand(1).setIsKill(); + cpt6->getOperand(2).setIsKill(); + cpt7->getOperand(1).setIsKill(); + } + MI.eraseFromParent(); // The pseudo instruction is gone now. return endMBB; } @@ -3119,34 +3217,34 @@ DPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case DPU::MRAM_LOAD_DOUBLEmr: return EmitMramLoadDoubleWithCustomInserter(MI, BB); case DPU::LSL64rr: - return EmitLsl64RegisterWithCustomInserter(MI, BB); + return emitLsl64RegisterWithCustomInserter(MI, BB); case DPU::LSL64ri: return EmitLsl64ImmediateWithCustomInserter(MI, BB); case DPU::LSR64rr: - return EmitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr, + return emitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr, DPU::LSR_Urrr); case DPU::LSR64ri: return EmitShiftRight64ImmediateWithCustomInserter( MI, BB, DPU::LSRrri, DPU::LSR_Urri, DPU::MOVE_Urr); case DPU::ASR64rr: - return EmitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::ASRrrr, + return emitShiftRight64RegisterWithCustomInserter(MI, BB, DPU::ASRrrr, DPU::ASR_Srrr); case DPU::ASR64ri: return EmitShiftRight64ImmediateWithCustomInserter( MI, BB, DPU::ASRrri, DPU::ASR_Srri, DPU::MOVE_Srr); case DPU::ROL64rr: - return EmitRot64RegisterWithCustomInserter(MI, BB, DPU::LSLrrr, + return emitRot64RegisterWithCustomInserter(MI, BB, DPU::LSLrrr, DPU::LSLrrrci, DPU::LSLXrrr); case DPU::ROR64rr: - return EmitRot64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr, + return emitRot64RegisterWithCustomInserter(MI, BB, DPU::LSRrrr, DPU::LSRrrrci, DPU::LSRXrrr); case DPU::ROL64ri: - return EmitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSLXrri, + return emitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSLXrri, DPU::LSL_ADDrrri); case DPU::ROR64ri: - return EmitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSRXrri, + return emitRot64ImmediateWithCustomInserter(MI, BB, DPU::LSRXrri, DPU::LSR_ADDrrri); case DPU::CLZ64r: - return EmitClz64WithCustomInserter(MI, BB); + return emitClz64WithCustomInserter(MI, BB); } } diff --git a/llvm/lib/Target/DPU/DPUTargetMachine.cpp b/llvm/lib/Target/DPU/DPUTargetMachine.cpp index 5815b161c6ce9..e42cab004d1d3 100644 --- a/llvm/lib/Target/DPU/DPUTargetMachine.cpp +++ b/llvm/lib/Target/DPU/DPUTargetMachine.cpp @@ -7,12 +7,13 @@ // //===----------------------------------------------------------------------===// -#include "DPUTargetMachine.h" #include "DPU.h" #include "DPUISelDAGToDAG.h" #include "DPUMacroFusion.h" +#include "DPUTargetMachine.h" #include "DPUTargetTransformInfo.h" #include "MCTargetDesc/DPUMCAsmInfo.h" + #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" @@ -84,6 +85,8 @@ class DPUPassConfig : public TargetPassConfig { bool addInstSelector() override; + void addPostRegAlloc() override; + void addPreEmitPass() override; void addPreEmitPass2() override; }; @@ -103,6 +106,15 @@ bool DPUPassConfig::addInstSelector() { return false; } +void DPUPassConfig::addPostRegAlloc() { + // TODO: add CFGOptimizer + // if (addPass(&TailDuplicateID)) + // printAndVerify("After Post-RegAlloc TailDuplicate"); + + DPUTargetMachine &TM = getDPUTargetMachine(); + addPass(createDPUPostRAFusionPass(TM)); +} + void DPUPassConfig::addPreEmitPass() { DPUTargetMachine &TM = getDPUTargetMachine(); addPass(createDPUMergeComboInstrPass(TM)); diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index ef9f18a2289e9..90675143a8c84 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -87,7 +87,7 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext()); for (auto *V : Values) { - Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy); + Constant *C = ConstantExpr::getPointerBitCastOrAddrSpaceCast(V, Int8PtrTy); if (InitAsSet.insert(C).second) Init.push_back(C); }