diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 048a6a49e4cb9..27fa0b43d74f6 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -32,6 +32,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" #include "llvm/Transforms/Utils/LowerVectorIntrinsics.h" @@ -233,6 +234,60 @@ static bool canEmitLibcall(const TargetMachine *TM, Function *F, return TLI->getLibcallName(LC) != nullptr; } +// Return a value appropriate for use with the memset_pattern16 libcall, if +// possible and if we know how. (Adapted from equivalent helper in +// LoopIdiomRecognize). +static Constant *getMemSetPattern16Value(MemSetPatternInst *Inst, + const TargetLibraryInfo &TLI) { + // TODO: This could check for UndefValue because it can be merged into any + // other valid pattern. + + // Don't emit libcalls if a non-default address space is being used. + if (Inst->getRawDest()->getType()->getPointerAddressSpace() != 0) + return nullptr; + + Value *V = Inst->getValue(); + Type *VTy = V->getType(); + const DataLayout &DL = Inst->getDataLayout(); + Module *M = Inst->getModule(); + + if (!isLibFuncEmittable(M, &TLI, LibFunc_memset_pattern16)) + return nullptr; + + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast(V); + if (!C || isa(C)) + return nullptr; + + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = DL.getTypeSizeInBits(VTy); + if (!DL.typeSizeEqualsStoreSize(VTy) || !isPowerOf2_64(Size)) + return nullptr; + + // Don't care enough about darwin/ppc to implement this. + if (DL.isBigEndian()) + return nullptr; + + // Convert to size in bytes. + Size /= 8; + + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same (e.g. for vectors and large integers). + if (Size > 16) + return nullptr; + + // If the constant is exactly 16 bytes, just use it. + if (Size == 16) + return C; + + // Otherwise, we'll use an array of the constants. + uint64_t ArraySize = 16 / Size; + ArrayType *AT = ArrayType::get(V->getType(), ArraySize); + return ConstantArray::get(AT, std::vector(ArraySize, C)); +} + // TODO: Handle atomic memcpy and memcpy.inline // TODO: Pass ScalarEvolution bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { @@ -323,7 +378,56 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { } case Intrinsic::experimental_memset_pattern: { auto *Memset = cast(Inst); - expandMemSetPatternAsLoop(Memset); + const TargetLibraryInfo &TLI = LookupTLI(*Memset->getFunction()); + Constant *PatternValue = getMemSetPattern16Value(Memset, TLI); + if (!PatternValue) { + // If it isn't possible to emit a memset_pattern16 libcall, expand to + // a loop instead. + expandMemSetPatternAsLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + break; + } + // FIXME: There is currently no profitability calculation for emitting + // the libcall vs expanding the memset.pattern directly. + IRBuilder<> Builder(Inst); + Module *M = Memset->getModule(); + const DataLayout &DL = Memset->getDataLayout(); + + StringRef FuncName = "memset_pattern16"; + FunctionCallee MSP = getOrInsertLibFunc( + M, TLI, LibFunc_memset_pattern16, Builder.getVoidTy(), + Memset->getRawDest()->getType(), Builder.getPtrTy(), + Memset->getLength()->getType()); + inferNonMandatoryLibFuncAttrs(M, FuncName, TLI); + + // Otherwise we should form a memset_pattern16. PatternValue is known + // to be an constant array of 16-bytes. Put the value into a mergable + // global. + assert(Memset->getRawDest()->getType()->getPointerAddressSpace() == 0 && + "Should have skipped if non-zero AS"); + GlobalVariable *GV = new GlobalVariable( + *M, PatternValue->getType(), /*isConstant=*/true, + GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); + GV->setUnnamedAddr( + GlobalValue::UnnamedAddr::Global); // Ok to merge these. + // TODO: Consider relaxing alignment requirement. + GV->setAlignment(Align(16)); + Value *PatternPtr = GV; + Value *NumBytes = Builder.CreateMul( + Builder.getInt64(DL.getTypeSizeInBits(Memset->getValue()->getType()) / + 8), + Memset->getLength()); + CallInst *MemsetPattern16Call = + Builder.CreateCall(MSP, {Memset->getRawDest(), PatternPtr, NumBytes}); + MemsetPattern16Call->setAAMetadata(Memset->getAAMetadata()); + // Preserve any call site attributes on the destination pointer + // argument (e.g. alignment). + AttrBuilder ArgAttrs(Memset->getContext(), + Memset->getAttributes().getParamAttrs(0)); + MemsetPattern16Call->setAttributes( + MemsetPattern16Call->getAttributes().addParamAttributes( + Memset->getContext(), 0, ArgAttrs)); Changed = true; Memset->eraseFromParent(); break; diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll new file mode 100644 index 0000000000000..7cfdcb8578809 --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -mtriple=x86_64-apple-darwin10.0.0 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s + +;. +; CHECK: @.memset_pattern = private unnamed_addr constant [2 x i64] [i64 -6148895925951734307, i64 -6148895925951734307], align 16 +; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x i64] [i64 4614256656552045848, i64 4614256656552045848], align 16 +; CHECK: @.memset_pattern.2 = private unnamed_addr constant [8 x i16] [i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555], align 16 +; CHECK: @.memset_pattern.3 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16 +; CHECK: @.memset_pattern.4 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16 +; CHECK: @.memset_pattern.5 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16 +; CHECK: @.memset_pattern.6 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16 +;. +define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_1_dynvalue( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 false) + ret void +} + +define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_1( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.3, i64 16) +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false) + ret void +} + +define void @memset_pattern_i128_1_nz_as(ptr addrspace(1) %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_1_nz_as( +; CHECK-SAME: ptr addrspace(1) [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[A]], i64 [[TMP1]] +; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr addrspace(1) [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr addrspace(1) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false) + ret void +} + +define void @memset_pattern_i128_1_align_attr(ptr align(16) %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_1_align_attr( +; CHECK-SAME: ptr align 16 [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @memset_pattern16(ptr align 16 [[A]], ptr @.memset_pattern.4, i64 16) +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr align(16) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false) + ret void +} + +define void @memset_pattern_i128_16(ptr %a) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_16( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.5, i64 256) +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 false) + ret void +} + +define void @memset_pattern_i128_x(ptr %a, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_x( +; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 16, [[X]] +; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.6, i64 [[TMP1]]) +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 false) + ret void +} + +define void @memset_pattern_i128_x_nonzero_as(ptr addrspace(10) %a, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_x_nonzero_as( +; CHECK-SAME: ptr addrspace(10) [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i128, ptr addrspace(10) [[A]], i64 [[TMP2]] +; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr addrspace(10) [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr addrspace(10) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 false) + ret void +} + +define void @memset_pattern_i16_x(ptr %a, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i16_x( +; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 2, [[X]] +; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 [[TMP1]]) +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i16 u0xabcd, i64 %x, i1 false) + ret void +} + +define void @memset_pattern_i64_x(ptr %a, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i64_x( +; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 8, [[X]] +; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern, i64 [[TMP1]]) +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0xaaaabbbbccccdddd, i64 %x, i1 false) + ret void +} + +; Demonstrate that TBAA metadata is preserved. +define void @memset_pattern_i64_128_tbaa(ptr %a) nounwind { +; CHECK-LABEL: define void @memset_pattern_i64_128_tbaa( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 1024), !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0x400921fb54442d18, i64 128, i1 false), !tbaa !5 + ret void +} + +!5 = !{!6, !6, i64 0} +!6 = !{!"double", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} + +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) } +;. +; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]], i64 0} +; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"Simple C++ TBAA"} +;.