llvm · asb · Jan 30, 2025 · Dec 18, 2024 · Jan 13, 2025 · Jan 15, 2025
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 #include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
 
@@ -233,6 +234,60 @@ static bool canEmitLibcall(const TargetMachine *TM, Function *F,
   return TLI->getLibcallName(LC) != nullptr;
 }
 
+// Return a value appropriate for use with the memset_pattern16 libcall, if
+// possible and if we know how. (Adapted from equivalent helper in
+// LoopIdiomRecognize).
+static Constant *getMemSetPattern16Value(MemSetPatternInst *Inst,
+                                         const TargetLibraryInfo &TLI) {
+  // TODO: This could check for UndefValue because it can be merged into any
+  // other valid pattern.
+
+  // Don't emit libcalls if a non-default address space is being used.
+  if (Inst->getRawDest()->getType()->getPointerAddressSpace() != 0)
+    return nullptr;
+
+  Value *V = Inst->getValue();
+  Type *VTy = V->getType();
+  const DataLayout &DL = Inst->getDataLayout();
+  Module *M = Inst->getModule();
+
+  if (!isLibFuncEmittable(M, &TLI, LibFunc_memset_pattern16))
+    return nullptr;
+
+  // If the value isn't a constant, we can't promote it to being in a constant
+  // array.  We could theoretically do a store to an alloca or something, but
+  // that doesn't seem worthwhile.
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C || isa<ConstantExpr>(C))
+    return nullptr;
+
+  // Only handle simple values that are a power of two bytes in size.
+  uint64_t Size = DL.getTypeSizeInBits(VTy);
+  if (!DL.typeSizeEqualsStoreSize(VTy) || !isPowerOf2_64(Size))
+    return nullptr;
+
+  // Don't care enough about darwin/ppc to implement this.
+  if (DL.isBigEndian())
+    return nullptr;
+
+  // Convert to size in bytes.
+  Size /= 8;
+
+  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+  // if the top and bottom are the same (e.g. for vectors and large integers).
+  if (Size > 16)
+    return nullptr;
+
+  // If the constant is exactly 16 bytes, just use it.
+  if (Size == 16)
+    return C;
+
+  // Otherwise, we'll use an array of the constants.
+  uint64_t ArraySize = 16 / Size;
+  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
 // TODO: Handle atomic memcpy and memcpy.inline
 // TODO: Pass ScalarEvolution
 bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
@@ -323,7 +378,56 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
     }
     case Intrinsic::experimental_memset_pattern: {
       auto *Memset = cast<MemSetPatternInst>(Inst);
-      expandMemSetPatternAsLoop(Memset);
+      const TargetLibraryInfo &TLI = LookupTLI(*Memset->getFunction());
+      Constant *PatternValue = getMemSetPattern16Value(Memset, TLI);
+      if (!PatternValue) {
+        // If it isn't possible to emit a memset_pattern16 libcall, expand to
+        // a loop instead.
+        expandMemSetPatternAsLoop(Memset);
+        Changed = true;
+        Memset->eraseFromParent();
+        break;
+      }
+      // FIXME: There is currently no profitability calculation for emitting
+      // the libcall vs expanding the memset.pattern directly.
+      IRBuilder<> Builder(Inst);
+      Module *M = Memset->getModule();
+      const DataLayout &DL = Memset->getDataLayout();
+
+      StringRef FuncName = "memset_pattern16";
+      FunctionCallee MSP = getOrInsertLibFunc(
+          M, TLI, LibFunc_memset_pattern16, Builder.getVoidTy(),
+          Memset->getRawDest()->getType(), Builder.getPtrTy(),
+          Memset->getLength()->getType());
+      inferNonMandatoryLibFuncAttrs(M, FuncName, TLI);
+
+      // Otherwise we should form a memset_pattern16.  PatternValue is known
+      // to be an constant array of 16-bytes. Put the value into a mergable
+      // global.
+      assert(Memset->getRawDest()->getType()->getPointerAddressSpace() == 0 &&
+             "Should have skipped if non-zero AS");
+      GlobalVariable *GV = new GlobalVariable(
+          *M, PatternValue->getType(), /*isConstant=*/true,
+          GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern");
+      GV->setUnnamedAddr(
+          GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+      // TODO: Consider relaxing alignment requirement.
+      GV->setAlignment(Align(16));
+      Value *PatternPtr = GV;
+      Value *NumBytes = Builder.CreateMul(
+          Builder.getInt64(DL.getTypeSizeInBits(Memset->getValue()->getType()) /
+                           8),
+          Memset->getLength());
+      CallInst *MemsetPattern16Call =
+          Builder.CreateCall(MSP, {Memset->getRawDest(), PatternPtr, NumBytes});
+      MemsetPattern16Call->setAAMetadata(Memset->getAAMetadata());
+      // Preserve any call site attributes on the destination pointer
+      // argument (e.g. alignment).
+      AttrBuilder ArgAttrs(Memset->getContext(),
+                           Memset->getAttributes().getParamAttrs(0));
+      MemsetPattern16Call->setAttributes(
+          MemsetPattern16Call->getAttributes().addParamAttributes(
+              Memset->getContext(), 0, ArgAttrs));
       Changed = true;
       Memset->eraseFromParent();
       break;

diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -mtriple=x86_64-apple-darwin10.0.0 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+
+;.
+; CHECK: @.memset_pattern = private unnamed_addr constant [2 x i64] [i64 -6148895925951734307, i64 -6148895925951734307], align 16
+; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x i64] [i64 4614256656552045848, i64 4614256656552045848], align 16
+; CHECK: @.memset_pattern.2 = private unnamed_addr constant [8 x i16] [i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555], align 16
+; CHECK: @.memset_pattern.3 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
+; CHECK: @.memset_pattern.4 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
+; CHECK: @.memset_pattern.5 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
+; CHECK: @.memset_pattern.6 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
+;.
+define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1_dynvalue(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1(
+; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.3, i64 16)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_1_nz_as(ptr addrspace(1) %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1_nz_as(
+; CHECK-SAME: ptr addrspace(1) [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr addrspace(1) [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr addrspace(1) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_1_align_attr(ptr align(16) %a, i128 %value) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_1_align_attr(
+; CHECK-SAME: ptr align 16 [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @memset_pattern16(ptr align 16 [[A]], ptr @.memset_pattern.4, i64 16)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr align(16) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_16(ptr %a) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_16(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.5, i64 256)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_x(ptr %a, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_x(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.6, i64 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_x_nonzero_as(ptr addrspace(10) %a, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i128_x_nonzero_as(
+; CHECK-SAME: ptr addrspace(10) [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i128, ptr addrspace(10) [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr addrspace(10) [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr addrspace(10) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i16_x(ptr %a, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i16_x(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 2, [[X]]
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i16 u0xabcd, i64 %x, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i64_x(ptr %a, i64 %x) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i64_x(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 8, [[X]]
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern, i64 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0xaaaabbbbccccdddd, i64 %x, i1 false)
+  ret void
+}
+
+; Demonstrate that TBAA metadata is preserved.
+define void @memset_pattern_i64_128_tbaa(ptr %a) nounwind {
+; CHECK-LABEL: define void @memset_pattern_i64_128_tbaa(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 1024), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0x400921fb54442d18, i64 128, i1 false), !tbaa !5
+  ret void
+}
+
+!5 = !{!6, !6, i64 0}
+!6 = !{!"double", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+;.