From 5ca5b930ff16bf6e91abfc978239391594b616af Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 13 Sep 2024 12:13:57 +0100 Subject: [PATCH 1/4] MemCpyOpt: replace an AA query with MSSA query An AA query in processStoreOfLoad misses certain cases, and has been marked as a TODO. Replace it with an MSSA query to increase MemCpyOpt's power, fixing the long-standing TODO. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 25 +++++++-------- llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll | 31 +++++++++---------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 1d67773585d59..0b7e58d5b961a 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -647,19 +647,18 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, (EnableMemCpyOptWithoutLibcalls || (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) { MemoryLocation LoadLoc = MemoryLocation::get(LI); - - // We use alias analysis to check if an instruction may store to - // the memory we load from in between the load and the store. If - // such an instruction is found, we try to promote there instead - // of at the store position. - // TODO: Can use MSSA for this. - Instruction *P = SI; - for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { - if (isModSet(AA->getModRefInfo(&I, LoadLoc))) { - P = &I; - break; - } - } + MemoryUseOrDef *LoadAccess = MSSA->getMemoryAccess(LI), + *StoreAccess = MSSA->getMemoryAccess(SI); + + // We use MSSA to check if an instruction may store to the memory we load + // from in between the load and the store. If such an instruction is found, + // we try to promote there instead of at the store position. + BatchAAResults BAA(*AA); + auto *Clobber = + cast(MSSA->getWalker()->getClobberingMemoryAccess( + StoreAccess, LoadLoc, BAA)); + Instruction *P = + MSSA->dominates(LoadAccess, Clobber) ? Clobber->getMemoryInst() : SI; // If we found an instruction that may write to the loaded memory, // we can try to promote at this position instead of the store diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll index 51fad82050939..00cffb9b1c7f1 100644 --- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll @@ -38,9 +38,8 @@ define void @noaliasdst(ptr %src, ptr noalias %dst) { define void @destroysrc(ptr %src, ptr %dst) { ; CHECK-LABEL: @destroysrc( -; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false) -; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST:%.*]], align 8 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 0, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC]], i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -51,8 +50,8 @@ define void @destroysrc(ptr %src, ptr %dst) { define void @destroynoaliassrc(ptr noalias %src, ptr %dst) { ; CHECK-LABEL: @destroynoaliassrc( -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC]], i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 0, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -63,9 +62,8 @@ define void @destroynoaliassrc(ptr noalias %src, ptr %dst) { define void @copyalias(ptr %src, ptr %dst) { ; CHECK-LABEL: @copyalias( -; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC]], i64 16, i1 false) -; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST]], align 8 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST]], ptr align 8 [[SRC]], i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -79,9 +77,9 @@ define void @copyalias(ptr %src, ptr %dst) { ; sure we lift the computation as well if needed and possible. define void @addrproducer(ptr %src, ptr %dst) { ; CHECK-LABEL: @addrproducer( +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST:%.*]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST]], i64 1 ; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST:%.*]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -93,11 +91,10 @@ define void @addrproducer(ptr %src, ptr %dst) { define void @aliasaddrproducer(ptr %src, ptr %dst, ptr %dstidptr) { ; CHECK-LABEL: @aliasaddrproducer( -; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST:%.*]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: [[DSTINDEX:%.*]] = load i32, ptr [[DSTIDPTR:%.*]], align 4 -; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S]], ptr [[DST]], i32 [[DSTINDEX]] -; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST2]], align 8 +; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST]], i32 [[DSTINDEX]] +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -110,11 +107,11 @@ define void @aliasaddrproducer(ptr %src, ptr %dst, ptr %dstidptr) { define void @noaliasaddrproducer(ptr %src, ptr noalias %dst, ptr noalias %dstidptr) { ; CHECK-LABEL: @noaliasaddrproducer( -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DSTIDPTR:%.*]], align 4 -; CHECK-NEXT: [[DSTINDEX:%.*]] = or i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[DSTIDPTR:%.*]], align 4 +; CHECK-NEXT: [[DSTINDEX:%.*]] = or i32 [[TMP1]], 1 ; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST:%.*]], i32 [[DSTINDEX]] -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC]], i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 undef, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -130,7 +127,7 @@ define void @throwing_call(ptr noalias %src, ptr %dst) { ; CHECK-LABEL: @throwing_call( ; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false) -; CHECK-NEXT: call void @call() [[ATTR2:#.*]] +; CHECK-NEXT: call void @call() #[[ATTR2:[0-9]+]] ; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST:%.*]], align 8 ; CHECK-NEXT: ret void ; From c865654dd39158ba783cae8b869b8b6a24f74b87 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sat, 14 Sep 2024 10:25:56 +0100 Subject: [PATCH 2/4] MemCpyOpt: fix bug --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 2 +- llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll | 31 ++++++++++--------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 0b7e58d5b961a..bcdbb8d27659c 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -656,7 +656,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, BatchAAResults BAA(*AA); auto *Clobber = cast(MSSA->getWalker()->getClobberingMemoryAccess( - StoreAccess, LoadLoc, BAA)); + StoreAccess->getDefiningAccess(), LoadLoc, BAA)); Instruction *P = MSSA->dominates(LoadAccess, Clobber) ? Clobber->getMemoryInst() : SI; diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll index 00cffb9b1c7f1..51fad82050939 100644 --- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll @@ -38,8 +38,9 @@ define void @noaliasdst(ptr %src, ptr noalias %dst) { define void @destroysrc(ptr %src, ptr %dst) { ; CHECK-LABEL: @destroysrc( -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 0, i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC]], i64 16, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false) +; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST:%.*]], align 8 ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -50,8 +51,8 @@ define void @destroysrc(ptr %src, ptr %dst) { define void @destroynoaliassrc(ptr noalias %src, ptr %dst) { ; CHECK-LABEL: @destroynoaliassrc( -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 0, i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -62,8 +63,9 @@ define void @destroynoaliassrc(ptr noalias %src, ptr %dst) { define void @copyalias(ptr %src, ptr %dst) { ; CHECK-LABEL: @copyalias( -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST]], ptr align 8 [[SRC]], i64 16, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST:%.*]], ptr align 8 [[SRC]], i64 16, i1 false) +; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST]], align 8 ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -77,9 +79,9 @@ define void @copyalias(ptr %src, ptr %dst) { ; sure we lift the computation as well if needed and possible. define void @addrproducer(ptr %src, ptr %dst) { ; CHECK-LABEL: @addrproducer( -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST:%.*]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST]], i64 1 ; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST:%.*]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -91,10 +93,11 @@ define void @addrproducer(ptr %src, ptr %dst) { define void @aliasaddrproducer(ptr %src, ptr %dst, ptr %dstidptr) { ; CHECK-LABEL: @aliasaddrproducer( +; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST:%.*]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: [[DSTINDEX:%.*]] = load i32, ptr [[DSTIDPTR:%.*]], align 4 -; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST]], i32 [[DSTINDEX]] -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) +; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S]], ptr [[DST]], i32 [[DSTINDEX]] +; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST2]], align 8 ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -107,11 +110,11 @@ define void @aliasaddrproducer(ptr %src, ptr %dst, ptr %dstidptr) { define void @noaliasaddrproducer(ptr %src, ptr noalias %dst, ptr noalias %dstidptr) { ; CHECK-LABEL: @noaliasaddrproducer( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[DSTIDPTR:%.*]], align 4 -; CHECK-NEXT: [[DSTINDEX:%.*]] = or i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DSTIDPTR:%.*]], align 4 +; CHECK-NEXT: [[DSTINDEX:%.*]] = or i32 [[TMP2]], 1 ; CHECK-NEXT: [[DST2:%.*]] = getelementptr [[S:%.*]], ptr [[DST:%.*]], i32 [[DSTINDEX]] -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC:%.*]], i64 16, i1 false) -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 undef, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[SRC]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC:%.*]], i8 undef, i64 16, i1 false) ; CHECK-NEXT: ret void ; %1 = load %S, ptr %src @@ -127,7 +130,7 @@ define void @throwing_call(ptr noalias %src, ptr %dst) { ; CHECK-LABEL: @throwing_call( ; CHECK-NEXT: [[TMP1:%.*]] = load [[S:%.*]], ptr [[SRC:%.*]], align 8 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[SRC]], i8 0, i64 16, i1 false) -; CHECK-NEXT: call void @call() #[[ATTR2:[0-9]+]] +; CHECK-NEXT: call void @call() [[ATTR2:#.*]] ; CHECK-NEXT: store [[S]] [[TMP1]], ptr [[DST:%.*]], align 8 ; CHECK-NEXT: ret void ; From 3300903202ff26f47778bef19d3c05f8be81198f Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 23 Sep 2024 12:11:15 +0100 Subject: [PATCH 3/4] MemCpyOpt: address review --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index bcdbb8d27659c..2f88b19a8d390 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -638,6 +638,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent()) return false; + BatchAAResults BAA(*AA); auto *T = LI->getType(); // Don't introduce calls to memcpy/memmove intrinsics out of thin air if // the corresponding libcalls are not available. @@ -653,12 +654,11 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, // We use MSSA to check if an instruction may store to the memory we load // from in between the load and the store. If such an instruction is found, // we try to promote there instead of at the store position. - BatchAAResults BAA(*AA); - auto *Clobber = - cast(MSSA->getWalker()->getClobberingMemoryAccess( - StoreAccess->getDefiningAccess(), LoadLoc, BAA)); - Instruction *P = - MSSA->dominates(LoadAccess, Clobber) ? Clobber->getMemoryInst() : SI; + auto *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + StoreAccess->getDefiningAccess(), LoadLoc, BAA); + Instruction *P = MSSA->dominates(LoadAccess, Clobber) + ? cast(Clobber)->getMemoryInst() + : SI; // If we found an instruction that may write to the loaded memory, // we can try to promote at this position instead of the store @@ -706,7 +706,6 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. - BatchAAResults BAA(*AA); auto GetCall = [&]() -> CallInst * { // We defer this expensive clobber walk until the cheap checks // have been done on the source inside performCallSlotOptzn. From db578329a4d800ca00a180667618ee513bd5d6f1 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 23 Sep 2024 14:50:26 +0100 Subject: [PATCH 4/4] MemCpyOpt: add test --- llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll index 51fad82050939..61e349e01ed91 100644 --- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll @@ -141,4 +141,19 @@ define void @throwing_call(ptr noalias %src, ptr %dst) { ret void } +define void @loop_memoryphi(ptr %a, ptr %b) { +; CHECK-LABEL: @loop_memoryphi( +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[B:%.*]], ptr align 8 [[A:%.*]], i64 16, i1 false) +; CHECK-NEXT: br label [[LOOP]] +; + br label %loop + +loop: + %v = load { i64, i64 }, ptr %a + store { i64, i64 } %v, ptr %b + br label %loop +} + declare void @call()