diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index d2814f07530d8..b38db412f786a 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -262,6 +262,9 @@ void llvm::createMemCpyLoopUnknownSize( assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) && "Store size is expected to match type size"); + Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize)); + Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize)); + Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize); Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); @@ -303,7 +306,7 @@ void llvm::createMemCpyLoopUnknownSize( Value *SrcGEP = ResBuilder.CreateInBoundsGEP(ResLoopOpType, SrcAddr, FullOffset); LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP, - PartSrcAlign, SrcIsVolatile); + ResSrcAlign, SrcIsVolatile); if (!CanOverlap) { // Set alias scope for loads. Load->setMetadata(LLVMContext::MD_alias_scope, @@ -311,8 +314,8 @@ void llvm::createMemCpyLoopUnknownSize( } Value *DstGEP = ResBuilder.CreateInBoundsGEP(ResLoopOpType, DstAddr, FullOffset); - StoreInst *Store = ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, - DstIsVolatile); + StoreInst *Store = + ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile); if (!CanOverlap) { // Indicate that stores don't overlap loads. Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope)); diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index d53db69f9f2e0..5cb57ee112b3a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -930,9 +930,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] -; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4 +; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]] -; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 +; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] @@ -966,9 +966,9 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]] -; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 2 +; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]] -; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 2 +; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]] ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] @@ -1038,9 +1038,9 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] -; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4 +; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] -; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4 +; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] @@ -1074,9 +1074,9 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] -; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 2 +; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] -; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 2 +; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] @@ -1146,9 +1146,9 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]] -; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4 +; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]] -; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4 +; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]] @@ -1182,9 +1182,9 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ] ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]] ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]] -; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4 +; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]] -; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 +; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]] ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]