diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp index acfd3407681a7..e5fbcca1e7d16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -108,28 +108,23 @@ void addInstToMergeableList( if (IIList.front()->getType() != II->getType()) continue; - // Check DMask. - Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex); - Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex); - if (DMaskList != DMask) - continue; - - // Check VAddr (except FragId). - int I = ImageDimIntr->VAddrStart; - for (; I < ImageDimIntr->VAddrEnd - 1; ++I) { - if (IIList.front()->getArgOperand(I) != II->getArgOperand(I)) - break; + // Check all arguments (DMask, VAddr, RSrc etc). + bool AllEqual = true; + assert(IIList.front()->arg_size() == II->arg_size()); + for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) { + Value *ArgList = IIList.front()->getArgOperand(I); + Value *Arg = II->getArgOperand(I); + if (I == ImageDimIntr->VAddrEnd - 1) { + // Check FragId group. + auto FragIdList = cast(IIList.front()->getArgOperand(I)); + auto FragId = cast(II->getArgOperand(I)); + AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4); + } else { + // Check all arguments except FragId. + AllEqual = ArgList == Arg; + } } - - if (I != ImageDimIntr->VAddrEnd - 1) - continue; - - // Check FragId group. - const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; - Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex); - auto IIListFragId = cast(FragIdList); - auto IIFragId = cast(II->getArgOperand(FragIdIndex)); - if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4)) + if (!AllEqual) continue; // Add to the list. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll index 853ca53767be8..5ffdbb0f8c5b0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll @@ -1184,6 +1184,47 @@ merge: ret [4 x float] %i25 } +define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32> inreg %rsrc1, <8 x i32> inreg %rsrc2, i32 %s, i32 %t) { +; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc( +; NO-MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] { +; NO-MSAA-NEXT: main_body: +; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0) +; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC1]], i32 0, i32 0) +; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0) +; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC2]], i32 0, i32 0) +; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0 +; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1 +; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2 +; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3 +; NO-MSAA-NEXT: ret [4 x float] [[I7]] +; +; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc( +; MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] { +; MSAA-NEXT: main_body: +; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0) +; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0 +; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1 +; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0) +; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 0 +; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 1 +; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0 +; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1 +; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2 +; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3 +; MSAA-NEXT: ret [4 x float] [[I7]] +; +main_body: + %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc1, i32 0, i32 0) + %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc1, i32 0, i32 0) + %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc2, i32 0, i32 0) + %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc2, i32 0, i32 0) + %i4 = insertvalue [4 x float] undef, float %i, 0 + %i5 = insertvalue [4 x float] %i4, float %i1, 1 + %i6 = insertvalue [4 x float] %i5, float %i2, 2 + %i7 = insertvalue [4 x float] %i6, float %i3, 3 + ret [4 x float] %i7 +} + declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0 declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0 declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0