Skip to content

Commit ca372df

Browse files
hstk30tru
authored andcommitted
[AArch64] Fix arm neon vstx lane memVT size
StN lane memory size set too big lead to alias analysis goes wrong. Fixes #64696 Differential Revision: https://reviews.llvm.org/D158611 (cherry picked from commit db8f6c0)
1 parent 466677b commit ca372df

File tree

4 files changed

+185
-17
lines changed

4 files changed

+185
-17
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13840,17 +13840,31 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1384013840
case Intrinsic::aarch64_neon_ld4:
1384113841
case Intrinsic::aarch64_neon_ld1x2:
1384213842
case Intrinsic::aarch64_neon_ld1x3:
13843-
case Intrinsic::aarch64_neon_ld1x4:
13843+
case Intrinsic::aarch64_neon_ld1x4: {
13844+
Info.opc = ISD::INTRINSIC_W_CHAIN;
13845+
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
13846+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13847+
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13848+
Info.offset = 0;
13849+
Info.align.reset();
13850+
// volatile loads with NEON intrinsics not supported
13851+
Info.flags = MachineMemOperand::MOLoad;
13852+
return true;
13853+
}
1384413854
case Intrinsic::aarch64_neon_ld2lane:
1384513855
case Intrinsic::aarch64_neon_ld3lane:
1384613856
case Intrinsic::aarch64_neon_ld4lane:
1384713857
case Intrinsic::aarch64_neon_ld2r:
1384813858
case Intrinsic::aarch64_neon_ld3r:
1384913859
case Intrinsic::aarch64_neon_ld4r: {
1385013860
Info.opc = ISD::INTRINSIC_W_CHAIN;
13851-
// Conservatively set memVT to the entire set of vectors loaded.
13852-
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
13853-
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13861+
// ldx return struct with the same vec type
13862+
Type *RetTy = I.getType();
13863+
auto *StructTy = cast<StructType>(RetTy);
13864+
unsigned NumElts = StructTy->getNumElements();
13865+
Type *VecTy = StructTy->getElementType(0);
13866+
MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
13867+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
1385413868
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
1385513869
Info.offset = 0;
1385613870
Info.align.reset();
@@ -13863,20 +13877,40 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1386313877
case Intrinsic::aarch64_neon_st4:
1386413878
case Intrinsic::aarch64_neon_st1x2:
1386513879
case Intrinsic::aarch64_neon_st1x3:
13866-
case Intrinsic::aarch64_neon_st1x4:
13880+
case Intrinsic::aarch64_neon_st1x4: {
13881+
Info.opc = ISD::INTRINSIC_VOID;
13882+
unsigned NumElts = 0;
13883+
for (const Value *Arg : I.args()) {
13884+
Type *ArgTy = Arg->getType();
13885+
if (!ArgTy->isVectorTy())
13886+
break;
13887+
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
13888+
}
13889+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13890+
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13891+
Info.offset = 0;
13892+
Info.align.reset();
13893+
// volatile stores with NEON intrinsics not supported
13894+
Info.flags = MachineMemOperand::MOStore;
13895+
return true;
13896+
}
1386713897
case Intrinsic::aarch64_neon_st2lane:
1386813898
case Intrinsic::aarch64_neon_st3lane:
1386913899
case Intrinsic::aarch64_neon_st4lane: {
1387013900
Info.opc = ISD::INTRINSIC_VOID;
13871-
// Conservatively set memVT to the entire set of vectors stored.
1387213901
unsigned NumElts = 0;
13902+
// all the vector type is same
13903+
Type *VecTy = I.getArgOperand(0)->getType();
13904+
MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
13905+
1387313906
for (const Value *Arg : I.args()) {
1387413907
Type *ArgTy = Arg->getType();
1387513908
if (!ArgTy->isVectorTy())
1387613909
break;
13877-
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
13910+
NumElts += 1;
1387813911
}
13879-
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13912+
13913+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
1388013914
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
1388113915
Info.offset = 0;
1388213916
Info.align.reset();
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -O2 | FileCheck %s
2+
3+
; st2 must before two ldrb.
4+
; The situation that put one ldrb before st2 because of the conservative memVT set for st2lane,
5+
; which lead to basic-aa goes wrong.
6+
7+
define dso_local i32 @test_vst2_lane_u8([2 x <8 x i8>] %vectors.coerce) local_unnamed_addr {
8+
; CHECK-LABEL: test_vst2_lane_u8:
9+
; CHECK: st2 { v[[V1:[0-9]+]].b, v[[V2:[0-9]+]].b }[6], [x8]
10+
; CHECK-NEXT: umov w[[W1:[0-9]+]], v[[V12:[0-9]+]].b[6]
11+
; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #12]
12+
; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #13]
13+
entry:
14+
%temp = alloca [2 x i8], align 4
15+
%vectors.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 0
16+
%vectors.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 1
17+
call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %temp) #4
18+
call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> %vectors.coerce.fca.0.extract, <8 x i8> %vectors.coerce.fca.1.extract, i64 6, ptr nonnull %temp)
19+
%0 = load i8, ptr %temp, align 4
20+
%vget_lane = extractelement <8 x i8> %vectors.coerce.fca.0.extract, i64 6
21+
%cmp8.not = icmp ne i8 %0, %vget_lane
22+
%arrayidx3.1 = getelementptr inbounds [2 x i8], ptr %temp, i64 0, i64 1
23+
%1 = load i8, ptr %arrayidx3.1, align 1
24+
%vget_lane.1 = extractelement <8 x i8> %vectors.coerce.fca.1.extract, i64 6
25+
%cmp8.not.1 = icmp ne i8 %1, %vget_lane.1
26+
%or.cond = select i1 %cmp8.not, i1 true, i1 %cmp8.not.1
27+
%cmp.lcssa = zext i1 %or.cond to i32
28+
call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %temp) #4
29+
ret i32 %cmp.lcssa
30+
}
31+
32+
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
33+
declare void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8>, <8 x i8>, i64, ptr nocapture) #2
34+
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s
2+
3+
%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
4+
%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
5+
%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
6+
7+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
8+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
9+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
10+
11+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
12+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
13+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
14+
15+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*)
16+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*)
17+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*)
18+
19+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
20+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
21+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
22+
23+
24+
define %struct.__neon_float32x2x2_t @test_ld2(float* %addr) {
25+
; CHECK-LABEL: name: test_ld2
26+
; CHECK: LD2Twov2s {{.*}} :: (load (s128) {{.*}})
27+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %addr)
28+
ret %struct.__neon_float32x2x2_t %val
29+
}
30+
31+
define %struct.__neon_float32x2x3_t @test_ld3(float* %addr) {
32+
; CHECK-LABEL: name: test_ld3
33+
; CHECK: LD3Threev2s {{.*}} :: (load (s192) {{.*}})
34+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %addr)
35+
ret %struct.__neon_float32x2x3_t %val
36+
}
37+
38+
define %struct.__neon_float32x2x4_t @test_ld4(float* %addr) {
39+
; CHECK-LABEL: name: test_ld4
40+
; CHECK: LD4Fourv2s {{.*}} :: (load (s256) {{.*}})
41+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %addr)
42+
ret %struct.__neon_float32x2x4_t %val
43+
}
44+
45+
define %struct.__neon_float32x2x2_t @test_ld1x2(float* %addr) {
46+
; CHECK-LABEL: name: test_ld1x2
47+
; CHECK: LD1Twov2s {{.*}} :: (load (s128) {{.*}})
48+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
49+
ret %struct.__neon_float32x2x2_t %val
50+
}
51+
52+
define %struct.__neon_float32x2x3_t @test_ld1x3(float* %addr) {
53+
; CHECK-LABEL: name: test_ld1x3
54+
; CHECK: LD1Threev2s {{.*}} :: (load (s192) {{.*}})
55+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
56+
ret %struct.__neon_float32x2x3_t %val
57+
}
58+
59+
define %struct.__neon_float32x2x4_t @test_ld1x4(float* %addr) {
60+
; CHECK-LABEL: name: test_ld1x4
61+
; CHECK: LD1Fourv2s {{.*}} :: (load (s256) {{.*}})
62+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
63+
ret %struct.__neon_float32x2x4_t %val
64+
}
65+
66+
define %struct.__neon_float32x2x2_t @test_ld2r(float* %addr) {
67+
; CHECK-LABEL: name: test_ld2r
68+
; CHECK: LD2Rv2s {{.*}} :: (load (s64) {{.*}})
69+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %addr)
70+
ret %struct.__neon_float32x2x2_t %val
71+
}
72+
73+
define %struct.__neon_float32x2x3_t @test_ld3r(float* %addr) {
74+
; CHECK-LABEL: name: test_ld3r
75+
; CHECK: LD3Rv2s {{.*}} :: (load (s96) {{.*}})
76+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %addr)
77+
ret %struct.__neon_float32x2x3_t %val
78+
}
79+
80+
define %struct.__neon_float32x2x4_t @test_ld4r(float* %addr) {
81+
; CHECK-LABEL: name: test_ld4r
82+
; CHECK: LD4Rv2s {{.*}} :: (load (s128) {{.*}})
83+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %addr)
84+
ret %struct.__neon_float32x2x4_t %val
85+
}
86+
87+
define %struct.__neon_float32x2x2_t @test_ld2lane(<2 x float> %a, <2 x float> %b, float* %addr) {
88+
; CHECK-LABEL: name: test_ld2lane
89+
; CHECK: {{.*}} LD2i32 {{.*}}
90+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, i64 1, float* %addr)
91+
ret %struct.__neon_float32x2x2_t %val
92+
}
93+
94+
define %struct.__neon_float32x2x3_t @test_ld3lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, float* %addr) {
95+
; CHECK-LABEL: name: test_ld3lane
96+
; CHECK: {{.*}} LD3i32 {{.*}}
97+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, i64 1, float* %addr)
98+
ret %struct.__neon_float32x2x3_t %val
99+
}
100+
101+
define %struct.__neon_float32x2x4_t @test_ld4lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, float* %addr) {
102+
; CHECK-LABEL: name: test_ld4lane
103+
; CHECK: {{.*}} LD4i32 {{.*}}
104+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i64 1, float* %addr)
105+
ret %struct.__neon_float32x2x4_t %val
106+
}

llvm/test/CodeGen/AArch64/multi-vector-store-size.ll

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ define void @addstx(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
2323
%cr = fadd <4 x float> %cl, %dl
2424
%dr = fadd <4 x float> %dl, %al
2525

26-
; The sizes below are conservative. AArch64TargetLowering
27-
; conservatively assumes the entire vector is stored.
2826
tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
2927
; CHECK: ST2Twov4s {{.*}} :: (store (s256) {{.*}})
3028
tail call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -46,8 +44,6 @@ define void @addst1x(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
4644
%cr = fadd <4 x float> %cl, %dl
4745
%dr = fadd <4 x float> %dl, %al
4846

49-
; The sizes below are conservative. AArch64TargetLowering
50-
; conservatively assumes the entire vector is stored.
5147
tail call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
5248
; CHECK: ST1Twov4s {{.*}} :: (store (s256) {{.*}})
5349
tail call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -69,14 +65,12 @@ define void @addstxlane(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
6965
%cr = fadd <4 x float> %cl, %dl
7066
%dr = fadd <4 x float> %dl, %al
7167

72-
; The sizes below are conservative. AArch64TargetLowering
73-
; conservatively assumes the entire vector is stored.
7468
tail call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, i64 1, ptr %res)
75-
; CHECK: ST2i32 {{.*}} :: (store (s256) {{.*}})
69+
; CHECK: ST2i32 {{.*}} :: (store (s64) {{.*}})
7670
tail call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, ptr %res)
77-
; CHECK: ST3i32 {{.*}} :: (store (s384) {{.*}})
71+
; CHECK: ST3i32 {{.*}} :: (store (s96) {{.*}})
7872
tail call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, ptr %res)
79-
; CHECK: ST4i32 {{.*}} :: (store (s512) {{.*}})
73+
; CHECK: ST4i32 {{.*}} :: (store (s128) {{.*}})
8074

8175
ret void
8276
}

0 commit comments

Comments
 (0)