Skip to content

Commit 4cb110a

Browse files
authored
[RFC] IR: Support atomicrmw FP ops with vector types (llvm#86796)
Allow using atomicrmw fadd, fsub, fmin, and fmax with vectors of floating-point type. AMDGPU supports atomic fadd for <2 x half> and <2 x bfloat> on some targets and address spaces. Note this only supports the proper floating-point operations; float vector typed xchg is still not supported. cmpxchg still only supports integers, so this inserts bitcasts for the loop expansion. I have support for fp vector typed xchg, and vector of int/ptr separately implemented but I don't have an immediate need for those beyond feature consistency.
1 parent bd589f5 commit 4cb110a

File tree

11 files changed

+1510
-11
lines changed

11 files changed

+1510
-11
lines changed

llvm/docs/LangRef.rst

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11112,11 +11112,12 @@ For most of these operations, the type of '<value>' must be an integer
1111211112
type whose bit width is a power of two greater than or equal to eight
1111311113
and less than or equal to a target-specific size limit. For xchg, this
1111411114
may also be a floating point or a pointer type with the same size constraints
11115-
as integers. For fadd/fsub/fmax/fmin, this must be a floating point type. The
11116-
type of the '``<pointer>``' operand must be a pointer to that type. If
11117-
the ``atomicrmw`` is marked as ``volatile``, then the optimizer is not
11118-
allowed to modify the number or order of execution of this
11119-
``atomicrmw`` with other :ref:`volatile operations <volatile>`.
11115+
as integers. For fadd/fsub/fmax/fmin, this must be a floating-point
11116+
or fixed vector of floating-point type. The type of the '``<pointer>``'
11117+
operand must be a pointer to that type. If the ``atomicrmw`` is marked
11118+
as ``volatile``, then the optimizer is not allowed to modify the
11119+
number or order of execution of this ``atomicrmw`` with other
11120+
:ref:`volatile operations <volatile>`.
1112011121

1112111122
Note: if the alignment is not greater or equal to the size of the `<value>`
1112211123
type, the atomic operation is likely to require a lock and have poor

llvm/lib/AsmParser/LLParser.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8240,6 +8240,8 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
82408240
return tokError("atomicrmw cannot be unordered");
82418241
if (!Ptr->getType()->isPointerTy())
82428242
return error(PtrLoc, "atomicrmw operand must be a pointer");
8243+
if (Val->getType()->isScalableTy())
8244+
return error(ValLoc, "atomicrmw operand may not be scalable");
82438245

82448246
if (Operation == AtomicRMWInst::Xchg) {
82458247
if (!Val->getType()->isIntegerTy() &&
@@ -8251,7 +8253,7 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
82518253
" operand must be an integer, floating point, or pointer type");
82528254
}
82538255
} else if (IsFP) {
8254-
if (!Val->getType()->isFloatingPointTy()) {
8256+
if (!Val->getType()->isFPOrFPVectorTy()) {
82558257
return error(ValLoc, "atomicrmw " +
82568258
AtomicRMWInst::getOperationName(Operation) +
82578259
" operand must be a floating point type");

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -562,9 +562,9 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
562562
Value *&Success, Value *&NewLoaded) {
563563
Type *OrigTy = NewVal->getType();
564564

565-
// This code can go away when cmpxchg supports FP types.
565+
// This code can go away when cmpxchg supports FP and vector types.
566566
assert(!OrigTy->isPointerTy());
567-
bool NeedBitcast = OrigTy->isFloatingPointTy();
567+
bool NeedBitcast = OrigTy->isFloatingPointTy() || OrigTy->isVectorTy();
568568
if (NeedBitcast) {
569569
IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
570570
NewVal = Builder.CreateBitCast(NewVal, IntTy);
@@ -731,7 +731,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
731731
unsigned ValueSize = DL.getTypeStoreSize(ValueType);
732732

733733
PMV.ValueType = PMV.IntValueType = ValueType;
734-
if (PMV.ValueType->isFloatingPointTy())
734+
if (PMV.ValueType->isFloatingPointTy() || PMV.ValueType->isVectorTy())
735735
PMV.IntValueType =
736736
Type::getIntNTy(Ctx, ValueType->getPrimitiveSizeInBits());
737737

llvm/lib/IR/Verifier.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4268,9 +4268,10 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
42684268
" operand must have integer or floating point type!",
42694269
&RMWI, ElTy);
42704270
} else if (AtomicRMWInst::isFPOperation(Op)) {
4271-
Check(ElTy->isFloatingPointTy(),
4271+
Check(ElTy->isFPOrFPVectorTy() && !isa<ScalableVectorType>(ElTy),
42724272
"atomicrmw " + AtomicRMWInst::getOperationName(Op) +
4273-
" operand must have floating point type!",
4273+
" operand must have floating-point or fixed vector of floating-point "
4274+
"type!",
42744275
&RMWI, ElTy);
42754276
} else {
42764277
Check(ElTy->isIntegerTy(),

llvm/test/Assembler/atomic.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,19 @@ define void @fp_atomics(ptr %x) {
7272

7373
ret void
7474
}
75+
76+
define void @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
77+
; CHECK: %atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst
78+
%atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst
79+
80+
; CHECK: %atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst
81+
%atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst
82+
83+
; CHECK: %atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst
84+
%atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst
85+
86+
; CHECK: %atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst
87+
%atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst
88+
89+
ret void
90+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; RUN: split-file %s %t --leading-lines
2+
; RUN: not llvm-as < %t/scalable_fp_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR0 %s
3+
; RUN: not llvm-as < %t/scalable_int_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR1 %s
4+
; RUN: not llvm-as < %t/scalable_ptr_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR2 %s
5+
; RUN: not llvm-as < %t/scalable_fp_vector_atomicrmw_fadd.ll 2>&1 | FileCheck -check-prefix=ERR3 %s
6+
; RUN: not llvm-as < %t/scalable_int_vector_atomicrmw_add.ll 2>&1 | FileCheck -check-prefix=ERR4 %s
7+
8+
;--- scalable_fp_vector_atomicrmw_xchg.ll
9+
define <vscale x 2 x half> @scalable_fp_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x half> %val) {
10+
; ERR0: :41: error: atomicrmw operand may not be scalable
11+
%atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x half> %val seq_cst
12+
ret <vscale x 2 x half> %atomic.xchg
13+
}
14+
15+
;--- scalable_int_vector_atomicrmw_xchg.ll
16+
define <vscale x 2 x i16> @scalable_int_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x i16> %val) {
17+
; ERR1: :41: error: atomicrmw operand may not be scalable
18+
%atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x i16> %val seq_cst
19+
ret <vscale x 2 x i16> %atomic.xchg
20+
}
21+
22+
;--- scalable_ptr_vector_atomicrmw_xchg.ll
23+
define <vscale x 2 x ptr> @scalable_ptr_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x ptr> %val) {
24+
; ERR2: :41: error: atomicrmw operand may not be scalable
25+
%atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x ptr> %val seq_cst
26+
ret <vscale x 2 x ptr> %atomic.xchg
27+
}
28+
29+
;--- scalable_fp_vector_atomicrmw_fadd.ll
30+
define <vscale x 2 x half> @scalable_fp_vector_atomicrmw_fadd(ptr %x, <vscale x 2 x half> %val) {
31+
; ERR3: :41: error: atomicrmw operand may not be scalable
32+
%atomic.fadd = atomicrmw fadd ptr %x, <vscale x 2 x half> %val seq_cst
33+
ret <vscale x 2 x half> %atomic.fadd
34+
}
35+
36+
;--- scalable_int_vector_atomicrmw_add.ll
37+
define <vscale x 2 x i16> @scalable_int_vector_atomicrmw_add(ptr %x, <vscale x 2 x i16> %val) {
38+
; ERR4: :39: error: atomicrmw operand may not be scalable
39+
%atomic.add = atomicrmw add ptr %x, <vscale x 2 x i16> %val seq_cst
40+
ret <vscale x 2 x i16> %atomic.add
41+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
2+
3+
; CHECK: error: atomicrmw xchg operand must be an integer, floating point, or pointer type
4+
define <2 x half> @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
5+
%atomic.xchg = atomicrmw xchg ptr %x, <2 x half> %val seq_cst
6+
ret <2 x half> %atomic.xchg
7+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,NOLSE %s
3+
; RUN: llc -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,LSE %s
4+
5+
define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
6+
; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
7+
; NOLSE: // %bb.0:
8+
; NOLSE-NEXT: fcvtl v1.4s, v0.4h
9+
; NOLSE-NEXT: ldr s0, [x0]
10+
; NOLSE-NEXT: b .LBB0_2
11+
; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start
12+
; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1
13+
; NOLSE-NEXT: fmov s0, w10
14+
; NOLSE-NEXT: cmp w10, w9
15+
; NOLSE-NEXT: b.eq .LBB0_5
16+
; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start
17+
; NOLSE-NEXT: // =>This Loop Header: Depth=1
18+
; NOLSE-NEXT: // Child Loop BB0_3 Depth 2
19+
; NOLSE-NEXT: fcvtl v2.4s, v0.4h
20+
; NOLSE-NEXT: fmov w9, s0
21+
; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s
22+
; NOLSE-NEXT: fcvtn v2.4h, v2.4s
23+
; NOLSE-NEXT: fmov w8, s2
24+
; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start
25+
; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1
26+
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
27+
; NOLSE-NEXT: ldaxr w10, [x0]
28+
; NOLSE-NEXT: cmp w10, w9
29+
; NOLSE-NEXT: b.ne .LBB0_1
30+
; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
31+
; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2
32+
; NOLSE-NEXT: stlxr wzr, w8, [x0]
33+
; NOLSE-NEXT: cbnz wzr, .LBB0_3
34+
; NOLSE-NEXT: b .LBB0_1
35+
; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end
36+
; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0
37+
; NOLSE-NEXT: ret
38+
;
39+
; LSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
40+
; LSE: // %bb.0:
41+
; LSE-NEXT: fcvtl v1.4s, v0.4h
42+
; LSE-NEXT: ldr s0, [x0]
43+
; LSE-NEXT: .LBB0_1: // %atomicrmw.start
44+
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
45+
; LSE-NEXT: fcvtl v2.4s, v0.4h
46+
; LSE-NEXT: fmov w8, s0
47+
; LSE-NEXT: mov w10, w8
48+
; LSE-NEXT: fadd v2.4s, v2.4s, v1.4s
49+
; LSE-NEXT: fcvtn v2.4h, v2.4s
50+
; LSE-NEXT: fmov w9, s2
51+
; LSE-NEXT: casal w10, w9, [x0]
52+
; LSE-NEXT: fmov s0, w10
53+
; LSE-NEXT: cmp w10, w8
54+
; LSE-NEXT: b.ne .LBB0_1
55+
; LSE-NEXT: // %bb.2: // %atomicrmw.end
56+
; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0
57+
; LSE-NEXT: ret
58+
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
59+
ret <2 x half> %res
60+
}
61+
62+
define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
63+
; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
64+
; NOLSE: // %bb.0:
65+
; NOLSE-NEXT: ldr d1, [x0]
66+
; NOLSE-NEXT: b .LBB1_2
67+
; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start
68+
; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1
69+
; NOLSE-NEXT: fmov d1, x10
70+
; NOLSE-NEXT: cmp x10, x9
71+
; NOLSE-NEXT: b.eq .LBB1_5
72+
; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start
73+
; NOLSE-NEXT: // =>This Loop Header: Depth=1
74+
; NOLSE-NEXT: // Child Loop BB1_3 Depth 2
75+
; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s
76+
; NOLSE-NEXT: fmov x9, d1
77+
; NOLSE-NEXT: fmov x8, d2
78+
; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start
79+
; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1
80+
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
81+
; NOLSE-NEXT: ldaxr x10, [x0]
82+
; NOLSE-NEXT: cmp x10, x9
83+
; NOLSE-NEXT: b.ne .LBB1_1
84+
; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
85+
; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2
86+
; NOLSE-NEXT: stlxr wzr, x8, [x0]
87+
; NOLSE-NEXT: cbnz wzr, .LBB1_3
88+
; NOLSE-NEXT: b .LBB1_1
89+
; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end
90+
; NOLSE-NEXT: fmov d0, d1
91+
; NOLSE-NEXT: ret
92+
;
93+
; LSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
94+
; LSE: // %bb.0:
95+
; LSE-NEXT: ldr d1, [x0]
96+
; LSE-NEXT: .LBB1_1: // %atomicrmw.start
97+
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
98+
; LSE-NEXT: fadd v2.2s, v1.2s, v0.2s
99+
; LSE-NEXT: fmov x8, d1
100+
; LSE-NEXT: mov x10, x8
101+
; LSE-NEXT: fmov x9, d2
102+
; LSE-NEXT: casal x10, x9, [x0]
103+
; LSE-NEXT: fmov d1, x10
104+
; LSE-NEXT: cmp x10, x8
105+
; LSE-NEXT: b.ne .LBB1_1
106+
; LSE-NEXT: // %bb.2: // %atomicrmw.end
107+
; LSE-NEXT: fmov d0, d1
108+
; LSE-NEXT: ret
109+
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
110+
ret <2 x float> %res
111+
}
112+
113+
attributes #0 = { nounwind }
114+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
115+
; CHECK: {{.*}}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck %s
3+
4+
define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
5+
; CHECK-LABEL: test_atomicrmw_fadd_v2f16_align4:
6+
; CHECK: # %bb.0:
7+
; CHECK-NEXT: pushq %rbp
8+
; CHECK-NEXT: pushq %rbx
9+
; CHECK-NEXT: subq $88, %rsp
10+
; CHECK-NEXT: movq %rdi, %rbx
11+
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12+
; CHECK-NEXT: psrld $16, %xmm0
13+
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14+
; CHECK-NEXT: pinsrw $0, 2(%rdi), %xmm1
15+
; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0
16+
; CHECK-NEXT: .p2align 4, 0x90
17+
; CHECK-NEXT: .LBB0_1: # %atomicrmw.start
18+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
19+
; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
20+
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
21+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
22+
; CHECK-NEXT: callq __extendhfsf2@PLT
23+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
24+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
25+
; CHECK-NEXT: callq __extendhfsf2@PLT
26+
; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
27+
; CHECK-NEXT: callq __truncsfhf2@PLT
28+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
29+
; CHECK-NEXT: movzwl %ax, %ebp
30+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
31+
; CHECK-NEXT: callq __extendhfsf2@PLT
32+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
33+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
34+
; CHECK-NEXT: callq __extendhfsf2@PLT
35+
; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
36+
; CHECK-NEXT: callq __truncsfhf2@PLT
37+
; CHECK-NEXT: pextrw $0, %xmm0, %ecx
38+
; CHECK-NEXT: shll $16, %ecx
39+
; CHECK-NEXT: orl %ebp, %ecx
40+
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
41+
; CHECK-NEXT: pextrw $0, %xmm0, %edx
42+
; CHECK-NEXT: shll $16, %edx
43+
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
44+
; CHECK-NEXT: pextrw $0, %xmm0, %eax
45+
; CHECK-NEXT: movzwl %ax, %eax
46+
; CHECK-NEXT: orl %edx, %eax
47+
; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx)
48+
; CHECK-NEXT: setne %cl
49+
; CHECK-NEXT: pinsrw $0, %eax, %xmm0
50+
; CHECK-NEXT: shrl $16, %eax
51+
; CHECK-NEXT: pinsrw $0, %eax, %xmm1
52+
; CHECK-NEXT: testb %cl, %cl
53+
; CHECK-NEXT: jne .LBB0_1
54+
; CHECK-NEXT: # %bb.2: # %atomicrmw.end
55+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
56+
; CHECK-NEXT: addq $88, %rsp
57+
; CHECK-NEXT: popq %rbx
58+
; CHECK-NEXT: popq %rbp
59+
; CHECK-NEXT: retq
60+
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
61+
ret <2 x half> %res
62+
}
63+
64+
define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
65+
; CHECK-LABEL: test_atomicrmw_fadd_v2f32_align8:
66+
; CHECK: # %bb.0:
67+
; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
68+
; CHECK-NEXT: .p2align 4, 0x90
69+
; CHECK-NEXT: .LBB1_1: # %atomicrmw.start
70+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
71+
; CHECK-NEXT: movq %xmm1, %rax
72+
; CHECK-NEXT: addps %xmm0, %xmm1
73+
; CHECK-NEXT: movq %xmm1, %rcx
74+
; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi)
75+
; CHECK-NEXT: movq %rax, %xmm1
76+
; CHECK-NEXT: jne .LBB1_1
77+
; CHECK-NEXT: # %bb.2: # %atomicrmw.end
78+
; CHECK-NEXT: movdqa %xmm1, %xmm0
79+
; CHECK-NEXT: retq
80+
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
81+
ret <2 x float> %res
82+
}
83+
84+
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)