Skip to content

Commit 11c8188

Browse files
[AArch64] Improve index selection for histograms (#111150)
Removes unnecessary extends on the indices passed into histogram instructions. It also removes the instruction when the mask is zero.
1 parent e7f1dae commit 11c8188

File tree

4 files changed

+280
-17
lines changed

4 files changed

+280
-17
lines changed

llvm/include/llvm/CodeGen/SelectionDAGNodes.h

+5-7
Original file line numberDiff line numberDiff line change
@@ -2938,8 +2938,8 @@ class MaskedGatherScatterSDNode : public MemSDNode {
29382938
const SDValue &getScale() const { return getOperand(5); }
29392939

29402940
static bool classof(const SDNode *N) {
2941-
return N->getOpcode() == ISD::MGATHER ||
2942-
N->getOpcode() == ISD::MSCATTER;
2941+
return N->getOpcode() == ISD::MGATHER || N->getOpcode() == ISD::MSCATTER ||
2942+
N->getOpcode() == ISD::EXPERIMENTAL_VECTOR_HISTOGRAM;
29432943
}
29442944
};
29452945

@@ -2994,17 +2994,15 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
29942994
}
29952995
};
29962996

2997-
class MaskedHistogramSDNode : public MemSDNode {
2997+
class MaskedHistogramSDNode : public MaskedGatherScatterSDNode {
29982998
public:
29992999
friend class SelectionDAG;
30003000

30013001
MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs,
30023002
EVT MemVT, MachineMemOperand *MMO,
30033003
ISD::MemIndexType IndexType)
3004-
: MemSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL, VTs, MemVT,
3005-
MMO) {
3006-
LSBaseSDNodeBits.AddressingMode = IndexType;
3007-
}
3004+
: MaskedGatherScatterSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL,
3005+
VTs, MemVT, MMO, IndexType) {}
30083006

30093007
ISD::MemIndexType getIndexType() const {
30103008
return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,7 @@ namespace {
549549
SDValue visitMSTORE(SDNode *N);
550550
SDValue visitMGATHER(SDNode *N);
551551
SDValue visitMSCATTER(SDNode *N);
552+
SDValue visitMHISTOGRAM(SDNode *N);
552553
SDValue visitVPGATHER(SDNode *N);
553554
SDValue visitVPSCATTER(SDNode *N);
554555
SDValue visitVP_STRIDED_LOAD(SDNode *N);
@@ -1972,6 +1973,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
19721973
case ISD::MLOAD: return visitMLOAD(N);
19731974
case ISD::MSCATTER: return visitMSCATTER(N);
19741975
case ISD::MSTORE: return visitMSTORE(N);
1976+
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
19751977
case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N);
19761978
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
19771979
case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
@@ -12357,6 +12359,35 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
1235712359
return SDValue();
1235812360
}
1235912361

12362+
SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
12363+
MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
12364+
SDValue Chain = HG->getChain();
12365+
SDValue Inc = HG->getInc();
12366+
SDValue Mask = HG->getMask();
12367+
SDValue BasePtr = HG->getBasePtr();
12368+
SDValue Index = HG->getIndex();
12369+
SDLoc DL(HG);
12370+
12371+
EVT MemVT = HG->getMemoryVT();
12372+
MachineMemOperand *MMO = HG->getMemOperand();
12373+
ISD::MemIndexType IndexType = HG->getIndexType();
12374+
12375+
if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
12376+
return Chain;
12377+
12378+
SDValue Ops[] = {Chain, Inc, Mask, BasePtr, Index,
12379+
HG->getScale(), HG->getIntID()};
12380+
if (refineUniformBase(BasePtr, Index, HG->isIndexScaled(), DAG, DL))
12381+
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
12382+
MMO, IndexType);
12383+
12384+
EVT DataVT = Index.getValueType();
12385+
if (refineIndexType(Index, IndexType, DataVT, DAG))
12386+
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, Ops,
12387+
MMO, IndexType);
12388+
return SDValue();
12389+
}
12390+
1236012391
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
1236112392
auto *SLD = cast<VPStridedLoadSDNode>(N);
1236212393
EVT EltVT = SLD->getValueType(0).getVectorElementType();

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+16-10
Original file line numberDiff line numberDiff line change
@@ -1122,7 +1122,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
11221122
ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
11231123
ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
11241124

1125-
setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
1125+
setTargetDAGCombine(
1126+
{ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
11261127

11271128
setTargetDAGCombine(ISD::FP_EXTEND);
11281129

@@ -23821,11 +23822,9 @@ static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
2382123822

2382223823
static SDValue performMaskedGatherScatterCombine(
2382323824
SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
23824-
MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
23825-
assert(MGS && "Can only combine gather load or scatter store nodes");
23826-
2382723825
if (!DCI.isBeforeLegalize())
2382823826
return SDValue();
23827+
MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
2382923828

2383023829
SDLoc DL(MGS);
2383123830
SDValue Chain = MGS->getChain();
@@ -23847,12 +23846,18 @@ static SDValue performMaskedGatherScatterCombine(
2384723846
DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
2384823847
Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
2384923848
}
23850-
auto *MSC = cast<MaskedScatterSDNode>(MGS);
23851-
SDValue Data = MSC->getValue();
23852-
SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
23853-
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
23854-
Ops, MSC->getMemOperand(), IndexType,
23855-
MSC->isTruncatingStore());
23849+
if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
23850+
SDValue Data = MSC->getValue();
23851+
SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
23852+
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
23853+
DL, Ops, MSC->getMemOperand(), IndexType,
23854+
MSC->isTruncatingStore());
23855+
}
23856+
auto *HG = cast<MaskedHistogramSDNode>(MGS);
23857+
SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
23858+
Index, Scale, HG->getIntID()};
23859+
return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
23860+
DL, Ops, HG->getMemOperand(), IndexType);
2385623861
}
2385723862

2385823863
/// Target-specific DAG combine function for NEON load/store intrinsics
@@ -26019,6 +26024,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2601926024
return performMSTORECombine(N, DCI, DAG, Subtarget);
2602026025
case ISD::MGATHER:
2602126026
case ISD::MSCATTER:
26027+
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
2602226028
return performMaskedGatherScatterCombine(N, DCI, DAG);
2602326029
case ISD::FP_EXTEND:
2602426030
return performFPExtendCombine(N, DAG, DCI, Subtarget);

llvm/test/CodeGen/AArch64/sve2-histcnt.ll

+228
Original file line numberDiff line numberDiff line change
@@ -267,5 +267,233 @@ define void @histogram_i16_8_lane(ptr %base, <vscale x 8 x i32> %indices, i16 %i
267267
ret void
268268
}
269269

270+
define void @histogram_i8_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i8 %inc) #0{
271+
; CHECK-LABEL: histogram_i8_zext:
272+
; CHECK: // %bb.0:
273+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
274+
; CHECK-NEXT: mov z3.s, w1
275+
; CHECK-NEXT: ld1b { z2.s }, p0/z, [x0, z0.s, uxtw]
276+
; CHECK-NEXT: ptrue p1.s
277+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
278+
; CHECK-NEXT: st1b { z1.s }, p0, [x0, z0.s, uxtw]
279+
; CHECK-NEXT: ret
280+
%extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
281+
%buckets = getelementptr i8, ptr %base, <vscale x 4 x i64> %extended
282+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> %buckets, i8 %inc, <vscale x 4 x i1> %mask)
283+
ret void
284+
}
285+
286+
define void @histogram_i16_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i16 %inc) #0{
287+
; CHECK-LABEL: histogram_i16_zext:
288+
; CHECK: // %bb.0:
289+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
290+
; CHECK-NEXT: mov z3.s, w1
291+
; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, uxtw #1]
292+
; CHECK-NEXT: ptrue p1.s
293+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
294+
; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, uxtw #1]
295+
; CHECK-NEXT: ret
296+
%extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
297+
%buckets = getelementptr i16, ptr %base, <vscale x 4 x i64> %extended
298+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 %inc, <vscale x 4 x i1> %mask)
299+
ret void
300+
}
301+
302+
define void @histogram_i32_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
303+
; CHECK-LABEL: histogram_i32_zext:
304+
; CHECK: // %bb.0:
305+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
306+
; CHECK-NEXT: mov z3.s, #1 // =0x1
307+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
308+
; CHECK-NEXT: ptrue p1.s
309+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
310+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
311+
; CHECK-NEXT: ret
312+
%extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
313+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
314+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
315+
ret void
316+
}
317+
318+
define void @histogram_i32_sext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
319+
; CHECK-LABEL: histogram_i32_sext:
320+
; CHECK: // %bb.0:
321+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
322+
; CHECK-NEXT: mov z3.s, #1 // =0x1
323+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
324+
; CHECK-NEXT: ptrue p1.s
325+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
326+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
327+
; CHECK-NEXT: ret
328+
%extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
329+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
330+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
331+
ret void
332+
}
333+
334+
define void @histogram_zext_from_i8_to_i64(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
335+
; CHECK-LABEL: histogram_zext_from_i8_to_i64:
336+
; CHECK: // %bb.0:
337+
; CHECK-NEXT: and z0.s, z0.s, #0xff
338+
; CHECK-NEXT: mov z3.s, #1 // =0x1
339+
; CHECK-NEXT: ptrue p1.s
340+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
341+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
342+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
343+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
344+
; CHECK-NEXT: ret
345+
%extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i64>
346+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
347+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
348+
ret void
349+
}
350+
351+
define void @histogram_zext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
352+
; CHECK-LABEL: histogram_zext_from_i16_to_i64:
353+
; CHECK: // %bb.0:
354+
; CHECK-NEXT: and z0.s, z0.s, #0xffff
355+
; CHECK-NEXT: mov z3.s, #1 // =0x1
356+
; CHECK-NEXT: ptrue p1.s
357+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
358+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
359+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
360+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
361+
; CHECK-NEXT: ret
362+
%extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
363+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
364+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
365+
ret void
366+
}
367+
368+
define void @histogram_sext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
369+
; CHECK-LABEL: histogram_sext_from_i16_to_i64:
370+
; CHECK: // %bb.0:
371+
; CHECK-NEXT: ptrue p1.s
372+
; CHECK-NEXT: mov z3.s, #1 // =0x1
373+
; CHECK-NEXT: sxth z0.s, p1/m, z0.s
374+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
375+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
376+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
377+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
378+
; CHECK-NEXT: ret
379+
%extended = sext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
380+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
381+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
382+
ret void
383+
}
384+
385+
define void @histogram_zext_from_i8_to_i32(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
386+
; CHECK-LABEL: histogram_zext_from_i8_to_i32:
387+
; CHECK: // %bb.0:
388+
; CHECK-NEXT: and z0.s, z0.s, #0xff
389+
; CHECK-NEXT: mov z3.s, #1 // =0x1
390+
; CHECK-NEXT: ptrue p1.s
391+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
392+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
393+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
394+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
395+
; CHECK-NEXT: ret
396+
%extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i32>
397+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
398+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
399+
ret void
400+
}
401+
402+
define void @histogram_zext_from_i16_to_i32(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0 {
403+
; CHECK-LABEL: histogram_zext_from_i16_to_i32:
404+
; CHECK: // %bb.0:
405+
; CHECK-NEXT: and z0.s, z0.s, #0xffff
406+
; CHECK-NEXT: mov z3.s, #1 // =0x1
407+
; CHECK-NEXT: ptrue p1.s
408+
; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s
409+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
410+
; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s
411+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
412+
; CHECK-NEXT: ret
413+
%extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i32>
414+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
415+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
416+
ret void
417+
}
418+
419+
define void @histogram_2_lane_zext(ptr %base, <vscale x 2 x i32> %indices, <vscale x 2 x i1> %mask) #0 {
420+
; CHECK-LABEL: histogram_2_lane_zext:
421+
; CHECK: // %bb.0:
422+
; CHECK-NEXT: mov z1.d, z0.d
423+
; CHECK-NEXT: mov z3.d, #1 // =0x1
424+
; CHECK-NEXT: ptrue p1.d
425+
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x0, z0.d, uxtw #2]
426+
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
427+
; CHECK-NEXT: histcnt z1.d, p0/z, z1.d, z1.d
428+
; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d
429+
; CHECK-NEXT: st1w { z1.d }, p0, [x0, z0.d, uxtw #2]
430+
; CHECK-NEXT: ret
431+
%extended = zext <vscale x 2 x i32> %indices to <vscale x 2 x i64>
432+
%buckets = getelementptr i32, ptr %base, <vscale x 2 x i64> %extended
433+
call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> %buckets, i32 1, <vscale x 2 x i1> %mask)
434+
ret void
435+
}
436+
437+
define void @histogram_8_lane_zext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
438+
; CHECK-LABEL: histogram_8_lane_zext:
439+
; CHECK: // %bb.0:
440+
; CHECK-NEXT: punpklo p1.h, p0.b
441+
; CHECK-NEXT: mov z4.s, #1 // =0x1
442+
; CHECK-NEXT: ptrue p2.s
443+
; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
444+
; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, uxtw #2]
445+
; CHECK-NEXT: punpkhi p0.h, p0.b
446+
; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
447+
; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, uxtw #2]
448+
; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
449+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, uxtw #2]
450+
; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
451+
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
452+
; CHECK-NEXT: ret
453+
%extended = zext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
454+
%buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
455+
call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
456+
ret void
457+
}
458+
459+
define void @histogram_8_lane_sext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
460+
; CHECK-LABEL: histogram_8_lane_sext:
461+
; CHECK: // %bb.0:
462+
; CHECK-NEXT: punpklo p1.h, p0.b
463+
; CHECK-NEXT: mov z4.s, #1 // =0x1
464+
; CHECK-NEXT: ptrue p2.s
465+
; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
466+
; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
467+
; CHECK-NEXT: punpkhi p0.h, p0.b
468+
; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
469+
; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, sxtw #2]
470+
; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
471+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2]
472+
; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
473+
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
474+
; CHECK-NEXT: ret
475+
%extended = sext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
476+
%buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
477+
call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
478+
ret void
479+
}
480+
481+
define void @histogram_zero_mask(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0{
482+
; CHECK-LABEL: histogram_zero_mask:
483+
; CHECK: // %bb.0:
484+
; CHECK-NEXT: ret
485+
call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> zeroinitializer)
486+
ret void
487+
}
488+
489+
define void @histogram_sext_zero_mask(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0{
490+
; CHECK-LABEL: histogram_sext_zero_mask:
491+
; CHECK: // %bb.0:
492+
; CHECK-NEXT: ret
493+
%extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
494+
%buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
495+
call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> zeroinitializer)
496+
ret void
497+
}
270498

271499
attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }

0 commit comments

Comments
 (0)