Skip to content

Commit 076d0ed

Browse files
tlivelyarichardson
authored andcommitted
[WebAssembly] Add wasm-specific vector shuffle builtin and intrinsic
Summary: Although using `__builtin_shufflevector` and the `shufflevector` instruction works fine, they are not opaque to the optimizer. As a result, DAGCombine can potentially reduce the number of shuffles and change the shuffle masks. This is unexpected behavior for users of the WebAssembly SIMD intrinsics who have crafted their shuffles to optimize the code generated by engines. This patch solves the problem by adding a new shuffle intrinsic that is opaque to the optimizers in line with the decision of the WebAssembly SIMD contributors at WebAssembly/simd#196 (comment). In the future we may implement custom DAG combines to properly optimize shuffles and replace this solution. Reviewers: aheejin, dschuff Subscribers: sbc100, jgravelle-google, hiraditya, sunfish, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D66983
2 parents 1879266 + 8e3e56f commit 076d0ed

File tree

7 files changed

+97
-12
lines changed

7 files changed

+97
-12
lines changed

clang/include/clang/Basic/BuiltinsWebAssembly.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ TARGET_BUILTIN(__builtin_wasm_avgr_u_i8x16, "V16cV16cV16c", "nc", "simd128")
119119
TARGET_BUILTIN(__builtin_wasm_avgr_u_i16x8, "V8sV8sV8s", "nc", "simd128")
120120

121121
TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128")
122+
TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16cV16cV16cIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128")
122123

123124
TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16c", "nc", "simd128")
124125
TARGET_BUILTIN(__builtin_wasm_any_true_i16x8, "iV8s", "nc", "simd128")

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16374,6 +16374,20 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
1637416374
CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()});
1637516375
return Builder.CreateCall(Callee, Vec);
1637616376
}
16377+
case WebAssembly::BI__builtin_wasm_shuffle_v8x16: {
16378+
Value *Ops[18];
16379+
size_t OpIdx = 0;
16380+
Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
16381+
Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
16382+
while (OpIdx < 18) {
16383+
llvm::APSInt LaneConst;
16384+
if (!E->getArg(OpIdx)->isIntegerConstantExpr(LaneConst, getContext()))
16385+
llvm_unreachable("Constant arg isn't actually constant?");
16386+
Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), LaneConst);
16387+
}
16388+
Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
16389+
return Builder.CreateCall(Callee, Ops);
16390+
}
1637716391
default:
1637816392
return nullptr;
1637916393
}

clang/lib/Headers/wasm_simd128.h

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,23 +1020,31 @@ wasm_f32x4_convert_u32x4(v128_t __a) {
10201020
#define wasm_v8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
10211021
__c7, __c8, __c9, __c10, __c11, __c12, __c13, \
10221022
__c14, __c15) \
1023-
((v128_t)(__builtin_shufflevector( \
1024-
(__u8x16)(__a), (__u8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5, \
1025-
__c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15)))
1023+
((v128_t)__builtin_wasm_shuffle_v8x16( \
1024+
(__i8x16)(__a), (__i8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5, \
1025+
__c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15))
10261026

10271027
#define wasm_v16x8_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
10281028
__c7) \
1029-
((v128_t)(__builtin_shufflevector((__u16x8)(__a), (__u16x8)(__b), __c0, \
1030-
__c1, __c2, __c3, __c4, __c5, __c6, \
1031-
__c7)))
1029+
((v128_t)__builtin_wasm_shuffle_v8x16( \
1030+
(__i8x16)(__a), (__i8x16)(__b), __c0 * 2, __c0 * 2 + 1, __c1 * 2, \
1031+
__c1 * 2 + 1, __c2 * 2, __c2 * 2 + 1, __c3 * 2, __c3 * 2 + 1, __c4 * 2, \
1032+
__c4 * 2 + 1, __c5 * 2, __c5 * 2 + 1, __c6 * 2, __c6 * 2 + 1, __c7 * 2, \
1033+
__c7 * 2 + 1))
10321034

10331035
#define wasm_v32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3) \
1034-
((v128_t)(__builtin_shufflevector((__u32x4)(__a), (__u32x4)(__b), __c0, \
1035-
__c1, __c2, __c3)))
1036+
((v128_t)__builtin_wasm_shuffle_v8x16( \
1037+
(__i8x16)(__a), (__i8x16)(__b), __c0 * 4, __c0 * 4 + 1, __c0 * 4 + 2, \
1038+
__c0 * 4 + 3, __c1 * 4, __c1 * 4 + 1, __c1 * 4 + 2, __c1 * 4 + 3, \
1039+
__c2 * 4, __c2 * 4 + 1, __c2 * 4 + 2, __c2 * 4 + 3, __c3 * 4, \
1040+
__c3 * 4 + 1, __c3 * 4 + 2, __c3 * 4 + 3))
10361041

10371042
#define wasm_v64x2_shuffle(__a, __b, __c0, __c1) \
1038-
((v128_t)( \
1039-
__builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), __c0, __c1)))
1043+
((v128_t)__builtin_wasm_shuffle_v8x16( \
1044+
(__i8x16)(__a), (__i8x16)(__b), __c0 * 8, __c0 * 8 + 1, __c0 * 8 + 2, \
1045+
__c0 * 8 + 3, __c0 * 8 + 4, __c0 * 8 + 5, __c0 * 8 + 6, __c0 * 8 + 7, \
1046+
__c1 * 8, __c1 * 8 + 1, __c1 * 8 + 2, __c1 * 8 + 3, __c1 * 8 + 4, \
1047+
__c1 * 8 + 5, __c1 * 8 + 6, __c1 * 8 + 7))
10401048

10411049
#ifdef __wasm_unimplemented_simd128__
10421050

clang/test/CodeGen/builtins-wasm.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -724,5 +724,14 @@ i32x4 widen_high_u_i32x4_i16x8(i16x8 v) {
724724
i8x16 swizzle_v8x16(i8x16 x, i8x16 y) {
725725
return __builtin_wasm_swizzle_v8x16(x, y);
726726
// WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)
727+
}
728+
729+
i8x16 shuffle(i8x16 x, i8x16 y) {
730+
return __builtin_wasm_shuffle_v8x16(x, y, 0, 1, 2, 3, 4, 5, 6, 7,
731+
8, 9, 10, 11, 12, 13, 14, 15);
732+
// WEBASSEMBLY: call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
733+
// WEBASSEMBLY-SAME: i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
734+
// WEBASSEMBLY-SAME: i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14,
735+
// WEBASSEMBLY-SAME: i32 15
727736
// WEBASSEMBLY-NEXT: ret
728737
}

llvm/include/llvm/IR/IntrinsicsWebAssembly.td

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,13 @@ def int_wasm_swizzle :
104104
Intrinsic<[llvm_v16i8_ty],
105105
[llvm_v16i8_ty, llvm_v16i8_ty],
106106
[IntrNoMem, IntrSpeculatable]>;
107+
def int_wasm_shuffle :
108+
Intrinsic<[llvm_v16i8_ty],
109+
[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty,
110+
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
111+
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
112+
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
113+
[IntrNoMem, IntrSpeculatable]>;
107114
def int_wasm_sub_saturate_signed :
108115
Intrinsic<[llvm_anyvector_ty],
109116
[LLVMMatchType<0>, LLVMMatchType<0>],
@@ -116,7 +123,6 @@ def int_wasm_avgr_unsigned :
116123
Intrinsic<[llvm_anyvector_ty],
117124
[LLVMMatchType<0>, LLVMMatchType<0>],
118125
[IntrNoMem, IntrSpeculatable]>;
119-
120126
def int_wasm_bitselect :
121127
Intrinsic<[llvm_anyvector_ty],
122128
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
@@ -170,7 +176,6 @@ def int_wasm_widen_high_unsigned :
170176
[llvm_anyvector_ty],
171177
[IntrNoMem, IntrSpeculatable]>;
172178

173-
174179
//===----------------------------------------------------------------------===//
175180
// Bulk memory intrinsics
176181
//===----------------------------------------------------------------------===//

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,6 +1354,24 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
13541354
Op.getOperand(3) // thrown value
13551355
});
13561356
}
1357+
1358+
case Intrinsic::wasm_shuffle: {
1359+
// Drop in-chain and replace undefs, but otherwise pass through unchanged
1360+
SDValue Ops[18];
1361+
size_t OpIdx = 0;
1362+
Ops[OpIdx++] = Op.getOperand(1);
1363+
Ops[OpIdx++] = Op.getOperand(2);
1364+
while (OpIdx < 18) {
1365+
const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
1366+
if (MaskIdx.isUndef() ||
1367+
cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
1368+
Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32);
1369+
} else {
1370+
Ops[OpIdx++] = MaskIdx;
1371+
}
1372+
}
1373+
return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
1374+
}
13571375
}
13581376
}
13591377

llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,36 @@ define <16 x i8> @narrow_unsigned_v16i8(<8 x i16> %low, <8 x i16> %high) {
141141
ret <16 x i8> %a
142142
}
143143

144+
; CHECK-LABEL: shuffle_v16i8:
145+
; NO-SIMD128-NOT: v8x16
146+
; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
147+
; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
148+
; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}}
149+
; SIMD128-NEXT: return $pop[[R]]{{$}}
150+
declare <16 x i8> @llvm.wasm.shuffle(
151+
<16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
152+
i32, i32, i32, i32, i32)
153+
define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
154+
%res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
155+
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
156+
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 35)
157+
ret <16 x i8> %res
158+
}
159+
160+
; CHECK-LABEL: shuffle_undef_v16i8:
161+
; NO-SIMD128-NOT: v8x16
162+
; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
163+
; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
164+
; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}}
165+
; SIMD128-NEXT: return $pop[[R]]{{$}}
166+
define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
167+
%res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
168+
i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
169+
i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
170+
i32 undef, i32 undef, i32 undef, i32 2)
171+
ret <16 x i8> %res
172+
}
173+
144174
; ==============================================================================
145175
; 8 x i16
146176
; ==============================================================================

0 commit comments

Comments
 (0)