-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Closed
Labels
Description
In my real code, I have situation where I use vpermb
but am ultimately only interested in the first 16 bytes of the result. I then want to blend this vector with another vector based on a mask I produce with that vector.
Here is what is emitted for Zen 5:
foo:
vpcmpnleub k1, xmm0, xmm1
vpermb zmm1, zmm3, zmm2
vmovdqu8 xmm0 {k1}, xmm1
I think it would be better if we folded the vmovdqu8
into the vpermb
.
foo:
vpcmpnleub k1, xmm0, xmm1
vpermb zmm0 {k1}, zmm3, zmm2
; just use xmm0 from this point on
Here is a minimal repro in Zig that produces the original assembly: (Godbolt)
const std = @import("std");
export fn foo(a: @Vector(16, u8), b: @Vector(16, u8), table: @Vector(64, u8), indices: @Vector(64, u8)) @Vector(16, u8) {
return std.simd.extract(vperm(table, indices, padWithUndefineds(@Vector(64, u8), a), @as(u16, @bitCast(a > b))), 0, 16);
}
fn vperm(table: @Vector(64, u8), indices: @Vector(64, u8), fallback: @Vector(64, u8), mask: u64) @Vector(64, u8) {
return struct {
extern fn @"llvm.x86.avx512.mask.permvar.qi.512"(@Vector(64, u8), @Vector(64, u8), @Vector(64, u8), u64) @Vector(64, u8);
}.@"llvm.x86.avx512.mask.permvar.qi.512"(table, indices, fallback, mask);
}
fn padWithUndefineds(T: type, value: anytype) T {
const padding_len = @typeInfo(T).vector.len - @typeInfo(@TypeOf(value)).vector.len;
return if (padding_len == 0) value else std.simd.join(value, @as(@Vector(padding_len, u8), @splat(undefined)));
}
LLVM version: (Godbolt)
define dso_local <16 x i8> @foo(<16 x i8> %0, <16 x i8> %1, <64 x i8> %2, <64 x i8> %3) local_unnamed_addr {
Entry:
%4 = icmp ugt <16 x i8> %0, %1
%5 = tail call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %2, <64 x i8> %3)
%6 = shufflevector <64 x i8> %5, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%7 = select <16 x i1> %4, <16 x i8> %6, <16 x i8> %0
ret <16 x i8> %7
}
declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) #1