Skip to content

[AVX-512] Merge vmovdq/vpblendm on xmm registers into previous maskable instruction operating on zmm registers  #113400

@Validark

Description

@Validark

In my real code, I have situation where I use vpermb but am ultimately only interested in the first 16 bytes of the result. I then want to blend this vector with another vector based on a mask I produce with that vector.

Here is what is emitted for Zen 5:

foo:
        vpcmpnleub      k1, xmm0, xmm1
        vpermb  zmm1, zmm3, zmm2
        vmovdqu8        xmm0 {k1}, xmm1

I think it would be better if we folded the vmovdqu8 into the vpermb.

foo:
        vpcmpnleub      k1, xmm0, xmm1
        vpermb  zmm0 {k1}, zmm3, zmm2
        ; just use xmm0 from this point on

Here is a minimal repro in Zig that produces the original assembly: (Godbolt)

const std = @import("std");

export fn foo(a: @Vector(16, u8), b: @Vector(16, u8), table: @Vector(64, u8), indices: @Vector(64, u8)) @Vector(16, u8) {
    return std.simd.extract(vperm(table, indices, padWithUndefineds(@Vector(64, u8), a), @as(u16, @bitCast(a > b))), 0, 16);
}

fn vperm(table: @Vector(64, u8), indices: @Vector(64, u8), fallback: @Vector(64, u8), mask: u64) @Vector(64, u8) {
    return struct {
        extern fn @"llvm.x86.avx512.mask.permvar.qi.512"(@Vector(64, u8), @Vector(64, u8), @Vector(64, u8), u64) @Vector(64, u8);
    }.@"llvm.x86.avx512.mask.permvar.qi.512"(table, indices, fallback, mask);
}

fn padWithUndefineds(T: type, value: anytype) T {
    const padding_len = @typeInfo(T).vector.len - @typeInfo(@TypeOf(value)).vector.len;
    return if (padding_len == 0) value else std.simd.join(value, @as(@Vector(padding_len, u8), @splat(undefined)));
}

LLVM version: (Godbolt)

define dso_local <16 x i8> @foo(<16 x i8> %0, <16 x i8> %1, <64 x i8> %2, <64 x i8> %3) local_unnamed_addr {
Entry:
  %4 = icmp ugt <16 x i8> %0, %1
  %5 = tail call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %2, <64 x i8> %3)
  %6 = shufflevector <64 x i8> %5, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %7 = select <16 x i1> %4, <16 x i8> %6, <16 x i8> %0
  ret <16 x i8> %7
}

declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) #1

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions