[AVX-512] Merge `vmovdq/vpblendm` on `xmm` registers into previous maskable instruction operating on `zmm` registers 

In my real code, I have situation where I use `vpermb` but am ultimately only interested in the first 16 bytes of the result. I then want to blend this vector with another vector based on a mask I produce with that vector.

Here is what is emitted for Zen 5:

```asm
foo:
        vpcmpnleub      k1, xmm0, xmm1
        vpermb  zmm1, zmm3, zmm2
        vmovdqu8        xmm0 {k1}, xmm1
```

I think it would be better if we folded the `vmovdqu8` into the `vpermb`.

```asm
foo:
        vpcmpnleub      k1, xmm0, xmm1
        vpermb  zmm0 {k1}, zmm3, zmm2
        ; just use xmm0 from this point on
```

Here is a minimal repro in Zig that produces the original assembly: ([Godbolt](https://zig.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:2,lang:zig,selection:(endColumn:1,endLineNumber:18,positionColumn:1,positionLineNumber:18,selectionStartColumn:1,selectionStartLineNumber:18,startColumn:1,startLineNumber:18),source:'const+std+%3D+@import(%22std%22)%3B%0A%0Aexport+fn+foo(a:+@Vector(16,+u8),+b:+@Vector(16,+u8),+table:+@Vector(64,+u8),+indices:+@Vector(64,+u8))+@Vector(16,+u8)+%7B%0A++++return+std.simd.extract(vperm(table,+indices,+padWithUndefineds(@Vector(64,+u8),+a),+@as(u16,+@bitCast(a+%3E+b))),+0,+16)%3B%0A%7D%0A%0Afn+vperm(table:+@Vector(64,+u8),+indices:+@Vector(64,+u8),+fallback:+@Vector(64,+u8),+mask:+u64)+@Vector(64,+u8)+%7B%0A++++return+struct+%7B%0A++++++++extern+fn+@%22llvm.x86.avx512.mask.permvar.qi.512%22(@Vector(64,+u8),+@Vector(64,+u8),+@Vector(64,+u8),+u64)+@Vector(64,+u8)%3B%0A++++%7D.@%22llvm.x86.avx512.mask.permvar.qi.512%22(table,+indices,+fallback,+mask)%3B%0A%7D%0A%0Afn+padWithUndefineds(T:+type,+value:+anytype)+T+%7B%0A++++const+padding_len+%3D+@typeInfo(T).vector.len+-+@typeInfo(@TypeOf(value)).vector.len%3B%0A++++return+if+(padding_len+%3D%3D+0)+value+else+std.simd.join(value,+@as(@Vector(padding_len,+u8),+@splat(undefined)))%3B%0A%7D%0A%0A'),l:'5',n:'1',o:'Zig+source+%232',t:'0')),header:(),k:67.58480989856056,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:ztrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:20,fontUsePx:'0',j:3,lang:zig,libs:!(),options:'-O+ReleaseFast+-target+x86_64-linux+-mcpu%3Dznver5',overrides:!(),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:2),l:'5',n:'0',o:'+zig+trunk+(Editor+%232)',t:'0')),header:(),k:32.336604639749844,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',m:100,n:'0',o:'',t:'0')),version:4))

```zig
const std = @import("std");

export fn foo(a: @Vector(16, u8), b: @Vector(16, u8), table: @Vector(64, u8), indices: @Vector(64, u8)) @Vector(16, u8) {
    return std.simd.extract(vperm(table, indices, padWithUndefineds(@Vector(64, u8), a), @as(u16, @bitCast(a > b))), 0, 16);
}

fn vperm(table: @Vector(64, u8), indices: @Vector(64, u8), fallback: @Vector(64, u8), mask: u64) @Vector(64, u8) {
    return struct {
        extern fn @"llvm.x86.avx512.mask.permvar.qi.512"(@Vector(64, u8), @Vector(64, u8), @Vector(64, u8), u64) @Vector(64, u8);
    }.@"llvm.x86.avx512.mask.permvar.qi.512"(table, indices, fallback, mask);
}

fn padWithUndefineds(T: type, value: anytype) T {
    const padding_len = @typeInfo(T).vector.len - @typeInfo(@TypeOf(value)).vector.len;
    return if (padding_len == 0) value else std.simd.join(value, @as(@Vector(padding_len, u8), @splat(undefined)));
}
```

LLVM version: ([Godbolt](https://llvm.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:1,lang:llvm,selection:(endColumn:1,endLineNumber:11,positionColumn:1,positionLineNumber:11,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:'define+dso_local+%3C16+x+i8%3E+@foo(%3C16+x+i8%3E+%250,+%3C16+x+i8%3E+%251,+%3C64+x+i8%3E+%252,+%3C64+x+i8%3E+%253)+local_unnamed_addr+%7B%0AEntry:%0A++%254+%3D+icmp+ugt+%3C16+x+i8%3E+%250,+%251%0A++%255+%3D+tail+call+%3C64+x+i8%3E+@llvm.x86.avx512.permvar.qi.512(%3C64+x+i8%3E+%252,+%3C64+x+i8%3E+%253)%0A++%256+%3D+shufflevector+%3C64+x+i8%3E+%255,+%3C64+x+i8%3E+poison,+%3C16+x+i32%3E+%3Ci32+0,+i32+1,+i32+2,+i32+3,+i32+4,+i32+5,+i32+6,+i32+7,+i32+8,+i32+9,+i32+10,+i32+11,+i32+12,+i32+13,+i32+14,+i32+15%3E%0A++%257+%3D+select+%3C16+x+i1%3E+%254,+%3C16+x+i8%3E+%256,+%3C16+x+i8%3E+%250%0A++ret+%3C16+x+i8%3E+%257%0A%7D%0A%0Adeclare+%3C64+x+i8%3E+@llvm.x86.avx512.permvar.qi.512(%3C64+x+i8%3E,+%3C64+x+i8%3E)+%231%0A'),l:'5',n:'1',o:'LLVM+IR+source+%231',t:'0')),k:49.75646879756469,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:irclangtrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:llvm,libs:!(),options:'-O3+-march%3Dznver5+-target+x86_64-linux',overrides:!(),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+clang+(trunk)+(Editor+%231)',t:'0')),k:50.24353120243532,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4))

```llvm
define dso_local <16 x i8> @foo(<16 x i8> %0, <16 x i8> %1, <64 x i8> %2, <64 x i8> %3) local_unnamed_addr {
Entry:
  %4 = icmp ugt <16 x i8> %0, %1
  %5 = tail call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %2, <64 x i8> %3)
  %6 = shufflevector <64 x i8> %5, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %7 = select <16 x i1> %4, <16 x i8> %6, <16 x i8> %0
  ret <16 x i8> %7
}

declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) #1
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AVX-512] Merge `vmovdq/vpblendm` on `xmm` registers into previous maskable instruction operating on `zmm` registers #113400

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[AVX-512] Merge vmovdq/vpblendm on xmm registers into previous maskable instruction operating on zmm registers #113400

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

[AVX-512] Merge `vmovdq/vpblendm` on `xmm` registers into previous maskable instruction operating on `zmm` registers #113400