@@ -593,104 +593,100 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
593
593
; AVX512BW-LABEL: load_i16_stride5_vf4:
594
594
; AVX512BW: # %bb.0:
595
595
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
596
- ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
597
- ; AVX512BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
596
+ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
597
+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
598
598
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
599
- ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
600
- ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
599
+ ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
601
600
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
602
601
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
603
- ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
602
+ ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
604
603
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
605
604
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
606
- ; AVX512BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
605
+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
607
606
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
608
- ; AVX512BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
607
+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
609
608
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
610
- ; AVX512BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
611
- ; AVX512BW-NEXT: vmovq %xmm1 , (%rsi)
609
+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
610
+ ; AVX512BW-NEXT: vmovq %xmm2 , (%rsi)
612
611
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
613
612
; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
614
613
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
615
- ; AVX512BW-NEXT: vmovq %xmm2 , (%r9)
614
+ ; AVX512BW-NEXT: vmovq %xmm1 , (%r9)
616
615
; AVX512BW-NEXT: vzeroupper
617
616
; AVX512BW-NEXT: retq
618
617
;
619
618
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
620
619
; AVX512BW-FCP: # %bb.0:
621
620
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
622
- ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
623
- ; AVX512BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
621
+ ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
622
+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
624
623
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
625
- ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
626
- ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
624
+ ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
627
625
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
628
626
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
629
- ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
627
+ ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
630
628
; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
631
629
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
632
- ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
630
+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
633
631
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
634
- ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
632
+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
635
633
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
636
- ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
637
- ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
634
+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
635
+ ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
638
636
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
639
637
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
640
638
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
641
- ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%r9)
639
+ ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%r9)
642
640
; AVX512BW-FCP-NEXT: vzeroupper
643
641
; AVX512BW-FCP-NEXT: retq
644
642
;
645
643
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
646
644
; AVX512DQ-BW: # %bb.0:
647
645
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
648
- ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
649
- ; AVX512DQ-BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
646
+ ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
647
+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
650
648
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
651
- ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
652
- ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
649
+ ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
653
650
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
654
651
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
655
- ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
652
+ ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
656
653
; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
657
654
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
658
- ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
655
+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
659
656
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
660
- ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
657
+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
661
658
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
662
- ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
663
- ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%rsi)
659
+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
660
+ ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%rsi)
664
661
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
665
662
; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
666
663
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
667
- ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%r9)
664
+ ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%r9)
668
665
; AVX512DQ-BW-NEXT: vzeroupper
669
666
; AVX512DQ-BW-NEXT: retq
670
667
;
671
668
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
672
669
; AVX512DQ-BW-FCP: # %bb.0:
673
670
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
674
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
675
- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
671
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
672
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
676
673
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
677
- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
678
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
674
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
679
675
; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
680
676
; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
681
- ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
677
+ ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
682
678
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
683
679
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
684
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
680
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
685
681
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
686
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
682
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
687
683
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
688
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
689
- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
684
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
685
+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
690
686
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
691
687
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
692
688
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
693
- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%r9)
689
+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%r9)
694
690
; AVX512DQ-BW-FCP-NEXT: vzeroupper
695
691
; AVX512DQ-BW-FCP-NEXT: retq
696
692
%wide.vec = load <20 x i16>, ptr %in.vec, align 64
0 commit comments