diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 357f526d5e308..50abab7b11def 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -818,6 +818,11 @@ def FeatureUseFixedOverScalableIfEqualCost : SubtargetFeature<"use-fixed-over-sc def FeatureAvoidLDAPUR : SubtargetFeature<"avoid-ldapur", "AvoidLDAPUR", "true", "Prefer add+ldapr to offset ldapur">; +// Some INC/DEC forms have better latency and throughput than ADDVL. +def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl", + "HasDisableFastIncVL", "true", + "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">; + //===----------------------------------------------------------------------===// // Architectures. // diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f291589e04c6b..8eeca5bdb87c5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -385,6 +385,8 @@ def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; +def HasFastIncVL : Predicate<"!Subtarget->hasDisableFastIncVL()">; + def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index a2f326c994c2f..3e03bb261bf68 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2677,6 +2677,29 @@ let Predicates = [HasSVE_or_SME] in { (DECD_ZPiI ZPR:$op, 31, $imm)>; } + // Some INCB/DECB forms have better latency and throughput than ADDVL, so we + // prefer using them here. + // We could extend this to other INC/DEC (scalar) instructions. + let Predicates = [HasSVE_or_SME, UseScalarIncVL, HasFastIncVL], AddedComplexity = 6 in { + foreach imm = [ 1, 2, 4 ] in { + def : Pat<(add GPR64:$op, (vscale !mul(imm, 16))), + (INCB_XPiI GPR64:$op, 31, imm)>; + + def : Pat<(add GPR32:$op, (i32 (trunc (vscale !mul(imm, 16))))), + (EXTRACT_SUBREG (INCB_XPiI (INSERT_SUBREG (IMPLICIT_DEF), + GPR32:$op, sub_32), 31, imm), + sub_32)>; + + def : Pat<(add GPR64:$op, (vscale !mul(imm, -16))), + (DECB_XPiI GPR64:$op, 31, imm)>; + + def : Pat<(add GPR32:$op, (i32 (trunc (vscale !mul(imm, -16))))), + (EXTRACT_SUBREG (DECB_XPiI (INSERT_SUBREG (IMPLICIT_DEF), + GPR32:$op, sub_32), 31, imm), + sub_32)>; + } + } + let Predicates = [HasSVE_or_SME, UseScalarIncVL], AddedComplexity = 5 in { def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))), (ADDVL_XXI GPR64:$op, $imm)>; diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll index f49bb910b5bd1..99c65b090adb0 100644 --- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll +++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll @@ -65,7 +65,8 @@ define void @quux() #1 { ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x10, x29, #104 ; CHECK-NEXT: stur x9, [x10, #-256] // 8-byte Folded Spill -; CHECK-NEXT: addvl x9, x8, #1 +; CHECK-NEXT: mov x9, x8 +; CHECK-NEXT: incb x9 ; CHECK-NEXT: mov w0, w9 ; CHECK-NEXT: // implicit-def: $x9 ; CHECK-NEXT: mov w9, w0 @@ -160,7 +161,8 @@ define void @quux() #1 { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: subs x9, x9, #16 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: addvl x9, x8, #2 +; CHECK-NEXT: mov x9, x8 +; CHECK-NEXT: incb x9, all, mul #2 ; CHECK-NEXT: mov w0, w9 ; CHECK-NEXT: // implicit-def: $x9 ; CHECK-NEXT: mov w9, w0 diff --git a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll index 78f93f1ecbb26..d94fa6433bb7f 100644 --- a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll +++ b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll @@ -85,7 +85,7 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float ; CHECK-NEXT: ldr z5, [x4, #3, mul vl] ; CHECK-NEXT: fmla z4.h, p0/m, z5.h, z3.h ; CHECK-NEXT: str z4, [x16, #3, mul vl] -; CHECK-NEXT: addvl x16, x16, #4 +; CHECK-NEXT: incb x16, all, mul #4 ; CHECK-NEXT: cmp x16, x11 ; CHECK-NEXT: b.lo .LBB0_4 ; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll index a6c0e5aa70583..0a9f4948a5f77 100644 --- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_SCALAR_INC ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -sve-use-scalar-inc-vl=true -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,disable-fast-inc-vl -sve-use-scalar-inc-vl=true -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_FAST_INC ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -sve-use-scalar-inc-vl=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_SCALAR_INC +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2,disable-fast-inc-vl -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_FAST_INC define <vscale x 8 x i16> @inch_vec(<vscale x 8 x i16> %a) { ; NO_SCALAR_INC-LABEL: inch_vec: @@ -14,6 +16,11 @@ define <vscale x 8 x i16> @inch_vec(<vscale x 8 x i16> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: inch z0.h ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: inch_vec: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: inch z0.h +; NO_FAST_INC-NEXT: ret %vscale = call i16 @llvm.vscale.i16() %mul = mul i16 %vscale, 8 %vl = insertelement <vscale x 8 x i16> poison, i16 %mul, i32 0 @@ -32,6 +39,11 @@ define <vscale x 4 x i32> @incw_vec(<vscale x 4 x i32> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: incw z0.s ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incw_vec: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: incw z0.s +; NO_FAST_INC-NEXT: ret %vscale = call i32 @llvm.vscale.i32() %mul = mul i32 %vscale, 4 %vl = insertelement <vscale x 4 x i32> poison, i32 %mul, i32 0 @@ -50,6 +62,11 @@ define <vscale x 2 x i64> @incd_vec(<vscale x 2 x i64> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: incd z0.d ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incd_vec: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: incd z0.d +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 2 %vl = insertelement <vscale x 2 x i64> poison, i64 %mul, i32 0 @@ -68,6 +85,11 @@ define <vscale x 8 x i16> @dech_vec(<vscale x 8 x i16> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: dech z0.h, all, mul #2 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: dech_vec: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: dech z0.h, all, mul #2 +; NO_FAST_INC-NEXT: ret %vscale = call i16 @llvm.vscale.i16() %mul = mul i16 %vscale, 16 %vl = insertelement <vscale x 8 x i16> poison, i16 %mul, i32 0 @@ -86,6 +108,11 @@ define <vscale x 4 x i32> @decw_vec(<vscale x 4 x i32> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: decw z0.s, all, mul #4 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decw_vec: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: decw z0.s, all, mul #4 +; NO_FAST_INC-NEXT: ret %vscale = call i32 @llvm.vscale.i32() %mul = mul i32 %vscale, 16 %vl = insertelement <vscale x 4 x i32> poison, i32 %mul, i32 0 @@ -104,6 +131,11 @@ define <vscale x 2 x i64> @decd_vec(<vscale x 2 x i64> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: decd z0.d, all, mul #8 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decd_vec: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: decd z0.d, all, mul #8 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 16 %vl = insertelement <vscale x 2 x i64> poison, i64 %mul, i32 0 @@ -123,8 +155,13 @@ define i64 @incb_scalar_i64(i64 %a) { ; ; CHECK-LABEL: incb_scalar_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x0, x0, #1 +; CHECK-NEXT: incb x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incb_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: addvl x0, x0, #1 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 16 %add = add i64 %a, %mul @@ -142,6 +179,11 @@ define i64 @inch_scalar_i64(i64 %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: inch x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: inch_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: inch x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 8 %add = add i64 %a, %mul @@ -159,6 +201,11 @@ define i64 @incw_scalar_i64(i64 %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: incw x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incw_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: incw x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 4 %add = add i64 %a, %mul @@ -176,6 +223,11 @@ define i64 @incd_scalar_i64(i64 %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: incd x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incd_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: incd x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 2 %add = add i64 %a, %mul @@ -193,8 +245,13 @@ define i64 @decb_scalar_i64(i64 %a) { ; ; CHECK-LABEL: decb_scalar_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x0, x0, #-2 +; CHECK-NEXT: decb x0, all, mul #2 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decb_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: addvl x0, x0, #-2 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 32 %sub = sub i64 %a, %mul @@ -212,6 +269,11 @@ define i64 @dech_scalar_i64(i64 %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: dech x0, all, mul #3 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: dech_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: dech x0, all, mul #3 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 24 %sub = sub i64 %a, %mul @@ -229,6 +291,11 @@ define i64 @decw_scalar_i64(i64 %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: decw x0, all, mul #3 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decw_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: decw x0, all, mul #3 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 12 %sub = sub i64 %a, %mul @@ -246,6 +313,11 @@ define i64 @decd_scalar_i64(i64 %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: decd x0, all, mul #3 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decd_scalar_i64: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: decd x0, all, mul #3 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 6 %sub = sub i64 %a, %mul @@ -267,6 +339,13 @@ define i32 @incb_scalar_i32(i32 %a) { ; CHECK-NEXT: addvl x0, x0, #3 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incb_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: addvl x0, x0, #3 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 48 @@ -288,6 +367,13 @@ define i32 @inch_scalar_i32(i32 %a) { ; CHECK-NEXT: inch x0, all, mul #7 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: inch_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: inch x0, all, mul #7 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 56 @@ -309,6 +395,13 @@ define i32 @incw_scalar_i32(i32 %a) { ; CHECK-NEXT: incw x0, all, mul #7 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incw_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: incw x0, all, mul #7 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 28 @@ -330,6 +423,13 @@ define i32 @incd_scalar_i32(i32 %a) { ; CHECK-NEXT: incd x0, all, mul #7 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: incd_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: incd x0, all, mul #7 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 14 @@ -350,9 +450,16 @@ define i32 @decb_scalar_i32(i32 %a) { ; CHECK-LABEL: decb_scalar_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: addvl x0, x0, #-4 +; CHECK-NEXT: decb x0, all, mul #4 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decb_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: addvl x0, x0, #-4 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 64 @@ -374,6 +481,13 @@ define i32 @dech_scalar_i32(i32 %a) { ; CHECK-NEXT: dech x0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: dech_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: dech x0 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 8 @@ -395,6 +509,13 @@ define i32 @decw_scalar_i32(i32 %a) { ; CHECK-NEXT: decw x0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decw_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: decw x0 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 4 @@ -416,6 +537,13 @@ define i32 @decd_scalar_i32(i32 %a) { ; CHECK-NEXT: decd x0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret +; +; NO_FAST_INC-LABEL: decd_scalar_i32: +; NO_FAST_INC: // %bb.0: +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 def $x0 +; NO_FAST_INC-NEXT: decd x0 +; NO_FAST_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_FAST_INC-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 2 %vl = trunc i64 %mul to i32 diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll index 39ee4510d51b4..abb5f2aa07fc8 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ld1-single.ll @@ -33,8 +33,8 @@ define <vscale x 4 x i32> @test_svld1uwq_i32_si(<vscale x 1 x i1> %pred, ptr %ba define <vscale x 4 x i32> @test_svld1uwq_i32_out_of_bound(<vscale x 1 x i1> %pred, ptr %base) { ; CHECK-LABEL: test_svld1uwq_i32_out_of_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #2 -; CHECK-NEXT: ld1w { z0.q }, p0/z, [x8] +; CHECK-NEXT: incb x0, all, mul #2 +; CHECK-NEXT: ld1w { z0.q }, p0/z, [x0] ; CHECK-NEXT: ret %gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> %pred, ptr %gep) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll index 4ffc0b42d0711..d4c77328be478 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-st1-single.ll @@ -30,8 +30,8 @@ define void @test_svst1wq_i32_si(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred define void @test_svst1wq_i32_out_of_bound(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %base) { ; CHECK-LABEL: test_svst1wq_i32_out_of_bound: ; CHECK: // %bb.0: -; CHECK-NEXT: addvl x8, x0, #2 -; CHECK-NEXT: st1w { z0.q }, p0, [x8] +; CHECK-NEXT: incb x0, all, mul #2 +; CHECK-NEXT: st1w { z0.q }, p0, [x0] ; CHECK-NEXT: ret %gep = getelementptr inbounds <vscale x 1 x i32>, ptr %base, i64 8 call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> %zt, <vscale x 1 x i1> %pred, ptr %gep) diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll index bae69ef590f52..aa954aeb0ad07 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll @@ -23,7 +23,7 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 { ; COMMON-NEXT: umax z1.b, p0/m, z1.b, z3.b ; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b ; COMMON-NEXT: st1b { z0.b }, p0, [x1, x8] -; COMMON-NEXT: addvl x8, x8, #1 +; COMMON-NEXT: incb x8 ; COMMON-NEXT: cmp x8, x2 ; COMMON-NEXT: b.lo .LBB0_1 ; COMMON-NEXT: // %bb.2: // %for.exit @@ -71,13 +71,13 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i ; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, x2] ; COMMON-NEXT: ldr z2, [x0, #1, mul vl] ; COMMON-NEXT: ldr z3, [x8, #1, mul vl] +; COMMON-NEXT: incb x0, all, mul #2 ; COMMON-NEXT: subs x3, x3, #1 -; COMMON-NEXT: addvl x0, x0, #2 ; COMMON-NEXT: add z0.b, z0.b, z1.b ; COMMON-NEXT: add z1.b, z2.b, z3.b ; COMMON-NEXT: st1b { z0.h }, p1, [x1] ; COMMON-NEXT: st1b { z1.h }, p1, [x1, #1, mul vl] -; COMMON-NEXT: addvl x1, x1, #2 +; COMMON-NEXT: incb x1, all, mul #2 ; COMMON-NEXT: b.ne .LBB1_1 ; COMMON-NEXT: // %bb.2: // %for.exit ; COMMON-NEXT: ret @@ -156,56 +156,56 @@ for.exit: define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 { ; BASE-LABEL: mixed_offsets_scalable_then_fixed: ; BASE: // %bb.0: // %entry +; BASE-NEXT: incb x0, all, mul #4 ; BASE-NEXT: ptrue p0.s -; BASE-NEXT: addvl x8, x0, #4 -; BASE-NEXT: mov x9, #8 // =0x8 +; BASE-NEXT: mov x8, #8 // =0x8 ; BASE-NEXT: .LBB3_1: // %for.body ; BASE-NEXT: // =>This Inner Loop Header: Depth=1 -; BASE-NEXT: ldr z0, [x8, #-4, mul vl] -; BASE-NEXT: ldr z1, [x8] +; BASE-NEXT: ldr z0, [x0, #-4, mul vl] +; BASE-NEXT: ldr z1, [x0] ; BASE-NEXT: decw x2 -; BASE-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2] -; BASE-NEXT: addvl x8, x8, #1 +; BASE-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; BASE-NEXT: incb x0 ; BASE-NEXT: add z0.s, z0.s, z1.s ; BASE-NEXT: add z0.s, z0.s, z2.s ; BASE-NEXT: str z0, [x1] -; BASE-NEXT: addvl x1, x1, #1 +; BASE-NEXT: incb x1 ; BASE-NEXT: cbnz x2, .LBB3_1 ; BASE-NEXT: // %bb.2: // %for.exit ; BASE-NEXT: ret ; ; PREINDEX-LABEL: mixed_offsets_scalable_then_fixed: ; PREINDEX: // %bb.0: // %entry +; PREINDEX-NEXT: incb x0, all, mul #4 ; PREINDEX-NEXT: ptrue p0.s -; PREINDEX-NEXT: addvl x8, x0, #4 -; PREINDEX-NEXT: mov x9, #8 // =0x8 +; PREINDEX-NEXT: mov x8, #8 // =0x8 ; PREINDEX-NEXT: .LBB3_1: // %for.body ; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; PREINDEX-NEXT: ldr z0, [x8, #-4, mul vl] -; PREINDEX-NEXT: ldr z1, [x8] +; PREINDEX-NEXT: ldr z0, [x0, #-4, mul vl] +; PREINDEX-NEXT: ldr z1, [x0] ; PREINDEX-NEXT: decw x2 -; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2] -; PREINDEX-NEXT: addvl x8, x8, #1 +; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; PREINDEX-NEXT: incb x0 ; PREINDEX-NEXT: add z0.s, z0.s, z1.s ; PREINDEX-NEXT: add z0.s, z0.s, z2.s ; PREINDEX-NEXT: str z0, [x1] -; PREINDEX-NEXT: addvl x1, x1, #1 +; PREINDEX-NEXT: incb x1 ; PREINDEX-NEXT: cbnz x2, .LBB3_1 ; PREINDEX-NEXT: // %bb.2: // %for.exit ; PREINDEX-NEXT: ret ; ; POSTINDEX-LABEL: mixed_offsets_scalable_then_fixed: ; POSTINDEX: // %bb.0: // %entry +; POSTINDEX-NEXT: incb x0, all, mul #4 ; POSTINDEX-NEXT: ptrue p0.s ; POSTINDEX-NEXT: mov x8, xzr -; POSTINDEX-NEXT: addvl x9, x0, #4 -; POSTINDEX-NEXT: mov x10, #8 // =0x8 +; POSTINDEX-NEXT: mov x9, #8 // =0x8 ; POSTINDEX-NEXT: .LBB3_1: // %for.body ; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; POSTINDEX-NEXT: ldr z0, [x9, #-4, mul vl] -; POSTINDEX-NEXT: ldr z1, [x9] -; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x9, x10, lsl #2] -; POSTINDEX-NEXT: addvl x9, x9, #1 +; POSTINDEX-NEXT: ldr z0, [x0, #-4, mul vl] +; POSTINDEX-NEXT: ldr z1, [x0] +; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; POSTINDEX-NEXT: incb x0 ; POSTINDEX-NEXT: add z0.s, z0.s, z1.s ; POSTINDEX-NEXT: add z0.s, z0.s, z2.s ; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] @@ -243,11 +243,12 @@ for.exit: define void @mixed_offsets_fixed_then_scalable(ptr %src, ptr %dst, i64 %count) #0 { ; COMMON-LABEL: mixed_offsets_fixed_then_scalable: ; COMMON: // %bb.0: // %entry -; COMMON-NEXT: addvl x9, x0, #4 +; COMMON-NEXT: mov x9, x0 ; COMMON-NEXT: ptrue p0.s ; COMMON-NEXT: mov x8, xzr -; COMMON-NEXT: add x9, x9, #32 +; COMMON-NEXT: incb x9, all, mul #4 ; COMMON-NEXT: mov x10, #8 // =0x8 +; COMMON-NEXT: add x9, x9, #32 ; COMMON-NEXT: .LBB4_1: // %for.body ; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 ; COMMON-NEXT: add x11, x0, x8, lsl #2 @@ -304,11 +305,11 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { ; BASE-NEXT: ldr z1, [x0, #4, mul vl] ; BASE-NEXT: decw x2 ; BASE-NEXT: ldr z2, [x0, #8, mul vl] -; BASE-NEXT: addvl x0, x0, #1 +; BASE-NEXT: incb x0 ; BASE-NEXT: add z0.s, z0.s, z1.s ; BASE-NEXT: add z0.s, z0.s, z2.s ; BASE-NEXT: str z0, [x1] -; BASE-NEXT: addvl x1, x1, #1 +; BASE-NEXT: incb x1 ; BASE-NEXT: cbnz x2, .LBB5_1 ; BASE-NEXT: // %bb.2: // %for.exit ; BASE-NEXT: ret @@ -321,11 +322,11 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { ; PREINDEX-NEXT: ldr z1, [x0, #4, mul vl] ; PREINDEX-NEXT: decw x2 ; PREINDEX-NEXT: ldr z2, [x0, #8, mul vl] -; PREINDEX-NEXT: addvl x0, x0, #1 +; PREINDEX-NEXT: incb x0 ; PREINDEX-NEXT: add z0.s, z0.s, z1.s ; PREINDEX-NEXT: add z0.s, z0.s, z2.s ; PREINDEX-NEXT: str z0, [x1] -; PREINDEX-NEXT: addvl x1, x1, #1 +; PREINDEX-NEXT: incb x1 ; PREINDEX-NEXT: cbnz x2, .LBB5_1 ; PREINDEX-NEXT: // %bb.2: // %for.exit ; PREINDEX-NEXT: ret @@ -339,7 +340,7 @@ define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { ; POSTINDEX-NEXT: ldr z0, [x0] ; POSTINDEX-NEXT: ldr z1, [x0, #4, mul vl] ; POSTINDEX-NEXT: ldr z2, [x0, #8, mul vl] -; POSTINDEX-NEXT: addvl x0, x0, #1 +; POSTINDEX-NEXT: incb x0 ; POSTINDEX-NEXT: add z0.s, z0.s, z1.s ; POSTINDEX-NEXT: add z0.s, z0.s, z2.s ; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] @@ -396,9 +397,9 @@ define void @vscale_squared_offset(ptr %alloc) #0 { ; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 ; COMMON-NEXT: add x11, x0, x9 ; COMMON-NEXT: st1w { z0.s }, p0, [x0] -; COMMON-NEXT: add x8, x8, #1 +; COMMON-NEXT: incb x0 ; COMMON-NEXT: st1w { z1.s }, p0, [x11] -; COMMON-NEXT: addvl x0, x0, #1 +; COMMON-NEXT: add x8, x8, #1 ; COMMON-NEXT: cmp x8, x10 ; COMMON-NEXT: b.lt .LBB6_1 ; COMMON-NEXT: .LBB6_2: // %for.exit