diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index 8d848696bab87..9e543b844ad76 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -413,6 +413,12 @@ class TargetLibraryInfo { case LibFunc_sin: case LibFunc_sinf: case LibFunc_sinl: case LibFunc_cos: case LibFunc_cosf: case LibFunc_cosl: case LibFunc_tan: case LibFunc_tanf: case LibFunc_tanl: + case LibFunc_asin: case LibFunc_asinf: case LibFunc_asinl: + case LibFunc_acos: case LibFunc_acosf: case LibFunc_acosl: + case LibFunc_atan: case LibFunc_atanf: case LibFunc_atanl: + case LibFunc_sinh: case LibFunc_sinhf: case LibFunc_sinhl: + case LibFunc_cosh: case LibFunc_coshf: case LibFunc_coshl: + case LibFunc_tanh: case LibFunc_tanhf: case LibFunc_tanhl: case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl: case LibFunc_sqrt_finite: case LibFunc_sqrtf_finite: case LibFunc_sqrtl_finite: diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 11b07ac0b7fc4..90eef93a2a54d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -160,12 +160,18 @@ class TargetTransformInfoImplBase { // These will all likely lower to a single selection DAG node. // clang-format off if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" || - Name == "fabs" || Name == "fabsf" || Name == "fabsl" || - Name == "fmin" || Name == "fminf" || Name == "fminl" || - Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" || - Name == "sin" || Name == "sinf" || Name == "sinl" || - Name == "cos" || Name == "cosf" || Name == "cosl" || - Name == "tan" || Name == "tanf" || Name == "tanl" || + Name == "fabs" || Name == "fabsf" || Name == "fabsl" || + Name == "fmin" || Name == "fminf" || Name == "fminl" || + Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" || + Name == "sin" || Name == "sinf" || Name == "sinl" || + Name == "cos" || Name == "cosf" || Name == "cosl" || + Name == "tan" || Name == "tanf" || Name == "tanl" || + Name == "asin" || Name == "asinf" || Name == "asinl" || + Name == "acos" || Name == "acosf" || Name == "acosl" || + Name == "atan" || Name == "atanf" || Name == "atanl" || + Name == "sinh" || Name == "sinhf" || Name == "sinhl" || + Name == "cosh" || Name == "coshf" || Name == "coshl" || + Name == "tanh" || Name == "tanhf" || Name == "tanhl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl") return false; // clang-format on diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 533fe62fb8cdd..8f35fd5eb5268 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4132,6 +4132,30 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB, case LibFunc_tanf: case LibFunc_tanl: return Intrinsic::tan; + case LibFunc_asin: + case LibFunc_asinf: + case LibFunc_asinl: + return Intrinsic::asin; + case LibFunc_acos: + case LibFunc_acosf: + case LibFunc_acosl: + return Intrinsic::acos; + case LibFunc_atan: + case LibFunc_atanf: + case LibFunc_atanl: + return Intrinsic::atan; + case LibFunc_sinh: + case LibFunc_sinhf: + case LibFunc_sinhl: + return Intrinsic::sinh; + case LibFunc_cosh: + case LibFunc_coshf: + case LibFunc_coshl: + return Intrinsic::cosh; + case LibFunc_tanh: + case LibFunc_tanhf: + case LibFunc_tanhl: + return Intrinsic::tanh; case LibFunc_exp: case LibFunc_expf: case LibFunc_expl: diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll index 32a6950e20510..4acc7fe7eaccf 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll @@ -398,10 +398,10 @@ for.end: define void @acos_f64(ptr nocapture %varray) { ; CHECK-LABEL: @acos_f64( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x double> @llvm.acos.v2f64(<2 x double> [[TMP4:%.*]]) -; CHECK-VF4-NOT:[[TMP5:%.*]] = call <4 x double> @llvm.acos.v4f64(<4 x double> [[TMP4:%.*]]) -; CHECK-VF8-NOT:[[TMP5:%.*]] = call <8 x double> @llvm.acos.v8f64(<8 x double> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x double> @llvm.acos.v16f64(<16 x double> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @llvm.acos.v2f64(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.acos.v4f64(<4 x double> [[TMP4:%.*]]) +; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.acos.v8f64(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.acos.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -424,7 +424,7 @@ for.end: define void @acos_f32(ptr nocapture %varray) { ; CHECK-LABEL: @acos_f32( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_acosf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_acosf(<16 x float> [[TMP4:%.*]]) @@ -502,10 +502,10 @@ for.end: define void @asin_f64(ptr nocapture %varray) { ; CHECK-LABEL: @asin_f64( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x double> @llvm.asin.v2f64(<2 x double> [[TMP4:%.*]]) -; CHECK-VF4-NOT:[[TMP5:%.*]] = call <4 x double> @llvm.asin.v4f64(<4 x double> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @llvm.asin.v2f64(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.asin.v4f64(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x double> @llvm.asin.v16f64(<16 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.asin.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -528,7 +528,7 @@ for.end: define void @asin_f32(ptr nocapture %varray) { ; CHECK-LABEL: @asin_f32( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_asinf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]]) @@ -609,7 +609,7 @@ define void @atan_f64(ptr nocapture %varray) { ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x double> @llvm.atan.v16f64(<16 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.atan.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -632,7 +632,7 @@ for.end: define void @atan_f32(ptr nocapture %varray) { ; CHECK-LABEL: @atan_f32( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_atanf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]]) @@ -710,10 +710,10 @@ for.end: define void @sinh_f64(ptr nocapture %varray) { ; CHECK-LABEL: @sinh_f64( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x double> @llvm.sinh.v2f64(<2 x double> [[TMP4:%.*]]) -; CHECK-VF4-NOT:[[TMP5:%.*]] = call <4 x double> @llvm.sinh.v4f64(<4 x double> [[TMP4:%.*]]) -; CHECK-VF8-NOT:[[TMP5:%.*]] = call <8 x double> @llvm.sinh.v8f64(<8 x double> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x double> @llvm.sinh.v16f64(<16 x double> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @llvm.sinh.v2f64(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.sinh.v4f64(<4 x double> [[TMP4:%.*]]) +; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.sinh.v8f64(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.sinh.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -736,10 +736,10 @@ for.end: define void @sinh_f32(ptr nocapture %varray) { ; CHECK-LABEL: @sinh_f32( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP4:%.*]]) -; CHECK-VF4-NOT:[[TMP5:%.*]] = call <4 x float> @llvm.sinh.v4f32(<4 x float> [[TMP4:%.*]]) -; CHECK-VF8-NOT:[[TMP5:%.*]] = call <8 x float> @llvm.sinh.v8f32(<8 x float> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x float> @llvm.sinh.v16f32(<16 x float> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @llvm.sinh.v4f32(<4 x float> [[TMP4:%.*]]) +; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @llvm.sinh.v8f32(<8 x float> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @llvm.sinh.v16f32(<16 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -815,9 +815,9 @@ for.end: define void @cosh_f64(ptr nocapture %varray) { ; CHECK-LABEL: @cosh_f64( ; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cosh(<2 x double> [[TMP4:%.*]]) -; CHECK-VF4-NOT:[[TMP5:%.*]] = call <4 x double> @llvm.cosh.v4f64(<4 x double> [[TMP4:%.*]]) -; CHECK-VF8-NOT:[[TMP5:%.*]] = call <8 x double> @llvm.cosh.v8f64(<8 x double> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x double> @llvm.cosh.v16f64(<16 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.cosh.v4f64(<4 x double> [[TMP4:%.*]]) +; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.cosh.v8f64(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.cosh.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -840,10 +840,10 @@ for.end: define void @cosh_f32(ptr nocapture %varray) { ; CHECK-LABEL: @cosh_f32( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_coshf(<8 x float> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x float> @llvm.cosh.v16f32(<16 x float> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @llvm.cosh.v16f32(<16 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -918,10 +918,10 @@ for.end: define void @tanh_f64(ptr nocapture %varray) { ; CHECK-LABEL: @tanh_f64( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x double> @llvm.tanh.v2f64(<2 x double> [[TMP4:%.*]]) -; CHECK-VF4-NOT:[[TMP5:%.*]] = call <4 x double> @llvm.tanh.v4f64(<4 x double> [[TMP4:%.*]]) -; CHECK-VF8-NOT:[[TMP5:%.*]] = call <8 x double> @llvm.tanh.v8f64(<8 x double> [[TMP4:%.*]]) -; CHECK-VF16-NOT:[[TMP5:%.*]] = call <16 x double> @llvm.tanh.v16f64(<16 x double> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x double> @llvm.tanh.v2f64(<2 x double> [[TMP4:%.*]]) +; CHECK-VF4: [[TMP5:%.*]] = call <4 x double> @llvm.tanh.v4f64(<4 x double> [[TMP4:%.*]]) +; CHECK-VF8: [[TMP5:%.*]] = call <8 x double> @llvm.tanh.v8f64(<8 x double> [[TMP4:%.*]]) +; CHECK-VF16: [[TMP5:%.*]] = call <16 x double> @llvm.tanh.v16f64(<16 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -944,7 +944,7 @@ for.end: define void @tanh_f32(ptr nocapture %varray) { ; CHECK-LABEL: @tanh_f32( -; CHECK-VF2-NOT:[[TMP5:%.*]] = call <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4:%.*]]) +; CHECK-VF2: [[TMP5:%.*]] = call <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4:%.*]]) ; CHECK-VF4: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]]) ; CHECK-VF8: [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanhf(<8 x float> [[TMP4:%.*]]) ; CHECK-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanhf(<16 x float> [[TMP4:%.*]]) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll index da827a5b674d8..40ce26f8c4ed4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -587,13 +587,11 @@ define <4 x float> @asin_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -666,13 +664,11 @@ define <4 x float> @acos_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -745,13 +741,11 @@ define <4 x float> @atan_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -824,13 +818,11 @@ define <4 x float> @sinh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -903,13 +895,11 @@ define <4 x float> @cosh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -982,13 +972,11 @@ define <4 x float> @tanh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll index 62b8c0ce3291a..285b85ed69be6 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -587,13 +587,11 @@ define <4 x float> @asin_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -666,13 +664,11 @@ define <4 x float> @acos_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -745,13 +741,11 @@ define <4 x float> @atan_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -824,13 +818,11 @@ define <4 x float> @sinh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -903,13 +895,11 @@ define <4 x float> @cosh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -982,13 +972,11 @@ define <4 x float> @tanh_4x(ptr %a) { ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]]) -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]]) +; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, ptr %a, align 16